From 28a1f5d4eddab6eb7c9ca77346c6fa9608856dd5 Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Mon, 25 Aug 2025 00:06:19 +0200 Subject: Broken state --- rag/retriever/parser/pdf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'rag/retriever/parser/pdf.py') diff --git a/rag/retriever/parser/pdf.py b/rag/retriever/parser/pdf.py index 4c5addc..3253dc1 100644 --- a/rag/retriever/parser/pdf.py +++ b/rag/retriever/parser/pdf.py @@ -8,8 +8,10 @@ from langchain_community.document_loaders.parsers.pdf import ( PyPDFParser, ) from langchain_core.documents import Document +from rag.retriever.encoder import Chunks +# TODO: fix the PDFParser, remove langchain class PDFParser: def __init__(self) -> None: self.parser = PyPDFParser(password=None, extract_images=False) @@ -22,7 +24,7 @@ class PDFParser: def chunk( self, document: List[Document], source: Optional[str] = None - ) -> List[Document]: + ) -> Chunks: splitter = RecursiveCharacterTextSplitter( chunk_size=int(os.environ["CHUNK_SIZE"]), chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), @@ -31,4 +33,4 @@ class PDFParser: if source is not None: for c in chunks: c.metadata["source"] = source - return chunks + return Chunks(chunks) -- cgit v1.2.3-70-g09d2