diff options
| author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2025-08-25 00:06:19 +0200 | 
|---|---|---|
| committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2025-08-25 00:06:19 +0200 | 
| commit | 28a1f5d4eddab6eb7c9ca77346c6fa9608856dd5 (patch) | |
| tree | 563ffd32f1a6f5705c1fbf6230d5d226fd0e0e48 /rag/retriever/parser | |
| parent | 6afba9079eebe867ac4f1b6073b5277513e7491b (diff) | |
Diffstat (limited to 'rag/retriever/parser')
| -rw-r--r-- | rag/retriever/parser/pdf.py | 6 | 
1 files changed, 4 insertions, 2 deletions
diff --git a/rag/retriever/parser/pdf.py b/rag/retriever/parser/pdf.py index 4c5addc..3253dc1 100644 --- a/rag/retriever/parser/pdf.py +++ b/rag/retriever/parser/pdf.py @@ -8,8 +8,10 @@ from langchain_community.document_loaders.parsers.pdf import (      PyPDFParser,  )  from langchain_core.documents import Document +from rag.retriever.encoder import Chunks +# TODO: fix the PDFParser, remove langchain  class PDFParser:      def __init__(self) -> None:          self.parser = PyPDFParser(password=None, extract_images=False) @@ -22,7 +24,7 @@ class PDFParser:      def chunk(          self, document: List[Document], source: Optional[str] = None -    ) -> List[Document]: +    ) -> Chunks:          splitter = RecursiveCharacterTextSplitter(              chunk_size=int(os.environ["CHUNK_SIZE"]),              chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), @@ -31,4 +33,4 @@ class PDFParser:          if source is not None:              for c in chunks:                  c.metadata["source"] = source -        return chunks +        return Chunks(chunks)  |