diff options
author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2025-08-25 00:06:19 +0200 |
---|---|---|
committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2025-08-25 00:06:19 +0200 |
commit | 28a1f5d4eddab6eb7c9ca77346c6fa9608856dd5 (patch) | |
tree | 563ffd32f1a6f5705c1fbf6230d5d226fd0e0e48 /rag/retriever/parser/pdf.py | |
parent | 6afba9079eebe867ac4f1b6073b5277513e7491b (diff) |
Diffstat (limited to 'rag/retriever/parser/pdf.py')
-rw-r--r-- | rag/retriever/parser/pdf.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/rag/retriever/parser/pdf.py b/rag/retriever/parser/pdf.py index 4c5addc..3253dc1 100644 --- a/rag/retriever/parser/pdf.py +++ b/rag/retriever/parser/pdf.py @@ -8,8 +8,10 @@ from langchain_community.document_loaders.parsers.pdf import ( PyPDFParser, ) from langchain_core.documents import Document +from rag.retriever.encoder import Chunks +# TODO: fix the PDFParser, remove langchain class PDFParser: def __init__(self) -> None: self.parser = PyPDFParser(password=None, extract_images=False) @@ -22,7 +24,7 @@ class PDFParser: def chunk( self, document: List[Document], source: Optional[str] = None - ) -> List[Document]: + ) -> Chunks: splitter = RecursiveCharacterTextSplitter( chunk_size=int(os.environ["CHUNK_SIZE"]), chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), @@ -31,4 +33,4 @@ class PDFParser: if source is not None: for c in chunks: c.metadata["source"] = source - return chunks + return Chunks(chunks) |