summaryrefslogtreecommitdiff
path: root/rag/retriever/parser
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2025-08-25 00:06:19 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2025-08-25 00:06:19 +0200
commit28a1f5d4eddab6eb7c9ca77346c6fa9608856dd5 (patch)
tree563ffd32f1a6f5705c1fbf6230d5d226fd0e0e48 /rag/retriever/parser
parent6afba9079eebe867ac4f1b6073b5277513e7491b (diff)
Broken stateHEADmaster
Diffstat (limited to 'rag/retriever/parser')
-rw-r--r--rag/retriever/parser/pdf.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/rag/retriever/parser/pdf.py b/rag/retriever/parser/pdf.py
index 4c5addc..3253dc1 100644
--- a/rag/retriever/parser/pdf.py
+++ b/rag/retriever/parser/pdf.py
@@ -8,8 +8,10 @@ from langchain_community.document_loaders.parsers.pdf import (
PyPDFParser,
)
from langchain_core.documents import Document
+from rag.retriever.encoder import Chunks
+# TODO: fix the PDFParser, remove langchain
class PDFParser:
def __init__(self) -> None:
self.parser = PyPDFParser(password=None, extract_images=False)
@@ -22,7 +24,7 @@ class PDFParser:
def chunk(
self, document: List[Document], source: Optional[str] = None
- ) -> List[Document]:
+ ) -> Chunks:
splitter = RecursiveCharacterTextSplitter(
chunk_size=int(os.environ["CHUNK_SIZE"]),
chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
@@ -31,4 +33,4 @@ class PDFParser:
if source is not None:
for c in chunks:
c.metadata["source"] = source
- return chunks
+ return Chunks(chunks)