summaryrefslogtreecommitdiff
path: root/rag/parser
diff options
context:
space:
mode:
Diffstat (limited to 'rag/parser')
-rw-r--r--rag/parser/pdf.py13
1 files changed, 9 insertions, 4 deletions
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py
index cbd86a3..ca9b72d 100644
--- a/rag/parser/pdf.py
+++ b/rag/parser/pdf.py
@@ -1,6 +1,6 @@
import os
from pathlib import Path
-from typing import Iterator
+from typing import Iterator, List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
@@ -15,15 +15,20 @@ class PDFParser:
self.parser = PyPDFParser(password=None, extract_images=False)
def from_data(self, blob: Blob) -> Iterator[Document]:
- yield from self.parser.parse(blob)
+ return self.parser.parse(blob)
def from_path(self, path: Path) -> Iterator[Document]:
return Blob.from_path(path)
- def chunk(self, content: Iterator[Document]):
+ def chunk(
+ self, document: Iterator[Document], source: Optional[str] = None
+ ) -> List[Document]:
splitter = RecursiveCharacterTextSplitter(
chunk_size=int(os.environ["CHUNK_SIZE"]),
chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
)
- chunks = splitter.split_documents(content)
+ chunks = splitter.split_documents(document)
+ if source is not None:
+ for c in chunks:
+ c.metadata["source"] = source
return chunks