From d487ef8b04cc7f5ac1491f0638f902fe2abe5ac5 Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Mon, 8 Apr 2024 22:28:47 +0200 Subject: Wip refactor --- rag/parser/pdf.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'rag/parser') diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py index cbd86a3..ca9b72d 100644 --- a/rag/parser/pdf.py +++ b/rag/parser/pdf.py @@ -1,6 +1,6 @@ import os from pathlib import Path -from typing import Iterator +from typing import Iterator, List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document @@ -15,15 +15,20 @@ class PDFParser: self.parser = PyPDFParser(password=None, extract_images=False) def from_data(self, blob: Blob) -> Iterator[Document]: - yield from self.parser.parse(blob) + return self.parser.parse(blob) def from_path(self, path: Path) -> Iterator[Document]: return Blob.from_path(path) - def chunk(self, content: Iterator[Document]): + def chunk( + self, document: Iterator[Document], source: Optional[str] = None + ) -> List[Document]: splitter = RecursiveCharacterTextSplitter( chunk_size=int(os.environ["CHUNK_SIZE"]), chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), ) - chunks = splitter.split_documents(content) + chunks = splitter.split_documents(document) + if source is not None: + for c in chunks: + c.metadata["source"] = source return chunks -- cgit v1.2.3-70-g09d2