diff options
Diffstat (limited to 'rag/retriever/parser/pdf.py')
-rw-r--r-- | rag/retriever/parser/pdf.py | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/rag/retriever/parser/pdf.py b/rag/retriever/parser/pdf.py new file mode 100644 index 0000000..410f027 --- /dev/null +++ b/rag/retriever/parser/pdf.py @@ -0,0 +1,34 @@ +import os +from pathlib import Path +from typing import List, Optional + +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_core.documents import Document +from langchain_community.document_loaders.parsers.pdf import ( + PyPDFParser, +) +from langchain_community.document_loaders.blob_loaders import Blob + + +class PDFParser: + def __init__(self) -> None: + self.parser = PyPDFParser(password=None, extract_images=False) + + def from_data(self, blob: Blob) -> List[Document]: + return self.parser.parse(blob) + + def from_path(self, path: Path) -> Blob: + return Blob.from_path(path) + + def chunk( + self, document: List[Document], source: Optional[str] = None + ) -> List[Document]: + splitter = RecursiveCharacterTextSplitter( + chunk_size=int(os.environ["CHUNK_SIZE"]), + chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), + ) + chunks = splitter.split_documents(document) + if source is not None: + for c in chunks: + c.metadata["source"] = source + return chunks |