diff options
Diffstat (limited to 'rag/parser')
-rw-r--r-- | rag/parser/pdf.py | 34 |
1 files changed, 25 insertions, 9 deletions
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py index 1680a47..22fc4e0 100644 --- a/rag/parser/pdf.py +++ b/rag/parser/pdf.py @@ -1,18 +1,34 @@ import os from pathlib import Path +from typing import Iterator, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader +from langchain_core.documents import Document +from langchain_community.document_loaders.parsers.pdf import ( + PyPDFParser, +) +from rag.db.document import DocumentDB -def parser(filepath: Path): - content = PyPDFLoader(filepath).load() - splitter = RecursiveCharacterTextSplitter( - chunk_size=int(os.environ["CHUNK_SIZE"]), - chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), - ) - chunks = splitter.split_documents(content) - return chunks +class PDF: + def __init__(self) -> None: + self.db = DocumentDB() + self.parser = PyPDFParser(password=None, extract_images=False) + def from_data(self, blob) -> Optional[Iterator[Document]]: + if self.db.add(blob): + yield from self.parser.parse(blob) + yield None -# TODO: add parser for bytearray + def from_path(self, file_path: Path) -> Optional[Iterator[Document]]: + blob = Blob.from_path(file_path) + from_data(blob) + + def chunk(self, content: Iterator[Document]): + splitter = RecursiveCharacterTextSplitter( + chunk_size=int(os.environ["CHUNK_SIZE"]), + chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), + ) + chunks = splitter.split_documents(content) + return chunks |