From 052bf63a2c18b1b55013dcf6974228609cc4d76f Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Sat, 6 Apr 2024 13:15:07 +0200 Subject: Refactor pdf reader --- rag/parser/pdf.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'rag/parser') diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py index 1680a47..22fc4e0 100644 --- a/rag/parser/pdf.py +++ b/rag/parser/pdf.py @@ -1,18 +1,34 @@ import os from pathlib import Path +from typing import Iterator, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader +from langchain_core.documents import Document +from langchain_community.document_loaders.parsers.pdf import ( + PyPDFParser, +) +from rag.db.document import DocumentDB -def parser(filepath: Path): - content = PyPDFLoader(filepath).load() - splitter = RecursiveCharacterTextSplitter( - chunk_size=int(os.environ["CHUNK_SIZE"]), - chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), - ) - chunks = splitter.split_documents(content) - return chunks +class PDF: + def __init__(self) -> None: + self.db = DocumentDB() + self.parser = PyPDFParser(password=None, extract_images=False) + def from_data(self, blob) -> Optional[Iterator[Document]]: + if self.db.add(blob): + yield from self.parser.parse(blob) + yield None -# TODO: add parser for bytearray + def from_path(self, file_path: Path) -> Optional[Iterator[Document]]: + blob = Blob.from_path(file_path) + from_data(blob) + + def chunk(self, content: Iterator[Document]): + splitter = RecursiveCharacterTextSplitter( + chunk_size=int(os.environ["CHUNK_SIZE"]), + chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), + ) + chunks = splitter.split_documents(content) + return chunks -- cgit v1.2.3-70-g09d2