summaryrefslogtreecommitdiff
path: root/rag/parser
diff options
context:
space:
mode:
Diffstat (limited to 'rag/parser')
-rw-r--r--rag/parser/pdf.py34
1 files changed, 25 insertions, 9 deletions
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py
index 1680a47..22fc4e0 100644
--- a/rag/parser/pdf.py
+++ b/rag/parser/pdf.py
@@ -1,18 +1,34 @@
import os
from pathlib import Path
+from typing import Iterator, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
+from langchain_core.documents import Document
+from langchain_community.document_loaders.parsers.pdf import (
+ PyPDFParser,
+)
+from rag.db.document import DocumentDB
-def parser(filepath: Path):
- content = PyPDFLoader(filepath).load()
- splitter = RecursiveCharacterTextSplitter(
- chunk_size=int(os.environ["CHUNK_SIZE"]),
- chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
- )
- chunks = splitter.split_documents(content)
- return chunks
+class PDF:
+ def __init__(self) -> None:
+ self.db = DocumentDB()
+ self.parser = PyPDFParser(password=None, extract_images=False)
+ def from_data(self, blob) -> Optional[Iterator[Document]]:
+ if self.db.add(blob):
+ yield from self.parser.parse(blob)
+ yield None
-# TODO: add parser for bytearray
+ def from_path(self, file_path: Path) -> Optional[Iterator[Document]]:
+ blob = Blob.from_path(file_path)
+ from_data(blob)
+
+ def chunk(self, content: Iterator[Document]):
+ splitter = RecursiveCharacterTextSplitter(
+ chunk_size=int(os.environ["CHUNK_SIZE"]),
+ chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
+ )
+ chunks = splitter.split_documents(content)
+ return chunks