summaryrefslogtreecommitdiff
path: root/rag/parser
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-06 13:15:07 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-06 13:15:07 +0200
commit052bf63a2c18b1b55013dcf6974228609cc4d76f (patch)
tree1846b4c5555ca504bfb638f72bee14249f502577 /rag/parser
parentd116abc63e350b092c2a7f9e1bb9b54298e21b2d (diff)
Refactor pdf reader
Diffstat (limited to 'rag/parser')
-rw-r--r--rag/parser/pdf.py34
1 files changed, 25 insertions, 9 deletions
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py
index 1680a47..22fc4e0 100644
--- a/rag/parser/pdf.py
+++ b/rag/parser/pdf.py
@@ -1,18 +1,34 @@
import os
from pathlib import Path
+from typing import Iterator, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
+from langchain_core.documents import Document
+from langchain_community.document_loaders.parsers.pdf import (
+ PyPDFParser,
+)
+from rag.db.document import DocumentDB
-def parser(filepath: Path):
- content = PyPDFLoader(filepath).load()
- splitter = RecursiveCharacterTextSplitter(
- chunk_size=int(os.environ["CHUNK_SIZE"]),
- chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
- )
- chunks = splitter.split_documents(content)
- return chunks
+class PDF:
+ def __init__(self) -> None:
+ self.db = DocumentDB()
+ self.parser = PyPDFParser(password=None, extract_images=False)
+ def from_data(self, blob) -> Optional[Iterator[Document]]:
+ if self.db.add(blob):
+ yield from self.parser.parse(blob)
+ yield None
-# TODO: add parser for bytearray
+ def from_path(self, file_path: Path) -> Optional[Iterator[Document]]:
+ blob = Blob.from_path(file_path)
+ from_data(blob)
+
+ def chunk(self, content: Iterator[Document]):
+ splitter = RecursiveCharacterTextSplitter(
+ chunk_size=int(os.environ["CHUNK_SIZE"]),
+ chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
+ )
+ chunks = splitter.split_documents(content)
+ return chunks