summaryrefslogtreecommitdiff
path: root/rag/retriever/parser/pdf.py
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-09 00:14:00 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-09 00:14:00 +0200
commit91ddb3672e514fa9824609ff047d7cab0c65631a (patch)
tree009fd82618588d2960b5207128e86875f73cccdc /rag/retriever/parser/pdf.py
parentd487ef8b04cc7f5ac1491f0638f902fe2abe5ac5 (diff)
Refactor
Diffstat (limited to 'rag/retriever/parser/pdf.py')
-rw-r--r--rag/retriever/parser/pdf.py34
1 files changed, 34 insertions, 0 deletions
diff --git a/rag/retriever/parser/pdf.py b/rag/retriever/parser/pdf.py
new file mode 100644
index 0000000..410f027
--- /dev/null
+++ b/rag/retriever/parser/pdf.py
@@ -0,0 +1,34 @@
+import os
+from pathlib import Path
+from typing import List, Optional
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from langchain_community.document_loaders.parsers.pdf import (
+ PyPDFParser,
+)
+from langchain_community.document_loaders.blob_loaders import Blob
+
+
+class PDFParser:
+ def __init__(self) -> None:
+ self.parser = PyPDFParser(password=None, extract_images=False)
+
+ def from_data(self, blob: Blob) -> List[Document]:
+ return self.parser.parse(blob)
+
+ def from_path(self, path: Path) -> Blob:
+ return Blob.from_path(path)
+
+ def chunk(
+ self, document: List[Document], source: Optional[str] = None
+ ) -> List[Document]:
+ splitter = RecursiveCharacterTextSplitter(
+ chunk_size=int(os.environ["CHUNK_SIZE"]),
+ chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
+ )
+ chunks = splitter.split_documents(document)
+ if source is not None:
+ for c in chunks:
+ c.metadata["source"] = source
+ return chunks