summaryrefslogtreecommitdiff
path: root/rag/parser
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-09 00:14:00 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-09 00:14:00 +0200
commit91ddb3672e514fa9824609ff047d7cab0c65631a (patch)
tree009fd82618588d2960b5207128e86875f73cccdc /rag/parser
parentd487ef8b04cc7f5ac1491f0638f902fe2abe5ac5 (diff)
Refactor
Diffstat (limited to 'rag/parser')
-rw-r--r--rag/parser/__init__.py0
-rw-r--r--rag/parser/pdf.py34
2 files changed, 0 insertions, 34 deletions
diff --git a/rag/parser/__init__.py b/rag/parser/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/rag/parser/__init__.py
+++ /dev/null
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py
deleted file mode 100644
index ca9b72d..0000000
--- a/rag/parser/pdf.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-from pathlib import Path
-from typing import Iterator, List, Optional
-
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_core.documents import Document
-from langchain_community.document_loaders.parsers.pdf import (
- PyPDFParser,
-)
-from langchain_community.document_loaders.blob_loaders import Blob
-
-
-class PDFParser:
- def __init__(self) -> None:
- self.parser = PyPDFParser(password=None, extract_images=False)
-
- def from_data(self, blob: Blob) -> Iterator[Document]:
- return self.parser.parse(blob)
-
- def from_path(self, path: Path) -> Iterator[Document]:
- return Blob.from_path(path)
-
- def chunk(
- self, document: Iterator[Document], source: Optional[str] = None
- ) -> List[Document]:
- splitter = RecursiveCharacterTextSplitter(
- chunk_size=int(os.environ["CHUNK_SIZE"]),
- chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
- )
- chunks = splitter.split_documents(document)
- if source is not None:
- for c in chunks:
- c.metadata["source"] = source
- return chunks