From 91ddb3672e514fa9824609ff047d7cab0c65631a Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Tue, 9 Apr 2024 00:14:00 +0200 Subject: Refactor --- rag/parser/__init__.py | 0 rag/parser/pdf.py | 34 ---------------------------------- 2 files changed, 34 deletions(-) delete mode 100644 rag/parser/__init__.py delete mode 100644 rag/parser/pdf.py (limited to 'rag/parser') diff --git a/rag/parser/__init__.py b/rag/parser/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py deleted file mode 100644 index ca9b72d..0000000 --- a/rag/parser/pdf.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from pathlib import Path -from typing import Iterator, List, Optional - -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_core.documents import Document -from langchain_community.document_loaders.parsers.pdf import ( - PyPDFParser, -) -from langchain_community.document_loaders.blob_loaders import Blob - - -class PDFParser: - def __init__(self) -> None: - self.parser = PyPDFParser(password=None, extract_images=False) - - def from_data(self, blob: Blob) -> Iterator[Document]: - return self.parser.parse(blob) - - def from_path(self, path: Path) -> Iterator[Document]: - return Blob.from_path(path) - - def chunk( - self, document: Iterator[Document], source: Optional[str] = None - ) -> List[Document]: - splitter = RecursiveCharacterTextSplitter( - chunk_size=int(os.environ["CHUNK_SIZE"]), - chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), - ) - chunks = splitter.split_documents(document) - if source is not None: - for c in chunks: - c.metadata["source"] = source - return chunks -- cgit v1.2.3-70-g09d2