summaryrefslogtreecommitdiff
path: root/rag/retriever/parser/pdf.py
blob: 410f027afc88e93b069ae8ccf7fd0a1258df3b0b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import os
from pathlib import Path
from typing import List, Optional

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders.parsers.pdf import (
    PyPDFParser,
)
from langchain_community.document_loaders.blob_loaders import Blob


class PDFParser:
    def __init__(self) -> None:
        self.parser = PyPDFParser(password=None, extract_images=False)

    def from_data(self, blob: Blob) -> List[Document]:
        return self.parser.parse(blob)

    def from_path(self, path: Path) -> Blob:
        return Blob.from_path(path)

    def chunk(
        self, document: List[Document], source: Optional[str] = None
    ) -> List[Document]:
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=int(os.environ["CHUNK_SIZE"]),
            chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
        )
        chunks = splitter.split_documents(document)
        if source is not None:
            for c in chunks:
                c.metadata["source"] = source
        return chunks