summaryrefslogtreecommitdiff
path: root/rag/parser/pdf.py
blob: 1680a475eea51757d8ca56c47c6387466deecfc5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import os
from pathlib import Path

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader


def parser(filepath: Path):
    content = PyPDFLoader(filepath).load()
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=int(os.environ["CHUNK_SIZE"]),
        chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
    )
    chunks = splitter.split_documents(content)
    return chunks


# TODO: add parser for bytearray