blob: a5e46659fd288e12c0579d69eb3b9e140f81fbfb (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
import os
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
def parser(filepath: Path):
content = PyPDFLoader(filepath).load()
splitter = RecursiveCharacterTextSplitter(
chunk_size=os.environ["CHUNK_SIZE"], chunk_overlap=os.environ["CHUNK_OVERLAP"]
)
chunks = splitter.split_documents(content)
return chunks
# TODO: add parser for bytearray
|