blob: 1680a475eea51757d8ca56c47c6387466deecfc5 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
import os
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
def parser(filepath: Path):
content = PyPDFLoader(filepath).load()
splitter = RecursiveCharacterTextSplitter(
chunk_size=int(os.environ["CHUNK_SIZE"]),
chunk_overlap=int(os.environ["CHUNK_OVERLAP"]),
)
chunks = splitter.split_documents(content)
return chunks
# TODO: add parser for bytearray
|