blob: cb63fc4bbf41f347c440b275d3a16cbe7c5e90bd (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
CHUNK_SIZE = 1024
CHUNK_OVERLAP = 256
def parser(filepath: Path):
content = PyPDFLoader(filepath).load()
print(content)
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
)
chunks = splitter.split_documents(content)
return chunks
# TODO: add parser for bytearray
|