diff options
Diffstat (limited to 'rag/parser')
-rw-r--r-- | rag/parser/pdf.py | 10 |
1 files changed, 4 insertions, 6 deletions
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py index cb63fc4..a5e4665 100644 --- a/rag/parser/pdf.py +++ b/rag/parser/pdf.py @@ -1,16 +1,14 @@ +import os from pathlib import Path -from langchain_community.document_loaders import PyPDFLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -CHUNK_SIZE = 1024 -CHUNK_OVERLAP = 256 +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import PyPDFLoader def parser(filepath: Path): content = PyPDFLoader(filepath).load() - print(content) splitter = RecursiveCharacterTextSplitter( - chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP + chunk_size=os.environ["CHUNK_SIZE"], chunk_overlap=os.environ["CHUNK_OVERLAP"] ) chunks = splitter.split_documents(content) return chunks |