diff options
author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-05 00:42:36 +0200 |
---|---|---|
committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-05 00:42:36 +0200 |
commit | 064cd26f4ffdb28353315a6ecc28bf47cf575071 (patch) | |
tree | 55f52427f8b753e113f13402a26781bb00d366e7 /rag/parser | |
parent | 0a6b7793e982d4c59f3c6ffb947dfe28ca709cdc (diff) |
Format
Diffstat (limited to 'rag/parser')
-rw-r--r-- | rag/parser/pdf.py | 10 |
1 files changed, 4 insertions, 6 deletions
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py index cb63fc4..a5e4665 100644 --- a/rag/parser/pdf.py +++ b/rag/parser/pdf.py @@ -1,16 +1,14 @@ +import os from pathlib import Path -from langchain_community.document_loaders import PyPDFLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -CHUNK_SIZE = 1024 -CHUNK_OVERLAP = 256 +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import PyPDFLoader def parser(filepath: Path): content = PyPDFLoader(filepath).load() - print(content) splitter = RecursiveCharacterTextSplitter( - chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP + chunk_size=os.environ["CHUNK_SIZE"], chunk_overlap=os.environ["CHUNK_OVERLAP"] ) chunks = splitter.split_documents(content) return chunks |