diff options
author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-05 00:42:36 +0200 |
---|---|---|
committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-05 00:42:36 +0200 |
commit | 064cd26f4ffdb28353315a6ecc28bf47cf575071 (patch) | |
tree | 55f52427f8b753e113f13402a26781bb00d366e7 | |
parent | 0a6b7793e982d4c59f3c6ffb947dfe28ca709cdc (diff) |
Format
-rw-r--r-- | rag/db/documents.py | 3 | ||||
-rw-r--r-- | rag/parser/pdf.py | 10 |
2 files changed, 6 insertions, 7 deletions
diff --git a/rag/db/documents.py b/rag/db/documents.py index bdbf5a4..3c5489d 100644 --- a/rag/db/documents.py +++ b/rag/db/documents.py @@ -1,6 +1,7 @@ +import hashlib import os from typing import List -import hashlib + import psycopg from langchain_core.documents.base import Document diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py index cb63fc4..a5e4665 100644 --- a/rag/parser/pdf.py +++ b/rag/parser/pdf.py @@ -1,16 +1,14 @@ +import os from pathlib import Path -from langchain_community.document_loaders import PyPDFLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -CHUNK_SIZE = 1024 -CHUNK_OVERLAP = 256 +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import PyPDFLoader def parser(filepath: Path): content = PyPDFLoader(filepath).load() - print(content) splitter = RecursiveCharacterTextSplitter( - chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP + chunk_size=os.environ["CHUNK_SIZE"], chunk_overlap=os.environ["CHUNK_OVERLAP"] ) chunks = splitter.split_documents(content) return chunks |