From 064cd26f4ffdb28353315a6ecc28bf47cf575071 Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Fri, 5 Apr 2024 00:42:36 +0200 Subject: Format --- rag/db/documents.py | 3 ++- rag/parser/pdf.py | 10 ++++------ 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'rag') diff --git a/rag/db/documents.py b/rag/db/documents.py index bdbf5a4..3c5489d 100644 --- a/rag/db/documents.py +++ b/rag/db/documents.py @@ -1,6 +1,7 @@ +import hashlib import os from typing import List -import hashlib + import psycopg from langchain_core.documents.base import Document diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py index cb63fc4..a5e4665 100644 --- a/rag/parser/pdf.py +++ b/rag/parser/pdf.py @@ -1,16 +1,14 @@ +import os from pathlib import Path -from langchain_community.document_loaders import PyPDFLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -CHUNK_SIZE = 1024 -CHUNK_OVERLAP = 256 +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import PyPDFLoader def parser(filepath: Path): content = PyPDFLoader(filepath).load() - print(content) splitter = RecursiveCharacterTextSplitter( - chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP + chunk_size=os.environ["CHUNK_SIZE"], chunk_overlap=os.environ["CHUNK_OVERLAP"] ) chunks = splitter.split_documents(content) return chunks -- cgit v1.2.3-70-g09d2