summaryrefslogtreecommitdiff
path: root/rag
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-05 00:42:36 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-05 00:42:36 +0200
commit064cd26f4ffdb28353315a6ecc28bf47cf575071 (patch)
tree55f52427f8b753e113f13402a26781bb00d366e7 /rag
parent0a6b7793e982d4c59f3c6ffb947dfe28ca709cdc (diff)
Format
Diffstat (limited to 'rag')
-rw-r--r--rag/db/documents.py3
-rw-r--r--rag/parser/pdf.py10
2 files changed, 6 insertions, 7 deletions
diff --git a/rag/db/documents.py b/rag/db/documents.py
index bdbf5a4..3c5489d 100644
--- a/rag/db/documents.py
+++ b/rag/db/documents.py
@@ -1,6 +1,7 @@
+import hashlib
import os
from typing import List
-import hashlib
+
import psycopg
from langchain_core.documents.base import Document
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py
index cb63fc4..a5e4665 100644
--- a/rag/parser/pdf.py
+++ b/rag/parser/pdf.py
@@ -1,16 +1,14 @@
+import os
from pathlib import Path
-from langchain_community.document_loaders import PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-CHUNK_SIZE = 1024
-CHUNK_OVERLAP = 256
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
def parser(filepath: Path):
content = PyPDFLoader(filepath).load()
- print(content)
splitter = RecursiveCharacterTextSplitter(
- chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
+ chunk_size=os.environ["CHUNK_SIZE"], chunk_overlap=os.environ["CHUNK_OVERLAP"]
)
chunks = splitter.split_documents(content)
return chunks