summaryrefslogtreecommitdiff
path: root/rag/db/document.py
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-06 01:21:52 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-06 01:21:52 +0200
commit13ac875b2269756045834d7a64e7b35acb9ce0b4 (patch)
treeab05dc7ba966de66e15cc8249ec2d772a2a4d34d /rag/db/document.py
parent59c77c93c39755526e3d7649660780584b1c090d (diff)
Rename dbs
Diffstat (limited to 'rag/db/document.py')
-rw-r--r--rag/db/document.py59
1 files changed, 59 insertions, 0 deletions
diff --git a/rag/db/document.py b/rag/db/document.py
new file mode 100644
index 0000000..8e4d208
--- /dev/null
+++ b/rag/db/document.py
@@ -0,0 +1,59 @@
+import hashlib
+import os
+from typing import List
+
+import psycopg
+from langchain_core.documents.base import Document
+from loguru import logger as log
+
+TABLES = """
+CREATE TABLE IF NOT EXISTS document (
+ hash text PRIMARY KEY)
+"""
+
+
+class DocumentDB:
+ def __init__(self) -> None:
+ self.conn = psycopg.connect(
+ f"dbname={os.environ['RAG_DB_NAME']} user={os.environ['RAG_DB_USER']}"
+ )
+ self.__configure()
+
+ def close(self):
+ self.conn.close()
+
+ def __configure(self):
+ log.debug("Creating documents table if it does not exist...")
+ with self.conn.cursor() as cur:
+ cur.execute(TABLES)
+ self.conn.commit()
+
+ def __hash(self, chunks: List[Document]) -> str:
+ log.debug("Generating sha256 hash for pdf document")
+ document = str.encode("".join([chunk.page_content for chunk in chunks]))
+ return hashlib.sha256(document).hexdigest()
+
+ def add_document(self, chunks: List[Document]) -> bool:
+ log.debug("Inserting document hash into documents db...")
+ with self.conn.cursor() as cur:
+ hash = self.__hash(chunks)
+ cur.execute(
+ """
+ SELECT * FROM document
+ WHERE
+ hash = %s
+ """,
+ (hash,),
+ )
+ exist = cur.fetchone()
+ if exist is None:
+ cur.execute(
+ """
+ INSERT INTO document
+ (hash) VALUES
+ (%s)
+ """,
+ (hash,),
+ )
+ self.conn.commit()
+ return exist is not None