diff options
author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-06 01:21:52 +0200 |
---|---|---|
committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-06 01:21:52 +0200 |
commit | 13ac875b2269756045834d7a64e7b35acb9ce0b4 (patch) | |
tree | ab05dc7ba966de66e15cc8249ec2d772a2a4d34d /rag/db/document.py | |
parent | 59c77c93c39755526e3d7649660780584b1c090d (diff) |
Rename dbs
Diffstat (limited to 'rag/db/document.py')
-rw-r--r-- | rag/db/document.py | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/rag/db/document.py b/rag/db/document.py new file mode 100644 index 0000000..8e4d208 --- /dev/null +++ b/rag/db/document.py @@ -0,0 +1,59 @@ +import hashlib +import os +from typing import List + +import psycopg +from langchain_core.documents.base import Document +from loguru import logger as log + +TABLES = """ +CREATE TABLE IF NOT EXISTS document ( + hash text PRIMARY KEY) +""" + + +class DocumentDB: + def __init__(self) -> None: + self.conn = psycopg.connect( + f"dbname={os.environ['RAG_DB_NAME']} user={os.environ['RAG_DB_USER']}" + ) + self.__configure() + + def close(self): + self.conn.close() + + def __configure(self): + log.debug("Creating documents table if it does not exist...") + with self.conn.cursor() as cur: + cur.execute(TABLES) + self.conn.commit() + + def __hash(self, chunks: List[Document]) -> str: + log.debug("Generating sha256 hash for pdf document") + document = str.encode("".join([chunk.page_content for chunk in chunks])) + return hashlib.sha256(document).hexdigest() + + def add_document(self, chunks: List[Document]) -> bool: + log.debug("Inserting document hash into documents db...") + with self.conn.cursor() as cur: + hash = self.__hash(chunks) + cur.execute( + """ + SELECT * FROM document + WHERE + hash = %s + """, + (hash,), + ) + exist = cur.fetchone() + if exist is None: + cur.execute( + """ + INSERT INTO document + (hash) VALUES + (%s) + """, + (hash,), + ) + self.conn.commit() + return exist is not None |