From 91ddb3672e514fa9824609ff047d7cab0c65631a Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Tue, 9 Apr 2024 00:14:00 +0200 Subject: Refactor --- rag/retriever/document.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 rag/retriever/document.py (limited to 'rag/retriever/document.py') diff --git a/rag/retriever/document.py b/rag/retriever/document.py new file mode 100644 index 0000000..54ac451 --- /dev/null +++ b/rag/retriever/document.py @@ -0,0 +1,57 @@ +import hashlib +import os + +from langchain_community.document_loaders.blob_loaders import Blob +import psycopg +from loguru import logger as log + +TABLES = """ +CREATE TABLE IF NOT EXISTS document ( + hash text PRIMARY KEY) +""" + + +class DocumentDB: + def __init__(self) -> None: + self.conn = psycopg.connect( + f"dbname={os.environ['DOCUMENT_DB_NAME']} user={os.environ['DOCUMENT_DB_USER']}" + ) + self.__configure() + + def close(self): + self.conn.close() + + def __configure(self): + log.debug("Creating documents table if it does not exist...") + with self.conn.cursor() as cur: + cur.execute(TABLES) + self.conn.commit() + + def __hash(self, blob: Blob) -> str: + log.debug("Hashing document...") + return hashlib.sha256(blob.as_bytes()).hexdigest() + + def add(self, blob: Blob) -> bool: + with self.conn.cursor() as cur: + hash = self.__hash(blob) + cur.execute( + """ + SELECT * FROM document + WHERE + hash = %s + """, + (hash,), + ) + exist = cur.fetchone() + if exist is None: + log.debug("Inserting document hash into documents db...") + cur.execute( + """ + INSERT INTO document + (hash) VALUES + (%s) + """, + (hash,), + ) + self.conn.commit() + return exist is None -- cgit v1.2.3-70-g09d2