summaryrefslogtreecommitdiff
path: root/rag/db
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-06 13:15:07 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-06 13:15:07 +0200
commit052bf63a2c18b1b55013dcf6974228609cc4d76f (patch)
tree1846b4c5555ca504bfb638f72bee14249f502577 /rag/db
parentd116abc63e350b092c2a7f9e1bb9b54298e21b2d (diff)
Refactor pdf reader
Diffstat (limited to 'rag/db')
-rw-r--r--rag/db/document.py11
1 files changed, 4 insertions, 7 deletions
diff --git a/rag/db/document.py b/rag/db/document.py
index 763eb11..b657e55 100644
--- a/rag/db/document.py
+++ b/rag/db/document.py
@@ -1,9 +1,7 @@
import hashlib
import os
-from typing import List
import psycopg
-from langchain_core.documents.base import Document
from loguru import logger as log
TABLES = """
@@ -28,14 +26,13 @@ class DocumentDB:
cur.execute(TABLES)
self.conn.commit()
- def __hash(self, chunks: List[Document]) -> str:
+ def __hash(self, blob: bytes) -> str:
log.debug("Hashing document...")
- document = str.encode("".join([chunk.page_content for chunk in chunks]))
- return hashlib.sha256(document).hexdigest()
+ return hashlib.sha256(blob).hexdigest()
- def add(self, chunks: List[Document]) -> bool:
+ def add(self, blob: bytes) -> bool:
with self.conn.cursor() as cur:
- hash = self.__hash(chunks)
+ hash = self.__hash(blob)
cur.execute(
"""
SELECT * FROM document