diff options
Diffstat (limited to 'notebooks')
-rw-r--r-- | notebooks/testing.ipynb | 179 |
1 files changed, 77 insertions, 102 deletions
diff --git a/notebooks/testing.ipynb b/notebooks/testing.ipynb index 36f3c16..de5e76e 100644 --- a/notebooks/testing.ipynb +++ b/notebooks/testing.ipynb @@ -2,18 +2,30 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "c1f56ae3-a056-4b31-bcab-27c2c97c00f1", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", - "from rag.parser.pdf import parser" + "from dotenv import load_dotenv\n", + "import numpy as np\n", + "\n", + "load_dotenv()\n", + "\n", + "from importlib.util import find_spec\n", + "if find_spec(\"rag\") is None:\n", + " import sys\n", + " sys.path.append('..')\n", + "from rag.parser.pdf import parser\n", + "from rag.db.embeddings import Embeddings\n", + "from rag.llm.encoder import Encoder\n", + "from rag.db.documents import Documents" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "6b5cb12e-df7e-4532-b78b-216e11ed6161", "metadata": {}, "outputs": [], @@ -23,17 +35,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "8e378ed2-e346-4360-8a8e-265f0f1d173d", - "metadata": {}, - "outputs": [], - "source": [ - "path" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "f1a57be8-21a2-48d3-b99f-d1bbf7b8780a", "metadata": {}, "outputs": [], @@ -43,48 +45,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "b848627e-1e30-492a-ada6-a899112cd515", - "metadata": {}, - "outputs": [], - "source": [ - "chunks[32]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f01d39c-3310-4d6d-8b7b-f253975b5e37", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "type(chunks[32])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49bf6768-1350-42ee-8526-2e6bee2dd7bb", - "metadata": {}, - "outputs": [], - "source": [ - "from rag.db.vector import VectorStore" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98516091-7bc5-43c2-9aa7-967e63b795f1", - "metadata": {}, - "outputs": [], - "source": [ - "from rag.llms.encoder import Encoder" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "14408e20-3dec-40b4-9dda-3397beb0c453", "metadata": {}, "outputs": [], @@ -94,47 +55,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "e94993c5-6b19-4bac-b7e8-c26a5200a626", "metadata": {}, "outputs": [], "source": [ - "embs = encoder.encode([chunks[33].page_content])" + "embs = encoder.encode_document(chunks)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "b845bb31-0909-42cb-9957-9a8b3bb0b5c4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(1024,)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "embs[0].shape" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "aa279b1a-465e-4820-ab56-b25fc513c0a1", "metadata": {}, "outputs": [], "source": [ - "vec_db = VectorStore()" + "emb_db = Embeddings()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "1dc655de-2359-42ce-b705-76ec06c5f72f", "metadata": {}, "outputs": [], "source": [ - "vec_db.add(embs[0])" + "emb_db.add(embs)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "117d3416-e79f-436f-a33e-ffb45b972b72", "metadata": {}, "outputs": [], @@ -144,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "3a6ef474-678c-4525-8dcb-ece67aa9c7ea", "metadata": {}, "outputs": [], @@ -154,70 +126,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "4c8a16ba-6025-4a6e-95c2-bbba7a9a5de5", "metadata": {}, - "outputs": [], - "source": [ - "s, i = vec_db.search(qe, 1)\n", + "outputs": [ + { + "data": { + "text/plain": [ + "(array([[122.61588, 127.4572 , 128.07301, 128.97739, 131.64783]],\n", + " dtype=float32),\n", + " array([[149, 47, 224, 255, 254]]))" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s, i = emb_db.search(qe, 5)\n", "s,i" ] }, { "cell_type": "code", - "execution_count": null, - "id": "2d5f4601-82b0-47b7-8c79-938435591b59", - "metadata": {}, - "outputs": [], - "source": [ - "embs[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7a65b4d-7269-44bf-9b29-9ef0ee2b2052", - "metadata": {}, - "outputs": [], - "source": [ - "qe" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "016bae35-e22b-4e13-9c1d-6fcd09a8ca9f", + "execution_count": 12, + "id": "1c90dd20-c640-48b5-88c0-4ba93b60c5e6", "metadata": {}, "outputs": [], "source": [ - "import hashlib" + "docs_db = Documents()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "66f7fd80-a970-4ad9-b5c9-02ab2e39855b", + "execution_count": 14, + "id": "ed69d8bf-93f1-4353-a4c2-c4aacbe25420", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "document = [chunk.page_content for chunk in chunks]\n", - "text = str.encode(\"\".join(document))" + "docs_db.add_document(chunks)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "9d385588-cdd5-4f8c-beae-31685a9f4c63", + "execution_count": 16, + "id": "40ebc825-2e2c-4110-93ff-ae6ec3dc1322", "metadata": {}, "outputs": [], "source": [ - "type(hashlib.sha256(text).hexdigest())" + "docs_db.add_chunk(chunks[100].page_content)" ] }, { "cell_type": "code", "execution_count": null, - "id": "1c90dd20-c640-48b5-88c0-4ba93b60c5e6", + "id": "4352b32d-c2e7-4fbf-aa05-fc46baf7c9f8", "metadata": {}, "outputs": [], "source": [] |