summaryrefslogtreecommitdiff
path: root/notebooks
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-05 02:04:09 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-05 02:04:09 +0200
commit551f08f61f111342ca2b48c5757f20fc9ef74542 (patch)
tree25f9464cc6a3c3a674b15e3e4b707c05b45cd817 /notebooks
parentdce832ce1b332b9d13e679c1333e920a49297da2 (diff)
Update notebook
Diffstat (limited to 'notebooks')
-rw-r--r--notebooks/testing.ipynb179
1 files changed, 77 insertions, 102 deletions
diff --git a/notebooks/testing.ipynb b/notebooks/testing.ipynb
index 36f3c16..de5e76e 100644
--- a/notebooks/testing.ipynb
+++ b/notebooks/testing.ipynb
@@ -2,18 +2,30 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "c1f56ae3-a056-4b31-bcab-27c2c97c00f1",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
- "from rag.parser.pdf import parser"
+ "from dotenv import load_dotenv\n",
+ "import numpy as np\n",
+ "\n",
+ "load_dotenv()\n",
+ "\n",
+ "from importlib.util import find_spec\n",
+ "if find_spec(\"rag\") is None:\n",
+ " import sys\n",
+ " sys.path.append('..')\n",
+ "from rag.parser.pdf import parser\n",
+ "from rag.db.embeddings import Embeddings\n",
+ "from rag.llm.encoder import Encoder\n",
+ "from rag.db.documents import Documents"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "6b5cb12e-df7e-4532-b78b-216e11ed6161",
"metadata": {},
"outputs": [],
@@ -23,17 +35,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "8e378ed2-e346-4360-8a8e-265f0f1d173d",
- "metadata": {},
- "outputs": [],
- "source": [
- "path"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"id": "f1a57be8-21a2-48d3-b99f-d1bbf7b8780a",
"metadata": {},
"outputs": [],
@@ -43,48 +45,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "b848627e-1e30-492a-ada6-a899112cd515",
- "metadata": {},
- "outputs": [],
- "source": [
- "chunks[32]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5f01d39c-3310-4d6d-8b7b-f253975b5e37",
- "metadata": {},
- "outputs": [],
- "source": [
- "\n",
- "type(chunks[32])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "49bf6768-1350-42ee-8526-2e6bee2dd7bb",
- "metadata": {},
- "outputs": [],
- "source": [
- "from rag.db.vector import VectorStore"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "98516091-7bc5-43c2-9aa7-967e63b795f1",
- "metadata": {},
- "outputs": [],
- "source": [
- "from rag.llms.encoder import Encoder"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"id": "14408e20-3dec-40b4-9dda-3397beb0c453",
"metadata": {},
"outputs": [],
@@ -94,47 +55,58 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"id": "e94993c5-6b19-4bac-b7e8-c26a5200a626",
"metadata": {},
"outputs": [],
"source": [
- "embs = encoder.encode([chunks[33].page_content])"
+ "embs = encoder.encode_document(chunks)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"id": "b845bb31-0909-42cb-9957-9a8b3bb0b5c4",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1024,)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"embs[0].shape"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "aa279b1a-465e-4820-ab56-b25fc513c0a1",
"metadata": {},
"outputs": [],
"source": [
- "vec_db = VectorStore()"
+ "emb_db = Embeddings()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "1dc655de-2359-42ce-b705-76ec06c5f72f",
"metadata": {},
"outputs": [],
"source": [
- "vec_db.add(embs[0])"
+ "emb_db.add(embs)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "117d3416-e79f-436f-a33e-ffb45b972b72",
"metadata": {},
"outputs": [],
@@ -144,7 +116,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "3a6ef474-678c-4525-8dcb-ece67aa9c7ea",
"metadata": {},
"outputs": [],
@@ -154,70 +126,73 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"id": "4c8a16ba-6025-4a6e-95c2-bbba7a9a5de5",
"metadata": {},
- "outputs": [],
- "source": [
- "s, i = vec_db.search(qe, 1)\n",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([[122.61588, 127.4572 , 128.07301, 128.97739, 131.64783]],\n",
+ " dtype=float32),\n",
+ " array([[149, 47, 224, 255, 254]]))"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "s, i = emb_db.search(qe, 5)\n",
"s,i"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "2d5f4601-82b0-47b7-8c79-938435591b59",
- "metadata": {},
- "outputs": [],
- "source": [
- "embs[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e7a65b4d-7269-44bf-9b29-9ef0ee2b2052",
- "metadata": {},
- "outputs": [],
- "source": [
- "qe"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "016bae35-e22b-4e13-9c1d-6fcd09a8ca9f",
+ "execution_count": 12,
+ "id": "1c90dd20-c640-48b5-88c0-4ba93b60c5e6",
"metadata": {},
"outputs": [],
"source": [
- "import hashlib"
+ "docs_db = Documents()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "66f7fd80-a970-4ad9-b5c9-02ab2e39855b",
+ "execution_count": 14,
+ "id": "ed69d8bf-93f1-4353-a4c2-c4aacbe25420",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "document = [chunk.page_content for chunk in chunks]\n",
- "text = str.encode(\"\".join(document))"
+ "docs_db.add_document(chunks)"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "9d385588-cdd5-4f8c-beae-31685a9f4c63",
+ "execution_count": 16,
+ "id": "40ebc825-2e2c-4110-93ff-ae6ec3dc1322",
"metadata": {},
"outputs": [],
"source": [
- "type(hashlib.sha256(text).hexdigest())"
+ "docs_db.add_chunk(chunks[100].page_content)"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "1c90dd20-c640-48b5-88c0-4ba93b60c5e6",
+ "id": "4352b32d-c2e7-4fbf-aa05-fc46baf7c9f8",
"metadata": {},
"outputs": [],
"source": []