summaryrefslogtreecommitdiff
path: root/notebooks
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-05 00:42:45 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-05 00:42:45 +0200
commit901212a2fd9658e9b51a4e977c68a47ef7b840b7 (patch)
tree9ce41398c0a43557e62119633a40367b200d5e06 /notebooks
parent064cd26f4ffdb28353315a6ecc28bf47cf575071 (diff)
Add notebook
Diffstat (limited to 'notebooks')
-rw-r--r--notebooks/testing.ipynb247
1 files changed, 247 insertions, 0 deletions
diff --git a/notebooks/testing.ipynb b/notebooks/testing.ipynb
new file mode 100644
index 0000000..36f3c16
--- /dev/null
+++ b/notebooks/testing.ipynb
@@ -0,0 +1,247 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1f56ae3-a056-4b31-bcab-27c2c97c00f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pathlib import Path\n",
+ "from rag.parser.pdf import parser"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6b5cb12e-df7e-4532-b78b-216e11ed6161",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path = Path(\"/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e378ed2-e346-4360-8a8e-265f0f1d173d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f1a57be8-21a2-48d3-b99f-d1bbf7b8780a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "chunks = parser(path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b848627e-1e30-492a-ada6-a899112cd515",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "chunks[32]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f01d39c-3310-4d6d-8b7b-f253975b5e37",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "type(chunks[32])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "49bf6768-1350-42ee-8526-2e6bee2dd7bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from rag.db.vector import VectorStore"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "98516091-7bc5-43c2-9aa7-967e63b795f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from rag.llms.encoder import Encoder"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "14408e20-3dec-40b4-9dda-3397beb0c453",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "encoder = Encoder()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e94993c5-6b19-4bac-b7e8-c26a5200a626",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embs = encoder.encode([chunks[33].page_content])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b845bb31-0909-42cb-9957-9a8b3bb0b5c4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embs[0].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aa279b1a-465e-4820-ab56-b25fc513c0a1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vec_db = VectorStore()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1dc655de-2359-42ce-b705-76ec06c5f72f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vec_db.add(embs[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "117d3416-e79f-436f-a33e-ffb45b972b72",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "q = \"the variance of the portfolio\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3a6ef474-678c-4525-8dcb-ece67aa9c7ea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "qe = encoder.query(q)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c8a16ba-6025-4a6e-95c2-bbba7a9a5de5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s, i = vec_db.search(qe, 1)\n",
+ "s,i"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2d5f4601-82b0-47b7-8c79-938435591b59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e7a65b4d-7269-44bf-9b29-9ef0ee2b2052",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "qe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "016bae35-e22b-4e13-9c1d-6fcd09a8ca9f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import hashlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "66f7fd80-a970-4ad9-b5c9-02ab2e39855b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "document = [chunk.page_content for chunk in chunks]\n",
+ "text = str.encode(\"\".join(document))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d385588-cdd5-4f8c-beae-31685a9f4c63",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "type(hashlib.sha256(text).hexdigest())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1c90dd20-c640-48b5-88c0-4ba93b60c5e6",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}