From 901212a2fd9658e9b51a4e977c68a47ef7b840b7 Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Fri, 5 Apr 2024 00:42:45 +0200 Subject: Add notebook --- notebooks/testing.ipynb | 247 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 notebooks/testing.ipynb (limited to 'notebooks') diff --git a/notebooks/testing.ipynb b/notebooks/testing.ipynb new file mode 100644 index 0000000..36f3c16 --- /dev/null +++ b/notebooks/testing.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c1f56ae3-a056-4b31-bcab-27c2c97c00f1", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from rag.parser.pdf import parser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b5cb12e-df7e-4532-b78b-216e11ed6161", + "metadata": {}, + "outputs": [], + "source": [ + "path = Path(\"/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e378ed2-e346-4360-8a8e-265f0f1d173d", + "metadata": {}, + "outputs": [], + "source": [ + "path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1a57be8-21a2-48d3-b99f-d1bbf7b8780a", + "metadata": {}, + "outputs": [], + "source": [ + "chunks = parser(path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b848627e-1e30-492a-ada6-a899112cd515", + "metadata": {}, + "outputs": [], + "source": [ + "chunks[32]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f01d39c-3310-4d6d-8b7b-f253975b5e37", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "type(chunks[32])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49bf6768-1350-42ee-8526-2e6bee2dd7bb", + "metadata": {}, + "outputs": [], + "source": [ + "from rag.db.vector import VectorStore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98516091-7bc5-43c2-9aa7-967e63b795f1", + "metadata": {}, + "outputs": [], + "source": [ + "from rag.llms.encoder import Encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14408e20-3dec-40b4-9dda-3397beb0c453", + "metadata": {}, + "outputs": [], + "source": [ + "encoder = Encoder()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e94993c5-6b19-4bac-b7e8-c26a5200a626", + "metadata": {}, + "outputs": [], + "source": [ + "embs = encoder.encode([chunks[33].page_content])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b845bb31-0909-42cb-9957-9a8b3bb0b5c4", + "metadata": {}, + "outputs": [], + "source": [ + "embs[0].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa279b1a-465e-4820-ab56-b25fc513c0a1", + "metadata": {}, + "outputs": [], + "source": [ + "vec_db = VectorStore()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dc655de-2359-42ce-b705-76ec06c5f72f", + "metadata": {}, + "outputs": [], + "source": [ + "vec_db.add(embs[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "117d3416-e79f-436f-a33e-ffb45b972b72", + "metadata": {}, + "outputs": [], + "source": [ + "q = \"the variance of the portfolio\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a6ef474-678c-4525-8dcb-ece67aa9c7ea", + "metadata": {}, + "outputs": [], + "source": [ + "qe = encoder.query(q)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c8a16ba-6025-4a6e-95c2-bbba7a9a5de5", + "metadata": {}, + "outputs": [], + "source": [ + "s, i = vec_db.search(qe, 1)\n", + "s,i" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d5f4601-82b0-47b7-8c79-938435591b59", + "metadata": {}, + "outputs": [], + "source": [ + "embs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7a65b4d-7269-44bf-9b29-9ef0ee2b2052", + "metadata": {}, + "outputs": [], + "source": [ + "qe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "016bae35-e22b-4e13-9c1d-6fcd09a8ca9f", + "metadata": {}, + "outputs": [], + "source": [ + "import hashlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66f7fd80-a970-4ad9-b5c9-02ab2e39855b", + "metadata": {}, + "outputs": [], + "source": [ + "document = [chunk.page_content for chunk in chunks]\n", + "text = str.encode(\"\".join(document))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d385588-cdd5-4f8c-beae-31685a9f4c63", + "metadata": {}, + "outputs": [], + "source": [ + "type(hashlib.sha256(text).hexdigest())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c90dd20-c640-48b5-88c0-4ba93b60c5e6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- cgit v1.2.3-70-g09d2