{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c1f56ae3-a056-4b31-bcab-27c2c97c00f1", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "from dotenv import load_dotenv\n", "import numpy as np\n", "\n", "load_dotenv()\n", "\n", "from importlib.util import find_spec\n", "if find_spec(\"rag\") is None:\n", " import sys\n", " sys.path.append('..')\n", "from rag.parser.pdf import parser\n", "from rag.db.embeddings import Embeddings\n", "from rag.llm.encoder import Encoder\n", "from rag.db.documents import Documents" ] }, { "cell_type": "code", "execution_count": 2, "id": "6b5cb12e-df7e-4532-b78b-216e11ed6161", "metadata": {}, "outputs": [], "source": [ "path = Path(\"/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "f1a57be8-21a2-48d3-b99f-d1bbf7b8780a", "metadata": {}, "outputs": [], "source": [ "chunks = parser(path)" ] }, { "cell_type": "code", "execution_count": 4, "id": "14408e20-3dec-40b4-9dda-3397beb0c453", "metadata": {}, "outputs": [], "source": [ "encoder = Encoder()" ] }, { "cell_type": "code", "execution_count": 5, "id": "e94993c5-6b19-4bac-b7e8-c26a5200a626", "metadata": {}, "outputs": [], "source": [ "embs = encoder.encode_document(chunks)" ] }, { "cell_type": "code", "execution_count": 6, "id": "aa279b1a-465e-4820-ab56-b25fc513c0a1", "metadata": {}, "outputs": [], "source": [ "emb_db = Embeddings()" ] }, { "cell_type": "code", "execution_count": 7, "id": "1dc655de-2359-42ce-b705-76ec06c5f72f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "485\n" ] } ], "source": [ "emb_db.add(embs)" ] }, { "cell_type": "code", "execution_count": 8, "id": "57173d80-9519-479e-9cd9-ba7ccdae7d6b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CollectionInfo(status=, optimizer_status=, vectors_count=485, indexed_vectors_count=0, points_count=485, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1024, distance=, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "emb_db.client.get_collection(collection_name=\"knowledge-base\")" ] }, { "cell_type": "code", "execution_count": null, "id": "56c9df8a-cbf6-4051-8f4b-cb1eb89a536e", "metadata": {}, "outputs": [], "source": [ "embs[125]" ] }, { "cell_type": "code", "execution_count": null, "id": "117d3416-e79f-436f-a33e-ffb45b972b72", "metadata": {}, "outputs": [], "source": [ "q = \"non-parametric least-square\\nestimation and the parametric MLE under Gaussian assumption?\"" ] }, { "cell_type": "code", "execution_count": null, "id": "b31f0362-6def-4e50-a31c-8b7e2995c62b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3a6ef474-678c-4525-8dcb-ece67aa9c7ea", "metadata": {}, "outputs": [], "source": [ "qe = encoder.query(q)" ] }, { "cell_type": "code", "execution_count": null, "id": "4c8a16ba-6025-4a6e-95c2-bbba7a9a5de5", "metadata": {}, "outputs": [], "source": [ "hits = emb_db.search(qe, 100)" ] }, { "cell_type": "code", "execution_count": null, "id": "3fa73421-6df0-4f7b-96da-23a394eb442e", "metadata": {}, "outputs": [], "source": [ "hits" ] }, { "cell_type": "code", "execution_count": null, "id": "21e0aab5-7f42-4fcc-9495-446968fc0c88", "metadata": {}, "outputs": [], "source": [ "emb_db.client.get_collection(collection_name=\"knowledge-base\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1c90dd20-c640-48b5-88c0-4ba93b60c5e6", "metadata": {}, "outputs": [], "source": [ "docs_db = Documents()" ] }, { "cell_type": "code", "execution_count": null, "id": "ed69d8bf-93f1-4353-a4c2-c4aacbe25420", "metadata": {}, "outputs": [], "source": [ "docs_db.add_document(chunks)" ] }, { "cell_type": "code", "execution_count": null, "id": "40ebc825-2e2c-4110-93ff-ae6ec3dc1322", "metadata": {}, "outputs": [], "source": [ "docs_db.add_chunk(chunks[100].page_content)" ] }, { "cell_type": "code", "execution_count": null, "id": "0a4c565e-aace-4ea5-9093-9266b466b06c", "metadata": {}, "outputs": [], "source": [ "from qdrant_client import QdrantClient" ] }, { "cell_type": "code", "execution_count": null, "id": "4352b32d-c2e7-4fbf-aa05-fc46baf7c9f8", "metadata": {}, "outputs": [], "source": [ "\n", "qdrant = QdrantClient(\"http://localhost:6333\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b8382795-9610-4b24-80b7-31397b2faf90", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 5 }