{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c1f56ae3-a056-4b31-bcab-27c2c97c00f1", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "from dotenv import load_dotenv\n", "\n", "from importlib.util import find_spec\n", "if find_spec(\"rag\") is None:\n", " import sys\n", " sys.path.append('..')" ] }, { "cell_type": "code", "execution_count": 2, "id": "240df289-d9d6-424b-a5b9-e09dcfefd57e", "metadata": {}, "outputs": [], "source": [ "from rag.retriever.retriever import Retriever" ] }, { "cell_type": "code", "execution_count": 7, "id": "01305de3-0d6d-46e7-a0f0-2e2a2f72d563", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2024-04-20 20:49:04.904\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.document\u001b[0m:\u001b[36m__configure\u001b[0m:\u001b[36m28\u001b[0m - \u001b[34m\u001b[1mCreating documents table if it does not exist...\u001b[0m\n", "\u001b[32m2024-04-20 20:49:04.922\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.vector\u001b[0m:\u001b[36m__configure\u001b[0m:\u001b[36m43\u001b[0m - \u001b[34m\u001b[1mCollection knowledge-base already exists!\u001b[0m\n" ] } ], "source": [ "load_dotenv()\n", "ret = Retriever()\n", "vecdb = ret.vec_db" ] }, { "cell_type": "code", "execution_count": 5, "id": "c83210e2-a5a3-46eb-bd77-aedc0d223190", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CountResult(count=17918)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.client.count(\"knowledge-base\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "4fcb1862-19a3-482b-b397-ce7be074c187", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.client.delete_collection(\"knowledge-base\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "c86d538c-d613-4e4d-8dd2-3abcea2cfeef", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CollectionInfo(status=, optimizer_status=, vectors_count=17918, indexed_vectors_count=0, points_count=17918, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1024, distance=, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.client.get_collection(\"knowledge-base\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "c346ef16-4884-4c3a-b03c-9f789ca00212", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2024-04-20 20:49:52.237\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.retriever\u001b[0m:\u001b[36mretrieve\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mFinding documents matching query: what is a convex function?\u001b[0m\n", "\u001b[32m2024-04-20 20:49:52.239\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.encoder\u001b[0m:\u001b[36mencode_query\u001b[0m:\u001b[36m41\u001b[0m - \u001b[34m\u001b[1mEncoding query: what is a convex function?\u001b[0m\n", "\u001b[32m2024-04-20 20:49:53.191\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.vector\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m58\u001b[0m - \u001b[34m\u001b[1mSearching for vectors...\u001b[0m\n", "\u001b[32m2024-04-20 20:49:53.198\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.vector\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m65\u001b[0m - \u001b[34m\u001b[1mGot 5 hits in the vector db with limit=5\u001b[0m\n" ] }, { "data": { "text/plain": [ "[Document(title='Haskell Design Patterns.pdf', text='N\\nnon-tail\\trecursion\\t/\\t\\nNon-tail\\trecursion'),\n", " Document(title='LinuxNotesForProfessionals.pdf', text='GoalKicker.com – Linux ® Notes for Professionals 44sudo systemctl disable sshd.service\\nDebian\\nsudo /etc/init.d/ssh stop\\nsudo systemctl disable sshd.service\\nArch Linux\\nsudo killall sshd\\nsudo systemctl disable sshd.service'),\n", " Document(title='Kubernetes_Deployment_Antipatterns_v1.1.pdf', text='Anti-pattern 12\\nNot using the Helm package manager\\n37'),\n", " Document(title='Docker_anti_patterns_vertical_3.pdf', text='Docker Anti-PatternsCONTINUOUS DEPLOYMENT / DELIVERY'),\n", " Document(title='Haskell Design Patterns.pdf', text='B\\nbind\\tchain\\nand\\tmonad\\t/\\t\\nMonads\\tand\\tthe\\tbind\\tchain')]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ret.retrieve(\"what is a convex function?\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "36d174df-aa21-4708-ad52-43bb8cfa80e5", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2024-04-20 20:50:18.285\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.retriever\u001b[0m:\u001b[36mretrieve\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mFinding documents matching query: what is a hidden markov model?\u001b[0m\n", "\u001b[32m2024-04-20 20:50:18.286\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.encoder\u001b[0m:\u001b[36mencode_query\u001b[0m:\u001b[36m41\u001b[0m - \u001b[34m\u001b[1mEncoding query: what is a hidden markov model?\u001b[0m\n", "\u001b[32m2024-04-20 20:50:18.357\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.vector\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m58\u001b[0m - \u001b[34m\u001b[1mSearching for vectors...\u001b[0m\n", "\u001b[32m2024-04-20 20:50:18.364\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.vector\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m65\u001b[0m - \u001b[34m\u001b[1mGot 5 hits in the vector db with limit=5\u001b[0m\n" ] }, { "data": { "text/plain": [ "[Document(title='thinkocaml.pdf', text='44 Chapter 5. Recursive Functions'),\n", " Document(title='Haskell Design Patterns.pdf', text='N\\nnon-tail\\trecursion\\t/\\t\\nNon-tail\\trecursion'),\n", " Document(title='thinkdsp.pdf', text='Think DSP\\nDigital Signal Processing in Python\\nVersion 1.1.4\\nAllen B. Downey\\nGreen Tea Press\\nNeedham, Massachusetts'),\n", " Document(title='PFP-0.1.pdf', text='ii'),\n", " Document(title='Learning Bayesian Networks(Neapolitan, Richard).pdf', text='ii')]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ret.retrieve(\"what is a hidden markov model?\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "091c5da0-3a9f-4100-8b2d-ee93a8cf3234", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2024-04-20 20:50:42.102\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.retriever\u001b[0m:\u001b[36mretrieve\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mFinding documents matching query: what is the weather today?\u001b[0m\n", "\u001b[32m2024-04-20 20:50:42.104\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.encoder\u001b[0m:\u001b[36mencode_query\u001b[0m:\u001b[36m41\u001b[0m - \u001b[34m\u001b[1mEncoding query: what is the weather today?\u001b[0m\n", "\u001b[32m2024-04-20 20:50:42.175\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.vector\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m58\u001b[0m - \u001b[34m\u001b[1mSearching for vectors...\u001b[0m\n", "\u001b[32m2024-04-20 20:50:42.181\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.vector\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m65\u001b[0m - \u001b[34m\u001b[1mGot 5 hits in the vector db with limit=5\u001b[0m\n" ] }, { "data": { "text/plain": [ "[Document(title='Haskell Design Patterns.pdf', text='N\\nnon-tail\\trecursion\\t/\\t\\nNon-tail\\trecursion'),\n", " Document(title='category-theory-for-programmers.pdf', text='Category Theory\\nfor Programmers\\nByBartosz Milewski\\ncompiled and edited by\\nIgal Tabachnik'),\n", " Document(title='CNotesForProfessionals.pdf', text='Section 22.10: Pointer to Pointer 141 .............................................................................................................................. \\nSection 22.11: void* pointers as arguments and return values to standard functions 141 ....................................... \\nSection 22.12: Same Asterisk, Di\\ue023erent Meanings 142 ................................................................................................. \\nChapter 23: Sequence points 144 .............................................................................................................................. \\nSection 23.1: Unsequenced expressions 144 .................................................................................................................. \\nSection 23.2: Sequenced expressions 144 ..................................................................................................................... \\nSection 23.3: Indeterminately sequenced expressions 145 ......................................................................................... \\nChapter 24: Function Pointers 146 ........................................................................................................................... \\nSection 24.1: Introduction 146 .......................................................................................................................................... \\nSection 24.2: Returning Function Pointers from a Function 146 ................................................................................. \\nSection 24.3: Best Practices 147 ..................................................................................................................................... \\nSection 24.4: Assigning a Function Pointer 149 ............................................................................................................. \\nSection 24.5: Mnemonic for writing function pointers 149 ........................................................................................... \\nSection 24.6: Basics 150 ................................................................................................................................................... \\nChapter 25: Function Parameters 152 .................................................................................................................... \\nSection 25.1: Parameters are passed by value 152 ...................................................................................................... \\nSection 25.2: Passing in Arrays to Functions 152 .......................................................................................................... \\nSection 25.3: Order of function parameter execution 153 ........................................................................................... \\nSection 25.4: Using pointer parameters to return multiple values 153 ...................................................................... \\nSection 25.5: Example of function returning struct containing values with error codes 154 ................................... \\nChapter 26: Pass 2D-arrays to functions 156 ..................................................................................................... \\nSection 26.1: Pass a 2D-array to a function 156 ........................................................................................................... \\nSection 26.2: Using flat arrays as 2D arrays 162 .......................................................................................................... \\nChapter 27: Error handling 163 .................................................................................................................................. \\nSection 27.1: errno 163 .....................................................................................................................................................'),\n", " Document(title='Beginning Haskell_ A Project-Based Approach.pdf', text='do Notation �������������������������������������������������������������������������������������������������������������������������������������������������������� 150\\nMonad Laws ������������������������������������������������������������������������������������������������������������������������������������������������������ 152'),\n", " Document(title='Learning Bayesian Networks(Neapolitan, Richard).pdf', text='viii CONTENTS')]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ret.retrieve(\"what is the weather today?\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "8830235e-1cdc-46b1-9a0f-96d7df6fc183", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2024-04-20 20:51:23.336\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.retriever\u001b[0m:\u001b[36mretrieve\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mFinding documents matching query: what is ocaml?\u001b[0m\n", "\u001b[32m2024-04-20 20:51:23.337\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.encoder\u001b[0m:\u001b[36mencode_query\u001b[0m:\u001b[36m41\u001b[0m - \u001b[34m\u001b[1mEncoding query: what is ocaml?\u001b[0m\n", "\u001b[32m2024-04-20 20:51:23.409\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.vector\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m58\u001b[0m - \u001b[34m\u001b[1mSearching for vectors...\u001b[0m\n", "\u001b[32m2024-04-20 20:51:23.416\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mrag.retriever.vector\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m65\u001b[0m - \u001b[34m\u001b[1mGot 5 hits in the vector db with limit=5\u001b[0m\n" ] }, { "data": { "text/plain": [ "[Document(title='Haskell Design Patterns.pdf', text='N\\nnon-tail\\trecursion\\t/\\t\\nNon-tail\\trecursion'),\n", " Document(title='category-theory-for-programmers.pdf', text='Category Theory\\nfor Programmers\\nByBartosz Milewski\\ncompiled and edited by\\nIgal Tabachnik'),\n", " Document(title='Docker_anti_patterns_vertical_3.pdf', text='Docker Anti-PatternsCONTINUOUS DEPLOYMENT / DELIVERY'),\n", " Document(title='thinkocaml.pdf', text='90 Chapter 12. Hashtables'),\n", " Document(title='Beginning Haskell_ A Project-Based Approach.pdf', text='do Notation �������������������������������������������������������������������������������������������������������������������������������������������������������� 150\\nMonad Laws ������������������������������������������������������������������������������������������������������������������������������������������������������ 152')]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ret.retrieve(\"what is ocaml?\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d6d0c2d0-25a3-446b-a103-b8b56c82296c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 5 }