From 8fda223c36db69d7fb5c029bd55a1321c9812a7a Mon Sep 17 00:00:00 2001
From: Gustaf Rydholm <gustaf.rydholm@gmail.com>
Date: Sat, 6 Apr 2024 01:22:50 +0200
Subject: Update notebook

---
 notebooks/testing.ipynb | 308 ++++++++++++++++++++++--------------------------
 1 file changed, 138 insertions(+), 170 deletions(-)

diff --git a/notebooks/testing.ipynb b/notebooks/testing.ipynb
index 4d64cf2..521421c 100644
--- a/notebooks/testing.ipynb
+++ b/notebooks/testing.ipynb
@@ -8,19 +8,12 @@
    "outputs": [],
    "source": [
     "from pathlib import Path\n",
-    "from dotenv import load_dotenv\n",
-    "import numpy as np\n",
-    "\n",
-    "load_dotenv()\n",
     "\n",
     "from importlib.util import find_spec\n",
     "if find_spec(\"rag\") is None:\n",
     "    import sys\n",
     "    sys.path.append('..')\n",
-    "from rag.parser.pdf import parser\n",
-    "from rag.db.embeddings import Embeddings\n",
-    "from rag.llm.encoder import Encoder\n",
-    "from rag.db.documents import Documents"
+    "from rag.rag import RAG"
    ]
   },
   {
@@ -36,205 +29,180 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "f1a57be8-21a2-48d3-b99f-d1bbf7b8780a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chunks = parser(path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "14408e20-3dec-40b4-9dda-3397beb0c453",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "encoder = Encoder()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "e94993c5-6b19-4bac-b7e8-c26a5200a626",
+   "id": "b8382795-9610-4b24-80b7-31397b2faf90",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-04-06 01:20:04.673\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mrag.db.document\u001b[0m:\u001b[36m__configure\u001b[0m:\u001b[36m26\u001b[0m - \u001b[34m\u001b[1mCreating documents table if it does not exist...\u001b[0m\n",
+      "\u001b[32m2024-04-06 01:20:04.688\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mrag.db.vector\u001b[0m:\u001b[36m__configure\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCollection knowledge-base already exists...\u001b[0m\n"
+     ]
+    }
+   ],
    "source": [
-    "embs = encoder.encode_document(chunks)"
+    "rag = RAG()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "aa279b1a-465e-4820-ab56-b25fc513c0a1",
+   "id": "1c6b48d2-eb04-4a7c-8224-78aabfc7c887",
    "metadata": {},
    "outputs": [],
    "source": [
-    "emb_db = Embeddings()"
+    "query = \"What is a factor model?\""
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "1dc655de-2359-42ce-b705-76ec06c5f72f",
+   "id": "a95c8250-00b2-4cbc-a9c6-a76d14ef2da5",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-04-06 01:20:35.606\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mrag.llm.encoder\u001b[0m:\u001b[36mencode_query\u001b[0m:\u001b[36m33\u001b[0m - \u001b[34m\u001b[1mEncoding query: What is a factor model?\u001b[0m\n",
+      "\u001b[32m2024-04-06 01:20:36.146\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mrag.db.vector\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m51\u001b[0m - \u001b[34m\u001b[1mSearching for vectors...\u001b[0m\n",
+      "\u001b[32m2024-04-06 01:20:36.150\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mrag.rag\u001b[0m:\u001b[36m__context\u001b[0m:\u001b[36m33\u001b[0m - \u001b[34m\u001b[1mGot 5 hits in the vector db with limit=5\u001b[0m\n",
+      "\u001b[32m2024-04-06 01:20:36.151\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mrag.llm.generator\u001b[0m:\u001b[36mgenerate\u001b[0m:\u001b[36m32\u001b[0m - \u001b[34m\u001b[1mGenerating answer...\u001b[0m\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "485\n"
+      "metaprompt = \n",
+      "You are a quant researcher.\n",
+      "Answer the following question using the provided context.\n",
+      "If you can't find the answer, do not pretend you know it,but answer \"I don't know\".\n",
+      "\n",
+      "Question: What is a factor model?\n",
+      "\n",
+      "Context:\n",
+      "2.4. Factor Model 23\n",
+      "Comparing the i.i.d. model (2.12) with the general model (2.7)-\n",
+      "(2.11), obviously we can see it is the simplest special case with µ=φ0,\n",
+      "Π=0,p= 0,q= 0,Σw=A0AT\n",
+      "0,m= 0, ands= 0. And the\n",
+      "conditional mean and covariance matrix are both constant:\n",
+      "µt=µ, (2.13)\n",
+      "Σt=Σw. (2.14)\n",
+      "This i.i.d. model assumption may look simple, however, it is one\n",
+      "of the most fundamental assumptions for many important works. One\n",
+      "example is the Nobel prize-winning Markowitz portfolio theory [135,\n",
+      "136, 137, 138, 179] that will be covered in Chapter 5.\n",
+      "2.4 Factor Model\n",
+      "If we look at (2.12) carefully, we may think that the dimension of the\n",
+      "marketalwaysequalsthenumberofassets N.However,thismaynotbe\n",
+      "true in practice. In general, the market is composed of a large number\n",
+      "ofassets(i.e., Nislarge),butitisusuallyobservedthatitsdimensionis\n",
+      "relatively small, that is, the market is only driven by a limited number\n",
+      "of factors, say Kfactors with K≪N.\n",
+      "The general factor model is\n",
+      "rt=φ0+h(ft) +wt, (2.15)\n",
+      "26 Modeling of Financial Time Series\n",
+      "2.4.2 Hidden Factors\n",
+      "The assumption of a linear model of (2.15) with hidden factors is that\n",
+      "the factors are not explicit market variables but are functions of rtthat\n",
+      "summarize as much information as possible.\n",
+      "One method is to deﬁne the hidden factors as aﬃne transformations\n",
+      "ofrtas follows:\n",
+      "ft=d+ΥTrt, (2.23)\n",
+      "where d∈RKandΥ∈RN×Kare parameters to be estimated.\n",
+      "Then the hidden factor model can be expressed as follows:\n",
+      "rt=φ0+Π(d+ΥTrt) +wt, (2.24)\n",
+      "which is a speciﬁc case of (2.15) with h(ft) =Πft,ft∈RKbeing\n",
+      "the hidden variables deﬁned in (2.23); Π∈RN×Kbeing the factor\n",
+      "loading matrix; and wtfollows an i.i.d. distribution with zero mean\n",
+      "and a (possibly diagonal) constant covariance matrix Σw.\n",
+      "The model (2.24) can be further simpliﬁed as follows:\n",
+      "rt=m+ΠΥTrt+wt, (2.25)\n",
+      "where m=φ0+Πdis an newly deﬁned parameter.\n",
+      "The parameters m,Π, and Υcan be estimated by the following\n",
+      "nonlinear least-square (LS) regression:\n",
+      "minimize\n",
+      "m,Π,ΥErt−m−ΠΥTrt2\n",
+      "2. (2.26)\n",
+      "ofassets(i.e., Nislarge),butitisusuallyobservedthatitsdimensionis\n",
+      "relatively small, that is, the market is only driven by a limited number\n",
+      "of factors, say Kfactors with K≪N.\n",
+      "The general factor model is\n",
+      "rt=φ0+h(ft) +wt, (2.15)\n",
+      "whereφ0denotes a constant vector; ft∈RKwithK≪Nis a vector\n",
+      "of a few factors that are responsible for most of the randomness in\n",
+      "the market, the vector function h:RK↦→RNdenotes how the low\n",
+      "dimensional factors aﬀect the higher dimensional market; and a resid-\n",
+      "ual vector wtof (possibly independent) perturbations that has only a\n",
+      "marginal eﬀect. In general, the function his assumed to be linear.\n",
+      "This approach of modeling enjoys a wide popularity; refer to [42,\n",
+      "66, 67, 68, 69, 70, 118] for some typical references.\n",
+      "In the following, we consider two speciﬁc models of (2.15) with\n",
+      "either explicit or hidden factors.\n",
+      "24 Modeling of Financial Time Series\n",
+      "2.4.1 Explicit Factors\n",
+      "The explicit factor model is\n",
+      "rt=φ0+Πft+wt, (2.16)\n",
+      "which is a speciﬁc case of (2.15) with h(ft) =Πft,ft∈RKbeing\n",
+      "explicitly observable market variables, and Π∈RN×Kbeing the factor\n",
+      "loading matrix.\n",
+      "Some popular explicit factors include returns on the market port-\n",
+      "folio2, growth rate of the GDP, interest rate on short term Treasury\n",
+      "bills, inﬂation rate, unemployment, etc. [171].\n",
+      "Obviously, the factor model with explicit factors (2.16) is a special\n",
+      "case of the general model (2.7)-(2.11) with exogenous input being the\n",
+      "factors xt=ft,p= 0, andq= 0.\n",
+      "In general, it is assumed that ftfollows an i.i.d. distribution with\n",
+      "constant mean µfand constant covariance matrix Σf,wtfollows an\n",
+      "i.i.d. distribution with zero mean and (possibly diagonal) constant co-\n",
+      "variance matrix Σw, and ftandwtare uncorrelated. Then the con-\n",
+      "ditional mean and covariance matrix are both constant and can be\n",
+      "computed as follows:\n",
+      "µt=E[rt|Ft−1] =E[rt] =φ0+Πµf (2.17)\n",
+      "Σt=E[(rt−µt)(rt−µt)T|Ft−1],\n",
+      "the explicit and hidden factor models are as follows:\n",
+      "•The explicit factor model tends to explain the log-returns with a\n",
+      "smaller number of fundamental or macroeconomic variables and\n",
+      "thus it is easier to interpret. However, in general there is no sys-\n",
+      "tematic method to choose the right factors.\n",
+      "•The hidden factor model employs PCA to explore the structure\n",
+      "of the covariance matrix and locate a low-dimensional subspace\n",
+      "that captures most of the variation in the log-returns. It is a more\n",
+      "systematical approach and thus it may provide a better explana-\n",
+      "tory power. One drawback of the hidden factors compared with\n",
+      "the explicit factors is that they do not have explicit econometric\n",
+      "interpretations.\n",
+      "2.5 VARMA Model\n",
+      "The previous i.i.d. and factor models, while commonly employed, do\n",
+      "not incorporate any time-dependency in the model for rt. In other\n",
+      "words, the conditional mean and covariance matrix are constant and\n",
+      "\n",
+      "Answer:\n",
+      "\n"
      ]
-    }
-   ],
-   "source": [
-    "emb_db.add(embs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "57173d80-9519-479e-9cd9-ba7ccdae7d6b",
-   "metadata": {},
-   "outputs": [
+    },
     {
      "data": {
       "text/plain": [
-       "CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=485, indexed_vectors_count=0, points_count=485, segments_count=8, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1024, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})"
+       "'A factor model is a financial modeling approach that assumes the observed variables are driven by a few underlying factors. The general form of a factor model is rt = φ0 + h(ft) + wt, where rt denotes the observed variable (e.g., stock prices), φ0 represents a constant vector, ft represents the few underlying factors responsible for most of the randomness in the market, and wt is a residual vector representing perturbations with only marginal effects. The function h(ft) represents how these underlying factors affect the higher-dimensional market. Factor models can be further categorized into explicit factor models, where the underlying factors are explicitly observable market variables, and hidden factor models, which employ Principal Component Analysis (PCA) to explore the structure of the covariance matrix and locate a low-dimensional subspace that captures most of the variation in the observed variables.'"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "emb_db.client.get_collection(collection_name=\"knowledge-base\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "56c9df8a-cbf6-4051-8f4b-cb1eb89a536e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embs[125]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "117d3416-e79f-436f-a33e-ffb45b972b72",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "q = \"non-parametric least-square\\nestimation and the parametric MLE under Gaussian assumption?\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b31f0362-6def-4e50-a31c-8b7e2995c62b",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3a6ef474-678c-4525-8dcb-ece67aa9c7ea",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "qe = encoder.query(q)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c8a16ba-6025-4a6e-95c2-bbba7a9a5de5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "hits = emb_db.search(qe, 100)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3fa73421-6df0-4f7b-96da-23a394eb442e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "hits"
+    "rag.rag(query, \"quant researcher\", limit=5)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "21e0aab5-7f42-4fcc-9495-446968fc0c88",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "emb_db.client.get_collection(collection_name=\"knowledge-base\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1c90dd20-c640-48b5-88c0-4ba93b60c5e6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "docs_db = Documents()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ed69d8bf-93f1-4353-a4c2-c4aacbe25420",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "docs_db.add_document(chunks)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "40ebc825-2e2c-4110-93ff-ae6ec3dc1322",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "docs_db.add_chunk(chunks[100].page_content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0a4c565e-aace-4ea5-9093-9266b466b06c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from qdrant_client import QdrantClient"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4352b32d-c2e7-4fbf-aa05-fc46baf7c9f8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "qdrant = QdrantClient(\"http://localhost:6333\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b8382795-9610-4b24-80b7-31397b2faf90",
+   "id": "119c26b6-fe4a-4f7a-abcc-169bffac12dd",
    "metadata": {},
    "outputs": [],
    "source": []
-- 
cgit v1.2.3-70-g09d2