From d487ef8b04cc7f5ac1491f0638f902fe2abe5ac5 Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Mon, 8 Apr 2024 22:28:47 +0200 Subject: Wip refactor --- notebooks/check-pdf.ipynb | 402 ++++++++++++++++++++++++++++++++++++++++++++ notebooks/testing.ipynb | 14 +- poetry.lock | 314 +++++++++++++++++++++++++++++++++- pyproject.toml | 1 + rag/cli.py | 2 +- rag/db/vector.py | 25 ++- rag/llm/cohere_generator.py | 29 ++++ rag/llm/encoder.py | 15 +- rag/llm/generator.py | 33 ---- rag/llm/ollama_generator.py | 76 +++++++++ rag/parser/pdf.py | 13 +- rag/rag.py | 34 ++-- rag/ui.py | 59 ++++--- 13 files changed, 925 insertions(+), 92 deletions(-) create mode 100644 notebooks/check-pdf.ipynb create mode 100644 rag/llm/cohere_generator.py delete mode 100644 rag/llm/generator.py create mode 100644 rag/llm/ollama_generator.py diff --git a/notebooks/check-pdf.ipynb b/notebooks/check-pdf.ipynb new file mode 100644 index 0000000..a98f8c1 --- /dev/null +++ b/notebooks/check-pdf.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "fad5b45a-158a-4484-a34e-453f1b3316cc", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "from importlib.util import find_spec\n", + "if find_spec(\"rag\") is None:\n", + " import sys\n", + " sys.path.append('..')\n", + "from rag.parser.pdf import PDFParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6762495a-fc4e-4022-963e-e302e095b0cf", + "metadata": {}, + "outputs": [], + "source": [ + "p = PDFParser()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1eeef01f-497b-4069-8cde-fa018e99ce52", + "metadata": {}, + "outputs": [], + "source": [ + "path = Path(\"/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9ac72c90-fea7-4f96-a648-691a01e5b38b", + "metadata": {}, + "outputs": [], + "source": [ + "d = p.from_data(p.from_path(path))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "24d3edd0-8430-4f54-b03a-4dcce67d2cff", + "metadata": {}, + "outputs": [], + "source": [ + "d[1]\n", + "b = p.from_path(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0a2c5f13-17f9-4674-b524-4bba7e05754a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "langchain_core.document_loaders.blob_loaders.Blob" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8496ab61-dfa8-45be-bea0-4a31fc5476da", + "metadata": {}, + "outputs": [], + "source": [ + "Path(d[1].metadata[\"source\"]).name" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "00df538f-2d15-47c8-87dd-639582c41cbb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='A Signal Processing\\nPerspective on Financial\\nEngineering\\nYiyong Feng\\nDept. of Electronic and Computer Engineering\\nThe Hong Kong University of Science and Technology\\nClear Water Bay, Kowloon\\nHong Kong\\nyiyong@connect.ust.hk\\nDaniel P. Palomar\\nDept. of Electronic and Computer Engineering\\nThe Hong Kong University of Science and Technology\\nClear Water Bay, Kowloon\\nHong Kong\\npalomar@ust.hk\\nBoston — Delft', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 0}),\n", + " Document(page_content='Foundations and TrendsR⃝in Signal Processing\\nPublished, sold and distributed by:\\nnow Publishers Inc.\\nPO Box 1024\\nHanover, MA 02339\\nUnited States\\nTel. +1-781-985-4510\\nwww.nowpublishers.com\\nsales@nowpublishers.com\\nOutside North America:\\nnow Publishers Inc.\\nPO Box 179\\n2600 AD Delft\\nThe Netherlands\\nTel. +31-6-51115274\\nThe preferred citation for this publication is\\nY. Feng and D. P. Palomar. A Signal Processing Perspective on Financial\\nEngineering . Foundations and TrendsR⃝in Signal Processing, vol. 9, no. 1-2,\\npp. 1–231, 2015.\\nThis Foundations and TrendsR⃝issue was typeset in LATEX using a class file designed\\nby Neal Parikh. Printed on acid-free paper.\\nISBN: 978-1-68083-119-1\\nc⃝2016 Y. Feng and D. P. Palomar\\nAll rights reserved. No part of this publication may be reproduced, stored in a retrieval\\nsystem, or transmitted in any form or by any means, mechanical, photocopying, recording\\nor otherwise, without prior written permission of the publishers.\\nPhotocopying. In the USA: This journal is registered at the Copyright Clearance Cen-\\nter, Inc., 222 Rosewood Drive, Danvers, MA 01923. Authorization to photocopy items for\\ninternal or personal use, or the internal or personal use of specific clients, is granted by\\nnow Publishers Inc for users registered with the Copyright Clearance Center (CCC). The\\n‘services’ for users can be found on the internet at: www.copyright.com\\nFor those organizations that have been granted a photocopy license, a separate system\\nof payment has been arranged. Authorization does not extend to other kinds of copy-\\ning, such as that for general distribution, for advertising or promotional purposes, for\\ncreating new collective works, or for resale. In the rest of the world: Permission to pho-\\ntocopy must be obtained from the copyright owner. Please apply to now Publishers Inc.,\\nPO Box 1024, Hanover, MA 02339, USA; Tel. +1 781 871 0245; www.nowpublishers.com;\\nsales@nowpublishers.com\\nnow Publishers Inc. has an exclusive license to publish this material worldwide. Permission\\nto use this content must be obtained from the copyright license holder. Please apply to\\nnow Publishers, PO Box 179, 2600 AD Delft, The Netherlands, www.nowpublishers.com;\\ne-mail: sales@nowpublishers.com', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 1}),\n", + " Document(page_content='Foundations and TrendsR⃝in Signal Processing\\nVolume 9, Issue 1-2, 2015\\nEditorial Board\\nEditor-in-Chief\\nYonina Eldar\\nTechnion - Israel Institute of Technology\\nIsrael\\nEditors\\nRobert M. Gray\\nFounding Editor-in-Chief\\nStanford University\\nPao-Chi Chang\\nNCU, Taiwan\\nPamela Cosman\\nUC San Diego\\nMichelle Effros\\nCaltech\\nYariv Ephraim\\nGMU\\nAlfonso Farina\\nSelex ES\\nSadaoki Furui\\nTokyo Tech\\nGeorgios Giannakis\\nUniversity of Minnesota\\nVivek Goyal\\nBoston University\\nSinan Gunturk\\nCourant Institute\\nChristine Guillemot\\nINRIA\\nRobert W. Heath, Jr.\\nUT AustinSheila Hemami\\nNortheastern University\\nLina Karam\\nArizona State U\\nNick Kingsbury\\nUniversity of Cambridge\\nAlex Kot\\nNTU, Singapore\\nJelena Kovacevic\\nCMU\\nGeert Leus\\nTU Delft\\nJia Li\\nPenn State\\nHenrique Malvar\\nMicrosoft Research\\nB.S. Manjunath\\nUC Santa Barbara\\nUrbashi Mitra\\nUSC\\nBjörn Ottersten\\nKTH Stockholm\\nVincent Poor\\nPrinceton UniversityAnna Scaglione\\nUC Davis\\nMihaela van der Shaar\\nUCLA\\nNicholas D. Sidiropoulos\\nTU Crete\\nMichael Unser\\nEPFL\\nP. P. Vaidyanathan\\nCaltech\\nAmi Wiesel\\nHebrew U\\nMin Wu\\nUniversity of Maryland\\nJosiane Zerubia\\nINRIA', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 2}),\n", + " Document(page_content='Editorial Scope\\nTopics\\nFoundations and TrendsR⃝in Signal Processing publishes survey and\\ntutorial articles in the following topics:\\n•Adaptive signal processing\\n•Audio signal processing\\n•Biological and biomedical signal\\nprocessing\\n•Complexity in signal processing\\n•Digital signal processing\\n•Distributed and network signal\\nprocessing\\n•Image and video processing\\n•Linear and nonlinear filtering\\n•Multidimensional signal\\nprocessing\\n•Multimodal signal processing\\n•Multirate signal processing\\n•Multiresolution signal processing\\n•Nonlinear signal processing\\n•Randomized algorithms in signal\\nprocessing\\n•Sensor and multiple source signal\\nprocessing, source separation•Signal decompositions, subband\\nand transform methods, sparse\\nrepresentations\\n•Signal processing for\\ncommunications\\n•Signal processing for security and\\nforensic analysis, biometric signal\\nprocessing\\n•Signal quantization, sampling,\\nanalog-to-digital conversion,\\ncoding and compression\\n•Signal reconstruction,\\ndigital-to-analog conversion,\\nenhancement, decoding and\\ninverse problems\\n•Speech/audio/image/video\\ncompression\\n•Speech and spoken language\\nprocessing\\n•Statistical/machine learning\\n•Statistical signal processing\\nInformation for Librarians\\nFoundationsandTrendsR⃝inSignalProcessing,2015,Volume9,4issues.ISSN\\npaper version 1932-8346. ISSN online version 1932-8354. Also available as a\\ncombined paper and online subscription.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 3}),\n", + " Document(page_content='Foundations and TrendsR⃝in Signal Processing\\nVol. 9, No. 1-2 (2015) 1–231\\nc⃝2016 Y. Feng and D. P. Palomar\\nDOI: 10.1561/2000000072\\nA Signal Processing Perspective on Financial\\nEngineering\\nYiyong Feng\\nDept. of Electronic and Computer Engineering\\nThe Hong Kong University of Science and Technology\\nClear Water Bay, Kowloon\\nHong Kong\\nyiyong@connect.ust.hk\\nDaniel P. Palomar\\nDept. of Electronic and Computer Engineering\\nThe Hong Kong University of Science and Technology\\nClear Water Bay, Kowloon\\nHong Kong\\npalomar@ust.hk', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 4}),\n", + " Document(page_content='Contents\\n1 Introduction 2\\n1.1 A Signal Processing Perspective on Financial Engineering . 5\\n1.2 Connections between Fin. Eng. and Signal Process. . . . . 9\\n1.3 Outline . . . . . . . . . . . . . . . . . . . . . . . . . . . . 12\\nI Financial Modeling & Order Execution 16\\n2 Modeling of Financial Time Series 17\\n2.1 Asset Returns . . . . . . . . . . . . . . . . . . . . . . . . 18\\n2.2 General Structure of a Model . . . . . . . . . . . . . . . . 21\\n2.3 I.I.D. Model . . . . . . . . . . . . . . . . . . . . . . . . . 22\\n2.4 Factor Model . . . . . . . . . . . . . . . . . . . . . . . . 23\\n2.5 VARMA Model . . . . . . . . . . . . . . . . . . . . . . . 27\\n2.6 VECM . . . . . . . . . . . . . . . . . . . . . . . . . . . . 31\\n2.7 Conditional Volatility Models . . . . . . . . . . . . . . . . 34\\n2.8 Summary of Different Models and Their Limitations . . . . 42\\n3 Modeling Fitting: Mean and Covariance Matrix Estimators 47\\n3.1 Fitting Process, Types of Estimators, and Main Focus . . . 47\\n3.2 Warm Up: Large Sample Regime . . . . . . . . . . . . . . 50\\n3.3 Small Sample Regime: Shrinkage Estimators . . . . . . . . 59\\nii', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 5}),\n", + " Document(page_content='iii\\n3.4 Heavy Tail Issue: Robust Estimators . . . . . . . . . . . . 70\\n3.5 Small Sample Regime & Heavy Tail Issue . . . . . . . . . 74\\n3.6 Summary of Different Estimators . . . . . . . . . . . . . . 83\\n4 Order Execution 85\\n4.1 Limit Order Book and Market Impact . . . . . . . . . . . 85\\n4.2 Price Model and Execution Cost . . . . . . . . . . . . . . 91\\n4.3 Minimizing Expected Execution Cost . . . . . . . . . . . . 94\\n4.4 Minimizing Mean-Variance Trade-off of Execution Cost . . 94\\n4.5 Minimizing CVaR of Execution Cost . . . . . . . . . . . . 95\\nII Portfolio Optimization (Risk-Return Trade-off) 101\\n5 Portfolio Optimization with Known Parameters 102\\n5.1 Markowitz Mean-Variance Portfolio Optimization . . . . . 103\\n5.2 Drawbacks of Markowitz Framework . . . . . . . . . . . . 111\\n5.3 Black-Litterman Model . . . . . . . . . . . . . . . . . . . 114\\n6 Robust Portfolio Optimization 120\\n6.1 Robust Mean-Variance Trade-off Portfolio Optimization . . 121\\n6.2 Robust Sharpe ratio Optimization . . . . . . . . . . . . . 128\\n6.3 Connections with Robust Beamforming . . . . . . . . . . . 131\\n7 Multi-Portfolio Optimization 135\\n7.1 From Single-Portfolio to Multi-Portfolio . . . . . . . . . . 136\\n7.2 Multi-Portfolio Problems . . . . . . . . . . . . . . . . . . 139\\n7.3 Efficient Solving Methods . . . . . . . . . . . . . . . . . . 142\\n8 Index Tracking 148\\n8.1 Different Index Tracking Methods . . . . . . . . . . . . . 149\\n8.2 Sparse Index Tracking: Two-Step Approach . . . . . . . . 151\\n8.3 Sparse Index Tracking: Joint Optimization Approach . . . 154\\n9 Risk Parity Portfolio Optimization 161\\n9.1 What is a Risk Parity Portfolio? . . . . . . . . . . . . . . . 162\\n9.2 Risk Parity Portfolio Formulations . . . . . . . . . . . . . 165', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 6}),\n", + " Document(page_content='iv\\n9.3 SCRIP: An Efficient Numerical Solving Approach . . . . . 169\\nIII Statistical Arbitrage (Mean-Reversion) 172\\n10 Statistical Arbitrage 173\\n10.1 Cointegration versus Correlation . . . . . . . . . . . . . . 174\\n10.2 Pairs Selection . . . . . . . . . . . . . . . . . . . . . . . . 181\\n10.3 Cointegration Test . . . . . . . . . . . . . . . . . . . . . . 184\\n10.4 Investing in Cointegrated Pairs . . . . . . . . . . . . . . . 192\\n10.5 From Pairs Trading to Statistical Arbitrage . . . . . . . . 198\\n11 Conclusions 201\\nAppendices 203\\nA MATLAB Code of Example 3.1 204\\nB MATLAB Code of Figure 5.1 207\\nC MATLAB Code of Example 10.4 209\\nAbbreviations 211\\nNotation 213\\nReferences 216', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 7}),\n", + " Document(page_content='Abstract\\nFinancial engineering and electrical engineering are seemingly differ-\\nent areas that share strong underlying connections. Both areas rely\\non statistical analysis and modeling of systems; either modeling the\\nfinancial markets or modeling, say, wireless communication channels.\\nHaving a model of reality allows us to make predictions and to optimize\\nthe strategies. It is as important to optimize our investment strategies\\nin a financial market as it is to optimize the signal transmitted by an\\nantenna in a wireless link.\\nThis monograph provides a survey of financial engineering from a\\nsignal processing perspective, that is, it reviews financial modeling, the\\ndesign of quantitative investment strategies, and order execution with\\ncomparison to seemingly different problems in signal processing and\\ncommunication systems, such as signal modeling, filter/beamforming\\ndesign, network scheduling, and power allocation.\\nY. Feng and D. P. Palomar. A Signal Processing Perspective on Financial\\nEngineering . Foundations and TrendsR⃝in Signal Processing, vol. 9, no. 1-2,\\npp. 1–231, 2015.\\nDOI: 10.1561/2000000072.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 8}),\n", + " Document(page_content='1\\nIntroduction\\nDespitethedifferentnaturesoffinancialengineeringandelectricalengi-\\nneering, both areas are intimately connected on a mathematical level.\\nThe foundations of financial engineering lie on the statistical analy-\\nsis of numerical time series and the modeling of the behavior of the\\nfinancial markets in order to perform predictions and systematically\\noptimize investment strategies. Similarly, the foundations of electrical\\nengineering, for instance, wireless communication systems, lie on statis-\\ntical signal processing and the modeling of communication channels in\\norder to perform predictions and systematically optimize transmission\\nstrategies. Both foundations are the same in disguise.\\nThisobservationimmediatelypromptsthequestionofwhetherboth\\nareas can benefit from each other. It is often the case in science that the\\nsame or very similar methodologies are developed and applied indepen-\\ndently in different areas. The purpose of this monograph is to explore\\nsuch connections and to capitalize on the existing mathematical tools\\ndeveloped in wireless communications and signal processing to solve\\nreal-life problems arising in the financial markets in an unprecedented\\nway.\\n2', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 9}),\n", + " Document(page_content='3\\nThus,thismonographisaboutinvestmentinfinancialassetstreated\\nas a signal processing and optimization problem. An investment is the\\ncurrent commitment of resources in the expectation of reaping future\\nbenefits. In financial markets, such resources usually take the form of\\nmoney and thus the investment is the present commitment of money\\nin order to reap (hopefully more) money later [27]. The carriers of\\nmoney in financial markets are usually referred to as financial assets.\\nThere are various classes of financial assets, namely, equity securities\\n(e.g., common stocks), exchange-traded funds (ETFs), market indexes,\\ncommodities, exchanges rates, fixed-income securities, derivatives (e.g.,\\noptions and futures), etc. A detailed description of each kind of asset\\nis well documented, e.g., [27, 103]. For different kinds of assets, the key\\nquantitiesofinterestarenotthesame;forexample,forequitysecurities\\nthe quantities of interest are the compounded returns or log-returns;\\nfor fixed-income securities they are the changes in yield to maturity;\\nand for options they are changes in the rolling at-the-money forward\\nimplied volatility [143].\\nRoughly speaking, there are three families of investment philoso-\\nphies: fundamental analysis, technical analysis, and quantitative analy-\\nsis. Fundamental analysis uses financial and economical measures, such\\nas earnings, dividend yields, expectations of future interest rates, and\\nmanagement, to determine the value of each share of the company’s\\nstocks and then recommends purchasing the stocks if the estimated\\nvalue exceeds the current stock price [88, 89]. Warren Buffett of Berk-\\nshire Hathaway is probably the most famous practitioner of fundamen-\\ntal analysis [91]. Technical analysis, also known as “charting,” is essen-\\ntially the search for patterns in one dimensional charts of the prices of a\\nstock.Inaway,itpretendstobeascientificanalysisofpatterns(similar\\nto machine learning) but generally implemented in an unscientific and\\nanecdotal way with a low predictive power, as detailed in [132]. Quanti-\\ntative analysis applies quantitative (namely scientific or mathematical)\\ntools to discover the predictive patterns from financial data [128]. To\\nput this in perspective with the previous approach, technical analysis\\nis to quantitative analysis what astrology is to astronomy. The pioneer\\nof the quantitative investment approach is Edward O. Thorp, who used', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 10}),\n", + " Document(page_content='4 Introduction\\nhis knowledge of probability and statistics in the stock markets and has\\nmade a significant fortune since the late 1960s [193]. Quantitative anal-\\nysis has become more and more widely used since advanced computer\\nscience technology has enabled practitioners to apply complex quan-\\ntitative techniques to reap many more rewards more efficiently and\\nmore frequently in practice [4]. In fact, one could even go further to\\nsay that algorithmic trading has been one of the main driving forces in\\nthe technological advancement of computers. Some institutional hedge\\nfund firms that rely on quantitative analysis include Renaissance Tech-\\nnologies, AQR Capital, Winton Capital Management, and D. E. Shaw\\n& Co., to name a few.\\nIn this monograph, we will focus on the quantitative analysis of eq-\\nuity securities since they are the simplest and easiest accessible assets.\\nAs we will discover, many quantitative techniques employed in signal\\nprocessing methods may be applicable in quantitative investment. Nev-\\nertheless, the discussion in this monograph can be easily extended to\\nsome other tradeable assets such as commodities, ETFs, and futures.\\nThus, to explore the multiple connections between quantitative in-\\nvestment in financial engineering and areas in signal processing and\\ncommunications, we will show how to capitalize on existing mathemat-\\nical tools and methodologies that have been developed and are widely\\napplied in the context of signal processing applications to solve prob-\\nlems in the field of portfolio optimization and investment management\\nin quantitative finance. In particular, we will explore financial engineer-\\ning in several respects: i) we will provide the fundamentals of market\\ndata modeling and asset return predictability, as well as outline state-\\nof-the-art methodologies for the estimation and forecasting of portfolio\\ndesign parameters in realistic, non-frictionless financial markets; ii) we\\nwill present the problem of optimal portfolio construction, elaborate\\non advanced optimization issues, and make the connections between\\nportfolio optimization and filter/beamforming design in signal process-\\ning; iii) we will reveal the theoretical mechanisms underlying the design\\nand evaluation of statistical arbitrage trading strategies from a signal\\nprocessing perspective based on multivariate data analysis and time\\nseries modeling; and iv) we will discuss the optimal order execution', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 11}),\n", + " Document(page_content='1.1. A Signal Processing Perspective on Financial Engineering 5\\nand compare it with network scheduling in sensor networks and power\\nallocation in communication systems.\\nWe hope this monograph can provide more straightforward and sys-\\ntematic access to financial engineering for researchers in signal process-\\ning and communication societies1so that they can understand prob-\\nlems in financial engineering more easily and may even apply signal\\nprocessing techniques to handle financial problems.\\nIn the following content of this introduction, we first introduce fi-\\nnancial engineering from a signal processing perspective and then make\\nconnectionsbetweenproblemsarisinginfinancialengineeringandthose\\narising in different areas of signal processing and communication sys-\\ntems. At the end, the outline of the monograph is detailed.\\n1.1 A Signal Processing Perspective on Financial Engineer-\\ning\\nFigure 1.1 summarizes the procedure of quantitative investment.\\nRoughly speaking and oversimplifying, there are three main steps\\n(shown in Figure 1.1):\\n•financial modeling: modeling a very noisy financial time series to\\ndecompose it into trend and noise components;\\n•portfolio design: designing quantitative investment strategies\\nbased on the estimated financial models to optimize some pre-\\nferred criterion; and\\n•order execution: properly executing the orders to establish or un-\\nwind positions of the designed portfolio in an optimal way.\\nIn the following, we will further elaborate the above three steps from\\na signal processing perspective.\\n1There have been some initiatives in Signal Processing journals on the financial\\nengineering topic, namely, the 2011 IEEE Signal Processing Magazine - Special Issue\\non Signal Processing for Financial Applications, the 2012 IEEE Journal of Selected\\nTopics in Sginal Processing - Special Issue on Signal Processing Methods in Finance\\nand Electronic Trading, and the 2016 IEEE Journal of Selected Topics in Signal\\nProcessing - Special Issue on Financial Signal Processing and Machine Learning for\\nElectronic Trading.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 12}),\n", + " Document(page_content='6 Introduction\\nFinancial Modeling\\nPortfolio Optimization\\n(Risk-Return Trade-Off)\\nOrder ExecutionStatistical Arbitrage\\n(Mean-Reversion)Investment Strategies\\nFigure 1.1: Block diagram of quantitative investment in financial engineering.\\n1.1.1 Financial Modeling\\nFor equity securities, the log-prices (i.e., the logarithm of the prices)\\nand the compounded returns or log-returns (i.e., the differences of the\\nlog-prices) are the quantities of interest. From a signal processing per-\\nspective, a log-price sequence can be decomposed into two parts: trend\\nandnoisecomponents,whicharealsoreferredtoasmarketandidiosyn-\\ncratic components, respectively. The purpose of financial modeling or\\nsignal modeling is to decompose the trend components from the noisy\\nfinancial series. Then based on the constructed financial models, one\\ncan properly design some quantitative investment strategies for future\\nbenefits [196, 129, 143].\\nFor instance, a simple and popular financial model of the log-price\\nseries is the following random walk with drift:\\nyt=µ+yt−1+wt, (1.1)\\nwhereytis the log-price at discrete-time t,{wt}is a zero-mean white\\nnoise series, and the constant term µrepresents the time trend of the', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 13}),\n", + " Document(page_content='1.1. A Signal Processing Perspective on Financial Engineering 7\\nJan−10 Jan−11 Jan−12 Jan−13 Jan−14 Jan−15−0.3−0.2−0.100.10.20.30.40.50.60.7Log−priceS&P 500\\n \\nLog−price\\nTrend\\nAccum. noise\\nFigure 1.2: The decomposition of the log-price sequence of the S&P 500 Index into\\ntimetrendcomponent,andthecomponentwithouttimetrend(i.e.,theaccumulative\\nnoise).\\nlog-priceytsince E[yt−yt−1] =µ, which is usually referred to as drift.\\nBased on model (1.1), we can see the trend signal and noise com-\\nponents in the log-prices more clearly by rewriting ytas follows:\\nyt=µt+y0+t\\uf8fa\\ni=1wi, (1.2)\\nwhere the term µtdenotes the trend (e.g., uptrend if µ>0, downtrend\\nifµ < 0, or no trend if µ= 0), and the term∑t\\ni=1widenotes the\\naccumulative noise as time evolves.\\nFigure 1.2 shows the weekly log-prices of the S&P 500 index from\\n04-Jan-2010 to 04-Feb-2015 (the log-prices are shifted down so that\\nthe initial log-price is zero, i.e., y0= 0), where the estimated drift is\\nµ= 0.0022. Obviously, we observe two patterns: first, there exists a\\nsignificant uptrend since 2010 in the US market (see the dashed red', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 14}),\n", + " Document(page_content='8 Introduction\\nlineµt); and second, the accumulative noise in the log-prices is not\\nsteady and looks like a random walk (see the solid gray line for the\\naccumulative noise∑t\\ni=1wi=yt−µt).\\n1.1.2 Quantitative Investment\\nOnce the specific financial model is calibrated from the financial time\\nseries, the next question is how to utilize such a calibrated financial\\nmodel to invest. As mentioned before, one widely employed approach\\nis to apply quantitative techniques to design the investment strategies,\\ni.e., the quantitative investment [65, 128, 64, 143].\\nFigure 1.2 shows that there are two main components in a finan-\\ncial series: trend and noise. Correspondingly, there are two main types\\nof quantitative investment strategies based on the two components: a\\ntrend-based approach, termed risk-return trade-off investment; and a\\nnoise-based approach, termed mean-reversion investment.\\nThe trend-based risk-return trade-off investment tends to maximize\\nthe expected portfolio return while keeping the risk low; however, this\\nis easier said than done because of the sensitivity to the imperfect\\nestimation of the drift component and the covariance matrix of the\\nnoise component of multiple assets. In practice, one needs to consider\\nthe parameter estimation errors in the problem formulation to design\\nthe portfolio in a robust way. Traditionally, the variance of the portfolio\\nreturn is taken as a measure of risk, and the method is thus referred\\nto as “mean-variance portfolio optimization” in the financial literature\\n[135, 137, 138]. From the signal processing perspective, interestingly,\\nthe design of a mean-variance portfolio is mathematically identical to\\nthe design of a filter in signal processing or the design of beamforming\\nin wireless multi-antenna communication systems [123, 149, 213].\\nThe noise-based mean-reversion investment aims at seeking prof-\\nitability based on the noise component. For clarity of presentation, let\\nus use a simple example of only two stocks to illustrate the rough idea.\\nSuppose the log-price sequences of the two stocks are cointegrated (i.e.,\\nthey share the same stochastic drift), at some point in time if one stock\\nmoves up while the other moves down, then people can short-sell the\\nfirst overperforming stock and long/buy the second underperforming', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 15}),\n", + " Document(page_content='1.2. Connections between Fin. Eng. and Signal Process. 9\\nstock2, betting that the deviation between the two stocks will eventu-\\nally diminish. This idea can be generalized from only two stocks to a\\nlarger number of stocks to create more profitable opportunities. This\\ntype of quantitative investment is often referred to as “pairs trading”,\\nor more generally, “statistical arbitrage” in the literature [160, 203].\\n1.1.3 Order Execution\\nIdeally, after one has made a prediction and designed a portfolio, the\\nexecutionshouldbeaseamlesspartoftheprocess.However,inpractice,\\nthe process of executing the orders affects the original predictions in\\nthe wrong way, i.e., the achieved prices of the executed orders will\\nbe worse than what they should have been. This detrimental effect\\nis called market impact. Since it has been shown that smaller orders\\nhave a much smaller market impact, a natural idea to execute a large\\norder is to partition it into many small pieces and then execute them\\nsequentially [8, 18, 78, 146].\\nInterestingly, the order execution problem is close to many other\\nscheduling and optimization problems in signal processing and com-\\nmunication systems. From a dynamic control point of view, the order\\nexecutionproblemisquitesimilartosensorschedulingindynamicwire-\\nlesssensornetworks[180,181,208].Fromanoptimizationpointofview,\\ndistributing a large order into many smaller sized orders over a certain\\ntime window [8, 79] corresponds to allocating total power over differ-\\nent communication channels in broadcasting networks [198] or wireless\\nsensor networks [214].\\n1.2 Connections between Financial Engineering and Areas in\\nSignal Processing and Communication Systems\\nWe have already briefly introduced the main components of financial\\nengineering from a signal processing perspective. In the following we\\nmake several specific connections between financial engineering and\\nareas in signal processing and communication systems.\\n2In financial engineering, to “long” means simply to buy financial instruments,\\nto “short-sell” (or simply, to “short”) means to sell financial instruments that are\\nnot currently owned.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 16}),\n", + " Document(page_content='10 Introduction\\nModeling. Oneofthemostpopularmodelsusedinfinancialengineer-\\ning is the autoregressive moving average (ARMA) model. It models the\\ncurrent observation (e.g., today’s return) as the weighted summation\\nof a linear combination of previous observations (e.g., several previous\\ndays’ returns) and a moving average of the current and several previ-\\nous noise components [196]. Actually, this model is also widely used\\nin signal processing and it is referred to as a rational model because\\nitsz-transform is a rational function, or as a pole-zero model because\\nthe roots of the numerator polynomial of the z-transform are known as\\nzeros and the roots of the denominator polynomial of the z-transform\\nare known as poles [133].\\nRobust Covariance Matrix Estimation. After a specific model has\\nbeen selected, the next step is to estimate or calibrate its parameters\\nfrom the empirical data. In general, a critical parameter to be esti-\\nmatedisthecovariancematrixofthereturnsofmultiplestocks.Usually\\nthe empirical data contains noise and some robust estimation methods\\nare needed in practice. One popular idea in financial engineering is\\nto shrink the sample covariance matrix to the identity matrix as the\\nrobust covariance matrix estimator [120]. Interestingly, this is mathe-\\nmaticallythesameasthediagonalloadingmatrix(i.e.,theadditionofa\\nscaled identity matrix to the sample interference-plus-noise covariance\\nmatrix) derived more than thirty years ago for robust adaptive beam-\\nformingin signalprocessing andcommunication systems[1, 38,45]. For\\nlarge-dimensional data, the asymptotic performance of the covariance\\nmatrix estimators is important. The mathematical tool for the asymp-\\ntotic analysis is referred to as general asymptotics or large-dimensional\\ngeneral asymptotics in financial engineering [121, 122], or as random\\nmatrix theory (RMT) in information theory and communications [199].\\nPortfolioOptimizationvsFilter/BeamformingDesign. Onepopular\\nportfolio optimization problem is the minimum variance problem:\\nminimizewwTΣw\\nsubject to wT1= 1,(1.3)', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 17}),\n", + " Document(page_content='1.2. Connections between Fin. Eng. and Signal Process. 11\\nwhere w∈RNis the portfolio vector variable representing the nor-\\nmalized dollars invested in Nstocks, wT1= 1is the capital budget\\nconstraint, and Σ∈RN×Nis the (estimated in advance) positive defi-\\nnite covariance matrix of the stock returns.\\nThe above problem (1.3) is really mathematically identical to the\\nfilter/beamforming design problem in signal processing [149]:\\nminimizewwHRw\\nsubject to wHa= 1,(1.4)\\nwhere w∈CNis the complex beamforming vector variable denoting\\nthe weights of Narray observations and a∈CNandR∈CN×N(es-\\ntimated in advance) are the signal steering vector (also known as the\\ntransmission channel) and the positive definite interference-plus-noise\\ncovariance matrix, respectively. The similarity between problems (1.3)\\nand(1.4)showssomepotentialconnectionsbetweenportfoliooptimiza-\\ntion and filter/beamforming design, and we will explore more related\\nformulations in detail later in the monograph.\\nIndex Tracking vs Sparse Signal Recovery. Index tracing is a widely\\nused quantitative investment that aims at mimicking the market index\\nbut with much fewer stocks. That is, suppose that a benchmark index\\nis composed of Nstocks and let rb= [rb\\n1,...,rb\\nT]T∈RTandX=\\n[r1,...,rT]T∈RT×Ndenote the returns of the benchmark index and\\ntheNstocks in the past Tdays, respectively, index tracking intends\\nto find a sparse portfolio wto minimize the tracking error between the\\ntracking portfolio and benchmark index [106]:\\nminimizew1\\nT∥Xw−rb∥2\\n2+λ∥w∥0\\nsubject to 1Tw= 1,w≥0,(1.5)\\nwhereλ≥0is a predefined trade-off parameter.\\nMathematically speaking, the above problem (1.5) is identical to\\nthe sparse signal recovery problem [37] and compressive sensing [51] in\\nsignal processing:\\nminimizew1\\nT∥Φw−y∥2\\n2+λ∥w∥0 (1.6)', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 18}),\n", + " Document(page_content='12 Introduction\\nTable 1.1: Connections between financial engineering and signal processing.\\nFinancial Engineer-\\ningSignal Processing\\nModeling ARMA model [196]rational or pole-zero\\nmodel [133]\\nCovariance\\nMatrix\\nEstimationshrinkage sample co-\\nvariance matrix estima-\\ntor [120]diagonal loading in\\nbeamforming [1, 38, 45]\\nAsymptotic\\nAnalysis(large-dimensional)\\ngeneral asymptotics\\n[121, 122]random matrix theory\\n[199]\\nOptimizationportfolio optimization\\n[135, 137, 179, 213]filter/beamforming de-\\nsign [149, 213]\\nSparsity index tracking [106]sparse signal recovery\\n[37, 51]\\nwhereλ≥0is a predefined trade-off parameter, Φ∈RT×Nis a dic-\\ntionary matrix with T≪N,y∈RTis a measurement vector, and\\nw∈RNis a sparse signal to be recovered. Again, the similarity be-\\ntween the two problems (1.5) and (1.6) shows that the quantitative\\ntechniques dealing with sparsity may be useful for both index tracking\\nand sparse signal recovery.\\nTable 1.1 summarizes the above comparisons in a more compact\\nway and it is interesting to see so many similarities and connections\\nbetween financial engineering and signal processing.\\n1.3 Outline\\nThe abbreviations and notations used throughout the monograph are\\nprovided on pages 211 and 213, respectively.\\nFigure 1.3 shows the outline of the monograph and provides the\\nrecommended reading order for the reader’s convenience. The detailed\\norganization is as follows.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 19}),\n", + " Document(page_content='1.3. Outline 13\\nPart I mainly focuses on financial modeling (Chapters 2 and 3) and\\norder execution (Chapter 4).\\nChapter 2 starts with some basic financial concepts and then in-\\ntroduces several models, such as the i.i.d. model, factor model, ARMA\\nmodel, autoregressive conditional heteroskedasticity (ARCH) model,\\ngeneralized ARCH (GARCH) model, and vector error correction model\\n(VECM), which will be used in the later chapters. Thus, this chapter\\nprovides a foundation for the following chapters in the monograph.\\nChapter 3 deals with the model parameter estimation issues. In\\nparticular, it focuses on the estimation of the mean vector and the co-\\nvariance matrix of the returns of multiple stocks. Usually, these two\\nparameters are not easy to estimate in practice, especially under two\\nscenarios: when the number of samples is small, and when there exists\\noutliers. This chapter reviews the start-of-the-art robust estimation of\\nthe mean vector and the covariance matrix from both financial engi-\\nneering and signal processing.\\nChapter 4 formulates the order execution as optimization problems\\nand presents the efficient solving approaches.\\nOnce financial modeling and order execution have been introduced\\nin Part I, we move to the design of quantitative investment strate-\\ngies. As shown in Figure 1.1 there are two main types of investment\\nstrategies,namelyrisk-returntrade-offinvestmentstrategiesandmean-\\nreversion investment strategies, which are documented in Parts II and\\nIII, respectively.\\nPart II entitled “Portfolio Optimization” focuses on the risk-return\\ntrade-off investment. It contains Chapters 5-9 and is organized as fol-\\nlows.\\nChapter 5 reviews the most basic Markowitz mean-variance portfo-\\nlio framework, that is, the objective is to optimize a trade-off between\\nthe mean and the variance of the portfolio return. However, this frame-\\nwork is not practical due to two reasons: first, the optimized strategy\\nis extremely sensitive to the estimated mean vector and covariance\\nmatrix of the stock returns; and second, the variance is not an ap-\\npropriate risk measurement in financial engineering. To overcome the\\nsecond drawback, some more practical single side risk measurements,', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 20}),\n", + " Document(page_content='14 Introduction\\ne.g.,Value-at-Risk(VaR)andConditionalVaR(CVaR),areintroduced\\nas the alternatives to the variance.\\nChapter 6 presents the robust portfolio optimization to deal with\\nparameter estimation errors. The idea is to employ different uncer-\\ntainty sets to characterize different estimation errors and then derive\\nthe corresponding worst-case robust formulations.\\nChapter 7, different from previous Chapters 5 and 6 that consider\\neach portfolio individually, designs multiple portfolios corresponding to\\ndifferent clients jointly via a game theoretic approach by modeling a\\nfinancial market as a game and each portfolio as a player in the game.\\nThis approach is important in practice because multiple investment\\ndecisions may affect each other.\\nChapter 8 considers a passive quantitative investment method\\nnamed index tracking. It aims at designing a portfolio that mimics a\\npreferred benchmark index as closely as possible but with much fewer\\ninstruments.\\nChapter 9 considers a newly developed approach to the portfolio\\ndesign aiming at diversifying the risk, instead of diversifying the capital\\nas usually done, among the available assets, which is called a “risk\\nparity portfolio” in the literature.\\nPart III, containing Chapter 10, explores the mean-reversion in-\\nvestment that utilizes the noise component in the log-price sequences\\nof multiple assets.\\nChapter 10 introduces the idea of constructing a pair of two stocks\\nvia cointegration and optimizes the threshold for trading to achieve a\\npreferred criterion. Then it extends further from pairs trading based\\non only two stocks to statistical arbitrage for multiple stocks.\\nAfter covering the main content of the three parts, Chapter 11\\nconcludes the monograph.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 21}),\n", + " Document(page_content='1.3. Outline 15\\nChapter 1:\\n“Introduction ”\\nChapter 2:\\n“Basic Models ”Chapter 3:\\n“Mean/Cov \\nEstimaton ”\\nChapter 4:\\n“Order \\nExecution ”\\nChapter 5:\\n“MV Portfolio ”\\nChapter 9:\\n“Risk Parity \\nPortfolio”Chapter 7:\\n“Multiple \\nPortfolio”\\nChapter 8:\\n“Index \\nTracking ”Chapter 10:\\n“Statistical \\nArbitrage ”\\nChapter 11:\\n“Conclusion ”Part I:\\nFinancial \\nModeling\\nPart II:\\nPortfolio\\nOptimization\\nRecommended reading orderPart III:\\nStatistical\\nArbitrageChapter 6:\\n“Robust Portfolio \\nOptimization ”\\nFigure 1.3: Outline of the monograph.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 22}),\n", + " Document(page_content='Part I\\nFinancial Modeling &\\nOrder Execution', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 23}),\n", + " Document(page_content='2\\nModeling of Financial Time Series\\nModeling of financial time series provides the quantitative tools to ex-\\ntract useful (or predictable) information for future investments. There\\nare two main philosophies of modeling like then are in signal process-\\ning and control theory [98]: continuous-time and discrete-time systems.\\nContinuous-time modeling, using the Black-Scholes model, for exam-\\nple, involves stochastic calculus and concepts like the Brownian motion\\nthat are at the core of many fundamental results. For computational\\npurposes, however, discrete-time modeling is more convenient. In addi-\\ntion, practical investment strategies are usually naturally discretized,\\ni.e., daily or monthly investments.\\nTherefore, this chapter focuses on discrete-time modeling of finan-\\ncial time series, i.e., the interested time series quantities (mainly the\\nlog-returns) of some interested assets (say Nassets) given the past\\ninformation (i.e., the past log-returns of the Nassets).\\nThe detailed organization is as follows. Section 2.1 starts with some\\nbasic financial concepts, i.e., prices and returns. Then Section 2.2 in-\\ntroduces the general structure of modeling and Sections 2.3-2.7 explain\\nseveralspecificmodels,suchasthei.i.d.model,factormodel,vectorau-\\ntoregressive moving average (VARMA) model, vector error correction\\n17', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 24}),\n", + " Document(page_content='18 Modeling of Financial Time Series\\nmodel (VECM), autoregressive conditional heteroskedasticity (ARCH)\\nmodel, generalized ARCH (GARCH) model, and multivariate ARCH\\nand GARCH models, which will be used in the later chapters. At the\\nend, Section 2.8 summarizes all the models briefly.\\nThis chapter focuses on the models themselves but leaves the fitting\\nof the models with real data or parameter estimation to Chapter 3. All\\nthe models are introduced in their vector/multivariate cases.\\n2.1 Asset Returns\\nFor simplicity, let us focus on a single asset. Let ptbe the price of an\\nasset at (discrete) time index t.\\n2.1.1 Returns Based on Prices\\nSuppose the asset pays no dividends1, the simple return (a.k.a. linear\\nreturn or net return) over one interval from time t−1totis\\nRt≜pt−pt−1\\npt−1=pt\\npt−1−1. (2.1)\\nThe numerator pt−pt−1is the profit (or the loss in case of a negative\\nprofit)duringtheholdingperiodandthedenominator pt−1istheinitial\\ninvestment at time t−1. Thus the simple return can be regarded as\\nthe profit rate.\\nThen the quantity\\nRt+ 1 =pt\\npt−1(2.2)\\ndenotes the ratio between the end capital and the initial investment,\\nthus it is referred to as total return or gross return.\\nBased on the above definitions for only one investment period, the\\ngross return on the most recent kperiods is the product of the past k\\nsingle period gross returns\\n1 +Rt(k) =pt\\npt−k=pt\\npt−1×pt−1\\npt−2×···×pt−k+1\\npt−k\\n= (1 +Rt)×···× (1 +Rt−k+1),(2.3)\\n1If there exists dividend dtat timet, then the simple return in (2.1) can be\\nadjusted as Rt=pt−pt−1+dt\\npt−1.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 25}),\n", + " Document(page_content='2.1. Asset Returns 19\\nand the corresponding net return is\\nRt(k) =pt\\npt−k−1. (2.4)\\n2.1.2 Returns Based on Log-prices\\nThe log-return (a.k.a. continuously compounded return) at time tis\\ndefined as follows:\\nrt≜log(1 +Rt) = logpt\\npt−1=yt−yt−1, (2.5)\\nwhereyt≜logptis the log-price and log denotes the natural logarithm.\\nSince the function f(x) = log(1 + x)has the first order Taylor\\napproximation f(x) = log(1 + x)≈xat point 0, we can see rt=\\nlog(1 +Rt)is approximately equal to the net return Rtin (2.1), i.e.,\\nrt≈Rt, especially when Rtis small around zero (which is the case for\\nthe usual intervals).\\nThe log-return on the most recent kperiods is\\nrt(k)≜log(1 +Rt(k)) = log[(1 + Rt)×···× (1 +Rt−k+1)]\\n= log(1 +Rt) + log(1 + Rt−1) +···+ log(1 +Rt−k+1)\\n=rt+rt−1+···+rt−k+1,(2.6)\\nwhich has a nice additive property over periods (recall that the linear\\nmulti-period net return Rt(k)in (2.4) does not have such a property).\\n2.1.3 Portfolio Returns\\nFor a portfolio composing of Nassets, let w∈RNbe a vector with wi\\ndenoting normalized capital invested into the i-th asset. Then the net\\nreturn of the portfolio over a single period tisRp\\nt=∑N\\ni=1wiRitwhere\\nRitis the net return of the i-th asset.\\nThe log-return of a portfolio, however, does not have the above ad-\\nditivity property. If the simple returns Ritare all small in magnitude,\\ntheycanbeapproximatedbythelog-returns ritandtheportfolionetre-\\nturn can be approximated as Rp\\nt=∑N\\ni=1wiRit≈∑N\\ni=1wirit. However,\\nwhen some Ritare significantly different from zero, using∑N\\ni=1wiritto\\napproximate∑N\\ni=1wiRitmay introduce some serious errors [144].', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 26}),\n", + " Document(page_content='20 Modeling of Financial Time Series\\nModels for Financial \\nTime SeriesAdditivity Over Periods Log-Returns\\nPortfolio OptimizationAdditivity Over Assets Simple Returns\\nFigure 2.1: Simple returns versus log-returns.\\n2.1.4 Comparisons: Simple Returns versus Log-returns\\nFigure 2.1 provides a summary of the comparisons between simple re-\\nturns and log-returns.\\nFirst, the simple returns have the advantage of additivity over as-\\nsets. Because of that, it is the simple returns that will be used in port-\\nfolio optimization later in Part II.\\nSecond, the log-returns have the advantage of additivity over assets\\nperiods. This makes the distribution of the log-returns in the future\\neasier to compute and predict.\\nThird, the statistical properties of the log-returns are relatively\\nmore tractable. For example, from (2.1) we can see that simple returns\\nare highly asymmetric because they are bounded below by −1and un-\\nbounded above. Instead, the log-returns are relatively more symmetric\\nand this makes the corresponding distributions easier to model.\\nIt is the additivity over periods and statistical simplicity that are\\nneeded for modeling purposes and thus we focus on the log-returns in\\nthis chapter. However, as shown in Figure 2.1, either simple returns or\\nlog-returns should be used depending on the investor’s specific goal.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 27}),\n", + " Document(page_content='2.2. General Structure of a Model 21\\n2.2 General Structure of a Model\\nMost of the existing financial time series models aim at modeling the\\nlog-returns of Nassets jointly denoted by rt∈RN. In particular, they\\nmodel the log-returns at time tbased on the previous historical data\\ndenoted byFt−1. However, modeling an N-dimensional random vari-\\nable may be a daunting task not just because of the estimation aspect\\nbut also the storage issue. For this reason, most models simplify the\\ntask by modeling only the mean and covariance matrix.\\nConditional onFt−1, we can decompose rt∈RNas follows:\\nrt=µt+wt, (2.7)\\nwhereµtis the conditional mean\\nµt=E[rt|Ft−1] (2.8)\\nandwtis a white noise with zero mean and conditional covariance\\nΣt=E[(rt−µt)(rt−µt)T|Ft−1]. (2.9)\\nHere,µtandΣt(or equivalently Σ1/2\\nt) are the two main components\\nto be modeled, and they are usually referred to as conditional mean\\nand conditional covariance matrix (or more often conditional volatility\\nforΣ1/2\\nt), respectively, in the literature.\\nIn the literature, the underlying distribution wtis always assumed\\nto be Gaussian (or sometimes a more general elliptical distribution) for\\nmathematical simplicity even though reality does not fit the thin tails\\nof the Gaussian distribution [143].\\nIn the following, we first provide general models for both µtand\\nΣtand then explore several different types of specific models. Sections\\n2.3 and 2.4 model both conditional mean and covariance as constants,\\nSections 2.5 and 2.6 explore various models of the conditional mean\\nbut leave the conditional covariance matrix as a constant, and Section\\n2.7 focuses on modeling the conditional covariance matrix only. All the\\nspecific models can be regarded as special cases of the general models,\\nand we summarize them in Section 2.8.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 28}),\n", + " Document(page_content='22 Modeling of Financial Time Series\\n2.2.1 General Model for Conditional Mean µt\\nFor most log-return series, the following model is enough to model the\\nconditional mean µt:\\nµt=φ0+Πxt+p\\uf8fa\\ni=1Φirt−i−q\\uf8fa\\nj=1Θjwt−j, (2.10)\\nwhereφ0∈RNdenotes a constant vector, xt∈RKdenotes a vector\\nof exogenous variables, Π∈RN×Kis a loading matrix, pandqare\\nnonnegative integers, Φi,Θj∈RN×Nare matrix parameters, and rt−i\\nandwt−jare past log-returns and temporally white noise.\\n2.2.2 General Model for Conditional Covariance Matrix Σt\\nFor a multivariate case, there exist many different models of the con-\\nditional covariance matrix Σt, and, in general, there does not exist a\\ngeneral model formulation that captures all the existing ones as special\\ncases, e.g., see [16, 182, 196, 129]. Nevertheless, for the consistency of\\npresentation, let us introduce the following model [62]:\\nΣt=A0AT\\n0+m\\uf8fa\\ni=1Ai(wt−iwT\\nt−i)AT\\ni+s\\uf8fa\\nj=1BjΣt−jBT\\nj,(2.11)\\nwheremandsare nonnegative integers and Ai,Bj∈RN×Nare pa-\\nrameters. This model ensures a positive definite matrix provided that\\nA0AT\\n0is positive definite. The above model is referred to as the Baba-\\nEngle-Kraft-Kroner (BEKK) model in the literature.\\nIn practice, most models simply assume a constant covariance ma-\\ntrixΣt=Σw, i.e., a special case of (2.11) with m= 0ands= 0.\\n2.3 I.I.D. Model\\nPerhaps the simplest model for rtis that it follows an i.i.d. distribution\\nwith fixed mean and covariance matrix, i.e.,\\nrt=µ+wt, (2.12)\\nwhere wt∈RNis a white noise series with zero mean and constant\\ncovariance matrix Σw.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 29}),\n", + " Document(page_content='2.4. Factor Model 23\\nComparing the i.i.d. model (2.12) with the general model (2.7)-\\n(2.11), obviously we can see it is the simplest special case with µ=φ0,\\nΠ=0,p= 0,q= 0,Σw=A0AT\\n0,m= 0, ands= 0. And the\\nconditional mean and covariance matrix are both constant:\\nµt=µ, (2.13)\\nΣt=Σw. (2.14)\\nThis i.i.d. model assumption may look simple, however, it is one\\nof the most fundamental assumptions for many important works. One\\nexample is the Nobel prize-winning Markowitz portfolio theory [135,\\n136, 137, 138, 179] that will be covered in Chapter 5.\\n2.4 Factor Model\\nIf we look at (2.12) carefully, we may think that the dimension of the\\nmarketalwaysequalsthenumberofassets N.However,thismaynotbe\\ntrue in practice. In general, the market is composed of a large number\\nofassets(i.e., Nislarge),butitisusuallyobservedthatitsdimensionis\\nrelatively small, that is, the market is only driven by a limited number\\nof factors, say Kfactors with K≪N.\\nThe general factor model is\\nrt=φ0+h(ft) +wt, (2.15)\\nwhereφ0denotes a constant vector; ft∈RKwithK≪Nis a vector\\nof a few factors that are responsible for most of the randomness in\\nthe market, the vector function h:RK↦→RNdenotes how the low\\ndimensional factors affect the higher dimensional market; and a resid-\\nual vector wtof (possibly independent) perturbations that has only a\\nmarginal effect. In general, the function his assumed to be linear.\\nThis approach of modeling enjoys a wide popularity; refer to [42,\\n66, 67, 68, 69, 70, 118] for some typical references.\\nIn the following, we consider two specific models of (2.15) with\\neither explicit or hidden factors.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 30}),\n", + " Document(page_content='24 Modeling of Financial Time Series\\n2.4.1 Explicit Factors\\nThe explicit factor model is\\nrt=φ0+Πft+wt, (2.16)\\nwhich is a specific case of (2.15) with h(ft) =Πft,ft∈RKbeing\\nexplicitly observable market variables, and Π∈RN×Kbeing the factor\\nloading matrix.\\nSome popular explicit factors include returns on the market port-\\nfolio2, growth rate of the GDP, interest rate on short term Treasury\\nbills, inflation rate, unemployment, etc. [171].\\nObviously, the factor model with explicit factors (2.16) is a special\\ncase of the general model (2.7)-(2.11) with exogenous input being the\\nfactors xt=ft,p= 0, andq= 0.\\nIn general, it is assumed that ftfollows an i.i.d. distribution with\\nconstant mean µfand constant covariance matrix Σf,wtfollows an\\ni.i.d. distribution with zero mean and (possibly diagonal) constant co-\\nvariance matrix Σw, and ftandwtare uncorrelated. Then the con-\\nditional mean and covariance matrix are both constant and can be\\ncomputed as follows:\\nµt=E[rt|Ft−1] =E[rt] =φ0+Πµf (2.17)\\nΣt=E[(rt−µt)(rt−µt)T|Ft−1],\\n=ΠΣfΠT+Σw. (2.18)\\nCapital Asset Pricing Model (CAPM)\\nOne of the most popular factor models is the CAPM with the returns\\non the market portfolio being the only factor [70]. The i-th stock return\\nat timetis\\nri,t−rf=βi(rM,t−rf) +wi,t, (2.19)\\nwhererfis the risk-free rate, rM,tis the return of the market portfolio,\\nandwi,tis a stock-specific white noise with zero mean and constant\\nvariance.\\n2The market portfolio is a portfolio consisting of all equities with the normalized\\nportfolio weights being proportional to the market values of the equities.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 31}),\n", + " Document(page_content='2.4. Factor Model 25\\nTaking the expectation on both sides of (2.19) results in the so-\\ncalled CAPM:\\nE[ri,t]−rf=βi(E[rM,t]−rf). (2.20)\\nBased on (2.20)\\n•E[rM,t]−rfmeasures the difference between the expected market\\nreturn and risk-free rate, which is known as the market premium;\\n•E[ri,t]−rfmeasures the difference between the expected stock\\nreturn and risk-free rate, which is known as the risk premium;\\nand\\n•βiin general is given by\\nβi=Cov(ri,t,rM,t)\\nVar(rM,t)(2.21)\\nwhich measures how sensitive the risk premium is to the market\\npremium, that is, the risk premium equals the market premium\\ntimesβi.\\nNote that the conditional mean E[ri,t|Ft−1]is the same as the un-\\nconditional mean E[ri,t] =rf+βi(E[rM,t]−rf).\\nTaking the variance on both sides of (2.19) gives us the following\\nrelationship:\\nVar[ri,t] =β2\\niVar[rM,t] +Var[wi,t], (2.22)\\nwhich is decomposed into two parts:\\n•β2\\niVar[rM,t]measures the risk associated with the market and it\\nis referred to as systematic risk, and\\n•Var[wi,t]is specific to each stock and it is called nonsystematic\\nrisk.\\nAlso, the conditional variance Var[ri,t|Ft−1]equals the unconditional\\nvariance Var[ri,t].', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 32}),\n", + " Document(page_content='26 Modeling of Financial Time Series\\n2.4.2 Hidden Factors\\nThe assumption of a linear model of (2.15) with hidden factors is that\\nthe factors are not explicit market variables but are functions of rtthat\\nsummarize as much information as possible.\\nOne method is to define the hidden factors as affine transformations\\nofrtas follows:\\nft=d+ΥTrt, (2.23)\\nwhere d∈RKandΥ∈RN×Kare parameters to be estimated.\\nThen the hidden factor model can be expressed as follows:\\nrt=φ0+Π(d+ΥTrt) +wt, (2.24)\\nwhich is a specific case of (2.15) with h(ft) =Πft,ft∈RKbeing\\nthe hidden variables defined in (2.23); Π∈RN×Kbeing the factor\\nloading matrix; and wtfollows an i.i.d. distribution with zero mean\\nand a (possibly diagonal) constant covariance matrix Σw.\\nThe model (2.24) can be further simplified as follows:\\nrt=m+ΠΥTrt+wt, (2.25)\\nwhere m=φ0+Πdis an newly defined parameter.\\nThe parameters m,Π, and Υcan be estimated by the following\\nnonlinear least-square (LS) regression:\\nminimize\\nm,Π,ΥE\\ued79\\ued79\\ued79rt−m−ΠΥTrt\\ued79\\ued79\\ued792\\n2. (2.26)\\nRecall that Π,Υ∈RN×K, then ΠΥT∈RN×Nwith rank(ΠΥT)≤\\nK≪N, then intuitively problem (2.26) is projecting rtonto a lower\\nK-dimensional subspace with variations being captured as much as\\npossible. Indeed, this technique is usually referred to as principal com-\\nponent analysis (PCA) [109] in the literature, the optimal solution of\\nwhich can be stated in closed-form as follows [143]:\\nΠ=Υ=EK, (2.27)\\nm=(\\nI−EKET\\nK)\\nE[rt], (2.28)\\nwhere EK∈RN×Kwith thek-th column vector being the k-th largest\\neigenvector of the covariance matrix Cov[rt],k= 1,...,K., and it can\\nbe shown that the white noise wtis uncorrelated of the hidden factors.', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 33}),\n", + " Document(page_content='2.5. VARMA Model 27\\nThen combining (2.25), (2.27) and (2.28) together, we can find the\\nconditional mean and covariance matrix as follows:\\nµt=E[rt|Ft−1] =E[rt], (2.29)\\nΣt=E[(rt−µt)(rt−µt)T|Ft−1]\\n=EKΛKET\\nK+Σw, (2.30)\\nwhere ΛK= Diag([λ1,...,λK])is aK-by-Kdiagonal matrix with λk\\nbeing thek-th largest eigenvalue of Cov[rt], and we can see both the\\nconditional mean and covariance matrix are constant and independent\\nof time.\\n2.4.3 Comparisons: Explicit Factors versus Hidden Factors\\nBased on (2.17)-(2.18) or (2.29)-(2.30), we can see that the factor mod-\\nels,i.e.,(2.16)and(2.25),decomposetheconditionalcovariance Σtinto\\ntwo parts: low dimensional factors and marginal noise. The key is the\\nway to choose or construct the factors, and the comparisons between\\nthe explicit and hidden factor models are as follows:\\n•The explicit factor model tends to explain the log-returns with a\\nsmaller number of fundamental or macroeconomic variables and\\nthus it is easier to interpret. However, in general there is no sys-\\ntematic method to choose the right factors.\\n•The hidden factor model employs PCA to explore the structure\\nof the covariance matrix and locate a low-dimensional subspace\\nthat captures most of the variation in the log-returns. It is a more\\nsystematical approach and thus it may provide a better explana-\\ntory power. One drawback of the hidden factors compared with\\nthe explicit factors is that they do not have explicit econometric\\ninterpretations.\\n2.5 VARMA Model\\nThe previous i.i.d. and factor models, while commonly employed, do\\nnot incorporate any time-dependency in the model for rt. In other\\nwords, the conditional mean and covariance matrix are constant and', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 34}),\n", + " Document(page_content='28 Modeling of Financial Time Series\\npast information is not explicitly used (it can still be used implicitly\\nvia the estimation of the parameters).\\nThe VARMA model can incorporate the past information into the\\nmodel of conditional mean, although still not in the conditional covari-\\nance matrix.\\nStationarity is an important characteristic for time series analysis\\nwhich describes the time-invariant behavior of a time series. A mul-\\ntivariate time series rtis said to be weakly stationary if its first and\\nsecond moments are time-invariant. In general, a stationary time series\\nis much easier to model, estimate, and analyze.\\n2.5.1 VAR( 1) Model\\nLet us start with the vector autoregressive (VAR) model of order 1,\\ndenoted as VAR( 1), as follows:\\nrt=φ0+Φ1rt−1+wt, (2.31)\\nwhereφ0∈RNisaconstantvector, Φ1∈RN×Nisamatrixparameter,\\nandwtdenotes a serially uncorrelated noise series with zero mean\\nand constant covariance matrix Σw. We can see that the term Φ1rt−1\\nmodels the serial correlation of the time series rt.\\nAlso, compared with the general model (2.7)-(2.11), the VAR( 1)\\nmodel (2.31) is a special case with Π=0,p= 1,q=m=s= 0, and\\nΣt=Σw, and it is straightforward to obtain the conditional mean and\\ncovariance matrix based on (2.31) as follows:\\nµt=φ0+Φ1rt−1, (2.32)\\nΣt=Σw. (2.33)\\nObviously, the conditional covariance matrix Σtis constant.\\n2.5.2 VAR( p) Model\\nThep-th order autoregressive process, denoted as VAR( p), extends the\\nVAR( 1) model by including more previous observations into the model\\nas follows:\\nrt=φ0+p\\uf8fa\\ni=1Φirt−i+wt, (2.34)', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 35}),\n", + " Document(page_content='2.5. VARMA Model 29\\nwherepis a nonnegative integer, φ0∈RNis a constant vector, Φi∈\\nRN×Nare matrix parameters, and wtdenotes a serially uncorrelated\\nwhite noise series with zero mean and constant covariance matrix Σw.\\nClearly we can see that the time series rtis serially correlated via\\nthe term∑p\\ni=1Φirt−iwhich contains more previous observations than\\nthe AR( 1) model (2.31). Similar to (2.32) and (2.33), the conditional\\nmean and covariance matrix based on (2.34) are\\nµt=φ0+p\\uf8fa\\ni=1Φirt−i, (2.35)\\nΣt=Σw, (2.36)\\nwhere the conditional covariance matrix is constant.\\n2.5.3 VMA( q) Model\\nEven though the VAR model models the serial correlations, it imposes\\nsuch correlations with all the past observations. We can observe this\\neasily by substituting the VAR( 1) model (2.31) recursively and we have\\nthatrtis serially correlated to all the past observations r0,...,rt−1,\\nespecially when the eigenvalues of Ψ1are close to 1.\\nFor some realistic cases, the time series rtshould only have serial\\ncorrelation up to a small lag qsuch that rtis serially uncorrelated to\\nrt−ℓfor allℓ > q. Unfortunately, the VAR model does not have this\\nproperty.\\nA useful alternative to the VAR model is a vector moving average\\n(VMA) model. The VMA model of order q, denoted as VMA( q), is\\nrt=µ+wt−q\\uf8fa\\nj=1Θjwt−j, (2.37)\\nwhereqis a nonnegative integer, µ∈RNis a constant vector, Θj∈\\nRN×Nare matrix parameters, and wtdenotes a serially uncorrelated\\nwhite noise series with zero mean and constant covariance matrix Σw.\\nBased on (2.37), it is easy to check that rtis serially uncorrelated\\ntort−ℓfor allℓ > q. Also, the VMA( q) model (2.37) is a special case\\nof the general model (2.7)-(2.11) with Π=0andp=m=s= 0, and', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 36}),\n", + " Document(page_content='30 Modeling of Financial Time Series\\nwe have the conditional mean and covariance matrix as follows:\\nµt=µ−q\\uf8fa\\nj=1Θjwt−j, (2.38)\\nΣt=Σw, (2.39)\\nwhere the conditional covariance matrix is constant.\\n2.5.4 VARMA Model\\nSometimes, using simply a VAR model or a VMA model only is not\\nenough to fit the data and it is helpful to combine them together. The\\ncombination of VAR( p) and VMA( q), referred to as VARMA( p,q), is\\ngiven by\\nrt=φ0+p\\uf8fa\\ni=1Φirt−i+wt−q\\uf8fa\\nj=1Θjwt−j, (2.40)\\nwherepandqare nonnegative integers, φ0∈RNis a constant vector,\\nthe matrices Φi,Θj∈RN×Nare parameters, and wtis a white noise\\nseries with zero mean and constant covariance matrix Σw. Directly, the\\nconditional mean and covariance matrix based on (2.40) are\\nµt=φ0+p\\uf8fa\\ni=1Φirt−i−q\\uf8fa\\nj=1Θjwt−j, (2.41)\\nΣt=Σw, (2.42)\\nwhere the conditional covariance matrix is still constant.\\nRemark 2.1. The VARMA model is a powerful model of conditional\\nmean, however, it also has some drawbacks that need to be dealt with\\ncarefully.\\nThe identifiability issue, i.e., two VARMA( p,q) models with differ-\\nent coefficient matrices can be rewritten as the same VMA( ∞) model,\\nis one of the most important ones. This issue is important because the\\nlikelihood function of the VARMA( p,q) model may not be uniquely\\ndefined and thus the parameters cannot be estimated. To overcome\\nthis drawback, some model structural specifications are needed. There', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 37}),\n", + " Document(page_content='2.6. VECM 31\\nare two main approaches namely the Kronecker index, and the scalar\\ncomponent model in the literature [197].\\nAnother issue is that, for a causal and invertible VARMA model,\\nthe conditional maximum likelihood estimation may not result in a\\ncausal and invertible estimated VARMA model, especially when the\\nnumber of samples is small [129, 197]. The solving approach is to either\\naddmoreconstraintsintheconditionalmaximumlikelihoodestimation\\n[169] or switch to the unconditional maximum likelihood estimation\\n[197]. However, both of them require more intensive computation. ■\\n2.6 VECM\\nUntil now we have focused on modeling directly the log-return series\\nrtinstead of the log-price series yt(recall that rt= ∆yt=yt−yt−1).\\nThisisbecauseingeneralthelog-priceseries ytisnotweaklystationary\\n(think for example of Apple stock whose log-prices keep increasing) and\\nthus is not easy to model, while its difference series, i.e., the log-return\\nseries rt, is weakly stationary and is easier to model and analyze.\\nHowever, it turns out that differencing may destroy part of the\\nrelationship among the log-prices which may be invaluable for a proper\\nmodeling with forecast power. It is therefore also important to analyze\\nthe original (probably non-stationary) time series directly [129].\\nInterestingly, it turns out that in fact a (probably non-stationary)\\nVAR model may be enough. For example, one can always fit the log-\\nprice series ytwith a VAR model, say, the following VAR( p):\\nyt=φ0+Φ1yt−1+···+Φt−pyt−p+wt, (2.43)\\nwherepis a nonnegative integer, φ0∈RNis a constant vector, Φi∈\\nRN×Nare matrix parameters, and wtdenotes a serially uncorrelated\\nwhite noise series with zero mean and constant covariance matrix Σw.\\nHere (2.43) models the log-price series and ytis not necessarily\\nstationary. The standard results for a stationary VAR model may not\\nbe useful.\\nIntheliterature,atimeseriesiscalledintegratedoforder p,denoted\\nasI(p),ifthetimeseriesobtainedbydifferencingthetimeseries ptimes\\nis weakly stationary, while by differencing the time series p−1times is', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 38}),\n", + " Document(page_content='32 Modeling of Financial Time Series\\nnot weakly stationary [196, 129]. A multivariate time series is said to be\\ncointegrated if it has at least one linear combination being integrated of\\na lower order. To illustrate the concepts visually, we consider a slightly\\nmodified example from [196] with only two dimensions as follows.\\nExample 2.1. Suppose the log-price series ytfollows\\nyt=Φ1yt−1+wt, (2.44)\\nwhere Φ1=)\\n0.5−1\\n−0.25 0.5[\\n, and wtfollows an i.i.d. distribution with\\nzero mean and constant covariance matrix Σw. The model (2.44) (or\\nyt) is not stationary because the eigenvalues of Φ1are0and1(recall\\nfor stationarity the modulus of the eigenvalues need to be less than\\none).\\nTo check the integration order of yt, rewriting (2.44) as\\n)\\n1−0.5B B\\n0.25B 1−0.5B[\\nyt=wt, (2.45)\\nwhereBis the backshift operator, and premultiplying both sides of\\n(2.45) by)\\n1−0.5B−B\\n−0.25B1−0.5B[\\nyields\\n)\\n1−B 0\\n0 1−B[\\nyt=)\\n1−0.5B−B\\n−0.25B1−0.5B[\\nwt. (2.46)\\nSince the right hand side of (2.46) is stationary, so is the first order\\ndifference of yton the left hand side of (2.46). This implies that ytis\\nintegrated of order one, i.e., it is I(1).\\nTocheckwhether ytiscointegratedornot,wedefine L≜)\\n1−2\\n0.5 1[\\nand premultiply (2.44) by L, then we have\\nLyt=LΦ1L−1Lyt−1+Lwt, (2.47)\\nwhich can be rewritten more explicitly as\\n)\\ny1t−2y2t\\n0.5y1t+y2t[\\n=)\\n1 0\\n0 0[)\\ny1,t−1−2y2,t−1\\n0.5y1,t−1+y2,t−1[\\n+Lwt.(2.48)', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 39}),\n", + " Document(page_content='2.6. VECM 33\\nSince Lwtis always stationary, so is the linear combination 0.5y1t+\\ny2t, and thus ytis cointegrated. This derived cointegration result in\\nfact is very important and can be utilized to design very profitable\\nquantitative trading strategies (which will be shown later in Part III).\\nNow we can observe that if we difference the log-price series directly\\nand reach the model (2.46), we cannot obtain the cointegration result\\nthat 0.5y1t+y2tis stationary any more. Therefore, it is important to\\nstudy the log-price series ytdirectly as mentioned before. ■\\nThe above Example 2.1 shows a specific example of cointegration.\\nIn practice, a systematic way to find the cointegrated components (if\\nthey exist) is via a vector error correction model (VECM) [61].\\nLet us assume the log-price series ytis at most I(1), that is, at\\nleast its difference series rtor the log-return series is always weakly\\nstationary. Using the relation yt=yt−1+rt, the VAR(p) model (2.43)\\ncan always be rewritten as\\nrt=φ0+Πyt−1+˜Φ1rt−1+···+˜Φp−1rt−p+1+wt,(2.49)\\nwhere\\nΠ=−(I−Φ1−···− Φp) =−Φ(1) (2.50)\\n˜Φj=−p\\uf8fa\\ni=j+1Φi, j = 1,...,p−1. (2.51)\\nInterestingly, the above model (2.49) can also be regarded as a\\nspecial case of the general model (2.10) with the exogenous variables\\nbeing the previous log-prices, i.e., xt=yt−1. And the conditional mean\\nand covariance matrix are\\nµt=φ0+Πyt−1+p−1\\uf8fa\\ni=1˜Φirt−i, (2.52)\\nΣt=Σw, (2.53)\\nwhere the conditional covariance matrix is constant.\\nUnder the assumption that ytis at mostI(1), it is straightforward\\ntoconcludethattheterm Πyt−1intheabovemodel(2.49)isstationary,\\ntherefore, some linear combinations of ytmay be stationary. The term', metadata={'source': '/home/aktersnurra/projects/library/quant/math/a-signal-processing-perspective-on-financial-engineering.pdf', 'page': 40}),\n", + " Document(page_content='34 Modeling of Financial Time Series\\nΠyt−1is usually referred to as an error correction term and thus the\\nmodel is called a VECM. There are three interesting cases of Πyt−1:\\n1.rank(Π) = 0. This implies Π=0andytis not cointegrated since\\nthere is no linear combination of ytbeing stationary. Then the\\nVECM (2.49) reduces to a VAR( p−1) for the log-return time\\nseries rt.\\n2.rank(Π) =N. This implies Πis invertible. Then ytmust be\\nstationary already since rtandwtare both stationary and yt\\ncan be rewritten as a linear combination of rtandwtby left\\nmultiplying both sides of (2.49) by Πinverse. Thus, one can\\nstudy ytdirectly.\\n3.0=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] +[[package]] +name = "fastavro" +version = "1.9.4" +description = "Fast read/write of AVRO files" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fastavro-1.9.4-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:60cb38f07462a7fb4e4440ed0de67d3d400ae6b3d780f81327bebde9aa55faef"}, + {file = "fastavro-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:063d01d197fc929c20adc09ca9f0ca86d33ac25ee0963ce0b438244eee8315ae"}, + {file = "fastavro-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87a9053fcfbc895f2a16a4303af22077e3a8fdcf1cd5d6ed47ff2ef22cbba2f0"}, + {file = "fastavro-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:02bf1276b7326397314adf41b34a4890f6ffa59cf7e0eb20b9e4ab0a143a1598"}, + {file = "fastavro-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56bed9eca435389a8861e6e2d631ec7f8f5dda5b23f93517ac710665bd34ca29"}, + {file = "fastavro-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:0cd2099c8c672b853e0b20c13e9b62a69d3fbf67ee7c59c7271ba5df1680310d"}, + {file = "fastavro-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:af8c6d8c43a02b5569c093fc5467469541ac408c79c36a5b0900d3dd0b3ba838"}, + {file = "fastavro-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4a138710bd61580324d23bc5e3df01f0b82aee0a76404d5dddae73d9e4c723f"}, + {file = "fastavro-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:903d97418120ca6b6a7f38a731166c1ccc2c4344ee5e0470d09eb1dc3687540a"}, + {file = "fastavro-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c443eeb99899d062dbf78c525e4614dd77e041a7688fa2710c224f4033f193ae"}, + {file = "fastavro-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ac26ab0774d1b2b7af6d8f4300ad20bbc4b5469e658a02931ad13ce23635152f"}, + {file = "fastavro-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:cf7247874c22be856ba7d1f46a0f6e0379a6025f1a48a7da640444cbac6f570b"}, + {file = "fastavro-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:68912f2020e1b3d70557260b27dd85fb49a4fc6bfab18d384926127452c1da4c"}, + {file = "fastavro-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6925ce137cdd78e109abdb0bc33aad55de6c9f2d2d3036b65453128f2f5f5b92"}, + {file = "fastavro-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b928cd294e36e35516d0deb9e104b45be922ba06940794260a4e5dbed6c192a"}, + {file = "fastavro-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:90c9838bc4c991ffff5dd9d88a0cc0030f938b3fdf038cdf6babde144b920246"}, + {file = "fastavro-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:eca6e54da571b06a3c5a72dbb7212073f56c92a6fbfbf847b91c347510f8a426"}, + {file = "fastavro-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a4b02839ac261100cefca2e2ad04cdfedc556cb66b5ec735e0db428e74b399de"}, + {file = "fastavro-1.9.4-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:4451ee9a305a73313a1558d471299f3130e4ecc10a88bf5742aa03fb37e042e6"}, + {file = "fastavro-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8524fccfb379565568c045d29b2ebf71e1f2c0dd484aeda9fe784ef5febe1a8"}, + {file = "fastavro-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33d0a00a6e09baa20f6f038d7a2ddcb7eef0e7a9980e947a018300cb047091b8"}, + {file = "fastavro-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:23d7e5b29c9bf6f26e8be754b2c8b919838e506f78ef724de7d22881696712fc"}, + {file = "fastavro-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2e6ab3ee53944326460edf1125b2ad5be2fadd80f7211b13c45fa0c503b4cf8d"}, + {file = "fastavro-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:64d335ec2004204c501f8697c385d0a8f6b521ac82d5b30696f789ff5bc85f3c"}, + {file = "fastavro-1.9.4-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:7e05f44c493e89e73833bd3ff3790538726906d2856f59adc8103539f4a1b232"}, + {file = "fastavro-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:253c63993250bff4ee7b11fb46cf3a4622180a783bedc82a24c6fdcd1b10ca2a"}, + {file = "fastavro-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24d6942eb1db14640c2581e0ecd1bbe0afc8a83731fcd3064ae7f429d7880cb7"}, + {file = "fastavro-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d47bb66be6091cd48cfe026adcad11c8b11d7d815a2949a1e4ccf03df981ca65"}, + {file = "fastavro-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c293897f12f910e58a1024f9c77f565aa8e23b36aafda6ad8e7041accc57a57f"}, + {file = "fastavro-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:f05d2afcb10a92e2a9e580a3891f090589b3e567fdc5641f8a46a0b084f120c3"}, + {file = "fastavro-1.9.4.tar.gz", hash = "sha256:56b8363e360a1256c94562393dc7f8611f3baf2b3159f64fb2b9c6b87b14e876"}, +] + +[package.extras] +codecs = ["cramjam", "lz4", "zstandard"] +lz4 = ["lz4"] +snappy = ["cramjam"] +zstandard = ["zstandard"] + [[package]] name = "fastjsonschema" version = "2.19.1" @@ -686,6 +752,22 @@ files = [ [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] +[[package]] +name = "filelock" +version = "3.13.3" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.8" +files = [ + {file = "filelock-3.13.3-py3-none-any.whl", hash = "sha256:5ffa845303983e7a0b7ae17636509bc97997d58afeafa72fb141a17b152284cb"}, + {file = "filelock-3.13.3.tar.gz", hash = "sha256:a79895a25bbefdf55d1a2a0a80968f7dbb28edcd6d4234a0afb3f37ecde4b546"}, +] + +[package.extras] +docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] +typing = ["typing-extensions (>=4.8)"] + [[package]] name = "fqdn" version = "1.5.1" @@ -783,6 +865,41 @@ files = [ {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"}, ] +[[package]] +name = "fsspec" +version = "2024.3.1" +description = "File-system specification" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fsspec-2024.3.1-py3-none-any.whl", hash = "sha256:918d18d41bf73f0e2b261824baeb1b124bcf771767e3a26425cd7dec3332f512"}, + {file = "fsspec-2024.3.1.tar.gz", hash = "sha256:f39780e282d7d117ffb42bb96992f8a90795e4d0fb0f661a70ca39fe9c43ded9"}, +] + +[package.extras] +abfs = ["adlfs"] +adl = ["adlfs"] +arrow = ["pyarrow (>=1)"] +dask = ["dask", "distributed"] +devel = ["pytest", "pytest-cov"] +dropbox = ["dropbox", "dropboxdrivefs", "requests"] +full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"] +fuse = ["fusepy"] +gcs = ["gcsfs"] +git = ["pygit2"] +github = ["requests"] +gs = ["gcsfs"] +gui = ["panel"] +hdfs = ["pyarrow (>=1)"] +http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"] +libarchive = ["libarchive-c"] +oci = ["ocifs"] +s3 = ["s3fs"] +sftp = ["paramiko"] +smb = ["smbprotocol"] +ssh = ["paramiko"] +tqdm = ["tqdm"] + [[package]] name = "gitdb" version = "4.0.11" @@ -1103,6 +1220,40 @@ cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] +[[package]] +name = "huggingface-hub" +version = "0.22.2" +description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "huggingface_hub-0.22.2-py3-none-any.whl", hash = "sha256:3429e25f38ccb834d310804a3b711e7e4953db5a9e420cc147a5e194ca90fd17"}, + {file = "huggingface_hub-0.22.2.tar.gz", hash = "sha256:32e9a9a6843c92f253ff9ca16b9985def4d80a93fb357af5353f770ef74a81be"}, +] + +[package.dependencies] +filelock = "*" +fsspec = ">=2023.5.0" +packaging = ">=20.9" +pyyaml = ">=5.1" +requests = "*" +tqdm = ">=4.42.1" +typing-extensions = ">=3.7.4.3" + +[package.extras] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +cli = ["InquirerPy (==0.3.4)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] +hf-transfer = ["hf-transfer (>=0.1.4)"] +inference = ["aiohttp", "minijinja (>=1.0)"] +quality = ["mypy (==1.5.1)", "ruff (>=0.3.0)"] +tensorflow = ["graphviz", "pydot", "tensorflow"] +tensorflow-testing = ["keras (<3.0)", "tensorflow"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] +torch = ["safetensors", "torch"] +typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] + [[package]] name = "hyperframe" version = "6.0.1" @@ -3499,6 +3650,133 @@ webencodings = ">=0.4" doc = ["sphinx", "sphinx_rtd_theme"] test = ["flake8", "isort", "pytest"] +[[package]] +name = "tokenizers" +version = "0.15.2" +description = "" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tokenizers-0.15.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:52f6130c9cbf70544287575a985bf44ae1bda2da7e8c24e97716080593638012"}, + {file = "tokenizers-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:054c1cc9c6d68f7ffa4e810b3d5131e0ba511b6e4be34157aa08ee54c2f8d9ee"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a9b9b070fdad06e347563b88c278995735292ded1132f8657084989a4c84a6d5"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea621a7eef4b70e1f7a4e84dd989ae3f0eeb50fc8690254eacc08acb623e82f1"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf7fd9a5141634fa3aa8d6b7be362e6ae1b4cda60da81388fa533e0b552c98fd"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44f2a832cd0825295f7179eaf173381dc45230f9227ec4b44378322d900447c9"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b9ec69247a23747669ec4b0ca10f8e3dfb3545d550258129bd62291aabe8605"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b6a4c78da863ff26dbd5ad9a8ecc33d8a8d97b535172601cf00aee9d7ce9ce"}, + {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5ab2a4d21dcf76af60e05af8063138849eb1d6553a0d059f6534357bce8ba364"}, + {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a47acfac7e511f6bbfcf2d3fb8c26979c780a91e06fb5b9a43831b2c0153d024"}, + {file = "tokenizers-0.15.2-cp310-none-win32.whl", hash = "sha256:064ff87bb6acdbd693666de9a4b692add41308a2c0ec0770d6385737117215f2"}, + {file = "tokenizers-0.15.2-cp310-none-win_amd64.whl", hash = "sha256:3b919afe4df7eb6ac7cafd2bd14fb507d3f408db7a68c43117f579c984a73843"}, + {file = "tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:89cd1cb93e4b12ff39bb2d626ad77e35209de9309a71e4d3d4672667b4b256e7"}, + {file = "tokenizers-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cfed5c64e5be23d7ee0f0e98081a25c2a46b0b77ce99a4f0605b1ec43dd481fa"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a907d76dcfda37023ba203ab4ceeb21bc5683436ebefbd895a0841fd52f6f6f2"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ea60479de6fc7b8ae756b4b097572372d7e4032e2521c1bbf3d90c90a99ff0"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:48e2b9335be2bc0171df9281385c2ed06a15f5cf121c44094338306ab7b33f2c"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:112a1dd436d2cc06e6ffdc0b06d55ac019a35a63afd26475205cb4b1bf0bfbff"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4620cca5c2817177ee8706f860364cc3a8845bc1e291aaf661fb899e5d1c45b0"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccd73a82751c523b3fc31ff8194702e4af4db21dc20e55b30ecc2079c5d43cb7"}, + {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:107089f135b4ae7817affe6264f8c7a5c5b4fd9a90f9439ed495f54fcea56fb4"}, + {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0ff110ecc57b7aa4a594396525a3451ad70988e517237fe91c540997c4e50e29"}, + {file = "tokenizers-0.15.2-cp311-none-win32.whl", hash = "sha256:6d76f00f5c32da36c61f41c58346a4fa7f0a61be02f4301fd30ad59834977cc3"}, + {file = "tokenizers-0.15.2-cp311-none-win_amd64.whl", hash = "sha256:cc90102ed17271cf0a1262babe5939e0134b3890345d11a19c3145184b706055"}, + {file = "tokenizers-0.15.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f86593c18d2e6248e72fb91c77d413a815153b8ea4e31f7cd443bdf28e467670"}, + {file = "tokenizers-0.15.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0774bccc6608eca23eb9d620196687c8b2360624619623cf4ba9dc9bd53e8b51"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d0222c5b7c9b26c0b4822a82f6a7011de0a9d3060e1da176f66274b70f846b98"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3835738be1de66624fff2f4f6f6684775da4e9c00bde053be7564cbf3545cc66"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0143e7d9dcd811855c1ce1ab9bf5d96d29bf5e528fd6c7824d0465741e8c10fd"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db35825f6d54215f6b6009a7ff3eedee0848c99a6271c870d2826fbbedf31a38"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f5e64b0389a2be47091d8cc53c87859783b837ea1a06edd9d8e04004df55a5c"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e0480c452217edd35eca56fafe2029fb4d368b7c0475f8dfa3c5c9c400a7456"}, + {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a33ab881c8fe70474980577e033d0bc9a27b7ab8272896e500708b212995d834"}, + {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a308a607ca9de2c64c1b9ba79ec9a403969715a1b8ba5f998a676826f1a7039d"}, + {file = "tokenizers-0.15.2-cp312-none-win32.whl", hash = "sha256:b8fcfa81bcb9447df582c5bc96a031e6df4da2a774b8080d4f02c0c16b42be0b"}, + {file = "tokenizers-0.15.2-cp312-none-win_amd64.whl", hash = "sha256:38d7ab43c6825abfc0b661d95f39c7f8af2449364f01d331f3b51c94dcff7221"}, + {file = "tokenizers-0.15.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:38bfb0204ff3246ca4d5e726e8cc8403bfc931090151e6eede54d0e0cf162ef0"}, + {file = "tokenizers-0.15.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c861d35e8286a53e06e9e28d030b5a05bcbf5ac9d7229e561e53c352a85b1fc"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:936bf3842db5b2048eaa53dade907b1160f318e7c90c74bfab86f1e47720bdd6"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:620beacc3373277700d0e27718aa8b25f7b383eb8001fba94ee00aeea1459d89"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2735ecbbf37e52db4ea970e539fd2d450d213517b77745114f92867f3fc246eb"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:473c83c5e2359bb81b0b6fde870b41b2764fcdd36d997485e07e72cc3a62264a"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968fa1fb3c27398b28a4eca1cbd1e19355c4d3a6007f7398d48826bbe3a0f728"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:865c60ae6eaebdde7da66191ee9b7db52e542ed8ee9d2c653b6d190a9351b980"}, + {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7c0d8b52664ab2d4a8d6686eb5effc68b78608a9008f086a122a7b2996befbab"}, + {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:f33dfbdec3784093a9aebb3680d1f91336c56d86cc70ddf88708251da1fe9064"}, + {file = "tokenizers-0.15.2-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:d44ba80988ff9424e33e0a49445072ac7029d8c0e1601ad25a0ca5f41ed0c1d6"}, + {file = "tokenizers-0.15.2-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:dce74266919b892f82b1b86025a613956ea0ea62a4843d4c4237be2c5498ed3a"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0ef06b9707baeb98b316577acb04f4852239d856b93e9ec3a299622f6084e4be"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73e2e74bbb07910da0d37c326869f34113137b23eadad3fc00856e6b3d9930c"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4eeb12daf02a59e29f578a865f55d87cd103ce62bd8a3a5874f8fdeaa82e336b"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ba9f6895af58487ca4f54e8a664a322f16c26bbb442effd01087eba391a719e"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccec77aa7150e38eec6878a493bf8c263ff1fa8a62404e16c6203c64c1f16a26"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3f40604f5042ff210ba82743dda2b6aa3e55aa12df4e9f2378ee01a17e2855e"}, + {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5645938a42d78c4885086767c70923abad047163d809c16da75d6b290cb30bbe"}, + {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05a77cbfebe28a61ab5c3891f9939cc24798b63fa236d84e5f29f3a85a200c00"}, + {file = "tokenizers-0.15.2-cp37-none-win32.whl", hash = "sha256:361abdc068e8afe9c5b818769a48624687fb6aaed49636ee39bec4e95e1a215b"}, + {file = "tokenizers-0.15.2-cp37-none-win_amd64.whl", hash = "sha256:7ef789f83eb0f9baeb4d09a86cd639c0a5518528f9992f38b28e819df397eb06"}, + {file = "tokenizers-0.15.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4fe1f74a902bee74a3b25aff180fbfbf4f8b444ab37c4d496af7afd13a784ed2"}, + {file = "tokenizers-0.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4c4b89038a684f40a6b15d6b09f49650ac64d951ad0f2a3ea9169687bbf2a8ba"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d05a1b06f986d41aed5f2de464c003004b2df8aaf66f2b7628254bcbfb72a438"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:508711a108684111ec8af89d3a9e9e08755247eda27d0ba5e3c50e9da1600f6d"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:daa348f02d15160cb35439098ac96e3a53bacf35885072611cd9e5be7d333daa"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:494fdbe5932d3416de2a85fc2470b797e6f3226c12845cadf054dd906afd0442"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c2d60f5246f4da9373f75ff18d64c69cbf60c3bca597290cea01059c336d2470"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93268e788825f52de4c7bdcb6ebc1fcd4a5442c02e730faa9b6b08f23ead0e24"}, + {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6fc7083ab404019fc9acafe78662c192673c1e696bd598d16dc005bd663a5cf9"}, + {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41e39b41e5531d6b2122a77532dbea60e171ef87a3820b5a3888daa847df4153"}, + {file = "tokenizers-0.15.2-cp38-none-win32.whl", hash = "sha256:06cd0487b1cbfabefb2cc52fbd6b1f8d4c37799bd6c6e1641281adaa6b2504a7"}, + {file = "tokenizers-0.15.2-cp38-none-win_amd64.whl", hash = "sha256:5179c271aa5de9c71712e31cb5a79e436ecd0d7532a408fa42a8dbfa4bc23fd9"}, + {file = "tokenizers-0.15.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:82f8652a74cc107052328b87ea8b34291c0f55b96d8fb261b3880216a9f9e48e"}, + {file = "tokenizers-0.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:02458bee6f5f3139f1ebbb6d042b283af712c0981f5bc50edf771d6b762d5e4f"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c9a09cd26cca2e1c349f91aa665309ddb48d71636370749414fbf67bc83c5343"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:158be8ea8554e5ed69acc1ce3fbb23a06060bd4bbb09029431ad6b9a466a7121"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ddba9a2b0c8c81633eca0bb2e1aa5b3a15362b1277f1ae64176d0f6eba78ab1"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ef5dd1d39797044642dbe53eb2bc56435308432e9c7907728da74c69ee2adca"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:454c203164e07a860dbeb3b1f4a733be52b0edbb4dd2e5bd75023ffa8b49403a"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cf6b7f1d4dc59af960e6ffdc4faffe6460bbfa8dce27a58bf75755ffdb2526d"}, + {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2ef09bbc16519f6c25d0c7fc0c6a33a6f62923e263c9d7cca4e58b8c61572afb"}, + {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c9a2ebdd2ad4ec7a68e7615086e633857c85e2f18025bd05d2a4399e6c5f7169"}, + {file = "tokenizers-0.15.2-cp39-none-win32.whl", hash = "sha256:918fbb0eab96fe08e72a8c2b5461e9cce95585d82a58688e7f01c2bd546c79d0"}, + {file = "tokenizers-0.15.2-cp39-none-win_amd64.whl", hash = "sha256:524e60da0135e106b254bd71f0659be9f89d83f006ea9093ce4d1fab498c6d0d"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6a9b648a58281c4672212fab04e60648fde574877d0139cd4b4f93fe28ca8944"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7c7d18b733be6bbca8a55084027f7be428c947ddf871c500ee603e375013ffba"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:13ca3611de8d9ddfbc4dc39ef54ab1d2d4aaa114ac8727dfdc6a6ec4be017378"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:237d1bf3361cf2e6463e6c140628e6406766e8b27274f5fcc62c747ae3c6f094"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67a0fe1e49e60c664915e9fb6b0cb19bac082ab1f309188230e4b2920230edb3"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4e022fe65e99230b8fd89ebdfea138c24421f91c1a4f4781a8f5016fd5cdfb4d"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d857be2df69763362ac699f8b251a8cd3fac9d21893de129bc788f8baaef2693"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:708bb3e4283177236309e698da5fcd0879ce8fd37457d7c266d16b550bcbbd18"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c35e09e9899b72a76e762f9854e8750213f67567787d45f37ce06daf57ca78"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1257f4394be0d3b00de8c9e840ca5601d0a4a8438361ce9c2b05c7d25f6057b"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02272fe48280e0293a04245ca5d919b2c94a48b408b55e858feae9618138aeda"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dc3ad9ebc76eabe8b1d7c04d38be884b8f9d60c0cdc09b0aa4e3bcf746de0388"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:32e16bdeffa7c4f46bf2152172ca511808b952701d13e7c18833c0b73cb5c23f"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fb16ba563d59003028b678d2361a27f7e4ae0ab29c7a80690efa20d829c81fdb"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:2277c36d2d6cdb7876c274547921a42425b6810d38354327dd65a8009acf870c"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1cf75d32e8d250781940d07f7eece253f2fe9ecdb1dc7ba6e3833fa17b82fcbc"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1b3b31884dc8e9b21508bb76da80ebf7308fdb947a17affce815665d5c4d028"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b10122d8d8e30afb43bb1fe21a3619f62c3e2574bff2699cf8af8b0b6c5dc4a3"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d88b96ff0fe8e91f6ef01ba50b0d71db5017fa4e3b1d99681cec89a85faf7bf7"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:37aaec5a52e959892870a7c47cef80c53797c0db9149d458460f4f31e2fb250e"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e2ea752f2b0fe96eb6e2f3adbbf4d72aaa1272079b0dfa1145507bd6a5d537e6"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:4b19a808d8799fda23504a5cd31d2f58e6f52f140380082b352f877017d6342b"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c86e5e068ac8b19204419ed8ca90f9d25db20578f5881e337d203b314f4104"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de19c4dc503c612847edf833c82e9f73cd79926a384af9d801dcf93f110cea4e"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea09acd2fe3324174063d61ad620dec3bcf042b495515f27f638270a7d466e8b"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cf27fd43472e07b57cf420eee1e814549203d56de00b5af8659cb99885472f1f"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7ca22bd897537a0080521445d91a58886c8c04084a6a19e6c78c586e0cfa92a5"}, + {file = "tokenizers-0.15.2.tar.gz", hash = "sha256:e6e9c6e019dd5484be5beafc775ae6c925f4c69a3487040ed09b45e13df2cb91"}, +] + +[package.dependencies] +huggingface_hub = ">=0.16.4,<1.0" + +[package.extras] +dev = ["tokenizers[testing]"] +docs = ["setuptools_rust", "sphinx", "sphinx_rtd_theme"] +testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] + [[package]] name = "toml" version = "0.10.2" @@ -3541,6 +3819,26 @@ files = [ {file = "tornado-6.4.tar.gz", hash = "sha256:72291fa6e6bc84e626589f1c29d90a5a6d593ef5ae68052ee2ef000dfd273dee"}, ] +[[package]] +name = "tqdm" +version = "4.66.2" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.66.2-py3-none-any.whl", hash = "sha256:1ee4f8a893eb9bef51c6e35730cebf234d5d0b6bd112b0271e10ed7c24a02bd9"}, + {file = "tqdm-4.66.2.tar.gz", hash = "sha256:6cd52cdf0fef0e0f543299cfc96fec90d7b8a7e88745f411ec33eb44d5ed3531"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "traitlets" version = "5.14.2" @@ -3567,6 +3865,20 @@ files = [ {file = "types_python_dateutil-2.9.0.20240316-py3-none-any.whl", hash = "sha256:6b8cb66d960771ce5ff974e9dd45e38facb81718cc1e208b10b1baccbfdbee3b"}, ] +[[package]] +name = "types-requests" +version = "2.31.0.20240406" +description = "Typing stubs for requests" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-requests-2.31.0.20240406.tar.gz", hash = "sha256:4428df33c5503945c74b3f42e82b181e86ec7b724620419a2966e2de604ce1a1"}, + {file = "types_requests-2.31.0.20240406-py3-none-any.whl", hash = "sha256:6216cdac377c6b9a040ac1c0404f7284bd13199c0e1bb235f4324627e8898cf5"}, +] + +[package.dependencies] +urllib3 = ">=2" + [[package]] name = "typing-extensions" version = "4.10.0" @@ -3849,4 +4161,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "8be3bd74ef26bad03e2d8bcf1ef2f277270082c6ab48274504e6f29392c78914" +content-hash = "44ec0d87aaee06d7909d51fceddf741a2f6d971b3de9309089b3d154abfe518d" diff --git a/pyproject.toml b/pyproject.toml index daec64e..abfc2be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ python-dotenv = "^1.0.1" qdrant-client = "^1.8.2" loguru = "^0.7.2" streamlit = "^1.33.0" +cohere = "^5.2.3" [build-system] diff --git a/rag/cli.py b/rag/cli.py index 5ea1a47..c470db3 100644 --- a/rag/cli.py +++ b/rag/cli.py @@ -22,7 +22,7 @@ if __name__ == "__main__": if query: result = rag.retrive(query) print("Answer: \n") - print(result.answer) + print(result.answer + "\n") case _: print("Invalid option!") diff --git a/rag/db/vector.py b/rag/db/vector.py index bbbbf32..fd2b2c2 100644 --- a/rag/db/vector.py +++ b/rag/db/vector.py @@ -5,7 +5,7 @@ from typing import Dict, List from loguru import logger as log from qdrant_client import QdrantClient from qdrant_client.http.models import StrictFloat -from qdrant_client.models import Distance, PointStruct, ScoredPoint, VectorParams +from qdrant_client.models import Distance, PointStruct, VectorParams @dataclass @@ -15,11 +15,18 @@ class Point: payload: Dict[str, str] +@dataclass +class Document: + title: str + text: str + + class VectorDB: - def __init__(self): + def __init__(self, score_threshold: float = 0.6): self.dim = int(os.environ["EMBEDDING_DIM"]) self.collection_name = os.environ["QDRANT_COLLECTION_NAME"] self.client = QdrantClient(url=os.environ["QDRANT_URL"]) + self.score_threshold = score_threshold self.__configure() def __configure(self): @@ -47,12 +54,20 @@ class VectorDB: max_retries=3, ) - def search(self, query: List[float], limit: int = 4) -> List[ScoredPoint]: + def search(self, query: List[float], limit: int = 5) -> List[Document]: log.debug("Searching for vectors...") hits = self.client.search( collection_name=self.collection_name, query_vector=query, limit=limit, - score_threshold=0.6, + score_threshold=self.score_threshold, + ) + log.debug(f"Got {len(hits)} hits in the vector db with limit={limit}") + return list( + map( + lambda h: Document( + title=h.payload.get("source", ""), text=h.payload["text"] + ), + hits, + ) ) - return hits diff --git a/rag/llm/cohere_generator.py b/rag/llm/cohere_generator.py new file mode 100644 index 0000000..a6feacd --- /dev/null +++ b/rag/llm/cohere_generator.py @@ -0,0 +1,29 @@ +import os +from typing import Any, Generator +import cohere + +from dataclasses import asdict +try: + from rag.llm.ollama_generator import Prompt +except ModuleNotFoundError: + from llm.ollama_generator import Prompt +from loguru import logger as log + + +class CohereGenerator: + def __init__(self) -> None: + self.client = cohere.Client(os.environ["COHERE_API_KEY"]) + + def generate(self, prompt: Prompt) -> Generator[Any, Any, Any]: + log.debug("Generating answer from cohere") + for event in self.client.chat_stream( + message=prompt.query, + documents=[asdict(d) for d in prompt.documents], + prompt_truncation="AUTO", + ): + if event.event_type == "text-generation": + yield event.text + elif event.event_type == "citation-generation": + yield event.citations + elif event.event_type == "stream-end": + yield event.finish_reason diff --git a/rag/llm/encoder.py b/rag/llm/encoder.py index 95f3c6a..a59b1b4 100644 --- a/rag/llm/encoder.py +++ b/rag/llm/encoder.py @@ -1,5 +1,6 @@ import os -from typing import Iterator, List +from pathlib import Path +from typing import List, Dict from uuid import uuid4 import ollama @@ -13,6 +14,7 @@ try: except ModuleNotFoundError: from db.vector import Point + class Encoder: def __init__(self) -> None: self.model = os.environ["ENCODER_MODEL"] @@ -21,13 +23,20 @@ class Encoder: def __encode(self, prompt: str) -> List[StrictFloat]: return list(ollama.embeddings(model=self.model, prompt=prompt)["embedding"]) - def encode_document(self, chunks: Iterator[Document]) -> List[Point]: + def __get_source(self, metadata: Dict[str, str]) -> str: + source = metadata["source"] + return Path(source).name + + def encode_document(self, chunks: List[Document]) -> List[Point]: log.debug("Encoding document...") return [ Point( id=uuid4().hex, vector=self.__encode(chunk.page_content), - payload={"text": chunk.page_content}, + payload={ + "text": chunk.page_content, + "source": self.__get_source(chunk.metadata), + }, ) for chunk in chunks ] diff --git a/rag/llm/generator.py b/rag/llm/generator.py deleted file mode 100644 index 8c7702f..0000000 --- a/rag/llm/generator.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -from dataclasses import dataclass - -import ollama -from loguru import logger as log - - -@dataclass -class Prompt: - query: str - context: str - - -class Generator: - def __init__(self) -> None: - self.model = os.environ["GENERATOR_MODEL"] - - def __metaprompt(self, prompt: Prompt) -> str: - metaprompt = ( - "Answer the following question using the provided context.\n" - "If you can't find the answer, do not pretend you know it," - 'but answer "I don\'t know".\n\n' - f"Question: {prompt.query.strip()}\n\n" - "Context:\n" - f"{prompt.context.strip()}\n\n" - "Answer:\n" - ) - return metaprompt - - def generate(self, prompt: Prompt) -> str: - log.debug("Generating answer...") - metaprompt = self.__metaprompt(prompt) - return ollama.generate(model=self.model, prompt=metaprompt) diff --git a/rag/llm/ollama_generator.py b/rag/llm/ollama_generator.py new file mode 100644 index 0000000..dd17f8d --- /dev/null +++ b/rag/llm/ollama_generator.py @@ -0,0 +1,76 @@ +import os +from dataclasses import dataclass +from typing import Any, Generator, List + +import ollama +from loguru import logger as log + +try: + from rag.db.vector import Document +except ModuleNotFoundError: + from db.vector import Document + + +@dataclass +class Prompt: + query: str + documents: List[Document] + + +SYSTEM_PROMPT = ( + "# System Preamble" + "## Basic Rules" + "When you answer the user's requests, you cite your sources in your answers, according to those instructions." + "Answer the following question using the provided context.\n" + "## Style Guide" + "Unless the user asks for a different style of answer, you should answer " + "in full sentences, using proper grammar and spelling." +) + + +class OllamaGenerator: + def __init__(self) -> None: + self.model = os.environ["GENERATOR_MODEL"] + + def __context(self, documents: List[Document]) -> str: + results = [ + f"Document: {i}\ntitle: {doc.title}\n{doc.text}" + for i, doc in enumerate(documents) + ] + return "\n".join(results) + + def __metaprompt(self, prompt: Prompt) -> str: + # Include sources + metaprompt = ( + f'Question: "{prompt.query.strip()}"\n\n' + "Context:\n" + "\n" + f"{self.__context(prompt.documents)}\n\n" + "\n" + "Carefully perform the following instructions, in order, starting each " + "with a new line.\n" + "Firstly, Decide which of the retrieved documents are relevant to the " + "user's last input by writing 'Relevant Documents:' followed by " + "comma-separated list of document numbers.\n If none are relevant, you " + "should instead write 'None'.\n" + "Secondly, Decide which of the retrieved documents contain facts that " + "should be cited in a good answer to the user's last input by writing " + "'Cited Documents:' followed a comma-separated list of document numbers. " + "If you dont want to cite any of them, you should instead write 'None'.\n" + "Thirdly, Write 'Answer:' followed by a response to the user's last input " + "in high quality natural english. Use the retrieved documents to help you. " + "Do not insert any citations or grounding markup.\n" + "Finally, Write 'Grounded answer:' followed by a response to the user's " + "last input in high quality natural english. Use the symbols and " + " to indicate when a fact comes from a document in the search " + "result, e.g my fact for a fact from document 0." + ) + return metaprompt + + def generate(self, prompt: Prompt) -> Generator[Any, Any, Any]: + log.debug("Generating answer...") + metaprompt = self.__metaprompt(prompt) + for chunk in ollama.generate( + model=self.model, prompt=metaprompt, system=SYSTEM_PROMPT, stream=True + ): + yield chunk diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py index cbd86a3..ca9b72d 100644 --- a/rag/parser/pdf.py +++ b/rag/parser/pdf.py @@ -1,6 +1,6 @@ import os from pathlib import Path -from typing import Iterator +from typing import Iterator, List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document @@ -15,15 +15,20 @@ class PDFParser: self.parser = PyPDFParser(password=None, extract_images=False) def from_data(self, blob: Blob) -> Iterator[Document]: - yield from self.parser.parse(blob) + return self.parser.parse(blob) def from_path(self, path: Path) -> Iterator[Document]: return Blob.from_path(path) - def chunk(self, content: Iterator[Document]): + def chunk( + self, document: Iterator[Document], source: Optional[str] = None + ) -> List[Document]: splitter = RecursiveCharacterTextSplitter( chunk_size=int(os.environ["CHUNK_SIZE"]), chunk_overlap=int(os.environ["CHUNK_OVERLAP"]), ) - chunks = splitter.split_documents(content) + chunks = splitter.split_documents(document) + if source is not None: + for c in chunks: + c.metadata["source"] = source return chunks diff --git a/rag/rag.py b/rag/rag.py index cd4537e..93f9fd7 100644 --- a/rag/rag.py +++ b/rag/rag.py @@ -5,20 +5,22 @@ from typing import List from dotenv import load_dotenv from loguru import logger as log -from qdrant_client.models import StrictFloat + try: - from rag.db.vector import VectorDB + from rag.db.vector import VectorDB, Document from rag.db.document import DocumentDB from rag.llm.encoder import Encoder - from rag.llm.generator import Generator, Prompt + from rag.llm.ollama_generator import OllamaGenerator, Prompt + from rag.llm.cohere_generator import CohereGenerator from rag.parser.pdf import PDFParser except ModuleNotFoundError: - from db.vector import VectorDB + from db.vector import VectorDB, Document from db.document import DocumentDB from llm.encoder import Encoder - from llm.generator import Generator, Prompt + from llm.ollama_generator import OllamaGenerator, Prompt + from llm.cohere_generator import CohereGenerator from parser.pdf import PDFParser @@ -34,7 +36,7 @@ class RAG: # FIXME: load this somewhere else? load_dotenv() self.pdf_parser = PDFParser() - self.generator = Generator() + self.generator = CohereGenerator() self.encoder = Encoder() self.vector_db = VectorDB() self.doc_db = DocumentDB() @@ -43,23 +45,19 @@ class RAG: blob = self.pdf_parser.from_path(path) self.add_pdf_from_blob(blob) - def add_pdf_from_blob(self, blob: BytesIO): + def add_pdf_from_blob(self, blob: BytesIO, source: str): if self.doc_db.add(blob): log.debug("Adding pdf to vector database...") - chunks = self.pdf_parser.from_data(blob) + document = self.pdf_parser.from_data(blob) + chunks = self.pdf_parser.chunk(document, source) points = self.encoder.encode_document(chunks) self.vector_db.add(points) else: log.debug("Document already exists!") - def __context(self, query_emb: List[StrictFloat], limit: int) -> str: - hits = self.vector_db.search(query_emb, limit) - log.debug(f"Got {len(hits)} hits in the vector db with limit={limit}") - return [h.payload["text"] for h in hits] - - def retrive(self, query: str, limit: int = 5) -> Response: + def search(self, query: str, limit: int = 5) -> List[Document]: query_emb = self.encoder.encode_query(query) - context = self.__context(query_emb, limit) - prompt = Prompt(query, "\n".join(context)) - answer = self.generator.generate(prompt)["response"] - return Response(query, context, answer) + return self.vector_db.search(query_emb, limit) + + def retrieve(self, prompt: Prompt): + yield from self.generator.generate(prompt) diff --git a/rag/ui.py b/rag/ui.py index 37c50dd..84dbbeb 100644 --- a/rag/ui.py +++ b/rag/ui.py @@ -4,8 +4,10 @@ from langchain_community.document_loaders.blob_loaders import Blob try: from rag.rag import RAG + from rag.llm.ollama_generator import Prompt except ModuleNotFoundError: from rag import RAG + from llm.ollama_generator import Prompt rag = RAG() @@ -16,9 +18,15 @@ def upload_pdfs(): type="pdf", accept_multiple_files=True, ) - for file in files: - blob = Blob.from_data(file.read()) - rag.add_pdf_from_blob(blob) + + if not files: + return + + with st.spinner("Indexing documents..."): + for file in files: + source = file.name + blob = Blob.from_data(file.read()) + rag.add_pdf_from_blob(blob, source) if __name__ == "__main__": @@ -26,30 +34,41 @@ if __name__ == "__main__": st.header("RAG-UI") upload_pdfs() - query = st.text_area( - "query", - key="query", - height=100, - placeholder="Enter query here", - help="", - label_visibility="collapsed", - disabled=False, - ) + + with st.form(key="query"): + query = st.text_area( + "query", + key="query", + height=100, + placeholder="Enter query here", + help="", + label_visibility="collapsed", + disabled=False, + ) + submit = st.form_submit_button("Generate") (b,) = st.columns(1) (result_column, context_column) = st.columns(2) - if b.button("Generate", disabled=False, type="primary", use_container_width=True): + if submit: + if not query: + st.stop() + query = ss.get("query", "") - with st.spinner("Generating answer..."): - response = rag.retrieve(query) + with st.spinner("Searching for documents..."): + documents = rag.search(query) - with result_column: - st.markdown("### Answer") - st.markdown(response.answer) + prompt = Prompt(query, documents) with context_column: st.markdown("### Context") - for c in response.context: - st.markdown(c) + for i, doc in enumerate(documents): + st.markdown(f"### Document {i}") + st.markdown(f"**Title: {doc.title}**") + st.markdown(doc.text) st.markdown("---") + + with result_column: + st.markdown("### Answer") + st.write_stream(rag.retrieve(prompt)) + -- cgit v1.2.3-70-g09d2