summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-13 13:21:44 +0200
committerGustaf Rydholm <gustaf.rydholm@gmail.com>2024-04-13 13:21:44 +0200
commita96b270805eba2b6d8c40d1fa2ee0d35c73cba0e (patch)
treefd2a663eda2b4ccb021adcd7fb5b4e6d89180f95
parent4968ed48ed1adb267b910b28fdda0db115ba1b19 (diff)
Add upload script
-rw-r--r--README.md14
-rw-r--r--poetry.lock4
-rw-r--r--pyproject.toml2
-rw-r--r--rag/upload.py31
4 files changed, 45 insertions, 6 deletions
diff --git a/README.md b/README.md
index c478830..3fd5569 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
# Retrieval Augmented Generation
-RAG with ollama (and optionally cohere) and qdrant. This is basically a glorified
-ctrl+f.
+RAG with ollama (and optionally cohere) and qdrant. This is basically a glorified
+(bloated) `ctrl+f`.
## Usage
@@ -65,7 +65,9 @@ Get an API from their website.
#### 6.1 Prerequisites
-##### 6.2 Python Environment
+Both databases needs to be running as well as ollama.
+
+##### 6.1.1 Python Environment
Activate the poetry shell:
@@ -91,7 +93,11 @@ streamlit run rag/ui.py
#### 6.5 Upload Multiple Documents
-tbc
+To upload an entire directory (and subdirectories) of pdfs, simply run:
+
+```sh
+python rag/upload.py -d <PATH-TO-DIR-WITH-PDFS>
+```
### Notes
diff --git a/poetry.lock b/poetry.lock
index 8a2b3a2..856fae7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -388,7 +388,7 @@ files = [
[[package]]
name = "cffi"
version = "1.16.0"
-description = "Foreign Function AbstractGenerator for Python calling C code."
+description = "Foreign Function Interface for Python calling C code."
optional = false
python-versions = ">=3.8"
files = [
@@ -4161,4 +4161,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
-content-hash = "44ec0d87aaee06d7909d51fceddf741a2f6d971b3de9309089b3d154abfe518d"
+content-hash = "28d2833cd03e06191ab01107031fa972a6955c76385eac763141406ff92df2c5"
diff --git a/pyproject.toml b/pyproject.toml
index abfc2be..38986f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,8 @@ qdrant-client = "^1.8.2"
loguru = "^0.7.2"
streamlit = "^1.33.0"
cohere = "^5.2.3"
+tqdm = "^4.66.2"
+click = "^8.1.7"
[build-system]
diff --git a/rag/upload.py b/rag/upload.py
new file mode 100644
index 0000000..3c5a100
--- /dev/null
+++ b/rag/upload.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+
+import click
+from dotenv import load_dotenv
+from loguru import logger as log
+from tqdm import tqdm
+
+from rag.retriever.retriever import Retriever
+
+log.remove()
+log.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
+
+
+@click.command()
+@click.option(
+ "-d",
+ "--directory",
+ help="The full path to the root directory containing pdfs to upload",
+ type=click.Path(exists=True),
+)
+def main(directory: str):
+ log.info(f"Uploading pfs found in directory {directory}...")
+ retriever = Retriever()
+ pdfs = Path(directory).glob("**/*.pdf")
+ for path in tqdm(pdfs):
+ retriever.add_pdf(path=path)
+
+
+if __name__ == "__main__":
+ load_dotenv()
+ main()