From a96b270805eba2b6d8c40d1fa2ee0d35c73cba0e Mon Sep 17 00:00:00 2001 From: Gustaf Rydholm Date: Sat, 13 Apr 2024 13:21:44 +0200 Subject: Add upload script --- README.md | 14 ++++++++++---- poetry.lock | 4 ++-- pyproject.toml | 2 ++ rag/upload.py | 31 +++++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 6 deletions(-) create mode 100644 rag/upload.py diff --git a/README.md b/README.md index c478830..3fd5569 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Retrieval Augmented Generation -RAG with ollama (and optionally cohere) and qdrant. This is basically a glorified -ctrl+f. +RAG with ollama (and optionally cohere) and qdrant. This is basically a glorified +(bloated) `ctrl+f`. ## Usage @@ -65,7 +65,9 @@ Get an API from their website. #### 6.1 Prerequisites -##### 6.2 Python Environment +Both databases needs to be running as well as ollama. + +##### 6.1.1 Python Environment Activate the poetry shell: @@ -91,7 +93,11 @@ streamlit run rag/ui.py #### 6.5 Upload Multiple Documents -tbc +To upload an entire directory (and subdirectories) of pdfs, simply run: + +```sh +python rag/upload.py -d +``` ### Notes diff --git a/poetry.lock b/poetry.lock index 8a2b3a2..856fae7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -388,7 +388,7 @@ files = [ [[package]] name = "cffi" version = "1.16.0" -description = "Foreign Function AbstractGenerator for Python calling C code." +description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.8" files = [ @@ -4161,4 +4161,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "44ec0d87aaee06d7909d51fceddf741a2f6d971b3de9309089b3d154abfe518d" +content-hash = "28d2833cd03e06191ab01107031fa972a6955c76385eac763141406ff92df2c5" diff --git a/pyproject.toml b/pyproject.toml index abfc2be..38986f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,8 @@ qdrant-client = "^1.8.2" loguru = "^0.7.2" streamlit = "^1.33.0" cohere = "^5.2.3" +tqdm = "^4.66.2" +click = "^8.1.7" [build-system] diff --git a/rag/upload.py b/rag/upload.py new file mode 100644 index 0000000..3c5a100 --- /dev/null +++ b/rag/upload.py @@ -0,0 +1,31 @@ +from pathlib import Path + +import click +from dotenv import load_dotenv +from loguru import logger as log +from tqdm import tqdm + +from rag.retriever.retriever import Retriever + +log.remove() +log.add(lambda msg: tqdm.write(msg, end=""), colorize=True) + + +@click.command() +@click.option( + "-d", + "--directory", + help="The full path to the root directory containing pdfs to upload", + type=click.Path(exists=True), +) +def main(directory: str): + log.info(f"Uploading pfs found in directory {directory}...") + retriever = Retriever() + pdfs = Path(directory).glob("**/*.pdf") + for path in tqdm(pdfs): + retriever.add_pdf(path=path) + + +if __name__ == "__main__": + load_dotenv() + main() -- cgit v1.2.3-70-g09d2