summaryrefslogtreecommitdiff
path: root/rag/parser
diff options
context:
space:
mode:
Diffstat (limited to 'rag/parser')
-rw-r--r--rag/parser/__init__.py0
-rw-r--r--rag/parser/pdf.py19
2 files changed, 19 insertions, 0 deletions
diff --git a/rag/parser/__init__.py b/rag/parser/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/rag/parser/__init__.py
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py
new file mode 100644
index 0000000..cb63fc4
--- /dev/null
+++ b/rag/parser/pdf.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+CHUNK_SIZE = 1024
+CHUNK_OVERLAP = 256
+
+
+def parser(filepath: Path):
+ content = PyPDFLoader(filepath).load()
+ print(content)
+ splitter = RecursiveCharacterTextSplitter(
+ chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
+ )
+ chunks = splitter.split_documents(content)
+ return chunks
+
+
+# TODO: add parser for bytearray