diff options
author | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-05 00:25:42 +0200 |
---|---|---|
committer | Gustaf Rydholm <gustaf.rydholm@gmail.com> | 2024-04-05 00:25:42 +0200 |
commit | aaf4705804af1e73617f0b460826776be8039f8e (patch) | |
tree | a51211e7fd8a377f3d1ef9351f6ee936635ccdda /rag/parser/pdf.py | |
parent | 69bf312562c5f48402dfe4710e92b343cd258f86 (diff) |
Add pdf parser
Diffstat (limited to 'rag/parser/pdf.py')
-rw-r--r-- | rag/parser/pdf.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/rag/parser/pdf.py b/rag/parser/pdf.py new file mode 100644 index 0000000..cb63fc4 --- /dev/null +++ b/rag/parser/pdf.py @@ -0,0 +1,19 @@ +from pathlib import Path +from langchain_community.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter + +CHUNK_SIZE = 1024 +CHUNK_OVERLAP = 256 + + +def parser(filepath: Path): + content = PyPDFLoader(filepath).load() + print(content) + splitter = RecursiveCharacterTextSplitter( + chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP + ) + chunks = splitter.split_documents(content) + return chunks + + +# TODO: add parser for bytearray |