rag/retriever/encoder.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

from dataclasses import dataclass
import hashlib
import os
from pathlib import Path
from typing import Dict, List, Union

import ollama
from langchain_core.documents import Document
from loguru import logger as log
from qdrant_client.http.models import StrictFloat
from tqdm import tqdm

from .vector import Documents, Point

@dataclass
class Query:
    query: str


Input = Query | Documents


class Encoder:
    def __init__(self) -> None:
        self.model = os.environ["ENCODER_MODEL"]
        self.preamble = (
            "Represent this sentence for searching relevant passages: "
            if "mxbai-embed-large" in model_name
            else ""
        )

    def __get_source(self, metadata: Dict[str, str]) -> str:
        source = metadata["source"]
        return Path(source).name

    def __encode(self, prompt: str) -> List[StrictFloat]:
        return list(ollama.embeddings(model=self.model, prompt=prompt)["embedding"])

    # TODO: move this to vec db and just return the embeddings
    # TODO: use late chunking here
    def __encode_document(self, chunks: List[Document]) -> List[Point]:
        log.debug("Encoding document...")
        return [
            Point(
                id=hashlib.sha256(
                    chunk.page_content.encode(encoding="utf-8")
                ).hexdigest(),
                vector=list(self.__encode(chunk.page_content)),
                payload={
                    "text": chunk.page_content,
                    "source": self.__get_source(chunk.metadata),
                },
            )
            for chunk in tqdm(chunks)
        ]

    def __encode_query(self, query: str) -> List[StrictFloat]:
        log.debug(f"Encoding query: {query}")
        query = self.preamble + query
        return self.__encode(query)

    def encode(self, x: Input) -> Union[List[StrictFloat], List[Point]]:
        match x:
            case Query(query):
                return self.__encode_query(query)
            case Documents(documents):
                return self.__encode_document(documents)