Source code for scapi.database.index.search

from collections import defaultdict
from typing import Any, ClassVar, NamedTuple, TypeAlias

from . import parsing, tokenize


Index: TypeAlias = dict[str, set[str]]
Keys: TypeAlias = set[str]
Counts: TypeAlias = dict[str, int]
Entity: TypeAlias = dict[str, Any]
Entities: TypeAlias = dict[str, Entity]



[docs]
class Lookup(NamedTuple):
    """Search result entry."""

    id: str
    data: dict[str, Any]
    score: float = 0.0
    """Similarity score between query and entity (`0.0`-`1.0`)."""




[docs]
class SearchIndex:
    """In-memory search index for entity lookup with N-gram tokenization."""

    SCORE_ROUND: ClassVar[int] = 2
    """Number of decimals for score rounding."""

    JACCARD_WEIGHT: ClassVar[float] = 0.8
    """Weight factor for Jaccard similarity in scoring."""

    def __init__(self):
        self._index: Index = {}
        """Inverted index mapping N-grams to entity IDs."""

        self._keys: Keys = set()
        """Unique entity IDs."""

        self._entities: Entities = {}
        """Entity storage (ID -> JSON{})."""

        self._counts: Counts = {}
        """N-gram counts per entity ID."""


[docs]
    def build(self, path: str, data: Any):
        """Build index from structured data at given path."""

        index: Index = defaultdict(set)
        counts: dict[str, set[str]] = defaultdict(set)
        entities: Entities = defaultdict(lambda: defaultdict(dict))

        parser = parsing.get(path)

        for entity_id, entity, texts in parser(data):
            # collect translations for entity id
            entities[entity_id] = entity

            # tokenize & indexing, store mapping I[ngram] -> {entity_id}
            for text in texts:
                ngrams = tokenize.ngramize(text)

                for ngram in ngrams:
                    index[ngram].add(entity_id)
                    counts[entity_id].add(ngram)

        # update instance values
        self._index = dict(index)
        self._keys = set(self._index.keys())
        self._entities = dict(entities)
        self._counts = {entity_id: len(ngrams) for entity_id, ngrams in counts.items()}



[docs]
    def get(self, entity_id: str) -> Entity | None:
        """Retrieve entity data by ID."""

        return self._entities.get(entity_id)



[docs]
    def search(self, query: str, threshold: float) -> list[Lookup]:
        """
        Search entities with similarity scoring.

        Args:
            query: Search text.
            threshold: Minimum similarity score (`0.0`-`1.0`).

        Returns:
            List of matched entities sorted by relevance.
        """

        if not query:
            return []

        # tokenize user search query
        ngrams = tokenize.ngramize(query)

        if not ngrams:
            return []

        # count how many N-grams from the query (Q) hit each indexed entity (I)
        # hits: {entity_id: count of matching N-grams}
        hits: dict[str, int] = defaultdict(int)

        # soft matching (OR) collect entity ids for all matching N-grams
        for ngram in ngrams & self._keys:
            for entity_id in self._index[ngram]:
                hits[entity_id] += 1

        # query ngrams count
        q_num_ngrams = len(ngrams)

        # create search results
        results: list[Lookup] = []

        # scoring & filtering hits
        for entity_id, count in hits.items():
            # entity ngrams count
            e_num_ngrams = self._counts.get(entity_id, 0)

            # score: max(|Q ∩ I| / min(|Q|, |I|), w × |Q ∩ I| / (|Q| + |I| − |Q ∩ I|))
            min_size = min(q_num_ngrams, e_num_ngrams)
            union_size = q_num_ngrams + e_num_ngrams - count
            overlap = count / min_size if min_size > 0 else 0.0
            jaccard = count / union_size if union_size > 0 else 0.0
            score = round(max(overlap, jaccard * self.JACCARD_WEIGHT), self.SCORE_ROUND)

            # filter by threshold
            if score >= threshold:
                item = self._entities.get(entity_id, {})
                results.append(Lookup(id=entity_id, data=item, score=score))

        # sort results by descending score
        return sorted(results, key=lambda r: r.score, reverse=True)


    def __repr__(self):
        return f"{self.__class__.__name__}(entities={len(self._entities)}, index={len(self._index)})"