Source code for scapi.database.index.tokenize

import re
import unicodedata


_PATTERN_NORMALIZE = re.compile(r"[^\w\u0400-\u04FF]+", re.UNICODE)
_PATTERN_SPACE = re.compile(r"\s+")


[docs] def normalize(text: str) -> str: """Normalize text: case folding, unicode normalization, punctuation removal.""" text = unicodedata.normalize("NFKC", text.lower()) text = _PATTERN_NORMALIZE.sub(" ", text) text = _PATTERN_SPACE.sub(" ", text).strip() return text
[docs] def words(text: str) -> set[str]: """Tokenize text into normalized word set.""" return set(normalize(text).split(" "))
[docs] def ngrams(token: str, n: int = 3) -> set[str]: """Generate N-grams from token with edge padding.""" if not token: return set() padded = f"#{token}#" ngrams = set() L = len(padded) - n + 1 for i in range(L): ngrams.add(padded[i : i + n]) if len(token) < n: ngrams.add(token) return ngrams
[docs] def ngramize(text: str) -> set[str]: """Convert text to N-gram set for fuzzy matching.""" return {ngram for word in words(text) for ngram in ngrams(word)}