Source code for curses_fzf.scoring

from __future__ import annotations

import re
from typing import Optional, List, Tuple, Set

RE_WORD = re.compile(r"\S+")


[docs] class ScoringResult(): """ ScoringResult encapsules information and toolage to help with your own scoring functions. The only important field is :attr:`~ScoringResult.score`, :class:`~curses_fzf.FuzzyFinder` will take it into account when it decides which items to show and in which order. If it is ``0`` the element will not be displayed in :attr:`FuzzyFinder.filtered` list. Positive values will sort the items from high to low :attr:`~ScoringResult.score`. The second field to notice is :attr:`~ScoringResult.matches`, which is a list of tuples containing the starting index of all matches inside the :attr:`~ScoringResult.candidate` string along with the matched substring. If set, this information will be used by :class:`~curses_fzf.FuzzyFinder` to highlight the matched substrings in the :attr:`FuzzyFinder.filtered` list of query results. The intended way to set those fields is :meth:`~ScoringResult.add_match`. It adds the given score to :attr:`ScoringResult.score` and the given position and match as a tuple to :attr:`ScoringResult.matches`. However feel free to directly manipulate these fields as your scoring logic requires. Especially setting :attr:`~ScoringResult.score` to ``0`` is a common way to filter an item out of the results. Args: query (int): This is the :attr:`FuzzyFinder.query` string entered by the user. candidate (int): This is a single item's :meth:`~FuzzyFinder.display` representation from the :attr:`FuzzyFinder.all_items` list. """ SEPARATORS: Set[str] = set(" \t/\\_-.:;|,()[]{}<>\"'`") """ A set of characters that commonly indicate "word boundaries" in generic text and paths. Those are chosen similarly to fzf's original heuristics. """ def __init__(self, query: str, candidate: str) -> None: self.query: str = query """ The original query string parameter as given to the constructor. This is the :attr:`FuzzyFinder.query` string entered by the user. """ self.query_lower: str = query.lower() """ The :attr:`~ScoringResult.query` string converted to lowercase. """ self.query_words_with_index: List[Tuple[str, int]] = [ (m.group(), m.start()) for m in RE_WORD.finditer(self.query_lower)] """ The :attr:`~ScoringResult.query_lower` string split on whitespaces. Each element is a tuple with the word and its starting index in the original :attr:`~ScoringResult.query` string. """ self.candidate: str = candidate """ The original candidate string parameter as given to the constructor. This is a single item's :meth:`~FuzzyFinder.display` representation from the :attr:`FuzzyFinder.all_items` list. """ self.candidate_lower: str = candidate.lower() """ The :attr:`~ScoringResult.candidate` string converted to lowercase. """ self.candidate_words_with_index: List[Tuple[str, int]] = [ (m.group(), m.start()) for m in RE_WORD.finditer(self.candidate_lower)] """ The :attr:`~ScoringResult.candidate_lower` string split on whitespaces. Each element is a tuple with the word and its starting index in the original :attr:`~ScoringResult.candidate` string. """ self.score: int = 0 """ The resulting fuzzy score. This is the field that :class:`~curses_fzf.FuzzyFinder` will take into account when it decides which items to show. A score of ``0`` will filter the item out of the resulting :attr:`FuzzyFinder.filtered` list. Remaining items are sorted from high to low score. The :meth:`~ScoringResult.add_match` method is meant to add to score, but feel free to directly manipulate this field as your scoring logic requires. """ self.matches: List[Tuple[int, str]] = [] """ A list of tuples representing each substring match of the :attr:`~ScoringResult.query` in the :attr:`~ScoringResult.candidate` string. The first tuple entry is the starting index of the match in the original :attr:`~ScoringResult.candidate` string. The second entry is the matched substring. The :meth:`~ScoringResult.add_match` method is meant to add to matches, but feel free to directly manipulate this field as your scoring logic requires. """ self._already_matched_words: Set[int] = set() """ A set of indices of candidate words that have already been matched to a query word. """ def __int__(self) -> int: return int(self.score) def __lt__(self, other: object) -> bool: if isinstance(other, ScoringResult): return self.score < other.score try: return self.score < int(other) # type: ignore[arg-type] except Exception: return NotImplemented def __gt__(self, other: object) -> bool: if isinstance(other, ScoringResult): return self.score > other.score try: return self.score > int(other) # type: ignore[arg-type] except Exception: return NotImplemented def __eq__(self, other: object) -> bool: if isinstance(other, ScoringResult): return self.score == other.score try: return self.score == int(other) # type: ignore[arg-type] except Exception: return NotImplemented
[docs] def add_match(self, position: int, match: str, score: int) -> None: """ Add position and match as a tuple to :attr:`ScoringResult.matches`. The given score is added to :attr:`ScoringResult.score`. This is the intended way to add to the :attr:`~ScoringResult.score` and keep track of the :attr:`~ScoringResult.matches`, but feel free to directly manipulate these fields as your scoring logic requires. Especially setting :attr:`~ScoringResult.score` to ``0`` is a common way to filter an item out of the results. Args: position (int): The starting index of the match in the original :attr:`~ScoringResult.candidate` string. match (str): The matched substring, starting at the given position. score (int): This value is added to :attr:`ScoringResult.score`. """ self.score += score self.matches.append((position, match))
[docs] def find_best_word_match(self, word: str) -> Optional[Tuple[str, int, int, int]]: """ Find the best match for the given query word among the :attr:`~ScoringResult.candidate_words_with_index`. The best match is the one that is closest to a full word (highest percentage of the word matched). If multiple matches have the same percentage, the one with the match closer to the word's beginning is preferred. If a match is found, the index of the matched word in the :attr:`~ScoringResult.candidate` is added to :attr:`ScoringResult._already_matched_words` to prevent it from being matched again to another query word. Args: word (str): The query word (an item from :attr:`~ScoringResult.query_words_with_index`) for which to find the best match among :attr:`~ScoringResult.candidate_words_with_index`. Returns: Optional[Tuple[str, int, int, int]]: A tuple containing: - the matched word from :attr:`~ScoringResult.candidate_words_with_index` - its position inside :attr:`~ScoringResult.candidate` string - the position of the match inside the matched word - the percentage of the :attr:`~ScoringResult.candidate` word the :attr:`~ScoringResult.query` word actually matches Returns ``None`` if no match was found at all. """ best_match = None length_matched_word = -1 match_position_in_word = -1 for c_tuple in self.candidate_words_with_index: # don't consider the same word twice for different query words if c_tuple[1] in self._already_matched_words: continue pos = c_tuple[0].find(word) if pos != -1: len_found_word = len(c_tuple[0]) # the best match is the one that is closest to a full word if len_found_word < length_matched_word or \ length_matched_word == -1: length_matched_word = len_found_word match_position_in_word = pos best_match = c_tuple # if 2 findings have the same length prefer the one with the # match closer to the word's beginning elif len_found_word == length_matched_word and \ pos < match_position_in_word: match_position_in_word = pos best_match = c_tuple if best_match is None: return None # remember the word's index in candidate self._already_matched_words.add(best_match[1]) return (best_match[0], best_match[1], match_position_in_word, int(100 * len(word) / length_matched_word))
[docs] def is_boundary(self, position: int) -> bool: """ Check if the given position in the :attr:`~ScoringResult.candidate` string is a boundary. A boundary is defined as: - the beginning of the string - a position after a separator character (space, tab, slash, dash, dot, etc.) - a camelCase transition (lowercase followed by uppercase) - an alpha<->digit transition Those criteria are chosen similarly to fzf's original heuristics. Args: position (int): The index in the :attr:`~ScoringResult.candidate` string to check. Returns: bool: ``True`` if the position is a boundary, ``False`` otherwise. """ if position <= 0: return True prev = self.candidate[position - 1] cur = self.candidate[position] if prev in self.SEPARATORS: return True if prev.islower() and cur.isupper(): return True if prev.isalpha() and cur.isdigit(): return True if prev.isdigit() and cur.isalpha(): return True return False
[docs] def check_query_empty(self) -> bool: """ Check if the :attr:`~ScoringResult.query` string is empty. If it is empty, the score is set to ``100`` to keep all items in the original order. Returns: bool: ``True`` if the :attr:`~ScoringResult.query` string is empty, ``False`` otherwise. """ if not self.query: self.score = 100 return True return False
[docs] def merge_positions_to_substrings(self, positions: List[int]) -> List[Tuple[int, str]]: """ Convert matched character positions to (start, substring) tuples for :meth:`~ScoringResult.add_match` or :attr:`~ScoringResult.matches`. Example: positions [3,4,5, 10,11] -> [(3, candidate[3:6]), (10, candidate[10:12])] """ if not positions: return [] result = [] start = prev = positions[0] for position in positions[1:]: if position == prev + 1: prev = position continue result.append((start, self.candidate[start:prev + 1])) start = prev = position result.append((start, self.candidate[start:prev + 1])) return result
[docs] def greedy_match_positions(self) -> List[int]: """ Find the positions of each single query characters in the candidate string using a greedy forward scan. This is a common first step in many fuzzy matching algorithms and can be used to quickly filter out candidates that don't match at all or to get a baseline score for more complex scoring functions. Returns: List[int]: A list of positions where the query characters match in the candidate. """ positions: List[int] = [] start = 0 for query_char in self.query_lower: pos = self.candidate_lower.find(query_char, start) if pos == -1: return [] positions.append(pos) start = pos + 1 return positions
[docs] def scoring_full_words(query: str, candidate: str) -> ScoringResult: """ The :attr:`~curses_fzf.ScoringResult.query` and the :attr:`~curses_fzf.ScoringResult.candidate` string both get lowercased and split on whitespaces (see :attr:`~curses_fzf.ScoringResult.query_words_with_index` & :attr:`~curses_fzf.ScoringResult.candidate_words_with_index`). Each query word is supposed to find a unique match among the words of the candidate. The closer a match is to a full word the higher the score. An additional bonus is given, if the match is at the beginning of a word. The query words may appear in the candidate in any order, however if the original order is found a small bonus will be granted. """ sr = ScoringResult(query, candidate) if sr.check_query_empty(): return sr for q_word, q_word_index in sr.query_words_with_index: best_match = sr.find_best_word_match(q_word) # all query words need to find a match to keep the candidate if best_match is None: sr.score = 0 return sr match_position_in_candidate = best_match[1] + best_match[2] # score is the word match percentage multiplied by a bonus if the # match starts at the word's beginning score = best_match[3] if best_match[2] == 0: score *= 1.5 sr.add_match(match_position_in_candidate, q_word, int(score)) # small bonus if all matches are in the exact order of the query if all(sr.matches[i][0] < sr.matches[i+1][0] for i in range(len(sr.matches) - 1)): sr.score = int(sr.score * 1.2) # normalize the score by the number of matches (= number of query words) sr.score = int(sr.score / len(sr.matches)) return sr
[docs] def scoring_fzf(query: str, candidate: str) -> ScoringResult: # noqa: C901 """ A fzf-like fuzzy scoring. This is the default scoring function used by :class:`~curses_fzf.FuzzyFinder` if no other scoring function is provided. The :attr:`~ScoringResult.query` characters are matched as a subsequence against the :attr:`~ScoringResult.candidate` (characters must appear in order). There are bonuses for consecutive matches, matches on boundaries and matches early in the candidate. """ sr = ScoringResult(query, candidate) if sr.check_query_empty(): return sr # this algorithm may miss some matches if a longer subsequence is found # before a shorter one that would allow to match the rest of the query later # on, so we calculate a greedy match as a fallback greedy_matches = sr.merge_positions_to_substrings( sr.greedy_match_positions()) if not greedy_matches: sr.score = 0 return sr MATCH_BASE_SCORE = 100 # starting score if query matches (once) BOUNDARY_MATCH_WEIGHT = 8.0 # bonus factor for matches on boundaries EARLY_MATCH_WEIGHT = 5.0 # bonus factor for early matches WORD_COVERAGE_WEIGHT = 10.0 # bonus factor for word coverage # find the longest matching subsequences of the query in the candidate in # original order, every match gets a fixed base score to beginn with query = sr.query_lower start = 0 sr.score = MATCH_BASE_SCORE while query: for i in range(len(query), 0, -1): query_sequence = query[:i] pos = sr.candidate_lower.find(query_sequence, start) if pos != -1: sr.add_match(pos, sr.candidate[pos:pos+i], 0) query = query[i:] start = pos + i break if i == 1: # no match found, but since we didn't return before, there # actually is a match, so we use the greedy match as a fallback sr.matches = greedy_matches query = "" # add some bonus points total_len = len(candidate) for start_pos, match_str in sr.matches: match_len = len(match_str) # bonus on word boundary if sr.is_boundary(start_pos): sr.score += int(BOUNDARY_MATCH_WEIGHT * match_len) # bonus for early matches early_factor = (total_len - start_pos) / total_len # 0.x .. 1 sr.score += int(EARLY_MATCH_WEIGHT * match_len * early_factor) # bonus for word coverage for word, word_start in sr.candidate_words_with_index: word_end = word_start + len(word) if word_start <= start_pos < word_end: coverage = match_len / len(word) # 0.x .. 1 sr.score += int(WORD_COVERAGE_WEIGHT * match_len * coverage) break # fewer match groups should be ranked higher sr.score //= len(sr.matches) return sr