Refactored SemHash, moved more functions to utils

Pringled · Pringled · commit 34c1f9cfcac0 · 2026-01-02T19:53:38.000+01:00
diff --git a/semhash/datamodels.py b/semhash/datamodels.py
@@ -5,14 +5,11 @@
 from collections.abc import Hashable, Sequence
 from dataclasses import dataclass, field
 from functools import cached_property
-from typing import Any, Generic, TypeAlias, TypeVar
+from typing import Any, Generic
 
 from frozendict import frozendict
 
-from semhash.utils import to_frozendict
-
-Record = TypeVar("Record", str, dict[str, Any])
-DuplicateList: TypeAlias = list[tuple[Record, float]]
+from semhash.utils import DuplicateList, Record, to_frozendict
 
 
 @dataclass
diff --git a/semhash/semhash.py b/semhash/semhash.py
@@ -14,7 +14,14 @@
 from semhash.datamodels import DeduplicationResult, DuplicateRecord, FilterResult, Record
 from semhash.index import Index
 from semhash.records import add_scores_to_records, map_deduplication_result_to_strings
-from semhash.utils import Encoder, compute_candidate_limit, to_frozendict
+from semhash.utils import (
+    Encoder,
+    compute_candidate_limit,
+    featurize,
+    prepare_records,
+    remove_exact_duplicates,
+    to_frozendict,
+)
 
 
 class SemHash(Generic[Record]):
@@ -33,95 +40,6 @@ def __init__(self, index: Index, model: Encoder, columns: Sequence[str], was_str
         self._was_string = was_string
         self._ranking_cache: FilterResult | None = None
 
-    @staticmethod
-    def _featurize(
-        records: Sequence[dict[str, str]],
-        columns: Sequence[str],
-        model: Encoder,
-    ) -> np.ndarray:
-        """
-        Featurize a list of records using the model.
-
-        :param records: A list of records.
-        :param columns: Columns to featurize.
-        :param model: An Encoder model.
-        :return: The embeddings of the records.
-        """
-        # Extract the embeddings for each column across all records
-        embeddings_per_col = []
-        for col in columns:
-            col_texts = [r[col] for r in records]
-            col_emb = model.encode(col_texts)
-            embeddings_per_col.append(np.asarray(col_emb))
-
-        return np.concatenate(embeddings_per_col, axis=1)
-
-    @classmethod
-    def _remove_exact_duplicates(
-        cls,
-        records: Sequence[dict[str, str]],
-        columns: Sequence[str],
-        reference_records: list[list[dict[str, str]]] | None = None,
-    ) -> tuple[list[dict[str, str]], list[tuple[dict[str, str], list[dict[str, str]]]]]:
-        """
-        Remove exact duplicates based on the unpacked string representation of each record.
-
-        If reference_records is None, the function will only check for duplicates within the records list.
-
-        :param records: A list of records to check for exact duplicates.
-        :param columns: Columns to unpack.
-        :param reference_records: A list of records to compare against. These are already unpacked
-        :return: A list of deduplicated records and a list of duplicates.
-        """
-        deduplicated = []
-        duplicates = []
-
-        column_set = set(columns)
-        # Build a seen set from reference_records if provided
-        seen: defaultdict[frozendict[str, str], list[dict[str, str]]] = defaultdict(list)
-        if reference_records is not None:
-            for record_set in reference_records:
-                key = to_frozendict(record_set[0], column_set)
-                seen[key] = list(record_set)
-        in_one_set = reference_records is None
-
-        for record in records:
-            frozen_record = frozendict({k: v for k, v in record.items() if k in column_set})
-            if duplicated_records := seen.get(frozen_record):
-                duplicates.append((record, duplicated_records))
-            else:
-                deduplicated.append(record)
-                # Only add current documents to seen if no reference set is used
-                if in_one_set:
-                    seen[frozen_record].append(record)
-
-        return deduplicated, duplicates
-
-    @staticmethod
-    def _prepare_records(
-        records: Sequence[Record], columns: Sequence[str] | None
-    ) -> tuple[list[dict[str, str]], Sequence[str], bool]:
-        """
-        Validate and prepare records for processing.
-
-        :param records: A list of records (strings or dictionaries).
-        :param columns: Columns to use if records are dictionaries.
-        :return: Tuple of (dict_records, columns, was_string).
-        :raises ValueError: If columns are not provided for dictionary records.
-        """
-        if columns is None and isinstance(records[0], dict):
-            raise ValueError("Columns must be specified when passing dictionaries.")
-
-        if isinstance(records[0], str):
-            columns = ["text"]
-            dict_records: list[dict[str, str]] = [{"text": str(record)} for record in records]
-            was_string = True
-        else:
-            dict_records = list(records)
-            was_string = False
-
-        return dict_records, columns, was_string
-
     @classmethod
     def from_embeddings(
         cls,
@@ -152,10 +70,10 @@ def from_embeddings(
             raise ValueError(f"Number of embeddings ({len(embeddings)}) must match number of records ({len(records)})")
 
         # Prepare and validate records
-        dict_records, columns, was_string = cls._prepare_records(records, columns)
+        dict_records, columns, was_string = prepare_records(records, columns)
 
         # Remove exact duplicates
-        deduplicated_records, exact_duplicates = cls._remove_exact_duplicates(dict_records, columns)
+        deduplicated_records, exact_duplicates = remove_exact_duplicates(dict_records, columns)
 
         # Build items list. Each item is a list of exact duplicates
         items: list[list[dict[str, str]]] = [[record] for record in deduplicated_records]
@@ -208,14 +126,14 @@ def from_records(
         :return: A SemHash instance with a fitted vicinity index.
         """
         # Prepare and validate records
-        dict_records, columns, was_string = cls._prepare_records(records, columns)
+        dict_records, columns, was_string = prepare_records(records, columns)
 
         # If no model is provided, load the default model
         if model is None:
             model = StaticModel.from_pretrained("minishlab/potion-base-8M")
 
         # Remove exact duplicates
-        deduplicated_records, duplicates = cls._remove_exact_duplicates(dict_records, columns)
+        deduplicated_records, duplicates = remove_exact_duplicates(dict_records, columns)
 
         col_set = set(columns)
         duplicate_map = defaultdict(list)
@@ -231,7 +149,7 @@ def from_records(
             items.append(i)
 
         # Create embeddings for deduplicated records only
-        embeddings = cls._featurize(deduplicated_records, columns, model)
+        embeddings = featurize(deduplicated_records, columns, model)
 
         # Build the Vicinity index
         backend = ann_backend if use_ann else Backend.BASIC
@@ -263,7 +181,7 @@ def deduplicate(
         dict_records = self._validate_if_strings(records)
 
         # Remove exact duplicates before embedding
-        dict_records, exact_duplicates = self._remove_exact_duplicates(
+        dict_records, exact_duplicates = remove_exact_duplicates(
             records=dict_records, columns=self.columns, reference_records=self.index.items
         )
         duplicate_records = []
@@ -279,7 +197,7 @@ def deduplicate(
             )
 
         # Compute embeddings for the new records
-        embeddings = self._featurize(records=dict_records, columns=self.columns, model=self.model)
+        embeddings = featurize(records=dict_records, columns=self.columns, model=self.model)
         # Query the fitted index
         results = self.index.query_threshold(embeddings, threshold=threshold)
 
@@ -536,7 +454,7 @@ def _rank_by_average_similarity(
         :return: A FilterResult containing the ranking (records sorted and their average similarity scores).
         """
         dict_records = self._validate_if_strings(records)
-        embeddings = self._featurize(records=dict_records, columns=self.columns, model=self.model)
+        embeddings = featurize(records=dict_records, columns=self.columns, model=self.model)
         results = self.index.query_top_k(embeddings, k=100, vectors_are_in_index=False)
 
         # Compute the average similarity for each record.
@@ -600,7 +518,7 @@ def _diversify(
         if not candidates:
             return FilterResult(selected=[], filtered=[], scores_selected=[], scores_filtered=[])
 
-        embeddings = self._featurize(records=candidates, columns=self.columns, model=self.model)
+        embeddings = featurize(records=candidates, columns=self.columns, model=self.model)
         result = diversify(
             embeddings=embeddings,
             scores=np.array(relevance),
diff --git a/semhash/utils.py b/semhash/utils.py
@@ -1,9 +1,14 @@
+from collections import defaultdict
 from collections.abc import Sequence
-from typing import Any, Protocol
+from typing import Any, Protocol, TypeAlias, TypeVar
 
 import numpy as np
 from frozendict import frozendict
 
+# Type definitions
+Record = TypeVar("Record", str, dict[str, Any])
+DuplicateList: TypeAlias = list[tuple[Record, float]]
+
 
 class Encoder(Protocol):
     """An encoder protocol for SemHash."""
@@ -54,3 +59,91 @@ def compute_candidate_limit(
     # 4) enforce upper bound (and never exceed the dataset)
     limit = min(limit, max_candidates, total)
     return limit
+
+
+def featurize(
+    records: Sequence[dict[str, str]],
+    columns: Sequence[str],
+    model: Encoder,
+) -> np.ndarray:
+    """
+    Featurize a list of records using the model.
+
+    :param records: A list of records.
+    :param columns: Columns to featurize.
+    :param model: An Encoder model.
+    :return: The embeddings of the records.
+    """
+    # Extract the embeddings for each column across all records
+    embeddings_per_col = []
+    for col in columns:
+        col_texts = [r[col] for r in records]
+        col_emb = model.encode(col_texts)
+        embeddings_per_col.append(np.asarray(col_emb))
+
+    return np.concatenate(embeddings_per_col, axis=1)
+
+
+def remove_exact_duplicates(
+    records: Sequence[dict[str, str]],
+    columns: Sequence[str],
+    reference_records: list[list[dict[str, str]]] | None = None,
+) -> tuple[list[dict[str, str]], list[tuple[dict[str, str], list[dict[str, str]]]]]:
+    """
+    Remove exact duplicates based on the unpacked string representation of each record.
+
+    If reference_records is None, the function will only check for duplicates within the records list.
+
+    :param records: A list of records to check for exact duplicates.
+    :param columns: Columns to unpack.
+    :param reference_records: A list of records to compare against. These are already unpacked
+    :return: A list of deduplicated records and a list of duplicates.
+    """
+    deduplicated = []
+    duplicates = []
+
+    column_set = set(columns)
+    # Build a seen set from reference_records if provided
+    seen: defaultdict[frozendict[str, str], list[dict[str, str]]] = defaultdict(list)
+    if reference_records is not None:
+        for record_set in reference_records:
+            key = to_frozendict(record_set[0], column_set)
+            seen[key] = list(record_set)
+    in_one_set = reference_records is None
+
+    for record in records:
+        frozen_record = frozendict({k: v for k, v in record.items() if k in column_set})
+        if duplicated_records := seen.get(frozen_record):
+            duplicates.append((record, duplicated_records))
+        else:
+            deduplicated.append(record)
+            # Only add current documents to seen if no reference set is used
+            if in_one_set:
+                seen[frozen_record].append(record)
+
+    return deduplicated, duplicates
+
+
+def prepare_records(
+    records: Sequence[Record], columns: Sequence[str] | None
+) -> tuple[list[dict[str, str]], Sequence[str], bool]:
+    """
+    Validate and prepare records for processing.
+
+    :param records: A list of records (strings or dictionaries).
+    :param columns: Columns to use if records are dictionaries.
+    :return: Tuple of (dict_records, columns, was_string).
+    :raises ValueError: If columns are not provided for dictionary records.
+    """
+    if columns is None and isinstance(records[0], dict):
+        raise ValueError("Columns must be specified when passing dictionaries.")
+
+    if isinstance(records[0], str):
+        columns = ["text"]
+        dict_records: list[dict[str, str]] = [{"text": str(record)} for record in records]
+        was_string = True
+    else:
+        dict_records = list(records)
+        was_string = False
+
+    return dict_records, columns, was_string
diff --git a/tests/test_semhash.py b/tests/test_semhash.py
@@ -189,22 +189,24 @@ def test_self_filter_outliers(use_ann: bool, model: Encoder, train_texts: list[s
 def test__diversify(monkeypatch: pytest.MonkeyPatch) -> None:
     """Test the _diversify method."""
     # Create a dummy SemHash instance
-    semhash = SemHash(index=None, model=None, columns=["text"], was_string=True)  # type: ignore
+    from semhash import semhash as semhash_module
+
+    semhash_instance = SemHash(index=None, model=None, columns=["text"], was_string=True)  # type: ignore
     # Prepare a fake ranking with three records
     records = ["a", "b", "c"]
     scores = [3.0, 2.0, 1.0]
     ranking = FilterResult(selected=records, filtered=[], scores_selected=scores, scores_filtered=[])
     # Create dummy embeddings for the records
     embeddings = np.array([[1.0, 0.0], [0.5, 0.5], [0.0, 1.0]])
     # Monkeypatch featurize to return the dummy embeddings
-    monkeypatch.setattr(semhash, "_featurize", lambda records, columns, model: embeddings)
+    monkeypatch.setattr(semhash_module, "featurize", lambda records, columns, model: embeddings)
 
     # Test diversity=0.0: pure relevance, should pick top 2 by score
-    result_rel = semhash._diversify(ranking, candidate_limit=3, selection_size=2, diversity=0.0)
+    result_rel = semhash_instance._diversify(ranking, candidate_limit=3, selection_size=2, diversity=0.0)
     assert result_rel.selected == ["a", "b"]
 
     # Test diversity=1.0: pure diversity, should first pick 'a', then pick most dissimilar: 'c'
-    result_div = semhash._diversify(ranking, candidate_limit=3, selection_size=2, diversity=1.0)
+    result_div = semhash_instance._diversify(ranking, candidate_limit=3, selection_size=2, diversity=1.0)
     assert result_div.selected == ["a", "c"]