Refactored SemHash, moved more functions to utils

Pringled · Pringled · commit 5ed061d4d3b6 · 2026-01-03T11:35:06.000+01:00
diff --git a/semhash/semhash.py b/semhash/semhash.py
@@ -40,6 +40,66 @@ def __init__(self, index: Index, model: Encoder, columns: Sequence[str], was_str
         self._was_string = was_string
         self._ranking_cache: FilterResult | None = None
 
+    @classmethod
+    def from_records(
+        cls,
+        records: Sequence[Record],
+        columns: Sequence[str] | None = None,
+        use_ann: bool = True,
+        model: Encoder | None = None,
+        ann_backend: Backend | str = Backend.USEARCH,
+        **kwargs: Any,
+    ) -> SemHash:
+        """
+        Initialize a SemHash instance from records.
+
+        This removes exact duplicates, featurizes the records, and fits a vicinity index.
+
+        :param records: A list of records (strings or dictionaries).
+        :param columns: Columns to featurize if records are dictionaries.
+        :param use_ann: Whether to use approximate nearest neighbors (True) or basic search (False). Default is True.
+        :param model: (Optional) An Encoder model. If None, the default model is used (minishlab/potion-base-8M).
+        :param ann_backend: (Optional) The ANN backend to use if use_ann is True. Defaults to Backend.USEARCH.
+        :param **kwargs: Any additional keyword arguments to pass to the Vicinity index.
+        :return: A SemHash instance with a fitted vicinity index.
+        """
+        # Prepare and validate records
+        dict_records, columns, was_string = prepare_records(records, columns)
+
+        # If no model is provided, load the default model
+        if model is None:
+            model = StaticModel.from_pretrained("minishlab/potion-base-8M")
+
+        # Remove exact duplicates
+        deduplicated_records, duplicates = remove_exact_duplicates(dict_records, columns)
+
+        col_set = set(columns)
+        duplicate_map = defaultdict(list)
+        for x, _ in duplicates:
+            frozen_record = to_frozendict(x, col_set)
+            duplicate_map[frozen_record].append(x)
+
+        items: list[list[dict[str, str]]] = []
+        for record in deduplicated_records:
+            i = [record]
+            frozen_record = to_frozendict(record, col_set)
+            i.extend(duplicate_map[frozen_record])
+            items.append(i)
+
+        # Create embeddings for deduplicated records only
+        embeddings = featurize(deduplicated_records, columns, model)
+
+        # Build the Vicinity index
+        backend = ann_backend if use_ann else Backend.BASIC
+        index = Index.from_vectors_and_items(
+            vectors=embeddings,
+            items=items,
+            backend_type=backend,
+            **kwargs,
+        )
+
+        return cls(index=index, columns=columns, model=model, was_string=was_string)
+
     @classmethod
     def from_embeddings(
         cls,
@@ -54,6 +114,8 @@ def from_embeddings(
         """
         Initialize a SemHash instance from pre-computed embeddings.
 
+        This removes exact duplicates and fits a vicinity index using the provided embeddings.
+
         :param embeddings: Pre-computed embeddings as a numpy array of shape (n_records, embedding_dim).
         :param records: A list of records (strings or dictionaries) corresponding to the embeddings.
         :param model: The Encoder model used for creating the embeddings.
@@ -102,66 +164,6 @@ def from_embeddings(
 
         return cls(index=index, model=model, columns=columns, was_string=was_string)
 
-    @classmethod
-    def from_records(
-        cls,
-        records: Sequence[Record],
-        columns: Sequence[str] | None = None,
-        use_ann: bool = True,
-        model: Encoder | None = None,
-        ann_backend: Backend | str = Backend.USEARCH,
-        **kwargs: Any,
-    ) -> SemHash:
-        """
-        Initialize a SemHash instance from records.
-
-        This removes exact duplicates, featurizes the records, and fits a vicinity index.
-
-        :param records: A list of records (strings or dictionaries).
-        :param columns: Columns to featurize if records are dictionaries.
-        :param use_ann: Whether to use approximate nearest neighbors (True) or basic search (False). Default is True.
-        :param model: (Optional) An Encoder model. If None, the default model is used (minishlab/potion-base-8M).
-        :param ann_backend: (Optional) The ANN backend to use if use_ann is True. Defaults to Backend.USEARCH.
-        :param **kwargs: Any additional keyword arguments to pass to the Vicinity index.
-        :return: A SemHash instance with a fitted vicinity index.
-        """
-        # Prepare and validate records
-        dict_records, columns, was_string = prepare_records(records, columns)
-
-        # If no model is provided, load the default model
-        if model is None:
-            model = StaticModel.from_pretrained("minishlab/potion-base-8M")
-
-        # Remove exact duplicates
-        deduplicated_records, duplicates = remove_exact_duplicates(dict_records, columns)
-
-        col_set = set(columns)
-        duplicate_map = defaultdict(list)
-        for x, _ in duplicates:
-            frozen_record = to_frozendict(x, col_set)
-            duplicate_map[frozen_record].append(x)
-
-        items: list[list[dict[str, str]]] = []
-        for record in deduplicated_records:
-            i = [record]
-            frozen_record = to_frozendict(record, col_set)
-            i.extend(duplicate_map[frozen_record])
-            items.append(i)
-
-        # Create embeddings for deduplicated records only
-        embeddings = featurize(deduplicated_records, columns, model)
-
-        # Build the Vicinity index
-        backend = ann_backend if use_ann else Backend.BASIC
-        index = Index.from_vectors_and_items(
-            vectors=embeddings,
-            items=items,
-            backend_type=backend,
-            **kwargs,
-        )
-
-        return cls(index=index, columns=columns, model=model, was_string=was_string)
-
     def deduplicate(
         self,
         records: Sequence[Record],