feat: faster loading if model already cached

stephantul · stephantul · commit e666b4b39cf7 · 2025-09-09T10:54:47.000+02:00
diff --git a/model2vec/distill/inference.py b/model2vec/distill/inference.py
@@ -46,7 +46,7 @@ def create_embeddings(
     :param pad_token_id: The pad token id. Used to pad sequences.
     :return: The output embeddings.
     """
-    model = model.to(device)
+    model = model.to(device)  # type: ignore
 
     out_weights: np.ndarray
     intermediate_weights: list[np.ndarray] = []
@@ -98,7 +98,7 @@ def _encode_mean_using_model(model: PreTrainedModel, encodings: dict[str, torch.
     """
     encodings = {k: v.to(model.device) for k, v in encodings.items()}
     encoded: BaseModelOutputWithPoolingAndCrossAttentions = model(**encodings)
-    out: torch.Tensor = encoded.last_hidden_state.cpu()
+    out: torch.Tensor = encoded.last_hidden_state.cpu()  # type: ignore  # typing is wrong.
     # NOTE: If the dtype is bfloat 16, we convert to float32,
     # because numpy does not suport bfloat16
     # See here: https://github.com/numpy/numpy/issues/19808
diff --git a/model2vec/hf_utils.py b/model2vec/hf_utils.py
@@ -9,6 +9,7 @@
 import numpy as np
 import safetensors
 from huggingface_hub import ModelCard, ModelCardData
+from huggingface_hub.constants import HF_HUB_CACHE
 from safetensors.numpy import save_file
 from tokenizers import Tokenizer
 
@@ -99,6 +100,7 @@ def load_pretrained(
     subfolder: str | None = None,
     token: str | None = None,
     from_sentence_transformers: bool = False,
+    skip_metadata: bool = False,
 ) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
     """
     Loads a pretrained model from a folder.
@@ -109,6 +111,7 @@ def load_pretrained(
     :param subfolder: The subfolder to load from.
     :param token: The huggingface token to use.
     :param from_sentence_transformers: Whether to load the model from a sentence transformers model.
+    :param skip_metadata: Whether to skip loading metadata. This is useful if you don't need the metadata.
     :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
     :return: The embeddings, tokenizer, config, and metadata.
 
@@ -122,7 +125,12 @@ def load_pretrained(
         tokenizer_file = "tokenizer.json"
         config_name = "config.json"
 
-    folder_or_repo_path = Path(folder_or_repo_path)
+    if cached_folder := _get_latest_model_path(str(folder_or_repo_path)):
+        logger.info(f"Found cached model at {cached_folder}, loading from cache.")
+        folder_or_repo_path = cached_folder
+    else:
+        logger.info(f"No cached model found for {folder_or_repo_path}, loading from local or hub.")
+        folder_or_repo_path = Path(folder_or_repo_path)
 
     local_folder = folder_or_repo_path / subfolder if subfolder else folder_or_repo_path
 
@@ -139,9 +147,7 @@ def load_pretrained(
         if not tokenizer_path.exists():
             raise FileNotFoundError(f"Tokenizer file does not exist in {local_folder}")
 
-        # README is optional, so this is a bit finicky.
         readme_path = local_folder / "README.md"
-        metadata = _get_metadata_from_readme(readme_path)
 
     else:
         logger.info("Folder does not exist locally, attempting to use huggingface hub.")
@@ -150,18 +156,11 @@ def load_pretrained(
                 folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
             )
         )
-
-        try:
-            readme_path = Path(
-                huggingface_hub.hf_hub_download(
-                    folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
-                )
+        readme_path = Path(
+            huggingface_hub.hf_hub_download(
+                folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
             )
-            metadata = _get_metadata_from_readme(Path(readme_path))
-        except Exception as e:
-            # NOTE: we don't want to raise an error here, since the README is optional.
-            logger.info(f"No README found in the model folder: {e} No model card loaded.")
-            metadata = {}
+        )
 
         config_path = Path(
             huggingface_hub.hf_hub_download(
@@ -175,10 +174,13 @@ def load_pretrained(
         )
 
     opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
-    if from_sentence_transformers:
-        embeddings = opened_tensor_file.get_tensor("embedding.weight")
+    embedding_key = "embedding.weight" if from_sentence_transformers else "embeddings"
+    embeddings = opened_tensor_file.get_tensor(embedding_key)
+
+    if not skip_metadata and readme_path.exists():
+        metadata = _get_metadata_from_readme(readme_path)
     else:
-        embeddings = opened_tensor_file.get_tensor("embeddings")
+        metadata = {}
 
     tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
     config = json.load(open(config_path))
@@ -223,3 +225,28 @@ def push_folder_to_hub(
     huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token, path_in_repo=subfolder)
 
     logger.info(f"Pushed model to {repo_id}")
+
+
+def _get_latest_model_path(model_id: str) -> Path | None:
+    """
+    Gets the latest model path for a given identifier from the hugging face hub cache.
+
+    Returns None if there is no cached model. In this case, the model will be downloaded.
+    """
+    # Make path object
+    cache_dir = Path(HF_HUB_CACHE)
+    # This is specific to how HF stores the files.
+    normalized = model_id.replace("/", "--")
+    repo_dir = cache_dir / f"models--{normalized}" / "snapshots"
+
+    if not repo_dir.exists():
+        return None
+
+    # Find all directories.
+    snapshots = [p for p in repo_dir.iterdir() if p.is_dir()]
+    if not snapshots:
+        return None
+
+    # Get the latest directory by modification time.
+    latest_snapshot = max(snapshots, key=lambda p: p.stat().st_mtime)
+    return latest_snapshot
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -13,7 +13,7 @@
 from tqdm import tqdm
 
 from model2vec.quantization import DType, quantize_and_reduce_dim
-from model2vec.utils import ProgressParallel, load_local_model
+from model2vec.utils import ProgressParallel
 
 PathLike = Union[Path, str]
 
@@ -156,6 +156,7 @@ def from_pretrained(
         subfolder: str | None = None,
         quantize_to: str | DType | None = None,
         dimensionality: int | None = None,
+        skip_metadata: bool = False,
     ) -> StaticModel:
         """
         Load a StaticModel from a local path or huggingface hub path.
@@ -171,6 +172,8 @@ def from_pretrained(
         :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
             This is useful if you want to load a model with a lower dimensionality.
             Note that this only applies if you have trained your model using mrl or PCA.
+        :param skip_metadata: Whether to skip loading metadata. This is useful if you don't need the metadata.
+            Loading metadata can be slow for models with lots of results in the README.md
         :return: A StaticModel.
         """
         from model2vec.hf_utils import load_pretrained
@@ -180,6 +183,7 @@ def from_pretrained(
             token=token,
             from_sentence_transformers=False,
             subfolder=subfolder,
+            skip_metadata=skip_metadata,
         )
 
         embeddings = quantize_and_reduce_dim(
@@ -205,6 +209,7 @@ def from_sentence_transformers(
         normalize: bool | None = None,
         quantize_to: str | DType | None = None,
         dimensionality: int | None = None,
+        skip_metadata: bool = False,
     ) -> StaticModel:
         """
         Load a StaticModel trained with sentence transformers from a local path or huggingface hub path.
@@ -219,6 +224,8 @@ def from_sentence_transformers(
         :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
             This is useful if you want to load a model with a lower dimensionality.
             Note that this only applies if you have trained your model using mrl or PCA.
+        :param skip_metadata: Whether to skip loading metadata. This is useful if you don't need the metadata.
+            Loading metadata can be slow for models with lots of results in the README.md
         :return: A StaticModel.
         """
         from model2vec.hf_utils import load_pretrained
@@ -228,6 +235,7 @@ def from_sentence_transformers(
             token=token,
             from_sentence_transformers=True,
             subfolder=None,
+            skip_metadata=skip_metadata,
         )
 
         embeddings = quantize_and_reduce_dim(
@@ -447,28 +455,3 @@ def push_to_hub(
         with TemporaryDirectory() as temp_dir:
             self.save_pretrained(temp_dir, model_name=repo_id)
             push_folder_to_hub(Path(temp_dir), subfolder=subfolder, repo_id=repo_id, private=private, token=token)
-
-    @classmethod
-    def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:
-        """
-        Loads a model from a local path.
-
-        You should only use this code path if you are concerned with start-up time.
-        Loading via the `from_pretrained` method is safer, and auto-downloads, but
-        also means we import a whole bunch of huggingface code that we don't need.
-
-        Additionally, huggingface will check the most recent version of the model,
-        which can be slow.
-
-        :param path: The path to load the model from. The path is a directory saved by the
-            `save_pretrained` method.
-        :return: A StaticModel
-        :raises: ValueError if the path is not a directory.
-        """
-        path = Path(path)
-        if not path.is_dir():
-            raise ValueError(f"Path {path} is not a directory.")
-
-        embeddings, tokenizer, config = load_local_model(path)
-
-        return StaticModel(embeddings, tokenizer, config)
diff --git a/model2vec/utils.py b/model2vec/utils.py
@@ -102,27 +102,3 @@ def setup_logging() -> None:
         datefmt="%Y-%m-%d %H:%M:%S",
         handlers=[RichHandler(rich_tracebacks=True)],
     )
-
-
-def load_local_model(folder: Path) -> tuple[np.ndarray, Tokenizer, dict[str, str]]:
-    """Load a local model."""
-    embeddings_path = folder / "model.safetensors"
-    tokenizer_path = folder / "tokenizer.json"
-    config_path = folder / "config.json"
-
-    opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
-    embeddings = opened_tensor_file.get_tensor("embeddings")
-
-    if config_path.exists():
-        config = json.load(open(config_path))
-    else:
-        config = {}
-
-    tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
-
-    if len(tokenizer.get_vocab()) != len(embeddings):
-        logger.warning(
-            f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
-        )
-
-    return embeddings, tokenizer, config
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -118,9 +118,9 @@ def test_encode_as_tokens_empty(
     encoded = model.encode_as_sequence("")
     assert np.array_equal(encoded, np.zeros(shape=(0, 2), dtype=model.embedding.dtype))
 
-    encoded = model.encode_as_sequence(["", ""])
+    encoded_list = model.encode_as_sequence(["", ""])
     out = [np.zeros(shape=(0, 2), dtype=model.embedding.dtype) for _ in range(2)]
-    assert [np.array_equal(x, y) for x, y in zip(encoded, out)]
+    assert [np.array_equal(x, y) for x, y in zip(encoded_list, out)]
 
 
 def test_encode_empty_sentence(
@@ -273,23 +273,3 @@ def test_dim(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: d
     model = StaticModel(mock_vectors, mock_tokenizer, mock_config)
     assert model.dim == 2
     assert model.dim == model.embedding.shape[1]
-
-
-def test_local_load_from_model(mock_tokenizer: Tokenizer) -> None:
-    """Test local load from a model."""
-    x = np.ones((mock_tokenizer.get_vocab_size(), 2))
-    with TemporaryDirectory() as tempdir:
-        tempdir_path = Path(tempdir)
-        safetensors.numpy.save_file({"embeddings": x}, Path(tempdir) / "model.safetensors")
-        mock_tokenizer.save(str(Path(tempdir) / "tokenizer.json"))
-
-        model = StaticModel.load_local(tempdir_path)
-        assert model.embedding.shape == x.shape
-        assert model.tokenizer.to_str() == mock_tokenizer.to_str()
-        assert model.config == {"normalize": False}
-
-
-def test_local_load_from_model_no_folder() -> None:
-    """Test local load from a model with no folder."""
-    with pytest.raises(ValueError):
-        StaticModel.load_local("woahbuddy_relax_this_is_just_a_test")
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -14,7 +14,7 @@
 
 from model2vec.distill.utils import select_optimal_device
 from model2vec.hf_utils import _get_metadata_from_readme
-from model2vec.utils import get_package_extras, importable, load_local_model
+from model2vec.utils import get_package_extras, importable
 
 
 def test__get_metadata_from_readme_not_exists() -> None:
@@ -78,44 +78,3 @@ def test_get_package_extras() -> None:
 def test_get_package_extras_empty() -> None:
     """Test package extras with an empty package."""
     assert not list(get_package_extras("tqdm", ""))
-
-
-@pytest.mark.parametrize(
-    "config, expected",
-    [
-        ({"dog": "cat"}, {"dog": "cat"}),
-        ({}, {}),
-        (None, {}),
-    ],
-)
-def test_local_load(mock_tokenizer: Tokenizer, config: dict[str, Any], expected: dict[str, Any]) -> None:
-    """Test local loading."""
-    x = np.ones((mock_tokenizer.get_vocab_size(), 2))
-
-    with TemporaryDirectory() as tempdir:
-        tempdir_path = Path(tempdir)
-        safetensors.numpy.save_file({"embeddings": x}, Path(tempdir) / "model.safetensors")
-        mock_tokenizer.save(str(Path(tempdir) / "tokenizer.json"))
-        if config is not None:
-            json.dump(config, open(tempdir_path / "config.json", "w"))
-        arr, tokenizer, config = load_local_model(tempdir_path)
-        assert config == expected
-        assert tokenizer.to_str() == mock_tokenizer.to_str()
-        assert arr.shape == x.shape
-
-
-def test_local_load_mismatch(mock_tokenizer: Tokenizer, caplog: pytest.LogCaptureFixture) -> None:
-    """Test local loading."""
-    x = np.ones((10, 2))
-
-    with TemporaryDirectory() as tempdir:
-        tempdir_path = Path(tempdir)
-        safetensors.numpy.save_file({"embeddings": x}, Path(tempdir) / "model.safetensors")
-        mock_tokenizer.save(str(Path(tempdir) / "tokenizer.json"))
-
-        load_local_model(tempdir_path)
-        expected = (
-            f"Number of tokens does not match number of embeddings: `{len(mock_tokenizer.get_vocab())}` vs `{len(x)}`"
-        )
-        assert len(caplog.records) == 1
-        assert caplog.records[0].message == expected