@@ -30,7 +30,7 @@ def __init__(
3030 base_model_name : str | None = None ,
3131 language : list [str ] | None = None ,
3232 weights : np .ndarray | None = None ,
33- token_mapping : list [ int ] | None = None ,
33+ token_mapping : np . ndarray | None = None ,
3434 ) -> None :
3535 """
3636 Initialize the StaticModel.
@@ -63,7 +63,7 @@ def __init__(
6363 self .weights = weights
6464 # Convert to an array for fast lookups
6565 # We can't use or short circuit here because np.ndarray as booleans are ambiguous.
66- self .token_mapping : np .ndarray | None = None if token_mapping is None else np . asarray ( token_mapping )
66+ self .token_mapping : np .ndarray | None = token_mapping
6767
6868 self .tokenizer = tokenizer
6969 self .unk_token_id : int | None
@@ -121,9 +121,6 @@ def save_pretrained(self, path: PathLike, model_name: str | None = None, subfold
121121 """
122122 from model2vec .hf_utils import save_pretrained
123123
124- if self .token_mapping is not None :
125- self .config ["token_mapping" ] = self .token_mapping .tolist ()
126-
127124 save_pretrained (
128125 folder_path = Path (path ),
129126 embeddings = self .embedding ,
@@ -134,6 +131,7 @@ def save_pretrained(self, path: PathLike, model_name: str | None = None, subfold
134131 model_name = model_name ,
135132 subfolder = subfolder ,
136133 weights = self .weights ,
134+ mapping = self .token_mapping ,
137135 )
138136
139137 def tokenize (self , sentences : Sequence [str ], max_length : int | None = None ) -> list [list [int ]]:
@@ -490,11 +488,10 @@ def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:
490488 if not path .is_dir ():
491489 raise ValueError (f"Path { path } is not a directory." )
492490
493- embeddings , tokenizer , config , weights = load_local_model (path )
494- token_mapping = cast (list [int ], config .pop ("token_mapping" , None ))
491+ embeddings , tokenizer , config , weights , mapping = load_local_model (path )
495492
496493 return StaticModel (
497- vectors = embeddings , tokenizer = tokenizer , config = config , weights = weights , token_mapping = token_mapping
494+ vectors = embeddings , tokenizer = tokenizer , config = config , weights = weights , token_mapping = mapping
498495 )
499496
500497
@@ -517,7 +514,7 @@ def quantize_model(
517514 """
518515 from model2vec .quantization import quantize_and_reduce_dim
519516
520- token_mapping : list [ int ] | None
517+ token_mapping : np . ndarray | None
521518 weights : np .ndarray | None
522519 if vocabulary_quantization is not None :
523520 from model2vec .vocabulary_quantization import quantize_vocabulary
@@ -530,7 +527,7 @@ def quantize_model(
530527 )
531528 else :
532529 embeddings = model .embedding
533- token_mapping = cast ( list [ int ], model .token_mapping . tolist ()) if model . token_mapping is not None else None
530+ token_mapping = model .token_mapping
534531 weights = model .weights
535532 if quantize_to is not None or dimensionality is not None :
536533 embeddings = quantize_and_reduce_dim (
@@ -568,20 +565,18 @@ def _loading_helper(
568565 if from_sentence_transformers and subfolder is not None :
569566 raise ValueError ("Subfolder is not supported for sentence transformers models." )
570567
571- embeddings , tokenizer , config , metadata , weights = load_pretrained (
568+ embeddings , tokenizer , config , metadata , weights , mapping = load_pretrained (
572569 folder_or_repo_path = path ,
573570 token = token ,
574571 from_sentence_transformers = from_sentence_transformers ,
575572 subfolder = subfolder ,
576573 )
577574
578- token_mapping = config .pop ("token_mapping" , None )
579-
580575 model = cls (
581576 vectors = embeddings ,
582577 tokenizer = tokenizer ,
583578 weights = weights ,
584- token_mapping = token_mapping ,
579+ token_mapping = mapping ,
585580 config = config ,
586581 normalize = normalize ,
587582 base_model_name = metadata .get ("base_model" ),
0 commit comments