Add network embedding batching (#276)

fcharlaix-opendsi · web-flow · commit cbca6e824572 · 2026-03-05T10:55:42.000+05:30
When using Mistral embedding API (through [LiteLLM](https://www.litellm.ai/) for OpenAI API compatibility) with Context Chat, I got many `Too many tokens overall, split into more batches` errors. This is due to Mistral's lower tokens limit per API request, [16000~](langchain-ai/langchain#20523), compared to OpenAI's [300000](https://github.com/langchain-ai/langchain/blob/18230f625f79aba25cbf9fb5500ab504cbb8f0bc/libs/partners/openai/langchain_openai/embeddings/base.py#L22). The idea to fix this is to implement the same pattern as the [LangChain OpenAI integration](https://github.com/langchain-ai/langchain/blob/18230f625f79aba25cbf9fb5500ab504cbb8f0bc/libs/partners/openai/langchain_openai/embeddings/base.py#L598), batching API requests. A better solution would be to allow using LangChain’s built-in provider class, but this refactor is too big for my first PR x) Signed-off-by: Florian Charlaix <fcharlaix@open-dsi.fr>
diff --git a/config.cpu.yaml b/config.cpu.yaml
@@ -23,6 +23,7 @@ embedding:
   base_url: http://localhost:5000/v1
   workers: 1
   request_timeout: 1800 # in seconds
+  # batch_size: 100 # max texts per embedding API request, 0 = no batching
   # only for external embedding service
   # remote_service: true
   # model_name: text-embedding-3-small
diff --git a/config.gpu.yaml b/config.gpu.yaml
@@ -23,6 +23,7 @@ embedding:
   base_url: http://localhost:5000/v1
   workers: 1
   request_timeout: 1800 # in seconds
+  # batch_size: 100 # max texts per embedding API request, 0 = no batching
   # only for external embedding service
   # remote_service: true
   # model_name: text-embedding-3-small
diff --git a/context_chat_backend/config_parser.py b/context_chat_backend/config_parser.py
@@ -78,6 +78,7 @@ def get_config(file_path: str) -> TConfig:
 				remote_service=True,
 				workers=0,
 				request_timeout=embedding.get('request_timeout', 1800) if embedding else 1800,
+				batch_size=int(os.getenv('CC_EM_BATCH_SIZE', 100)),
 			)
 		except Exception as e:
 			raise AssertionError(
diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py
@@ -117,7 +117,15 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float]
 		return [d['embedding'] for d in resp['data']]  # pyright: ignore[reportReturnType]
 
 	def embed_documents(self, texts: list[str]) -> list[list[float]]:
-		return self._get_embedding(texts)  # pyright: ignore[reportReturnType]
+		batch_size = self.app_config.embedding.batch_size
+		if batch_size <= 0 or len(texts) <= batch_size:
+			return self._get_embedding(texts)  # pyright: ignore[reportReturnType]
+
+		results: list[list[float]] = []
+		for i in range(0, len(texts), batch_size):
+			batch_embeddings = self._get_embedding(texts[i:i + batch_size])
+			results.extend(batch_embeddings)  # pyright: ignore[reportArgumentType]
+		return results
 
 	def embed_query(self, text: str) -> list[float]:
 		return self._get_embedding(text)  # pyright: ignore[reportReturnType]
diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py
@@ -31,6 +31,7 @@ class TEmbeddingConfig(BaseModel):
 	model_name: str | None = DEFAULT_EM_MODEL_ALIAS
 	auth: TEmbeddingAuthApiKey | TEmbeddingAuthBasic | None = None
 	remote_service: bool = False
+	batch_size: int = 100  # max texts per embedding API request, 0 = no batching
 	llama: dict = dict()  # noqa: C408
 
 

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@ def get_config(file_path: str) -> TConfig:`
`78`	`78`	`remote_service=True,`
`79`	`79`	`workers=0,`
`80`	`80`	`request_timeout=embedding.get('request_timeout', 1800) if embedding else 1800,`
	`81`	`+ batch_size=int(os.getenv('CC_EM_BATCH_SIZE', 100)),`
`81`	`82`	`)`
`82`	`83`	`except Exception as e:`
`83`	`84`	`raise AssertionError(`