11import logging
22import re # Import re for parsing model name
3- from typing import Any , Dict , List , Optional , Tuple , Union
3+ from typing import Any , AsyncGenerator , Dict , List , Optional , Tuple , Union
44
55import litellm
66
@@ -425,15 +425,107 @@ async def _handle_standard_litellm(
425425 finish_reason = response .choices [0 ].finish_reason ,
426426 )
427427
428- async def complete (self , request : CompletionRequest ) -> CompletionResponse :
428+ async def _handle_streaming_litellm (
429+ self ,
430+ user_content : str ,
431+ image_urls : List [str ],
432+ request : CompletionRequest ,
433+ history_messages : List [Dict [str , str ]],
434+ ) -> AsyncGenerator [str , None ]:
435+ """Handle streaming output generation with LiteLLM."""
436+ logger .debug (f"Using LiteLLM streaming for model: { self .model_config ['model_name' ]} " )
437+ # Build messages for LiteLLM
438+ content_list = [{"type" : "text" , "text" : user_content }]
439+ include_images = image_urls # Use the collected full data URIs
440+
441+ if include_images :
442+ NUM_IMAGES = min (5 , len (image_urls ))
443+ for img_url in image_urls [:NUM_IMAGES ]:
444+ content_list .append ({"type" : "image_url" , "image_url" : {"url" : img_url }})
445+
446+ # LiteLLM uses list content format
447+ user_message = {"role" : "user" , "content" : content_list }
448+ # Use the system prompt defined earlier
449+ litellm_messages = [get_system_message ()] + history_messages + [user_message ]
450+
451+ # Prepare LiteLLM parameters
452+ model_params = {
453+ "model" : self .model_config ["model_name" ],
454+ "messages" : litellm_messages ,
455+ "max_tokens" : request .max_tokens ,
456+ "temperature" : request .temperature ,
457+ "stream" : True , # Enable streaming
458+ "num_retries" : 3 ,
459+ }
460+
461+ for key , value in self .model_config .items ():
462+ if key != "model_name" :
463+ model_params [key ] = value
464+
465+ logger .debug (f"Calling LiteLLM streaming with params: { model_params } " )
466+ response = await litellm .acompletion (** model_params )
467+
468+ # Stream the response chunks
469+ async for chunk in response :
470+ if chunk .choices and chunk .choices [0 ].delta and chunk .choices [0 ].delta .content :
471+ yield chunk .choices [0 ].delta .content
472+
473+ async def _handle_streaming_ollama (
474+ self ,
475+ user_content : str ,
476+ ollama_image_data : List [str ],
477+ request : CompletionRequest ,
478+ history_messages : List [Dict [str , str ]],
479+ ) -> AsyncGenerator [str , None ]:
480+ """Handle streaming output generation with Ollama."""
481+ logger .debug (f"Using direct Ollama streaming for model: { self .ollama_base_model_name } " )
482+ client = ollama .AsyncClient (host = self .ollama_api_base )
483+
484+ # Construct Ollama messages
485+ system_message = {"role" : "system" , "content" : get_system_message ()["content" ]}
486+ user_message_data = {"role" : "user" , "content" : user_content }
487+
488+ # Add images directly to the user message if available
489+ if ollama_image_data :
490+ # Add all images to the user message
491+ user_message_data ["images" ] = ollama_image_data
492+
493+ ollama_messages = [system_message ] + history_messages + [user_message_data ]
494+
495+ # Construct Ollama options
496+ options = {
497+ "temperature" : request .temperature ,
498+ "num_predict" : (
499+ request .max_tokens if request .max_tokens is not None else - 1
500+ ), # Default to model's default if None
501+ }
502+
503+ try :
504+ response = await client .chat (
505+ model = self .ollama_base_model_name ,
506+ messages = ollama_messages ,
507+ options = options ,
508+ stream = True , # Enable streaming
509+ )
510+
511+ async for chunk in response :
512+ if chunk .get ("message" , {}).get ("content" ):
513+ yield chunk ["message" ]["content" ]
514+
515+ except Exception as e :
516+ logger .error (f"Error during direct Ollama streaming call: { e } " )
517+ raise
518+
519+ async def complete (self , request : CompletionRequest ) -> Union [CompletionResponse , AsyncGenerator [str , None ]]:
429520 """
430521 Generate completion using LiteLLM or direct Ollama client if configured.
431522
432523 Args:
433524 request: CompletionRequest object containing query, context, and parameters
434525
435526 Returns:
436- CompletionResponse object with the generated text and usage statistics
527+ CompletionResponse object with the generated text and usage statistics or
528+ AsyncGenerator for streaming responses
437529 """
438530 # Process context chunks and handle images
439531 context_text , image_urls , ollama_image_data = process_context_chunks (request .context_chunks , self .is_ollama )
@@ -446,6 +538,18 @@ async def complete(self, request: CompletionRequest) -> CompletionResponse:
446538 # Check if structured output is requested
447539 structured_output = request .schema is not None
448540
541+ # Streaming is not supported with structured output
542+ if request .stream_response and structured_output :
543+ logger .warning ("Streaming is not supported with structured output. Falling back to non-streaming." )
544+ request .stream_response = False
545+
546+ # If streaming is requested and no structured output
547+ if request .stream_response and not structured_output :
548+ if self .is_ollama :
549+ return self ._handle_streaming_ollama (user_content , ollama_image_data , request , history_messages )
550+ else :
551+ return self ._handle_streaming_litellm (user_content , image_urls , request , history_messages )
552+
449553 # If structured output is requested, use instructor to handle it
450554 if structured_output :
451555 # Get dynamic model from schema
0 commit comments