morphik-org
diff --git a/‎core/api.py‎
Lines changed: 85 additions & 21 deletions b/‎core/api.py‎
Lines changed: 85 additions & 21 deletions
diff --git a/‎core/completion/litellm_completion.py‎
Lines changed: 107 additions & 3 deletions b/‎core/completion/litellm_completion.py‎
Lines changed: 107 additions & 3 deletions
diff --git a/‎core/models/completion.py‎
Lines changed: 2 additions & 1 deletion b/‎core/models/completion.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/models/request.py‎
Lines changed: 4 additions & 0 deletions b/‎core/models/request.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎core/services/document_service.py‎
Lines changed: 36 additions & 17 deletions b/‎core/services/document_service.py‎
Lines changed: 36 additions & 17 deletions
@@ -10,6 +10,7 @@
 import tomli
 from fastapi import Depends, FastAPI, Form, Header, HTTPException, Query, UploadFile
 from fastapi.middleware.cors import CORSMiddleware  # Import CORSMiddleware
+from fastapi.responses import StreamingResponse
 from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
 from starlette.middleware.sessions import SessionMiddleware
 
@@ -480,7 +481,7 @@ async def query_completion(
 
         # Main query processing
         perf.start_phase("document_service_query")
-        response = await document_service.query(
+        result = await document_service.query(
             request.query,
             auth,
             request.filters,
@@ -499,30 +500,93 @@ async def query_completion(
             request.schema,
             history,
             perf,  # Pass performance tracker
+            request.stream_response,
         )
 
-        # Chat history storage
-        perf.start_phase("chat_history_storage")
-        if history_key:
-            history.append(
-                {
-                    "role": "assistant",
-                    "content": response.completion,
-                    "timestamp": datetime.now(UTC).isoformat(),
-                }
-            )
-            await redis.set(history_key, json.dumps(history))
-            await document_service.db.upsert_chat_history(
-                request.chat_id,
-                auth.user_id,
-                auth.app_id,
-                history,
-            )
+        # Handle streaming vs non-streaming responses
+        if request.stream_response:
+            # For streaming responses, unpack the tuple
+            response_stream, sources = result
+
+            async def generate_stream():
+                full_content = ""
+                first_token_time = None
+
+                async for chunk in response_stream:
+                    # Track time to first token
+                    if first_token_time is None:
+                        first_token_time = time.time()
+                        completion_start_to_first_token = first_token_time - perf.start_time
+                        perf.add_suboperation("completion_start_to_first_token", completion_start_to_first_token)
+                        logger.info(f"Completion start to first token: {completion_start_to_first_token:.2f}s")
+
+                    full_content += chunk
+                    yield f"data: {json.dumps({'content': chunk})}\n\n"
+
+                # Convert sources to the format expected by frontend
+                sources_info = [
+                    {"document_id": source.document_id, "chunk_number": source.chunk_number, "score": source.score}
+                    for source in sources
+                ]
+
+                # Send completion signal with sources
+                yield f"data: {json.dumps({'done': True, 'sources': sources_info})}\n\n"
+
+                # Handle chat history after streaming is complete
+                if history_key:
+                    history.append(
+                        {
+                            "role": "assistant",
+                            "content": full_content,
+                            "timestamp": datetime.now(UTC).isoformat(),
+                        }
+                    )
+                    await redis.set(history_key, json.dumps(history))
+                    await document_service.db.upsert_chat_history(
+                        request.chat_id,
+                        auth.user_id,
+                        auth.app_id,
+                        history,
+                    )
+
+                # Log consolidated performance summary for streaming
+                streaming_time = time.time() - first_token_time if first_token_time else 0
+                perf.add_suboperation("streaming_duration", streaming_time)
+                perf.log_summary(f"Generated streaming completion with {len(sources)} sources")
+
+            headers = {
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "Access-Control-Allow-Origin": "*",
+                "Access-Control-Allow-Headers": "*",
+            }
+            return StreamingResponse(generate_stream(), media_type="text/event-stream", headers=headers)
+        else:
+            # For non-streaming responses, result is just the CompletionResponse
+            response = result
+
+            # Chat history storage for non-streaming responses
+            perf.start_phase("chat_history_storage")
+            if history_key:
+                history.append(
+                    {
+                        "role": "assistant",
+                        "content": response.completion,
+                        "timestamp": datetime.now(UTC).isoformat(),
+                    }
+                )
+                await redis.set(history_key, json.dumps(history))
+                await document_service.db.upsert_chat_history(
+                    request.chat_id,
+                    auth.user_id,
+                    auth.app_id,
+                    history,
+                )
 
-        # Log consolidated performance summary
-        perf.log_summary(f"Generated completion with {len(response.sources) if response.sources else 0} sources")
+            # Log consolidated performance summary
+            perf.log_summary(f"Generated completion with {len(response.sources) if response.sources else 0} sources")
 
-        return response
+            return response
     except ValueError as e:
         validate_prompt_overrides_with_http_exception(operation_type="query", error=e)
     except PermissionError as e:
 
@@ -1,6 +1,6 @@
 import logging
 import re  # Import re for parsing model name
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 
 import litellm
 
@@ -425,15 +425,107 @@ async def _handle_standard_litellm(
             finish_reason=response.choices[0].finish_reason,
         )
 
-    async def complete(self, request: CompletionRequest) -> CompletionResponse:
+    async def _handle_streaming_litellm(
+        self,
+        user_content: str,
+        image_urls: List[str],
+        request: CompletionRequest,
+        history_messages: List[Dict[str, str]],
+    ) -> AsyncGenerator[str, None]:
+        """Handle streaming output generation with LiteLLM."""
+        logger.debug(f"Using LiteLLM streaming for model: {self.model_config['model_name']}")
+        # Build messages for LiteLLM
+        content_list = [{"type": "text", "text": user_content}]
+        include_images = image_urls  # Use the collected full data URIs
+
+        if include_images:
+            NUM_IMAGES = min(5, len(image_urls))
+            for img_url in image_urls[:NUM_IMAGES]:
+                content_list.append({"type": "image_url", "image_url": {"url": img_url}})
+
+        # LiteLLM uses list content format
+        user_message = {"role": "user", "content": content_list}
+        # Use the system prompt defined earlier
+        litellm_messages = [get_system_message()] + history_messages + [user_message]
+
+        # Prepare LiteLLM parameters
+        model_params = {
+            "model": self.model_config["model_name"],
+            "messages": litellm_messages,
+            "max_tokens": request.max_tokens,
+            "temperature": request.temperature,
+            "stream": True,  # Enable streaming
+            "num_retries": 3,
+        }
+
+        for key, value in self.model_config.items():
+            if key != "model_name":
+                model_params[key] = value
+
+        logger.debug(f"Calling LiteLLM streaming with params: {model_params}")
+        response = await litellm.acompletion(**model_params)
+
+        # Stream the response chunks
+        async for chunk in response:
+            if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
+                yield chunk.choices[0].delta.content
+
+    async def _handle_streaming_ollama(
+        self,
+        user_content: str,
+        ollama_image_data: List[str],
+        request: CompletionRequest,
+        history_messages: List[Dict[str, str]],
+    ) -> AsyncGenerator[str, None]:
+        """Handle streaming output generation with Ollama."""
+        logger.debug(f"Using direct Ollama streaming for model: {self.ollama_base_model_name}")
+        client = ollama.AsyncClient(host=self.ollama_api_base)
+
+        # Construct Ollama messages
+        system_message = {"role": "system", "content": get_system_message()["content"]}
+        user_message_data = {"role": "user", "content": user_content}
+
+        # Add images directly to the user message if available
+        if ollama_image_data:
+            # Add all images to the user message
+            user_message_data["images"] = ollama_image_data
+
+        ollama_messages = [system_message] + history_messages + [user_message_data]
+
+        # Construct Ollama options
+        options = {
+            "temperature": request.temperature,
+            "num_predict": (
+                request.max_tokens if request.max_tokens is not None else -1
+            ),  # Default to model's default if None
+        }
+
+        try:
+            response = await client.chat(
+                model=self.ollama_base_model_name,
+                messages=ollama_messages,
+                options=options,
+                stream=True,  # Enable streaming
+            )
+
+            async for chunk in response:
+                if chunk.get("message", {}).get("content"):
+                    yield chunk["message"]["content"]
+
+        except Exception as e:
+            logger.error(f"Error during direct Ollama streaming call: {e}")
+            raise
+
+    async def complete(self, request: CompletionRequest) -> Union[CompletionResponse, AsyncGenerator[str, None]]:
         """
         Generate completion using LiteLLM or direct Ollama client if configured.
 
         Args:
             request: CompletionRequest object containing query, context, and parameters
 
         Returns:
-            CompletionResponse object with the generated text and usage statistics
+            CompletionResponse object with the generated text and usage statistics or
+            AsyncGenerator for streaming responses
         """
         # Process context chunks and handle images
         context_text, image_urls, ollama_image_data = process_context_chunks(request.context_chunks, self.is_ollama)
@@ -446,6 +538,18 @@ async def complete(self, request: CompletionRequest) -> CompletionResponse:
         # Check if structured output is requested
         structured_output = request.schema is not None
 
+        # Streaming is not supported with structured output
+        if request.stream_response and structured_output:
+            logger.warning("Streaming is not supported with structured output. Falling back to non-streaming.")
+            request.stream_response = False
+
+        # If streaming is requested and no structured output
+        if request.stream_response and not structured_output:
+            if self.is_ollama:
+                return self._handle_streaming_ollama(user_content, ollama_image_data, request, history_messages)
+            else:
+                return self._handle_streaming_litellm(user_content, image_urls, request, history_messages)
+
         # If structured output is requested, use instructor to handle it
         if structured_output:
             # Get dynamic model from schema
 
@@ -32,9 +32,10 @@ class CompletionRequest(BaseModel):
     query: str
     context_chunks: List[str]
     max_tokens: Optional[int] = 1000
-    temperature: Optional[float] = 0.7
+    temperature: Optional[float] = 0.3
     prompt_template: Optional[str] = None
     folder_name: Optional[str] = None
     end_user_id: Optional[str] = None
     schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None
     chat_history: Optional[List[ChatMessage]] = None
+    stream_response: Optional[bool] = False
@@ -44,6 +44,10 @@ class CompletionQueryRequest(RetrieveRequest):
         None,
         description="Optional chat session ID for persisting conversation history",
     )
+    stream_response: Optional[bool] = Field(
+        False,
+        description="Whether to stream the response back in chunks",
+    )
 
 
 class IngestTextRequest(BaseModel):
 
@@ -8,7 +8,7 @@
 import uuid
 from datetime import UTC, datetime
 from io import BytesIO
-from typing import Any, Dict, List, Optional, Type, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Type, Union
 
 import arq
 import filetype
@@ -588,7 +588,8 @@ async def query(
         schema: Optional[Union[Type[BaseModel], Dict[str, Any]]] = None,
         chat_history: Optional[List[ChatMessage]] = None,
         perf_tracker: Optional[Any] = None,  # Performance tracker from API layer
-    ) -> CompletionResponse:
+        stream_response: Optional[bool] = False,
+    ) -> Union[CompletionResponse, tuple[AsyncGenerator[str, None], List[ChunkSource]]]:
         """Generate completion using relevant chunks as context.
 
         When graph_name is provided, the query will leverage the knowledge graph
@@ -717,28 +718,46 @@ async def query(
             prompt_template=custom_prompt_template,
             schema=schema,
             chat_history=chat_history,
+            stream_response=stream_response,
         )
 
         response = await self.completion_model.complete(request)
 
         if not perf_tracker:
             phase_times["completion_generation"] = time.time() - completion_start
 
-        # Add sources information at the document service level
-        response.sources = sources
-
-        # Log performance summary only for standalone calls
-        if local_perf:
-            total_time = time.time() - query_start_time
-            logger.info("=== DocumentService.query Performance Summary ===")
-            logger.info(f"Total query time: {total_time:.2f}s")
-            for phase, duration in sorted(phase_times.items(), key=lambda x: x[1], reverse=True):
-                percentage = (duration / total_time) * 100 if total_time > 0 else 0
-                logger.info(f"  - {phase}: {duration:.2f}s ({percentage:.1f}%)")
-            logger.info(f"Generated completion with {len(sources)} sources")
-            logger.info("================================================")
-
-        return response
+        # Handle streaming vs non-streaming responses
+        if stream_response:
+            # For streaming responses, return the async generator and sources separately
+
+            # Log performance summary for streaming calls
+            if local_perf:
+                total_time = time.time() - query_start_time
+                logger.info("=== DocumentService.query Performance Summary (Streaming) ===")
+                logger.info(f"Total setup time: {total_time:.2f}s")
+                for phase, duration in sorted(phase_times.items(), key=lambda x: x[1], reverse=True):
+                    percentage = (duration / total_time) * 100 if total_time > 0 else 0
+                    logger.info(f"  - {phase}: {duration:.2f}s ({percentage:.1f}%)")
+                logger.info(f"Starting streaming with {len(sources)} sources")
+                logger.info("=" * 59)
+
+            return response, sources
+        else:
+            # Add sources information at the document service level for non-streaming
+            response.sources = sources
+
+            # Log performance summary only for standalone calls
+            if local_perf:
+                total_time = time.time() - query_start_time
+                logger.info("=== DocumentService.query Performance Summary ===")
+                logger.info(f"Total query time: {total_time:.2f}s")
+                for phase, duration in sorted(phase_times.items(), key=lambda x: x[1], reverse=True):
+                    percentage = (duration / total_time) * 100 if total_time > 0 else 0
+                    logger.info(f"  - {phase}: {duration:.2f}s ({percentage:.1f}%)")
+                logger.info(f"Generated completion with {len(sources)} sources")
+                logger.info("================================================")
+
+            return response
 
     async def ingest_text(
         self,
Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,10 @@ class CompletionQueryRequest(RetrieveRequest):`
`44`	`44`	`None,`
`45`	`45`	`description="Optional chat session ID for persisting conversation history",`
`46`	`46`	`)`
	`47`	`+ stream_response: Optional[bool] = Field(`
	`48`	`+ False,`
	`49`	`+ description="Whether to stream the response back in chunks",`
	`50`	`+ )`
`47`	`51`
`48`	`52`
`49`	`53`	`class IngestTextRequest(BaseModel):`