peteonrails · IgorWarzocha · Jan 10, 2026 · Jan 10, 2026
@@ -50,6 +50,7 @@ hound = "3"  # WAV file reading/writing
 
 # HTTP client for remote transcription
 ureq = { version = "2", features = ["json"] }
+base64 = "0.22"
 
 # Audio playback (for feedback sounds)
 rodio = { version = "0.19", default-features = false, features = ["wav"] }

@@ -0,0 +1,56 @@
+#!/bin/bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+AUDIO_FILE="${AUDIO_FILE:-$ROOT_DIR/harvard.wav}"
+ENDPOINT="${ENDPOINT:-http://localhost:8000}"
+MODEL="${MODEL:-gemini-2.5-flash}"
+SYSTEM_PROMPT="${SYSTEM_PROMPT:-Translate whatever I say into pirate-speech. Return ONLY the translated text. No preamble, no explanations, no labels, no quotes, no extra lines.}"
+API_KEY="${API_KEY:-${VOXTYPE_WHISPER_API_KEY:-}}"
+
+if [[ ! -f "$AUDIO_FILE" ]]; then
+    echo "Audio file not found: $AUDIO_FILE" >&2
+    exit 1
+fi
+
+if [[ -z "$API_KEY" ]]; then
+    echo "Missing API key. Set API_KEY or VOXTYPE_WHISPER_API_KEY." >&2
+    exit 1
+fi
+
+export AUDIO_FILE MODEL SYSTEM_PROMPT
+
+python - <<'PY' | curl -s "${ENDPOINT%/}/v1/chat/completions" \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer ${API_KEY}" \
+    -d @-
+import base64
+import json
+import os
+
+audio_file = os.environ["AUDIO_FILE"]
+model = os.environ["MODEL"]
+system_prompt = os.environ["SYSTEM_PROMPT"]
+
+with open(audio_file, "rb") as fh:
+    audio_b64 = base64.b64encode(fh.read()).decode("utf-8")
+
+payload = {
+    "model": model,
+    "messages": [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Process this audio."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:audio/wav;base64,{audio_b64}"},
+                },
+            ],
+        },
+    ],
+}
+
+print(json.dumps(payload))
+PY
@@ -395,10 +395,10 @@ fn load_icon_theme(theme: &str) -> ResolvedIcons {
         },
         "omarchy" => ResolvedIcons {
             // Material Design icons matching Omarchy waybar config
-            idle: "\u{ec12}".to_string(),     // nf-md-microphone_outline
+            idle: "\u{ec12}".to_string(), // nf-md-microphone_outline
             recording: "\u{f036c}".to_string(), // nf-md-microphone
             transcribing: "\u{f051f}".to_string(), // nf-md-timer_sand
-            stopped: "\u{ec12}".to_string(),  // nf-md-microphone_outline
+            stopped: "\u{ec12}".to_string(), // nf-md-microphone_outline
         },
         "minimal" => ResolvedIcons {
             idle: "○".to_string(),
@@ -501,6 +501,17 @@ pub enum WhisperBackend {
     Remote,
 }
 
+/// Remote transcription mode
+#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum RemoteTranscriptionMode {
+    /// Standard Whisper transcription via /v1/audio/transcriptions
+    #[default]
+    Transcription,
+    /// Multimodal chat completion via /v1/chat/completions
+    Chat,
+}
+
 /// Whisper speech-to-text configuration
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct WhisperConfig {
@@ -527,7 +538,6 @@ pub struct WhisperConfig {
     pub on_demand_loading: bool,
 
     // --- Remote backend settings ---
-
     /// Remote server endpoint URL (e.g., "http://192.168.1.100:8080")
     /// Required when backend = "remote"
     #[serde(default)]
@@ -544,6 +554,18 @@ pub struct WhisperConfig {
     /// Timeout for remote requests in seconds (default: 30)
     #[serde(default)]
     pub remote_timeout_secs: Option<u64>,
+
+    /// Remote transcription mode (transcription or chat)
+    #[serde(default)]
+    pub remote_mode: RemoteTranscriptionMode,
+
+    /// Optional system prompt for chat-mode requests
+    #[serde(default)]
+    pub remote_system_prompt: Option<String>,
+
+    /// Use data:audio/wav base64 data URI in image_url for chat mode
+    #[serde(default)]
+    pub remote_use_data_uri: bool,
 }
 
 /// Text processing configuration
@@ -687,6 +709,9 @@ impl Default for Config {
                 remote_model: None,
                 remote_api_key: None,
                 remote_timeout_secs: None,
+                remote_mode: RemoteTranscriptionMode::default(),
+                remote_system_prompt: None,
+                remote_use_data_uri: false,
             },
             output: OutputConfig {
                 mode: OutputMode::Type,