SakanaAI · yagumana · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/README.md b/README.md
@@ -63,7 +63,9 @@ python -m kame.server --help
 ## Runtime Notes
 
 - `kame.server_oracle` requires `OPENAI_API_KEY`.
-- If ASR is enabled, set `GOOGLE_APPLICATION_CREDENTIALS` to a valid Google Cloud service account credential file.
+- ASR is enabled by default and uses Google Cloud Speech-to-Text. Set
+  `GOOGLE_APPLICATION_CREDENTIALS` to a valid Google Cloud credential JSON file
+  before starting the server.
 - The current oracle-guided server path is configured for English dialogue and ASR (`en-US`).
 - If `--static` is omitted, the browser UI assets are fetched automatically at startup.
 - `kame.server_oracle` sends conversation text to OpenAI Chat Completions.
@@ -97,6 +99,7 @@ uv init --bare --python 3.12
 uv add "kame-model @ git+https://github.com/SakanaAI/kame.git@1a69ee29dbd201d400f841459d87871154881047"
 
 export OPENAI_API_KEY=...
+export GOOGLE_APPLICATION_CREDENTIALS=/path/to/google-cloud-credentials.json
 
 uv run python -m kame.server_oracle \
   --hf-repo SakanaAI/kame \
@@ -116,8 +119,14 @@ Notes:
 - Python `>=3.10` is supported; the command above uses Python 3.12 because it is
   the version used for verification.
 - `OPENAI_API_KEY` is required by `kame.server_oracle`.
-- ASR uses Google Cloud Speech-to-Text when `GOOGLE_APPLICATION_CREDENTIALS` is
-  set. Without it, ASR initialization is skipped.
+- ASR is enabled by default and requires Google Cloud Speech-to-Text. Before
+  running the server, set up a Google Cloud project for
+  [Speech-to-Text](https://cloud.google.com/speech-to-text/docs/setup) and
+  configure
+  [Application Default Credentials](https://cloud.google.com/docs/authentication/set-up-adc-on-premises)
+  with `GOOGLE_APPLICATION_CREDENTIALS`.
+- For local smoke tests without Google Speech-to-Text, pass `--no-enable-asr`.
+  This skips ASR and does not exercise the full oracle-guided spoken-dialogue path.
 - `--config-path`, `--moshi-weight`, `--mimi-weight`, and `--tokenizer` are not
   needed for the public Hugging Face checkpoint in the usual case.
 - `config.json` in the Hugging Face repo resolves the model weights, Mimi

diff --git a/src/kame/server_oracle.py b/src/kame/server_oracle.py
@@ -325,6 +325,7 @@ def __init__(self, sample_rate=24000):
 
         # Google Speech
         self.asr_enabled = False
+        self.init_error: str | None = None
         self.speech_client = None
         self.config = None
         self.streaming_config = None
@@ -346,31 +347,36 @@ def register_callbacks(self, on_partial, on_final):
 
     def _initialize_speech_client(self):
         try:
-            if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ:
-                self.speech_client = speech.SpeechClient()
-                language_code = ASR_LANGUAGE_CODE
-                self.config = speech.RecognitionConfig(
-                    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
-                    sample_rate_hertz=self.target_sample_rate,
-                    language_code=language_code,
-                    enable_automatic_punctuation=False,
-                    enable_word_time_offsets=False,
-                    enable_word_confidence=False,
-                    use_enhanced=True,
-                    metadata=speech.RecognitionMetadata(
-                        interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
-                        microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
-                        recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
-                    ),
-                )
-                self.streaming_config = speech.StreamingRecognitionConfig(
-                    config=self.config,
-                    interim_results=True,
-                    single_utterance=False,
-                )
-                self.asr_enabled = True
-                log("info", f"Async ASR processor initialized (language: {language_code})")
+            if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
+                self.init_error = "GOOGLE_APPLICATION_CREDENTIALS environment variable is not set."
+                return
+
+            self.speech_client = speech.SpeechClient()
+            language_code = ASR_LANGUAGE_CODE
+            self.config = speech.RecognitionConfig(
+                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+                sample_rate_hertz=self.target_sample_rate,
+                language_code=language_code,
+                enable_automatic_punctuation=False,
+                enable_word_time_offsets=False,
+                enable_word_confidence=False,
+                use_enhanced=True,
+                metadata=speech.RecognitionMetadata(
+                    interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
+                    microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
+                    recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
+                ),
+            )
+            self.streaming_config = speech.StreamingRecognitionConfig(
+                config=self.config,
+                interim_results=True,
+                single_utterance=False,
+            )
+            self.asr_enabled = True
+            self.init_error = None
+            log("info", f"Async ASR processor initialized (language: {language_code})")
         except Exception as e:
+            self.init_error = str(e)
             log("warning", f"ASR initialization failed: {e}")
 
     async def start(self):
@@ -572,6 +578,24 @@ def _process_responses(self, responses):
                         pass
 
 
+def _require_initialized_asr(enable_asr: bool, asr_processor: Optional[AsyncASRProcessor]) -> None:
+    if not enable_asr:
+        return
+
+    if asr_processor is not None and asr_processor.asr_enabled:
+        return
+
+    reason = "unknown error"
+    if asr_processor is not None and asr_processor.init_error:
+        reason = asr_processor.init_error
+    raise RuntimeError(
+        "ASR is enabled but Google Speech-to-Text could not be initialized. "
+        f"{reason} "
+        "Set GOOGLE_APPLICATION_CREDENTIALS to a valid Google Cloud service account credential file "
+        "or rerun with --no-enable-asr."
+    )
+
+
 @dataclass
 class ServerState:
     model_type: str
@@ -626,6 +650,7 @@ def __init__(
 
         # ASR processor
         self.asr_processor = AsyncASRProcessor(sample_rate=int(self.mimi.sample_rate)) if enable_asr else None
+        _require_initialized_asr(enable_asr, self.asr_processor)
 
         # LLM stream manager (uses oracle_queue; never touches lm_gen directly)
         self.llm_stream_manager = LLMStreamManager(