Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ python -m kame.server --help
## Runtime Notes

- `kame.server_oracle` requires `OPENAI_API_KEY`.
- If ASR is enabled, set `GOOGLE_APPLICATION_CREDENTIALS` to a valid Google Cloud service account credential file.
- ASR is enabled by default and uses Google Cloud Speech-to-Text. Set
`GOOGLE_APPLICATION_CREDENTIALS` to a valid Google Cloud credential JSON file
before starting the server.
- The current oracle-guided server path is configured for English dialogue and ASR (`en-US`).
- If `--static` is omitted, the browser UI assets are fetched automatically at startup.
- `kame.server_oracle` sends conversation text to OpenAI Chat Completions.
Expand Down Expand Up @@ -97,6 +99,7 @@ uv init --bare --python 3.12
uv add "kame-model @ git+https://github.com/SakanaAI/kame.git@1a69ee29dbd201d400f841459d87871154881047"

export OPENAI_API_KEY=...
export GOOGLE_APPLICATION_CREDENTIALS=/path/to/google-cloud-credentials.json

uv run python -m kame.server_oracle \
--hf-repo SakanaAI/kame \
Expand All @@ -116,8 +119,14 @@ Notes:
- Python `>=3.10` is supported; the command above uses Python 3.12 because it is
the version used for verification.
- `OPENAI_API_KEY` is required by `kame.server_oracle`.
- ASR uses Google Cloud Speech-to-Text when `GOOGLE_APPLICATION_CREDENTIALS` is
set. Without it, ASR initialization is skipped.
- ASR is enabled by default and requires Google Cloud Speech-to-Text. Before
running the server, set up a Google Cloud project for
[Speech-to-Text](https://cloud.google.com/speech-to-text/docs/setup) and
configure
[Application Default Credentials](https://cloud.google.com/docs/authentication/set-up-adc-on-premises)
with `GOOGLE_APPLICATION_CREDENTIALS`.
- For local smoke tests without Google Speech-to-Text, pass `--no-enable-asr`.
This skips ASR and does not exercise the full oracle-guided spoken-dialogue path.
- `--config-path`, `--moshi-weight`, `--mimi-weight`, and `--tokenizer` are not
needed for the public Hugging Face checkpoint in the usual case.
- `config.json` in the Hugging Face repo resolves the model weights, Mimi
Expand Down
73 changes: 49 additions & 24 deletions src/kame/server_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ def __init__(self, sample_rate=24000):

# Google Speech
self.asr_enabled = False
self.init_error: str | None = None
self.speech_client = None
self.config = None
self.streaming_config = None
Expand All @@ -346,31 +347,36 @@ def register_callbacks(self, on_partial, on_final):

def _initialize_speech_client(self):
try:
if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ:
self.speech_client = speech.SpeechClient()
language_code = ASR_LANGUAGE_CODE
self.config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=self.target_sample_rate,
language_code=language_code,
enable_automatic_punctuation=False,
enable_word_time_offsets=False,
enable_word_confidence=False,
use_enhanced=True,
metadata=speech.RecognitionMetadata(
interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
),
)
self.streaming_config = speech.StreamingRecognitionConfig(
config=self.config,
interim_results=True,
single_utterance=False,
)
self.asr_enabled = True
log("info", f"Async ASR processor initialized (language: {language_code})")
if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
self.init_error = "GOOGLE_APPLICATION_CREDENTIALS environment variable is not set."
return

self.speech_client = speech.SpeechClient()
language_code = ASR_LANGUAGE_CODE
self.config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=self.target_sample_rate,
language_code=language_code,
enable_automatic_punctuation=False,
enable_word_time_offsets=False,
enable_word_confidence=False,
use_enhanced=True,
metadata=speech.RecognitionMetadata(
interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
),
)
self.streaming_config = speech.StreamingRecognitionConfig(
config=self.config,
interim_results=True,
single_utterance=False,
)
self.asr_enabled = True
self.init_error = None
log("info", f"Async ASR processor initialized (language: {language_code})")
except Exception as e:
self.init_error = str(e)
log("warning", f"ASR initialization failed: {e}")

async def start(self):
Expand Down Expand Up @@ -572,6 +578,24 @@ def _process_responses(self, responses):
pass


def _require_initialized_asr(enable_asr: bool, asr_processor: Optional[AsyncASRProcessor]) -> None:
if not enable_asr:
return

if asr_processor is not None and asr_processor.asr_enabled:
return

reason = "unknown error"
if asr_processor is not None and asr_processor.init_error:
reason = asr_processor.init_error
raise RuntimeError(
"ASR is enabled but Google Speech-to-Text could not be initialized. "
f"{reason} "
"Set GOOGLE_APPLICATION_CREDENTIALS to a valid Google Cloud service account credential file "
"or rerun with --no-enable-asr."
)


@dataclass
class ServerState:
model_type: str
Expand Down Expand Up @@ -626,6 +650,7 @@ def __init__(

# ASR processor
self.asr_processor = AsyncASRProcessor(sample_rate=int(self.mimi.sample_rate)) if enable_asr else None
_require_initialized_asr(enable_asr, self.asr_processor)

# LLM stream manager (uses oracle_queue; never touches lm_gen directly)
self.llm_stream_manager = LLMStreamManager(
Expand Down
Loading