Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env bash
# Bootstrap: ensure faster-whisper is installed and the tiny model is pre-cached.
set -euo pipefail

VENV="$HOME/env_sandbox"

echo "[s2t] Checking faster-whisper..."
if ! "$VENV/bin/pip" show faster-whisper > /dev/null 2>&1; then
echo "[s2t] Installing faster-whisper..."
"$VENV/bin/pip" install faster-whisper
else
echo "[s2t] faster-whisper already installed."
fi

echo "[s2t] Pre-warming Whisper tiny model cache..."
"$VENV/bin/python3" - <<'PY'
from faster_whisper import WhisperModel
WhisperModel("tiny", device="cpu", compute_type="int8")
print("[s2t] Model cached and ready.")
PY
63 changes: 63 additions & 0 deletions daemon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""S2T warm daemon — holds Whisper tiny model in RAM, serves transcription over HTTP on 127.0.0.1:7979."""

import json
import sys
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path

from faster_whisper import WhisperModel

PORT = 7979
model = None


def load_model():
global model
print("Loading Whisper tiny model...", flush=True)
model = WhisperModel("tiny", device="cpu", compute_type="int8")
print(f"Model ready. Listening on 127.0.0.1:{PORT}", flush=True)


class Handler(BaseHTTPRequestHandler):
def log_message(self, format, *args):
pass

def do_GET(self):
if self.path == "/health":
self._respond(200, {"status": "ok"})
else:
self._respond(404, {"error": "not found"})

def do_POST(self):
if self.path == "/transcribe":
length = int(self.headers.get("Content-Length", 0))
body = json.loads(self.rfile.read(length))
audio_path = body.get("path", "")
if not Path(audio_path).exists():
self._respond(400, {"error": f"file not found: {audio_path}"})
return
segments, _ = model.transcribe(
audio_path, language="en", condition_on_previous_text=False
)
text = " ".join(seg.text.strip() for seg in segments)
self._respond(200, {"text": text})
else:
self._respond(404, {"error": "not found"})

def _respond(self, code, data):
body = json.dumps(data).encode()
self.send_response(code)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", len(body))
self.end_headers()
self.wfile.write(body)


if __name__ == "__main__":
load_model()
server = HTTPServer(("127.0.0.1", PORT), Handler)
try:
server.serve_forever()
except KeyboardInterrupt:
print("Daemon stopped.")
38 changes: 38 additions & 0 deletions expand_phrases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python3
"""
Reads transcribed text, applies phrase expansions from phrases.json,
writes result back to the same file.
"""
import sys
import json
import re
from pathlib import Path

def expand(text: str, phrases: dict) -> str:
# Sort longest phrases first so more specific matches win over partial ones
for phrase, expansion in sorted(phrases.items(), key=lambda x: -len(x[0])):
if phrase.startswith("_"):
continue
pattern = re.compile(re.escape(phrase), re.IGNORECASE)
text = pattern.sub(expansion, text)
return text.strip()

if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: expand_phrases.py <transcription_file>", file=sys.stderr)
sys.exit(1)

transcript_path = Path(sys.argv[1])
phrases_path = Path(__file__).parent / "phrases.json"

if not phrases_path.exists():
sys.exit(0)

with open(phrases_path) as f:
phrases = json.load(f)

text = transcript_path.read_text().strip()
expanded = expand(text, phrases)

transcript_path.write_text(expanded + "\n")
print(f"[phrases] '{text}' -> '{expanded}'")
37 changes: 37 additions & 0 deletions phrases.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"_comment": "Map spoken phrases to expansions. Case-insensitive. Variants = common mishearings.",

"_section_commands": "--- ALE / project commands ---",
"spin up intellimass": "spin up intellimass",
"spin up intel amass": "spin up intellimass",
"spin up intellimess": "spin up intellimass",
"spin up intel mess": "spin up intellimass",
"spin up intel mass": "spin up intellimass",
"spin up ale": ".spin up ale",
"spin down": "spin down",
"Reload whisper phrases": "cd ~/s2t && python3 -c 'import json; d=json.load(open(\"phrases.json\")); print(len([k for k in d if not k.startswith(\"_\")]), \"phrases loaded\")' && cd -",
"It's been up in Hellimaz.": "spin up intellimass",
"This beautiful mass.": "spin up intellimass",
"Spin up and tell em ass": "spin up intellimass",
"Spin up and tell a mess.": "spin up intellimass",
"Spin up until I miss": "spin up intellimass",
"Spin up HC video.": "spin up hcvideo",
"Spin up HC video": "spin up hcvideo",
"Spin up headwaters.": "spin up head-waters",
" It's been up headwaters.": "spin up head-waters",
"Spin up headwaters": "spin up head-waters",


"_section_prefixes": "--- Common prompt prefixes ---",
"ok robot": "ok robot",
"direct": "--direct",

"_section_noise": "--- Suppress known hallucination garbage (set to empty string) ---",
"thanks for watching": "",
"thank you for watching": "",
"thank you.": "",
"you": "",
"the": "",
"♪": "",
"...": ""
}
83 changes: 70 additions & 13 deletions stop_and_process_recording.sh
Original file line number Diff line number Diff line change
@@ -1,30 +1,87 @@
#!/bin/bash

paste_transcription() {
local paste_mode="${S2T_PASTE_MODE:-auto}"
local window_id=""
local window_class=""
local window_name=""
local window_pid=""
local window_command=""
local window_signature=""
local log_file="${S2T_PASTE_LOG:-/tmp/s2t-paste.log}"

if [[ "$paste_mode" == "auto" ]]; then
window_id=$(xdotool getactivewindow 2>/dev/null || true)
if [[ -n "$window_id" ]]; then
window_class=$(xdotool getwindowclassname "$window_id" 2>/dev/null || true)
window_name=$(xdotool getwindowname "$window_id" 2>/dev/null || true)
window_pid=$(xdotool getwindowpid "$window_id" 2>/dev/null || true)
if [[ -n "$window_pid" ]]; then
window_command=$(ps -p "$window_pid" -o comm= -o args= 2>/dev/null | head -n 1 || true)
fi
fi

window_signature=$(printf '%s\n%s\n%s\n' "$window_class" "$window_name" "$window_command")

if printf '%s\n' "$window_signature" \
| grep -Eiq 'terminal|console|xterm|rxvt|urxvt|konsole|kitty|alacritty|wezterm|ghostty|foot|tilix|terminator|qterminal|lxterminal|mate-terminal|xfce4-terminal|gnome-terminal|ptyxis|st-256color|blackbox|warp|tabby|rio|contour|codex|claude|opencode|aider|agent|gpt-'; then
paste_mode="terminal"
elif [[ -z "${window_signature//[[:space:]]/}" ]]; then
paste_mode="terminal"
else
paste_mode="default"
fi

printf '%s paste_mode=%s window_id=%s class=%q name=%q pid=%s command=%q\n' \
"$(date -Is)" "$paste_mode" "$window_id" "$window_class" "$window_name" "$window_pid" "$window_command" \
>> "$log_file" 2>/dev/null || true
fi

case "$paste_mode" in
terminal)
xdotool key --clearmodifiers ctrl+shift+v
;;
default)
xdotool key --clearmodifiers ctrl+v
;;
*)
notify-send "S2T paste mode error" "Unknown S2T_PASTE_MODE: $paste_mode"
xdotool key --clearmodifiers ctrl+v
;;
esac
}

# Stop recording
kill $(cat $HOME/s2t/tmp/recording_pid)

# Transcribe audio
source $HOME/env_sandbox/bin/activate
whisper $HOME/s2t/tmp/recording.wav --model tiny --output_dir="${HOME}/s2t/tmp/" --output_format="txt"
deactivate
AUDIO_FILE="$HOME/s2t/tmp/recording.wav"
TEXT_FILE="$HOME/s2t/tmp/recording.txt"

# Try warm daemon first (sub-second). Fall back to direct faster-whisper+tiny (~2s).
if curl -sf --max-time 0.5 http://127.0.0.1:7979/health > /dev/null 2>&1; then
TEXT=$(curl -sf --max-time 15 -X POST http://127.0.0.1:7979/transcribe \
-H "Content-Type: application/json" \
-d "{\"path\":\"$AUDIO_FILE\"}" \
| python3 -c "import sys,json; print(json.load(sys.stdin)['text'])" 2>/dev/null)
echo "$TEXT" > "$TEXT_FILE"
else
$HOME/env_sandbox/bin/python3 $HOME/s2t/transcribe.py "$AUDIO_FILE" "$TEXT_FILE"
fi

# Temporary file for transcription
TRANSCRIPTION_FILE="$HOME/s2t/tmp/recording.txt"
# Apply phrase expansions (phrases.json)
$HOME/env_sandbox/bin/python3 $HOME/s2t/expand_phrases.py "$TEXT_FILE"

# Copy transcription to clipboard
xclip -selection clipboard < $TRANSCRIPTION_FILE
xclip -selection clipboard < "$TEXT_FILE"

# Optional: Notify the user that transcription is complete
notify-send "Transcription Complete" "Your speech has been transcribed and is now in the clipboard."
# Notify
notify-send "Transcription Complete" "$(cat "$TEXT_FILE")"

# Ensure the clipboard has time to update
sleep 0.1

# Simulate the paste action
xdotool key ctrl+v # Use whichever key combination is appropriate
paste_transcription

# Clean up
rm -rf $HOME/s2t/tmp/



95 changes: 95 additions & 0 deletions tests/test_s2t.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env bash
# S2T test suite — run from any directory.
# Tests: direct transcription, daemon health, daemon transcription, fallback path.
set -euo pipefail

S2T="$HOME/s2t"
VENV="$HOME/env_sandbox"
DAEMON_URL="http://127.0.0.1:7979"
TMP=$(mktemp -d)
PASS=0
FAIL=0

cleanup() { rm -rf "$TMP"; }
trap cleanup EXIT

green() { echo -e "\033[0;32m[PASS]\033[0m $*"; PASS=$((PASS + 1)); }
red() { echo -e "\033[0;31m[FAIL]\033[0m $*"; FAIL=$((FAIL + 1)); }

# Generate a short silent WAV for testing
make_wav() {
ffmpeg -f lavfi -i "sine=frequency=1000:duration=1" "$1" -y -loglevel quiet
}

echo "=== S2T Test Suite ==="
echo ""

# --- Test 1: direct transcription via transcribe.py ---
echo "[1] Direct transcription (transcribe.py)..."
make_wav "$TMP/t1.wav"
"$VENV/bin/python3" "$S2T/transcribe.py" "$TMP/t1.wav" "$TMP/t1.txt" 2>/dev/null
if [[ -f "$TMP/t1.txt" ]]; then
green "transcribe.py produced output file"
else
red "transcribe.py did not produce output file"
fi

# --- Test 2: phrase expansion ---
echo "[2] Phrase expansion..."
echo "test phrase" > "$TMP/expand.txt"
"$VENV/bin/python3" "$S2T/expand_phrases.py" "$TMP/expand.txt" 2>/dev/null || true
if [[ -f "$TMP/expand.txt" ]]; then
green "expand_phrases.py ran without error"
else
red "expand_phrases.py failed"
fi

# --- Test 3: daemon health check ---
echo "[3] Daemon health check..."
if curl -sf --max-time 1 "$DAEMON_URL/health" > /dev/null 2>&1; then
green "Daemon is running and healthy"
DAEMON_UP=true
else
echo " [info] Daemon not running — skipping daemon transcription test"
echo " (Run 'spin up voice' to start the daemon)"
DAEMON_UP=false
fi

# --- Test 4: daemon transcription (only if daemon is up) ---
if [[ "$DAEMON_UP" == "true" ]]; then
echo "[4] Daemon transcription..."
make_wav "$TMP/t4.wav"
RESPONSE=$(curl -sf --max-time 15 -X POST "$DAEMON_URL/transcribe" \
-H "Content-Type: application/json" \
-d "{\"path\":\"$TMP/t4.wav\"}" 2>/dev/null)
if echo "$RESPONSE" | python3 -c "import sys,json; d=json.load(sys.stdin); assert 'text' in d" 2>/dev/null; then
green "Daemon returned transcription response"
else
red "Daemon transcription response malformed: $RESPONSE"
fi
fi

# --- Test 5: fallback path timing (daemon down simulation) ---
echo "[5] Fallback transcription timing..."
make_wav "$TMP/t5.wav"
START=$(date +%s%N)
"$VENV/bin/python3" "$S2T/transcribe.py" "$TMP/t5.wav" "$TMP/t5.txt" 2>/dev/null
END=$(date +%s%N)
ELAPSED=$(( (END - START) / 1000000 ))
if [[ $ELAPSED -lt 10000 ]]; then
green "Fallback completed in ${ELAPSED}ms (under 10s)"
else
red "Fallback took ${ELAPSED}ms — suspiciously slow"
fi

# --- Test 6: spin project registered ---
echo "[6] Spin project registered..."
if /home/pbrown/BROWN-FAMILY-SPORTS/Software/spin/bin/spin status voice 2>&1 | sed 's/\x1b\[[0-9;]*m//g' | grep -q "voice"; then
green "spin recognizes 'voice' project"
else
red "spin does not recognize 'voice' project"
fi

echo ""
echo "=== Results: $PASS passed, $FAIL failed ==="
[[ $FAIL -eq 0 ]]
13 changes: 13 additions & 0 deletions transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python3
import sys
from faster_whisper import WhisperModel

audio_path = sys.argv[1]
out_path = sys.argv[2]

model = WhisperModel("tiny", device="cpu", compute_type="int8")
segments, _ = model.transcribe(audio_path, language="en", condition_on_previous_text=False)

text = " ".join(seg.text.strip() for seg in segments)
with open(out_path, "w") as f:
f.write(text)