add parakeet into cuda benckmark ci (pytorch#17295)

pytorchbot · web-flow · commit efe4f0cce36c · 2026-02-08T20:38:39.000-08:00
Pull Request resolved: pytorch#17182 Differential Revision: [D92208958](https://our.internmc.facebook.com/intern/diff/D92208958/)
diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
@@ -61,7 +61,7 @@ jobs:
         shell: bash
         env:
           # All available models and quantizations
-          ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it'
+          ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt'
           ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only'
           NUM_RUNS: ${{ inputs.num_runs || '50' }}
           RUN_ALL_MODELS: ${{ inputs.run_all_models || 'false' }}
@@ -234,6 +234,12 @@ jobs:
         if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then
           cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/
         fi
+        if [ -f "${RUNNER_ARTIFACT_DIR}/tokenizer.model" ]; then
+          cp "${RUNNER_ARTIFACT_DIR}/tokenizer.model" model_artifacts/
+        fi
+        if [ -f "${RUNNER_ARTIFACT_DIR}/test_audio.wav" ]; then
+          cp "${RUNNER_ARTIFACT_DIR}/test_audio.wav" model_artifacts/
+        fi
         # Copy tokenizer files
         for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do
           if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then
@@ -286,6 +292,13 @@ jobs:
             RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0"
             MODEL_NAME="gemma3_${{ matrix.quant }}"
             ;;
+          nvidia/parakeet-tdt)
+            RUNNER="cmake-out/examples/models/parakeet/parakeet_runner"
+            AUDIO="model_artifacts/test_audio.wav"
+            TOKENIZER="model_artifacts/tokenizer.model"
+            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --audio_path $AUDIO --tokenizer_path $TOKENIZER"
+            MODEL_NAME="parakeet_${{ matrix.quant }}"
+            ;;
           *)
             echo "Error: Unsupported model '${{ matrix.model }}'"
             exit 1
diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp
@@ -26,6 +26,7 @@
 #include "types.h"
 
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/llm/runner/wav_loader.h>
 #include <executorch/extension/llm/tokenizers/third-party/llama.cpp-unicode/include/unicode.h>
@@ -334,6 +335,10 @@ std::vector<Token> greedy_decode_executorch(
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
+  // Initialize stats for benchmarking
+  ::executorch::extension::llm::Stats stats;
+  stats.model_load_start_ms = ::executorch::extension::llm::time_in_ms();
+
   TimestampOutputMode timestamp_mode;
   try {
     timestamp_mode = parse_timestamp_output_mode(FLAGS_timestamps);
@@ -362,6 +367,8 @@ int main(int argc, char** argv) {
     ET_LOG(Error, "Failed to load model.");
     return 1;
   }
+  stats.model_load_end_ms = ::executorch::extension::llm::time_in_ms();
+  stats.inference_start_ms = ::executorch::extension::llm::time_in_ms();
 
   // Load audio
   ET_LOG(Info, "Loading audio from: %s", FLAGS_audio_path.c_str());
@@ -412,6 +419,10 @@ int main(int argc, char** argv) {
     ET_LOG(Error, "Encoder forward failed.");
     return 1;
   }
+  stats.prompt_eval_end_ms = ::executorch::extension::llm::time_in_ms();
+  stats.first_token_ms =
+      stats.prompt_eval_end_ms; // For ASR, first token is at end of encoding
+
   auto& enc_outputs = enc_result.get();
   auto f_proj = enc_outputs[0].toTensor(); // [B, T, joint_hidden]
   int64_t encoded_len = enc_outputs[1].toTensor().const_data_ptr<int64_t>()[0];
@@ -488,6 +499,15 @@ int main(int argc, char** argv) {
       decoded_tokens, *tokenizer);
   std::cout << "Transcribed text: " << text << std::endl;
 
+  // Record inference end time and token counts
+  stats.inference_end_ms = ::executorch::extension::llm::time_in_ms();
+  stats.num_prompt_tokens =
+      encoded_len; // Use encoder output length as "prompt" tokens
+  stats.num_generated_tokens = static_cast<int64_t>(decoded_tokens.size());
+
+  // Print PyTorchObserver stats for benchmarking
+  ::executorch::extension::llm::print_report(stats);
+
 #ifdef ET_BUILD_METAL
   executorch::backends::metal::print_metal_backend_stats();
 #endif // ET_BUILD_METAL