Skip to content

Commit efe4f0c

Browse files
authored
add parakeet into cuda benckmark ci (pytorch#17295)
Pull Request resolved: pytorch#17182 Differential Revision: [D92208958](https://our.internmc.facebook.com/intern/diff/D92208958/)
1 parent ba2516c commit efe4f0c

2 files changed

Lines changed: 34 additions & 1 deletion

File tree

.github/workflows/cuda-perf.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ jobs:
6161
shell: bash
6262
env:
6363
# All available models and quantizations
64-
ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it'
64+
ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt'
6565
ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only'
6666
NUM_RUNS: ${{ inputs.num_runs || '50' }}
6767
RUN_ALL_MODELS: ${{ inputs.run_all_models || 'false' }}
@@ -234,6 +234,12 @@ jobs:
234234
if [ -f "${RUNNER_ARTIFACT_DIR}/output.wav" ]; then
235235
cp "${RUNNER_ARTIFACT_DIR}/output.wav" model_artifacts/
236236
fi
237+
if [ -f "${RUNNER_ARTIFACT_DIR}/tokenizer.model" ]; then
238+
cp "${RUNNER_ARTIFACT_DIR}/tokenizer.model" model_artifacts/
239+
fi
240+
if [ -f "${RUNNER_ARTIFACT_DIR}/test_audio.wav" ]; then
241+
cp "${RUNNER_ARTIFACT_DIR}/test_audio.wav" model_artifacts/
242+
fi
237243
# Copy tokenizer files
238244
for file in tokenizer.json tokenizer_config.json special_tokens_map.json; do
239245
if [ -f "${RUNNER_ARTIFACT_DIR}/$file" ]; then
@@ -286,6 +292,13 @@ jobs:
286292
RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path model_artifacts/ --image_path $IMAGE --temperature 0"
287293
MODEL_NAME="gemma3_${{ matrix.quant }}"
288294
;;
295+
nvidia/parakeet-tdt)
296+
RUNNER="cmake-out/examples/models/parakeet/parakeet_runner"
297+
AUDIO="model_artifacts/test_audio.wav"
298+
TOKENIZER="model_artifacts/tokenizer.model"
299+
RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --audio_path $AUDIO --tokenizer_path $TOKENIZER"
300+
MODEL_NAME="parakeet_${{ matrix.quant }}"
301+
;;
289302
*)
290303
echo "Error: Unsupported model '${{ matrix.model }}'"
291304
exit 1

examples/models/parakeet/main.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "types.h"
2727

2828
#include <executorch/extension/llm/runner/llm_runner_helper.h>
29+
#include <executorch/extension/llm/runner/stats.h>
2930
#include <executorch/extension/llm/runner/util.h>
3031
#include <executorch/extension/llm/runner/wav_loader.h>
3132
#include <executorch/extension/llm/tokenizers/third-party/llama.cpp-unicode/include/unicode.h>
@@ -334,6 +335,10 @@ std::vector<Token> greedy_decode_executorch(
334335
int main(int argc, char** argv) {
335336
gflags::ParseCommandLineFlags(&argc, &argv, true);
336337

338+
// Initialize stats for benchmarking
339+
::executorch::extension::llm::Stats stats;
340+
stats.model_load_start_ms = ::executorch::extension::llm::time_in_ms();
341+
337342
TimestampOutputMode timestamp_mode;
338343
try {
339344
timestamp_mode = parse_timestamp_output_mode(FLAGS_timestamps);
@@ -362,6 +367,8 @@ int main(int argc, char** argv) {
362367
ET_LOG(Error, "Failed to load model.");
363368
return 1;
364369
}
370+
stats.model_load_end_ms = ::executorch::extension::llm::time_in_ms();
371+
stats.inference_start_ms = ::executorch::extension::llm::time_in_ms();
365372

366373
// Load audio
367374
ET_LOG(Info, "Loading audio from: %s", FLAGS_audio_path.c_str());
@@ -412,6 +419,10 @@ int main(int argc, char** argv) {
412419
ET_LOG(Error, "Encoder forward failed.");
413420
return 1;
414421
}
422+
stats.prompt_eval_end_ms = ::executorch::extension::llm::time_in_ms();
423+
stats.first_token_ms =
424+
stats.prompt_eval_end_ms; // For ASR, first token is at end of encoding
425+
415426
auto& enc_outputs = enc_result.get();
416427
auto f_proj = enc_outputs[0].toTensor(); // [B, T, joint_hidden]
417428
int64_t encoded_len = enc_outputs[1].toTensor().const_data_ptr<int64_t>()[0];
@@ -488,6 +499,15 @@ int main(int argc, char** argv) {
488499
decoded_tokens, *tokenizer);
489500
std::cout << "Transcribed text: " << text << std::endl;
490501

502+
// Record inference end time and token counts
503+
stats.inference_end_ms = ::executorch::extension::llm::time_in_ms();
504+
stats.num_prompt_tokens =
505+
encoded_len; // Use encoder output length as "prompt" tokens
506+
stats.num_generated_tokens = static_cast<int64_t>(decoded_tokens.size());
507+
508+
// Print PyTorchObserver stats for benchmarking
509+
::executorch::extension::llm::print_report(stats);
510+
491511
#ifdef ET_BUILD_METAL
492512
executorch::backends::metal::print_metal_backend_stats();
493513
#endif // ET_BUILD_METAL

0 commit comments

Comments
 (0)