@@ -195,9 +195,17 @@ case "$HF_MODEL" in
195195 PREPROCESSOR_FEATURE_SIZE=" "
196196 PREPROCESSOR_OUTPUT=" "
197197 ;;
198+ SocialLocalMobile/gemma-4-31B-it-HQQ-INT4)
199+ MODEL_NAME=" gemma4_31b"
200+ TASK=" "
201+ MAX_SEQ_LEN=" "
202+ EXTRA_PIP=" "
203+ PREPROCESSOR_FEATURE_SIZE=" "
204+ PREPROCESSOR_OUTPUT=" "
205+ ;;
198206 * )
199207 echo " Error: Unsupported model '$HF_MODEL '"
200- echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
208+ echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4, SocialLocalMobile/gemma-4-31B-it-HQQ-INT4 "
201209 exit 1
202210 ;;
203211esac
@@ -459,6 +467,50 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
459467 exit 0
460468fi
461469
470+ # Gemma 4 31B uses a prequantized checkpoint and custom export script
471+ if [ " $MODEL_NAME " = " gemma4_31b" ]; then
472+ pip install safetensors huggingface_hub gguf
473+
474+ # Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
475+ LOCAL_MODEL_DIR=$( mktemp -d)
476+ INDUCTOR_CACHE=$( mktemp -d)
477+ trap ' rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
478+
479+ python -c " from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL} ', local_dir='${LOCAL_MODEL_DIR} ')"
480+
481+ # Sanity check: run inference on the prequantized model
482+ echo " ::group::Inference sanity check"
483+ INFERENCE_OUTPUT=$( python -m executorch.examples.models.gemma4_31b.inference \
484+ --prequantized " $LOCAL_MODEL_DIR " \
485+ --prompt " What is the capital of France?" \
486+ --max-new-tokens 32 \
487+ --temperature 0 \
488+ --no-compile 2>&1 )
489+ echo " $INFERENCE_OUTPUT "
490+ if ! echo " $INFERENCE_OUTPUT " | grep -q " Paris" ; then
491+ echo " ERROR: Inference sanity check failed — expected 'Paris' in output"
492+ exit 1
493+ fi
494+ echo " ::endgroup::"
495+
496+ # Copy tokenizer for the runner
497+ cp " $LOCAL_MODEL_DIR /tokenizer.json" " ${OUTPUT_DIR} /tokenizer.json"
498+
499+ # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
500+ echo " ::group::Export"
501+ TORCHINDUCTOR_CACHE_DIR=" $INDUCTOR_CACHE " \
502+ python -m executorch.examples.models.gemma4_31b.export \
503+ --prequantized " $LOCAL_MODEL_DIR " \
504+ --output-dir " ${OUTPUT_DIR} "
505+ echo " ::endgroup::"
506+
507+ test -f " ${OUTPUT_DIR} /model.pte"
508+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
509+ ls -al " ${OUTPUT_DIR} "
510+
511+ exit 0
512+ fi
513+
462514MAX_SEQ_LEN_ARG=" "
463515if [ -n " $MAX_SEQ_LEN " ]; then
464516 MAX_SEQ_LEN_ARG=" --max_seq_len $MAX_SEQ_LEN "
0 commit comments