nextcloud · marcelklehr · May 11, 2026 · Apr 26, 2026 · May 11, 2026 · May 11, 2026
diff --git a/appinfo/info.xml b/appinfo/info.xml
@@ -41,5 +41,32 @@ Learn more about the Nextcloud Ethical AI Rating [in our blog](https://nextcloud
 			<image>nextcloud/stt_whisper2</image>
 			<image-tag>2.4.2</image-tag>
 		</docker-install>
+		<environment-variables>
+			<variable>
+				<name>STT_WHISPER2_VAD_FILTER</name>
+				<display-name>Enable Voice Activity Detection filter</display-name>
+				<description>Set to 1 to only transcribe audio parts that pass the voice activity filter. This helps reduce hallucinations on long audio files. Default: 1</description>
+			</variable>
+			<variable>
+				<name>STT_WHISPER2_VAD_THRESHOLD</name>
+				<display-name>Voice Activity Detection filter threshold</display-name>
+				<description>The threshold for the VAD filter. Must be a float between 0 and 1. Default: 0.5</description>
+			</variable>
+			<variable>
+				<name>STT_WHISPER2_VAD_MIN_SPEECH_MS</name>
+				<display-name>Voice Activity Detection filter minimum speech milliseconds</display-name>
+				<description>The minimum time of speech that activates the VAD filter, as an integer. Default: 0</description>
+			</variable>
+			<variable>
+				<name>STT_WHISPER2_VAD_MIN_SILENCE_MS</name>
+				<display-name>Voice Activity Detection filter minimum silence milliseconds</display-name>
+				<description>The minimum time of silence that deactivates the VAD filter, as an integer. Default: 2000</description>
+			</variable>
+			<variable>
+				<name>STT_WHISPER2_VAD_SPEECH_PAD_MS</name>
+				<display-name>Voice Activity Detection filter speech pad milliseconds</display-name>
+				<description>The time that speech chunks will be paded with silence by the VAD filter, as an integer. Default: 400</description>
+			</variable>
+		</environment-variables>
 	</external-app>
 </info>
diff --git a/lib/main.py b/lib/main.py
@@ -43,6 +43,20 @@
 LOGGER = logging.getLogger(os.environ["APP_ID"])
 LOGGER.setLevel(logging.DEBUG)
 
+# --- VAD configuration (Silero, bundled with faster-whisper) ---
+# Enabled by default; disable by setting STT_WHISPER2_VAD_FILTER=0 in the container env.
+# Tunables map 1:1 onto faster_whisper.vad.VadOptions.
+VAD_FILTER = os.environ.get("STT_WHISPER2_VAD_FILTER", "1") == "1"
+try:
+    VAD_PARAMETERS = {
+        "threshold": float(os.environ.get("STT_WHISPER2_VAD_THRESHOLD", "0.5")),
+        "min_speech_duration_ms": int(os.environ.get("STT_WHISPER2_VAD_MIN_SPEECH_MS", "0")),
+        "min_silence_duration_ms": int(os.environ.get("STT_WHISPER2_VAD_MIN_SILENCE_MS", "2000")),
+        "speech_pad_ms": int(os.environ.get("STT_WHISPER2_VAD_SPEECH_PAD_MS", "400")),
+    }
+except:
+    raise Exception('Failed to parse VAD settings. All values must be valid numbers')
+
 
 def load_models():
     models = {}
@@ -144,7 +158,11 @@ def background_thread_task():
             LOGGER.info("generating transcription")
             time_start = perf_counter()
             file_name = get_file(nc, task["id"], task.get("input").get('input'))
-            segments, info = model.transcribe(file_name)
+            segments, info = model.transcribe(
+                file_name,
+                vad_filter=VAD_FILTER,
+                vad_parameters=VAD_PARAMETERS if VAD_FILTER else None,
+            )
             transcript = ''
             for segment in segments:
                 transcript += segment.text