Skip to content

Segfault on CUDA encoder with low qp value #1419

@NicolasHug

Description

@NicolasHug

There's a segfault on the CUDA encoder(s) when passing low qp value. The bug isn't in TorchCodec, it can be reproduced with a standalone C++ repro, and we can make the FFmpeg CLI segfault too. I haven't verified if the segfault happens on other FFmpeg versions or CUDA versions than what I have on my laptop.

Copy pasting claude's report below, which I verified. I'm opening this for future reference, but I doubt there's much we can do in TC, except maybe throwing a warning?


nvenc segfault with low QP + high-entropy CUDA frames

Summary

Encoding CUDA frames with h264_nvenc using low qp values (roughly < 15)
segfaults when the frame data has high entropy (e.g. random pixel values). This
is a bug in the nvenc driver / FFmpeg nvenc wrapper, not in TorchCodec — we
reproduced it with a minimal C program that uses only the FFmpeg API.

How it surfaces in TorchCodec

import torch
from torchcodec.encoders._multi_stream_encoder import StreamingEncoder

frames = torch.randint(0, 256, (10, 3, 256, 256), dtype=torch.uint8, device="cuda")

encoder = StreamingEncoder()
video_stream = encoder.add_video(
    height=256, width=256, frame_rate=30, device="cuda",
    extra_options={"qp": 1},
)
with encoder.open("out.mp4"):
    video_stream.write(frames)  # segfault during close() / flush

The same crash happens with VideoEncoder:

from torchcodec.encoders import VideoEncoder

frames = torch.randint(0, 256, (10, 3, 256, 256), dtype=torch.uint8, device="cuda")
encoder = VideoEncoder(frames, frame_rate=30)
encoder.to_file("out.mp4", extra_options={"qp": 1})  # segfault

What we know

Scenario Result
qp=1, CUDA, random data Segfault
qp=20, CUDA, random data OK
qp=1, CPU, random data OK
qp=1, CUDA, low-entropy data (e.g. solid color) OK
crf=1, CUDA, random data OK
  • The crash happens inside avcodec_send_frame(NULL) (the flush/EOS signal) or
    during avcodec_receive_packet when draining buffered frames.
  • All frame writes succeed; the crash occurs when nvenc tries to finalize the
    buffered high-entropy frames at low QP.
  • The threshold is data-dependent. With truly random NV12 data, even qp=20
    can crash. With natural video content, the threshold is lower (around
    qp < 15).

Minimal C reproducer (no TorchCodec)

This program uses only the FFmpeg C API and CUDA runtime. It reproduces the
same segfault, confirming the bug is in nvenc/FFmpeg, not in TorchCodec.

Build:

gcc repro_nvenc_segfault.c -o repro_nvenc_segfault \
    $(pkg-config --cflags --libs libavcodec libavformat libavutil) \
    -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcudart

Run (make sure the FFmpeg libraries with nvenc support are on LD_LIBRARY_PATH):

./repro_nvenc_segfault
# Expected: segfault during "Flushing..." phase
// repro_nvenc_segfault.c
// Encodes 10 random CUDA NV12 frames with h264_nvenc at qp=1.
// Segfaults during flush on affected driver/FFmpeg versions.

#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/hwcontext.h>
#include <libavutil/hwcontext_cuda.h>
#include <libavutil/opt.h>
#include <libavutil/imgutils.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

int main() {
    int ret;

    // 1. Create HW device context
    AVBufferRef *hw_device_ctx = NULL;
    ret = av_hwdevice_ctx_create(&hw_device_ctx, AV_HWDEVICE_TYPE_CUDA,
                                  "0", NULL, 0);
    if (ret < 0) { fprintf(stderr, "Failed to create HW device ctx\n"); return 1; }

    // 2. Create HW frames context (CUDA, NV12, 256x256)
    AVBufferRef *hw_frames_ref = av_hwframe_ctx_alloc(hw_device_ctx);
    if (!hw_frames_ref) { fprintf(stderr, "Failed to alloc HW frames ctx\n"); return 1; }
    AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext *)hw_frames_ref->data;
    hw_frames_ctx->format    = AV_PIX_FMT_CUDA;
    hw_frames_ctx->sw_format = AV_PIX_FMT_NV12;
    hw_frames_ctx->width     = 256;
    hw_frames_ctx->height    = 256;
    ret = av_hwframe_ctx_init(hw_frames_ref);
    if (ret < 0) { fprintf(stderr, "Failed to init HW frames ctx\n"); return 1; }

    // 3. Set up h264_nvenc codec context
    const AVCodec *codec = avcodec_find_encoder_by_name("h264_nvenc");
    if (!codec) { fprintf(stderr, "h264_nvenc not found\n"); return 1; }
    AVCodecContext *enc_ctx = avcodec_alloc_context3(codec);
    enc_ctx->width          = 256;
    enc_ctx->height         = 256;
    enc_ctx->pix_fmt        = AV_PIX_FMT_CUDA;
    enc_ctx->sw_pix_fmt     = AV_PIX_FMT_NV12;
    enc_ctx->framerate      = (AVRational){30, 1};
    enc_ctx->time_base      = (AVRational){1, 30};
    enc_ctx->flags         |= AV_CODEC_FLAG_GLOBAL_HEADER;
    enc_ctx->hw_device_ctx  = av_buffer_ref(hw_device_ctx);
    enc_ctx->hw_frames_ctx  = av_buffer_ref(hw_frames_ref);

    // 4. Open codec with qp=1
    AVDictionary *opts = NULL;
    av_dict_set(&opts, "qp", "1", 0);
    ret = avcodec_open2(enc_ctx, codec, &opts);
    av_dict_free(&opts);
    if (ret < 0) { fprintf(stderr, "avcodec_open2 failed: %d\n", ret); return 1; }

    // 5. Create output (mp4 muxer)
    AVFormatContext *fmt_ctx = NULL;
    avformat_alloc_output_context2(&fmt_ctx, NULL, NULL, "repro_output.mp4");
    AVStream *stream   = avformat_new_stream(fmt_ctx, NULL);
    stream->time_base  = enc_ctx->time_base;
    avcodec_parameters_from_context(stream->codecpar, enc_ctx);
    avio_open(&fmt_ctx->pb, "repro_output.mp4", AVIO_FLAG_WRITE);
    avformat_write_header(fmt_ctx, NULL);

    // 6. Encode 10 frames filled with random data
    AVPacket *pkt = av_packet_alloc();
    for (int i = 0; i < 10; i++) {
        AVFrame *frame = av_frame_alloc();
        frame->format = AV_PIX_FMT_CUDA;
        frame->width  = 256;
        frame->height = 256;
        frame->pts    = i;
        av_hwframe_get_buffer(hw_frames_ref, frame, 0);

        size_t y_size  = frame->linesize[0] * 256;
        size_t uv_size = frame->linesize[0] * 256 / 2;
        void *host_buf = malloc(y_size + uv_size);
        for (size_t j = 0; j < y_size + uv_size; j++)
            ((unsigned char *)host_buf)[j] = rand() % 256;
        cudaMemcpy(frame->data[0], host_buf, y_size, cudaMemcpyHostToDevice);
        cudaMemcpy(frame->data[1], (char *)host_buf + y_size, uv_size,
                   cudaMemcpyHostToDevice);
        free(host_buf);

        fprintf(stderr, "Sending frame %d\n", i);
        avcodec_send_frame(enc_ctx, frame);
        while (avcodec_receive_packet(enc_ctx, pkt) == 0) {
            av_packet_rescale_ts(pkt, enc_ctx->time_base, stream->time_base);
            pkt->stream_index = stream->index;
            av_interleaved_write_frame(fmt_ctx, pkt);
        }
        av_frame_free(&frame);
    }

    // 7. Flush — this is where the segfault happens
    fprintf(stderr, "Flushing...\n");
    avcodec_send_frame(enc_ctx, NULL);
    while (1) {
        ret = avcodec_receive_packet(enc_ctx, pkt);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) break;
        if (ret < 0) break;
        av_packet_rescale_ts(pkt, enc_ctx->time_base, stream->time_base);
        pkt->stream_index = stream->index;
        av_interleaved_write_frame(fmt_ctx, pkt);
    }

    av_write_trailer(fmt_ctx);
    avio_closep(&fmt_ctx->pb);
    avformat_free_context(fmt_ctx);
    av_packet_free(&pkt);
    avcodec_free_context(&enc_ctx);
    av_buffer_unref(&hw_frames_ref);
    av_buffer_unref(&hw_device_ctx);
    fprintf(stderr, "Done!\n");
    return 0;
}

Reproducing with ffmpeg CLI

The crash also reproduces with the ffmpeg CLI when feeding truly random data
through the CUDA hw_frames path:

# Generate random NV12 frames (10 frames, 256x256)
dd if=/dev/urandom of=/tmp/random_nv12.raw bs=$((256*384)) count=10

# This segfaults (exit code 139):
ffmpeg -y -f rawvideo -pix_fmt nv12 -s 256x256 -r 30 \
    -i /tmp/random_nv12.raw \
    -vf "hwupload_cuda" -c:v h264_nvenc -qp 1 /tmp/out.mp4

# Software path does NOT segfault (gets a clean error instead):
ffmpeg -y -f rawvideo -pix_fmt nv12 -s 256x256 -r 30 \
    -i /tmp/random_nv12.raw \
    -c:v h264_nvenc -qp 1 /tmp/out.mp4

# Low-entropy test pattern does NOT segfault:
ffmpeg -y -f lavfi -i testsrc=duration=0.33:size=256x256:rate=30 \
    -vf "hwupload_cuda" -c:v h264_nvenc -qp 1 /tmp/out.mp4

Earlier tests with testsrc didn't crash because test patterns are
low-entropy. The crash only manifests with high-entropy frame data (random
noise, complex textures) that produces very large encoded output at low QP.

Environment

  • FFmpeg 8.0 (libavcodec 62.11.100)
  • NVIDIA driver 580.159.03
  • GPU: NVIDIA GeForce RTX 4080 Laptop GPU
  • CUDA: system CUDA runtime

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions