Skip to content

Commit eb85a1b

Browse files
author
pytorchbot
committed
2026-05-31 nightly release (04f1300)
1 parent f84c4c7 commit eb85a1b

19 files changed

Lines changed: 804 additions & 1215 deletions

.github/workflows/linux_cuda_wheel.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ jobs:
9797
python-version: ${{ matrix.python-version }}
9898
# We install conda packages at the start because otherwise conda may have conflicts with dependencies.
9999
# Note: xorg-libxau was addded to fix a problem with ffmpeg 4. We should consider removing it.
100-
default-packages: "nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }} conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }} conda-forge::xorg-libxau"
100+
default-packages: "nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }} conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }} conda-forge::xorg-libxau"
101101
- name: Check env, set LD_LIBRARY_PATH
102102
run: |
103103
${CONDA_RUN} env
@@ -224,7 +224,7 @@ jobs:
224224
with:
225225
python-version: ${{ env.PYTHON_VERSION }}
226226
# We install conda packages at the start because otherwise conda may have conflicts with dependencies.
227-
default-packages: "nvidia/label/cuda-${{ env.CUDA_VERSION }}.0::libnpp nvidia::cuda-nvrtc=${{ env.CUDA_VERSION }} nvidia::cuda-toolkit=${{ env.CUDA_VERSION }} nvidia::cuda-cudart=${{ env.CUDA_VERSION }} nvidia::cuda-driver-dev=${{ env.CUDA_VERSION }} conda-forge::ffmpeg=${{ env.FFMPEG_VERSION }}"
227+
default-packages: "nvidia::cuda-nvrtc=${{ env.CUDA_VERSION }} nvidia::cuda-toolkit=${{ env.CUDA_VERSION }} nvidia::cuda-cudart=${{ env.CUDA_VERSION }} nvidia::cuda-driver-dev=${{ env.CUDA_VERSION }} conda-forge::ffmpeg=${{ env.FFMPEG_VERSION }}"
228228
- name: Check env, set LD_LIBRARY_PATH
229229
run: |
230230
${CONDA_RUN} env

.github/workflows/windows_cuda_wheel.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,6 @@ jobs:
107107
- name: Install CUDA and FFmpeg conda packages
108108
run: |
109109
conda install -y \
110-
"nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp" \
111110
"nvidia::cuda-nvrtc=${{ matrix.cuda-version }}" \
112111
"nvidia::cuda-cudart=${{ matrix.cuda-version }}" \
113112
"conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }}"

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,8 @@ Make sure you have a GPU with NVDEC hardware that can decode the format you
128128
want. Refer to Nvidia's GPU support matrix
129129
[here](https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new).
130130

131-
You will need the `libnpp` and `libnvrtc` CUDA libraries, which are usually
132-
part of the CUDA Toolkit.
131+
You will need the `libnvrtc` CUDA library, which is usually part of the CUDA
132+
Toolkit.
133133

134134
To select a specific CUDA Toolkit version, use `--index-url`. Make sure to
135135
install the corresponding PyTorch version as well (refer to the

src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Lines changed: 18 additions & 194 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@
1717
#include "Logging.h"
1818
#include "NVDECCache.h"
1919

20-
#include "NPPRuntimeLoader.h"
2120
#include "NVCUVIDRuntimeLoader.h"
22-
#include "P016ToRGB16.h"
21+
#include "color_conversion.h"
2322
#include "nvcuvid_include/cuviddec.h"
2423
#include "nvcuvid_include/nvcuvid.h"
2524

@@ -252,8 +251,8 @@ std::optional<cudaVideoSurfaceFormat> getNVDECSurfaceFormat(
252251

253252
// P016 is typically not supported on 8-bit SDR content. In such cases, we
254253
// try to fall back to NV12 if supported:
255-
// NVDEC will decode to NV12, NPP will do NV12 -> RGB producing uint8, and
256-
// maybePermuteAndConvertToFloat32 will cast uint8 -> float32.
254+
// NVDEC will decode to NV12, our kernel will do NV12 -> RGB producing
255+
// uint8, and maybePermuteAndConvertToFloat32 will cast uint8 -> float32.
257256
// For HDR content, NV12 would lose precision, so we fall back to CPU instead.
258257
if (preferredFormat == cudaVideoSurfaceFormat_P016 && bitDepthMinus8 == 0 &&
259258
((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) {
@@ -269,156 +268,8 @@ void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
269268
cudaFree(opaque);
270269
}
271270

272-
static void computeP016ColorMatrix(
273-
AVColorSpace colorspace,
274-
AVColorRange colorRange,
275-
int bitDepth,
276-
float outMatrix[3][4]) {
277-
float kr, kg, kb;
278-
switch (colorspace) {
279-
case AVCOL_SPC_BT709:
280-
kr = 0.2126f;
281-
kg = 0.7152f;
282-
kb = 0.0722f;
283-
break;
284-
case AVCOL_SPC_BT2020_NCL:
285-
case AVCOL_SPC_BT2020_CL:
286-
kr = 0.2627f;
287-
kg = 0.6780f;
288-
kb = 0.0593f;
289-
break;
290-
default:
291-
// BT.601
292-
kr = 0.299f;
293-
kg = 0.587f;
294-
kb = 0.114f;
295-
break;
296-
}
297-
298-
float vScale = 2.0f * (1.0f - kr);
299-
float uScale = 2.0f * (1.0f - kb);
300-
float guCoeff = -(2.0f * kb * (1.0f - kb)) / kg;
301-
float gvCoeff = -(2.0f * kr * (1.0f - kr)) / kg;
302-
303-
float maxVal = static_cast<float>((1 << bitDepth) - 1);
304-
float outScale = 65535.0f;
305-
306-
bool isFullRange = (colorRange == AVCOL_RANGE_JPEG);
307-
308-
if (isFullRange) {
309-
float yScale = outScale / maxVal;
310-
float uvCenter = static_cast<float>(1 << (bitDepth - 1));
311-
312-
outMatrix[0][0] = yScale;
313-
outMatrix[0][1] = 0.0f;
314-
outMatrix[0][2] = vScale * outScale / maxVal;
315-
outMatrix[0][3] = -vScale * uvCenter * outScale / maxVal;
316-
317-
outMatrix[1][0] = yScale;
318-
outMatrix[1][1] = guCoeff * outScale / maxVal;
319-
outMatrix[1][2] = gvCoeff * outScale / maxVal;
320-
outMatrix[1][3] = -(guCoeff + gvCoeff) * uvCenter * outScale / maxVal;
321-
322-
outMatrix[2][0] = yScale;
323-
outMatrix[2][1] = uScale * outScale / maxVal;
324-
outMatrix[2][2] = 0.0f;
325-
outMatrix[2][3] = -uScale * uvCenter * outScale / maxVal;
326-
} else {
327-
float s = static_cast<float>(1 << (bitDepth - 8));
328-
float yOff = 16.0f * s;
329-
float yRange = 219.0f * s;
330-
float uvOff = 128.0f * s;
331-
float uvRange = 224.0f * s;
332-
333-
float yCoeff = outScale / yRange;
334-
float uvCoeff_u = outScale / uvRange;
335-
float uvCoeff_v = outScale / uvRange;
336-
337-
outMatrix[0][0] = yCoeff;
338-
outMatrix[0][1] = 0.0f;
339-
outMatrix[0][2] = vScale * uvCoeff_v;
340-
outMatrix[0][3] = -yCoeff * yOff - vScale * uvCoeff_v * uvOff;
341-
342-
outMatrix[1][0] = yCoeff;
343-
outMatrix[1][1] = guCoeff * uvCoeff_u;
344-
outMatrix[1][2] = gvCoeff * uvCoeff_v;
345-
outMatrix[1][3] = -yCoeff * yOff - guCoeff * uvCoeff_u * uvOff -
346-
gvCoeff * uvCoeff_v * uvOff;
347-
348-
outMatrix[2][0] = yCoeff;
349-
outMatrix[2][1] = uScale * uvCoeff_u;
350-
outMatrix[2][2] = 0.0f;
351-
outMatrix[2][3] = -yCoeff * yOff - uScale * uvCoeff_u * uvOff;
352-
}
353-
}
354271
} // namespace
355272

356-
static torch::stable::Tensor convertP016FrameToRGB16(
357-
UniqueAVFrame& avFrame,
358-
const StableDevice& device,
359-
cudaStream_t nvdecStream,
360-
std::optional<torch::stable::Tensor> preAllocatedOutputTensor,
361-
const FrameDims& outputDims,
362-
int bitDepth,
363-
const float colorMatrix[3][4],
364-
bool colorMatrixChanged) {
365-
// avFrame dimensions may be odd (NVDEC display area for VP9 etc.). P016
366-
// color conversion requires even dimensions, so we round up to even for the
367-
// kernel, then crop to outputDims.
368-
int frameHeight = avFrame->height;
369-
int frameWidth = avFrame->width;
370-
int height = roundUpToEven(frameHeight);
371-
int width = roundUpToEven(frameWidth);
372-
373-
int outHeight = outputDims.height;
374-
int outWidth = outputDims.width;
375-
bool needsCrop = (outHeight != height) || (outWidth != width);
376-
377-
torch::stable::Tensor dst;
378-
if (needsCrop) {
379-
dst = allocateEmptyHWCTensor(
380-
FrameDims(height, width), device, OutputDtype::FLOAT32);
381-
} else if (preAllocatedOutputTensor.has_value()) {
382-
dst = preAllocatedOutputTensor.value();
383-
} else {
384-
dst = allocateEmptyHWCTensor(
385-
FrameDims(outHeight, outWidth), device, OutputDtype::FLOAT32);
386-
}
387-
388-
cudaStream_t stream = getCurrentCudaStream(device.index());
389-
syncStreams(/*runningStream=*/nvdecStream, /*waitingStream=*/stream);
390-
391-
launchP016ToRGB16Kernel(
392-
reinterpret_cast<const uint16_t*>(avFrame->data[0]),
393-
reinterpret_cast<const uint16_t*>(avFrame->data[1]),
394-
dst.mutable_data_ptr<uint16_t>(),
395-
width,
396-
height,
397-
avFrame->linesize[0],
398-
avFrame->linesize[1],
399-
validateInt64ToInt(dst.stride(0) * 2, "dst.stride(0)*2"),
400-
bitDepth,
401-
colorMatrix,
402-
colorMatrixChanged,
403-
stream);
404-
405-
if (needsCrop) {
406-
if (outHeight != height) {
407-
dst = torch::stable::narrow(dst, /*dim=*/0, /*start=*/0, outHeight);
408-
}
409-
if (outWidth != width) {
410-
dst = torch::stable::narrow(dst, /*dim=*/1, /*start=*/0, outWidth);
411-
dst = torch::stable::contiguous(dst);
412-
}
413-
if (preAllocatedOutputTensor.has_value()) {
414-
torch::stable::copy_(preAllocatedOutputTensor.value(), dst);
415-
return preAllocatedOutputTensor.value();
416-
}
417-
return dst;
418-
}
419-
return dst;
420-
}
421-
422273
BetaCudaDeviceInterface::BetaCudaDeviceInterface(const StableDevice& device)
423274
: DeviceInterface(device) {
424275
STD_TORCH_CHECK(g_cuda_nvdec, "NvdecCudaDeviceInterface was not registered!");
@@ -427,16 +278,6 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const StableDevice& device)
427278

428279
initializeCudaContextWithPytorch(device_);
429280

430-
// Note: we could consider *not* erroring when NPP is unavailable, and just
431-
// fallback to the CPU for the color-conversion. This would be similar to what
432-
// we do when NVCUVID is not available (we fallback to the CPU for the
433-
// decoding step).
434-
STD_TORCH_CHECK(
435-
loadNPPLibrary(),
436-
"Failed to load NPP library. NPP is required for CUDA color conversion.");
437-
438-
nppCtx_ = getNppStreamContext(device_);
439-
440281
nvcuvidAvailable_ = loadNVCUVIDLibrary();
441282
}
442283

@@ -537,8 +378,6 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
537378
cuvidDestroyVideoParser(videoParser_);
538379
videoParser_ = nullptr;
539380
}
540-
541-
returnNppStreamContextToCache(device_, std::move(nppCtx_));
542381
}
543382

544383
void BetaCudaDeviceInterface::initialize(
@@ -789,8 +628,8 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
789628
procParams.progressive_frame = dispInfo.progressive_frame;
790629
procParams.top_field_first = dispInfo.top_field_first;
791630
procParams.unpaired_field = dispInfo.repeat_first_field < 0;
792-
// We set the NVDEC stream to the current stream. It will be waited upon by
793-
// the NPP stream before any color conversion.
631+
// We set the NVDEC stream to the current stream. It will be waited upon
632+
// by the color conversion stream before any color conversion.
794633
// Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
795634
// CUstream
796635
procParams.output_stream =
@@ -1138,37 +977,22 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
1138977

1139978
auto convertFrame = [&](std::optional<torch::stable::Tensor> preAlloc)
1140979
-> torch::stable::Tensor {
1141-
if (gpuFrame->format == AV_PIX_FMT_P016LE) {
1142-
int bitDepth = cpuFallback_
980+
bool isP016 = (gpuFrame->format == AV_PIX_FMT_P016LE);
981+
int bitDepth = 8;
982+
if (isP016) {
983+
bitDepth = cpuFallback_
1143984
? codecContext_->bits_per_raw_sample
1144985
: static_cast<int>(videoFormat_.bit_depth_luma_minus8) + 8;
1145-
AVColorSpace colorspace = gpuFrame->colorspace;
1146-
AVColorRange colorRange = gpuFrame->color_range;
1147-
bool colorMatrixChanged = false;
1148-
if (!cachedColorMatrix_.valid ||
1149-
cachedColorMatrix_.colorspace != colorspace ||
1150-
cachedColorMatrix_.colorRange != colorRange ||
1151-
cachedColorMatrix_.bitDepth != bitDepth) {
1152-
computeP016ColorMatrix(
1153-
colorspace, colorRange, bitDepth, cachedColorMatrix_.matrix);
1154-
cachedColorMatrix_.colorspace = colorspace;
1155-
cachedColorMatrix_.colorRange = colorRange;
1156-
cachedColorMatrix_.bitDepth = bitDepth;
1157-
cachedColorMatrix_.valid = true;
1158-
colorMatrixChanged = true;
1159-
}
1160-
return convertP016FrameToRGB16(
1161-
gpuFrame,
1162-
device_,
1163-
nvdecStream,
1164-
preAlloc,
1165-
originalDims,
1166-
bitDepth,
1167-
cachedColorMatrix_.matrix,
1168-
colorMatrixChanged);
1169986
}
1170-
return convertNV12FrameToRGB(
1171-
gpuFrame, device_, nppCtx_, nvdecStream, preAlloc, originalDims);
987+
return convertYUVFrameToRGB(
988+
gpuFrame,
989+
device_,
990+
nvdecStream,
991+
preAlloc,
992+
originalDims,
993+
isP016,
994+
bitDepth,
995+
cachedColorMatrix_);
1172996
};
1173997

1174998
if (rotation_ == Rotation::NONE) {

src/torchcodec/_core/BetaCudaDeviceInterface.h

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "FFMPEGCommon.h"
2222
#include "NVDECCache.h"
2323
#include "Transform.h"
24+
#include "color_conversion.h"
2425

2526
#include <memory>
2627
#include <mutex>
@@ -108,9 +109,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
108109

109110
UniqueAVBSFContext bitstreamFilter_;
110111

111-
// NPP context for color conversion
112-
UniqueNppContext nppCtx_;
113-
114112
std::unique_ptr<DeviceInterface> cpuFallback_;
115113
bool nvcuvidAvailable_ = false;
116114
UniqueSwsContext swsContext_;
@@ -120,15 +118,7 @@ class BetaCudaDeviceInterface : public DeviceInterface {
120118
OutputDtype outputDtype_ = OutputDtype::UINT8;
121119
cudaVideoSurfaceFormat surfaceFormat_ = cudaVideoSurfaceFormat_NV12;
122120

123-
struct CachedP016ColorMatrix {
124-
AVColorSpace colorspace = AVCOL_SPC_UNSPECIFIED;
125-
AVColorRange colorRange = AVCOL_RANGE_UNSPECIFIED;
126-
int bitDepth = 0;
127-
float matrix[3][4] = {};
128-
bool valid = false;
129-
};
130-
131-
CachedP016ColorMatrix cachedColorMatrix_;
121+
CachedColorMatrix cachedColorMatrix_;
132122
};
133123

134124
} // namespace facebook::torchcodec

src/torchcodec/_core/CMakeLists.txt

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,17 +147,14 @@ function(make_torchcodec_libraries
147147

148148
if(ENABLE_CUDA)
149149
enable_language(CUDA)
150-
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp NPPRuntimeLoader.cpp P016ToRGB16.cu)
150+
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp color_conversion.cpp color_conversion.cu)
151151
endif()
152152

153153
set(core_library_dependencies
154154
${ffmpeg_target}
155155
${TORCH_LIBRARIES}
156156
)
157157

158-
# Note: NPP (nppi, nppicc) is NOT linked here. It is loaded at runtime
159-
# via NPPRuntimeLoader.cpp, following the same pattern as NVCUVID.
160-
161158
make_torchcodec_sublibrary(
162159
"${core_library_name}"
163160
SHARED

0 commit comments

Comments
 (0)