meta-pytorch
diff --git a/‎.github/workflows/linux_cuda_wheel.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/linux_cuda_wheel.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/windows_cuda_wheel.yaml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/windows_cuda_wheel.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/torchcodec/_core/BetaCudaDeviceInterface.cpp‎
Lines changed: 18 additions & 194 deletions b/‎src/torchcodec/_core/BetaCudaDeviceInterface.cpp‎
Lines changed: 18 additions & 194 deletions
diff --git a/‎src/torchcodec/_core/BetaCudaDeviceInterface.h‎
Lines changed: 2 additions & 12 deletions b/‎src/torchcodec/_core/BetaCudaDeviceInterface.h‎
Lines changed: 2 additions & 12 deletions
diff --git a/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 1 addition & 4 deletions b/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 1 addition & 4 deletions
@@ -97,7 +97,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
           # We install conda packages at the start because otherwise conda may have conflicts with dependencies.
           # Note: xorg-libxau was addded to fix a problem with ffmpeg 4. We should consider removing it.
-          default-packages: "nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }} conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }} conda-forge::xorg-libxau"
+          default-packages: "nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }} conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }} conda-forge::xorg-libxau"
       - name: Check env, set LD_LIBRARY_PATH
         run: |
           ${CONDA_RUN} env
@@ -224,7 +224,7 @@ jobs:
         with:
           python-version: ${{ env.PYTHON_VERSION }}
           # We install conda packages at the start because otherwise conda may have conflicts with dependencies.
-          default-packages: "nvidia/label/cuda-${{ env.CUDA_VERSION }}.0::libnpp nvidia::cuda-nvrtc=${{ env.CUDA_VERSION }} nvidia::cuda-toolkit=${{ env.CUDA_VERSION }} nvidia::cuda-cudart=${{ env.CUDA_VERSION }} nvidia::cuda-driver-dev=${{ env.CUDA_VERSION }} conda-forge::ffmpeg=${{ env.FFMPEG_VERSION }}"
+          default-packages: "nvidia::cuda-nvrtc=${{ env.CUDA_VERSION }} nvidia::cuda-toolkit=${{ env.CUDA_VERSION }} nvidia::cuda-cudart=${{ env.CUDA_VERSION }} nvidia::cuda-driver-dev=${{ env.CUDA_VERSION }} conda-forge::ffmpeg=${{ env.FFMPEG_VERSION }}"
       - name: Check env, set LD_LIBRARY_PATH
         run: |
           ${CONDA_RUN} env
 
@@ -107,7 +107,6 @@ jobs:
       - name: Install CUDA and FFmpeg conda packages
         run: |
           conda install -y \
-            "nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp" \
             "nvidia::cuda-nvrtc=${{ matrix.cuda-version }}" \
             "nvidia::cuda-cudart=${{ matrix.cuda-version }}" \
             "conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }}"
 
@@ -128,8 +128,8 @@ Make sure you have a GPU with NVDEC hardware that can decode the format you
 want. Refer to Nvidia's GPU support matrix
 [here](https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new).
 
-You will need the `libnpp` and `libnvrtc` CUDA libraries, which are usually
-part of the CUDA Toolkit.
+You will need the `libnvrtc` CUDA library, which is usually part of the CUDA
+Toolkit.
 
 To select a specific CUDA Toolkit version, use `--index-url`. Make sure to
 install the corresponding PyTorch version as well (refer to the
 
@@ -17,9 +17,8 @@
 #include "Logging.h"
 #include "NVDECCache.h"
 
-#include "NPPRuntimeLoader.h"
 #include "NVCUVIDRuntimeLoader.h"
-#include "P016ToRGB16.h"
+#include "color_conversion.h"
 #include "nvcuvid_include/cuviddec.h"
 #include "nvcuvid_include/nvcuvid.h"
 
@@ -252,8 +251,8 @@ std::optional<cudaVideoSurfaceFormat> getNVDECSurfaceFormat(
 
   // P016 is typically not supported on 8-bit SDR content. In such cases, we
   // try to fall back to NV12 if supported:
-  // NVDEC will decode to NV12, NPP will do NV12 -> RGB producing uint8, and
-  // maybePermuteAndConvertToFloat32 will cast uint8 -> float32.
+  // NVDEC will decode to NV12, our kernel will do NV12 -> RGB producing
+  // uint8, and maybePermuteAndConvertToFloat32 will cast uint8 -> float32.
   // For HDR content, NV12 would lose precision, so we fall back to CPU instead.
   if (preferredFormat == cudaVideoSurfaceFormat_P016 && bitDepthMinus8 == 0 &&
       ((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1)) {
@@ -269,156 +268,8 @@ void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
   cudaFree(opaque);
 }
 
-static void computeP016ColorMatrix(
-    AVColorSpace colorspace,
-    AVColorRange colorRange,
-    int bitDepth,
-    float outMatrix[3][4]) {
-  float kr, kg, kb;
-  switch (colorspace) {
-    case AVCOL_SPC_BT709:
-      kr = 0.2126f;
-      kg = 0.7152f;
-      kb = 0.0722f;
-      break;
-    case AVCOL_SPC_BT2020_NCL:
-    case AVCOL_SPC_BT2020_CL:
-      kr = 0.2627f;
-      kg = 0.6780f;
-      kb = 0.0593f;
-      break;
-    default:
-      // BT.601
-      kr = 0.299f;
-      kg = 0.587f;
-      kb = 0.114f;
-      break;
-  }
-
-  float vScale = 2.0f * (1.0f - kr);
-  float uScale = 2.0f * (1.0f - kb);
-  float guCoeff = -(2.0f * kb * (1.0f - kb)) / kg;
-  float gvCoeff = -(2.0f * kr * (1.0f - kr)) / kg;
-
-  float maxVal = static_cast<float>((1 << bitDepth) - 1);
-  float outScale = 65535.0f;
-
-  bool isFullRange = (colorRange == AVCOL_RANGE_JPEG);
-
-  if (isFullRange) {
-    float yScale = outScale / maxVal;
-    float uvCenter = static_cast<float>(1 << (bitDepth - 1));
-
-    outMatrix[0][0] = yScale;
-    outMatrix[0][1] = 0.0f;
-    outMatrix[0][2] = vScale * outScale / maxVal;
-    outMatrix[0][3] = -vScale * uvCenter * outScale / maxVal;
-
-    outMatrix[1][0] = yScale;
-    outMatrix[1][1] = guCoeff * outScale / maxVal;
-    outMatrix[1][2] = gvCoeff * outScale / maxVal;
-    outMatrix[1][3] = -(guCoeff + gvCoeff) * uvCenter * outScale / maxVal;
-
-    outMatrix[2][0] = yScale;
-    outMatrix[2][1] = uScale * outScale / maxVal;
-    outMatrix[2][2] = 0.0f;
-    outMatrix[2][3] = -uScale * uvCenter * outScale / maxVal;
-  } else {
-    float s = static_cast<float>(1 << (bitDepth - 8));
-    float yOff = 16.0f * s;
-    float yRange = 219.0f * s;
-    float uvOff = 128.0f * s;
-    float uvRange = 224.0f * s;
-
-    float yCoeff = outScale / yRange;
-    float uvCoeff_u = outScale / uvRange;
-    float uvCoeff_v = outScale / uvRange;
-
-    outMatrix[0][0] = yCoeff;
-    outMatrix[0][1] = 0.0f;
-    outMatrix[0][2] = vScale * uvCoeff_v;
-    outMatrix[0][3] = -yCoeff * yOff - vScale * uvCoeff_v * uvOff;
-
-    outMatrix[1][0] = yCoeff;
-    outMatrix[1][1] = guCoeff * uvCoeff_u;
-    outMatrix[1][2] = gvCoeff * uvCoeff_v;
-    outMatrix[1][3] = -yCoeff * yOff - guCoeff * uvCoeff_u * uvOff -
-        gvCoeff * uvCoeff_v * uvOff;
-
-    outMatrix[2][0] = yCoeff;
-    outMatrix[2][1] = uScale * uvCoeff_u;
-    outMatrix[2][2] = 0.0f;
-    outMatrix[2][3] = -yCoeff * yOff - uScale * uvCoeff_u * uvOff;
-  }
-}
 } // namespace
 
-static torch::stable::Tensor convertP016FrameToRGB16(
-    UniqueAVFrame& avFrame,
-    const StableDevice& device,
-    cudaStream_t nvdecStream,
-    std::optional<torch::stable::Tensor> preAllocatedOutputTensor,
-    const FrameDims& outputDims,
-    int bitDepth,
-    const float colorMatrix[3][4],
-    bool colorMatrixChanged) {
-  // avFrame dimensions may be odd (NVDEC display area for VP9 etc.). P016
-  // color conversion requires even dimensions, so we round up to even for the
-  // kernel, then crop to outputDims.
-  int frameHeight = avFrame->height;
-  int frameWidth = avFrame->width;
-  int height = roundUpToEven(frameHeight);
-  int width = roundUpToEven(frameWidth);
-
-  int outHeight = outputDims.height;
-  int outWidth = outputDims.width;
-  bool needsCrop = (outHeight != height) || (outWidth != width);
-
-  torch::stable::Tensor dst;
-  if (needsCrop) {
-    dst = allocateEmptyHWCTensor(
-        FrameDims(height, width), device, OutputDtype::FLOAT32);
-  } else if (preAllocatedOutputTensor.has_value()) {
-    dst = preAllocatedOutputTensor.value();
-  } else {
-    dst = allocateEmptyHWCTensor(
-        FrameDims(outHeight, outWidth), device, OutputDtype::FLOAT32);
-  }
-
-  cudaStream_t stream = getCurrentCudaStream(device.index());
-  syncStreams(/*runningStream=*/nvdecStream, /*waitingStream=*/stream);
-
-  launchP016ToRGB16Kernel(
-      reinterpret_cast<const uint16_t*>(avFrame->data[0]),
-      reinterpret_cast<const uint16_t*>(avFrame->data[1]),
-      dst.mutable_data_ptr<uint16_t>(),
-      width,
-      height,
-      avFrame->linesize[0],
-      avFrame->linesize[1],
-      validateInt64ToInt(dst.stride(0) * 2, "dst.stride(0)*2"),
-      bitDepth,
-      colorMatrix,
-      colorMatrixChanged,
-      stream);
-
-  if (needsCrop) {
-    if (outHeight != height) {
-      dst = torch::stable::narrow(dst, /*dim=*/0, /*start=*/0, outHeight);
-    }
-    if (outWidth != width) {
-      dst = torch::stable::narrow(dst, /*dim=*/1, /*start=*/0, outWidth);
-      dst = torch::stable::contiguous(dst);
-    }
-    if (preAllocatedOutputTensor.has_value()) {
-      torch::stable::copy_(preAllocatedOutputTensor.value(), dst);
-      return preAllocatedOutputTensor.value();
-    }
-    return dst;
-  }
-  return dst;
-}
-
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const StableDevice& device)
     : DeviceInterface(device) {
   STD_TORCH_CHECK(g_cuda_nvdec, "NvdecCudaDeviceInterface was not registered!");
@@ -427,16 +278,6 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const StableDevice& device)
 
   initializeCudaContextWithPytorch(device_);
 
-  // Note: we could consider *not* erroring when NPP is unavailable, and just
-  // fallback to the CPU for the color-conversion. This would be similar to what
-  // we do when NVCUVID is not available (we fallback to the CPU for the
-  // decoding step).
-  STD_TORCH_CHECK(
-      loadNPPLibrary(),
-      "Failed to load NPP library. NPP is required for CUDA color conversion.");
-
-  nppCtx_ = getNppStreamContext(device_);
-
   nvcuvidAvailable_ = loadNVCUVIDLibrary();
 }
 
@@ -537,8 +378,6 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
     cuvidDestroyVideoParser(videoParser_);
     videoParser_ = nullptr;
   }
-
-  returnNppStreamContextToCache(device_, std::move(nppCtx_));
 }
 
 void BetaCudaDeviceInterface::initialize(
@@ -789,8 +628,8 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
   procParams.progressive_frame = dispInfo.progressive_frame;
   procParams.top_field_first = dispInfo.top_field_first;
   procParams.unpaired_field = dispInfo.repeat_first_field < 0;
-  // We set the NVDEC stream to the current stream. It will be waited upon by
-  // the NPP stream before any color conversion.
+  // We set the NVDEC stream to the current stream. It will be waited upon
+  // by the color conversion stream before any color conversion.
   // Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
   // CUstream
   procParams.output_stream =
@@ -1138,37 +977,22 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
 
   auto convertFrame = [&](std::optional<torch::stable::Tensor> preAlloc)
       -> torch::stable::Tensor {
-    if (gpuFrame->format == AV_PIX_FMT_P016LE) {
-      int bitDepth = cpuFallback_
+    bool isP016 = (gpuFrame->format == AV_PIX_FMT_P016LE);
+    int bitDepth = 8;
+    if (isP016) {
+      bitDepth = cpuFallback_
           ? codecContext_->bits_per_raw_sample
           : static_cast<int>(videoFormat_.bit_depth_luma_minus8) + 8;
-      AVColorSpace colorspace = gpuFrame->colorspace;
-      AVColorRange colorRange = gpuFrame->color_range;
-      bool colorMatrixChanged = false;
-      if (!cachedColorMatrix_.valid ||
-          cachedColorMatrix_.colorspace != colorspace ||
-          cachedColorMatrix_.colorRange != colorRange ||
-          cachedColorMatrix_.bitDepth != bitDepth) {
-        computeP016ColorMatrix(
-            colorspace, colorRange, bitDepth, cachedColorMatrix_.matrix);
-        cachedColorMatrix_.colorspace = colorspace;
-        cachedColorMatrix_.colorRange = colorRange;
-        cachedColorMatrix_.bitDepth = bitDepth;
-        cachedColorMatrix_.valid = true;
-        colorMatrixChanged = true;
-      }
-      return convertP016FrameToRGB16(
-          gpuFrame,
-          device_,
-          nvdecStream,
-          preAlloc,
-          originalDims,
-          bitDepth,
-          cachedColorMatrix_.matrix,
-          colorMatrixChanged);
     }
-    return convertNV12FrameToRGB(
-        gpuFrame, device_, nppCtx_, nvdecStream, preAlloc, originalDims);
+    return convertYUVFrameToRGB(
+        gpuFrame,
+        device_,
+        nvdecStream,
+        preAlloc,
+        originalDims,
+        isP016,
+        bitDepth,
+        cachedColorMatrix_);
   };
 
   if (rotation_ == Rotation::NONE) {
 
@@ -21,6 +21,7 @@
 #include "FFMPEGCommon.h"
 #include "NVDECCache.h"
 #include "Transform.h"
+#include "color_conversion.h"
 
 #include <memory>
 #include <mutex>
@@ -108,9 +109,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
 
   UniqueAVBSFContext bitstreamFilter_;
 
-  // NPP context for color conversion
-  UniqueNppContext nppCtx_;
-
   std::unique_ptr<DeviceInterface> cpuFallback_;
   bool nvcuvidAvailable_ = false;
   UniqueSwsContext swsContext_;
@@ -120,15 +118,7 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   OutputDtype outputDtype_ = OutputDtype::UINT8;
   cudaVideoSurfaceFormat surfaceFormat_ = cudaVideoSurfaceFormat_NV12;
 
-  struct CachedP016ColorMatrix {
-    AVColorSpace colorspace = AVCOL_SPC_UNSPECIFIED;
-    AVColorRange colorRange = AVCOL_RANGE_UNSPECIFIED;
-    int bitDepth = 0;
-    float matrix[3][4] = {};
-    bool valid = false;
-  };
-
-  CachedP016ColorMatrix cachedColorMatrix_;
+  CachedColorMatrix cachedColorMatrix_;
 };
 
 } // namespace facebook::torchcodec
 
@@ -147,17 +147,14 @@ function(make_torchcodec_libraries
 
     if(ENABLE_CUDA)
         enable_language(CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp NPPRuntimeLoader.cpp P016ToRGB16.cu)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp color_conversion.cpp color_conversion.cu)
     endif()
 
     set(core_library_dependencies
         ${ffmpeg_target}
         ${TORCH_LIBRARIES}
     )
 
-    # Note: NPP (nppi, nppicc) is NOT linked here. It is loaded at runtime
-    # via NPPRuntimeLoader.cpp, following the same pattern as NVCUVID.
-
     make_torchcodec_sublibrary(
         "${core_library_name}"
         SHARED