1717#include " Logging.h"
1818#include " NVDECCache.h"
1919
20- #include " NPPRuntimeLoader.h"
2120#include " NVCUVIDRuntimeLoader.h"
22- #include " P016ToRGB16 .h"
21+ #include " color_conversion .h"
2322#include " nvcuvid_include/cuviddec.h"
2423#include " nvcuvid_include/nvcuvid.h"
2524
@@ -252,8 +251,8 @@ std::optional<cudaVideoSurfaceFormat> getNVDECSurfaceFormat(
252251
253252 // P016 is typically not supported on 8-bit SDR content. In such cases, we
254253 // try to fall back to NV12 if supported:
255- // NVDEC will decode to NV12, NPP will do NV12 -> RGB producing uint8, and
256- // maybePermuteAndConvertToFloat32 will cast uint8 -> float32.
254+ // NVDEC will decode to NV12, our kernel will do NV12 -> RGB producing
255+ // uint8, and maybePermuteAndConvertToFloat32 will cast uint8 -> float32.
257256 // For HDR content, NV12 would lose precision, so we fall back to CPU instead.
258257 if (preferredFormat == cudaVideoSurfaceFormat_P016 && bitDepthMinus8 == 0 &&
259258 ((caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1 )) {
@@ -269,156 +268,8 @@ void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
269268 cudaFree (opaque);
270269}
271270
272- static void computeP016ColorMatrix (
273- AVColorSpace colorspace,
274- AVColorRange colorRange,
275- int bitDepth,
276- float outMatrix[3 ][4 ]) {
277- float kr, kg, kb;
278- switch (colorspace) {
279- case AVCOL_SPC_BT709 :
280- kr = 0 .2126f ;
281- kg = 0 .7152f ;
282- kb = 0 .0722f ;
283- break ;
284- case AVCOL_SPC_BT2020_NCL :
285- case AVCOL_SPC_BT2020_CL :
286- kr = 0 .2627f ;
287- kg = 0 .6780f ;
288- kb = 0 .0593f ;
289- break ;
290- default :
291- // BT.601
292- kr = 0 .299f ;
293- kg = 0 .587f ;
294- kb = 0 .114f ;
295- break ;
296- }
297-
298- float vScale = 2 .0f * (1 .0f - kr);
299- float uScale = 2 .0f * (1 .0f - kb);
300- float guCoeff = -(2 .0f * kb * (1 .0f - kb)) / kg;
301- float gvCoeff = -(2 .0f * kr * (1 .0f - kr)) / kg;
302-
303- float maxVal = static_cast <float >((1 << bitDepth) - 1 );
304- float outScale = 65535 .0f ;
305-
306- bool isFullRange = (colorRange == AVCOL_RANGE_JPEG );
307-
308- if (isFullRange) {
309- float yScale = outScale / maxVal;
310- float uvCenter = static_cast <float >(1 << (bitDepth - 1 ));
311-
312- outMatrix[0 ][0 ] = yScale;
313- outMatrix[0 ][1 ] = 0 .0f ;
314- outMatrix[0 ][2 ] = vScale * outScale / maxVal;
315- outMatrix[0 ][3 ] = -vScale * uvCenter * outScale / maxVal;
316-
317- outMatrix[1 ][0 ] = yScale;
318- outMatrix[1 ][1 ] = guCoeff * outScale / maxVal;
319- outMatrix[1 ][2 ] = gvCoeff * outScale / maxVal;
320- outMatrix[1 ][3 ] = -(guCoeff + gvCoeff) * uvCenter * outScale / maxVal;
321-
322- outMatrix[2 ][0 ] = yScale;
323- outMatrix[2 ][1 ] = uScale * outScale / maxVal;
324- outMatrix[2 ][2 ] = 0 .0f ;
325- outMatrix[2 ][3 ] = -uScale * uvCenter * outScale / maxVal;
326- } else {
327- float s = static_cast <float >(1 << (bitDepth - 8 ));
328- float yOff = 16 .0f * s;
329- float yRange = 219 .0f * s;
330- float uvOff = 128 .0f * s;
331- float uvRange = 224 .0f * s;
332-
333- float yCoeff = outScale / yRange;
334- float uvCoeff_u = outScale / uvRange;
335- float uvCoeff_v = outScale / uvRange;
336-
337- outMatrix[0 ][0 ] = yCoeff;
338- outMatrix[0 ][1 ] = 0 .0f ;
339- outMatrix[0 ][2 ] = vScale * uvCoeff_v;
340- outMatrix[0 ][3 ] = -yCoeff * yOff - vScale * uvCoeff_v * uvOff;
341-
342- outMatrix[1 ][0 ] = yCoeff;
343- outMatrix[1 ][1 ] = guCoeff * uvCoeff_u;
344- outMatrix[1 ][2 ] = gvCoeff * uvCoeff_v;
345- outMatrix[1 ][3 ] = -yCoeff * yOff - guCoeff * uvCoeff_u * uvOff -
346- gvCoeff * uvCoeff_v * uvOff;
347-
348- outMatrix[2 ][0 ] = yCoeff;
349- outMatrix[2 ][1 ] = uScale * uvCoeff_u;
350- outMatrix[2 ][2 ] = 0 .0f ;
351- outMatrix[2 ][3 ] = -yCoeff * yOff - uScale * uvCoeff_u * uvOff;
352- }
353- }
354271} // namespace
355272
356- static torch::stable::Tensor convertP016FrameToRGB16 (
357- UniqueAVFrame& avFrame,
358- const StableDevice& device,
359- cudaStream_t nvdecStream,
360- std::optional<torch::stable::Tensor> preAllocatedOutputTensor,
361- const FrameDims& outputDims,
362- int bitDepth,
363- const float colorMatrix[3 ][4 ],
364- bool colorMatrixChanged) {
365- // avFrame dimensions may be odd (NVDEC display area for VP9 etc.). P016
366- // color conversion requires even dimensions, so we round up to even for the
367- // kernel, then crop to outputDims.
368- int frameHeight = avFrame->height ;
369- int frameWidth = avFrame->width ;
370- int height = roundUpToEven (frameHeight);
371- int width = roundUpToEven (frameWidth);
372-
373- int outHeight = outputDims.height ;
374- int outWidth = outputDims.width ;
375- bool needsCrop = (outHeight != height) || (outWidth != width);
376-
377- torch::stable::Tensor dst;
378- if (needsCrop) {
379- dst = allocateEmptyHWCTensor (
380- FrameDims (height, width), device, OutputDtype::FLOAT32 );
381- } else if (preAllocatedOutputTensor.has_value ()) {
382- dst = preAllocatedOutputTensor.value ();
383- } else {
384- dst = allocateEmptyHWCTensor (
385- FrameDims (outHeight, outWidth), device, OutputDtype::FLOAT32 );
386- }
387-
388- cudaStream_t stream = getCurrentCudaStream (device.index ());
389- syncStreams (/* runningStream=*/ nvdecStream, /* waitingStream=*/ stream);
390-
391- launchP016ToRGB16Kernel (
392- reinterpret_cast <const uint16_t *>(avFrame->data [0 ]),
393- reinterpret_cast <const uint16_t *>(avFrame->data [1 ]),
394- dst.mutable_data_ptr <uint16_t >(),
395- width,
396- height,
397- avFrame->linesize [0 ],
398- avFrame->linesize [1 ],
399- validateInt64ToInt (dst.stride (0 ) * 2 , " dst.stride(0)*2" ),
400- bitDepth,
401- colorMatrix,
402- colorMatrixChanged,
403- stream);
404-
405- if (needsCrop) {
406- if (outHeight != height) {
407- dst = torch::stable::narrow (dst, /* dim=*/ 0 , /* start=*/ 0 , outHeight);
408- }
409- if (outWidth != width) {
410- dst = torch::stable::narrow (dst, /* dim=*/ 1 , /* start=*/ 0 , outWidth);
411- dst = torch::stable::contiguous (dst);
412- }
413- if (preAllocatedOutputTensor.has_value ()) {
414- torch::stable::copy_ (preAllocatedOutputTensor.value (), dst);
415- return preAllocatedOutputTensor.value ();
416- }
417- return dst;
418- }
419- return dst;
420- }
421-
422273BetaCudaDeviceInterface::BetaCudaDeviceInterface (const StableDevice& device)
423274 : DeviceInterface(device) {
424275 STD_TORCH_CHECK (g_cuda_nvdec, " NvdecCudaDeviceInterface was not registered!" );
@@ -427,16 +278,6 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const StableDevice& device)
427278
428279 initializeCudaContextWithPytorch (device_);
429280
430- // Note: we could consider *not* erroring when NPP is unavailable, and just
431- // fallback to the CPU for the color-conversion. This would be similar to what
432- // we do when NVCUVID is not available (we fallback to the CPU for the
433- // decoding step).
434- STD_TORCH_CHECK (
435- loadNPPLibrary (),
436- " Failed to load NPP library. NPP is required for CUDA color conversion." );
437-
438- nppCtx_ = getNppStreamContext (device_);
439-
440281 nvcuvidAvailable_ = loadNVCUVIDLibrary ();
441282}
442283
@@ -537,8 +378,6 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
537378 cuvidDestroyVideoParser (videoParser_);
538379 videoParser_ = nullptr ;
539380 }
540-
541- returnNppStreamContextToCache (device_, std::move (nppCtx_));
542381}
543382
544383void BetaCudaDeviceInterface::initialize (
@@ -789,8 +628,8 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
789628 procParams.progressive_frame = dispInfo.progressive_frame ;
790629 procParams.top_field_first = dispInfo.top_field_first ;
791630 procParams.unpaired_field = dispInfo.repeat_first_field < 0 ;
792- // We set the NVDEC stream to the current stream. It will be waited upon by
793- // the NPP stream before any color conversion.
631+ // We set the NVDEC stream to the current stream. It will be waited upon
632+ // by the color conversion stream before any color conversion.
794633 // Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
795634 // CUstream
796635 procParams.output_stream =
@@ -1138,37 +977,22 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
1138977
1139978 auto convertFrame = [&](std::optional<torch::stable::Tensor> preAlloc)
1140979 -> torch::stable::Tensor {
1141- if (gpuFrame->format == AV_PIX_FMT_P016LE ) {
1142- int bitDepth = cpuFallback_
980+ bool isP016 = (gpuFrame->format == AV_PIX_FMT_P016LE );
981+ int bitDepth = 8 ;
982+ if (isP016) {
983+ bitDepth = cpuFallback_
1143984 ? codecContext_->bits_per_raw_sample
1144985 : static_cast <int >(videoFormat_.bit_depth_luma_minus8 ) + 8 ;
1145- AVColorSpace colorspace = gpuFrame->colorspace ;
1146- AVColorRange colorRange = gpuFrame->color_range ;
1147- bool colorMatrixChanged = false ;
1148- if (!cachedColorMatrix_.valid ||
1149- cachedColorMatrix_.colorspace != colorspace ||
1150- cachedColorMatrix_.colorRange != colorRange ||
1151- cachedColorMatrix_.bitDepth != bitDepth) {
1152- computeP016ColorMatrix (
1153- colorspace, colorRange, bitDepth, cachedColorMatrix_.matrix );
1154- cachedColorMatrix_.colorspace = colorspace;
1155- cachedColorMatrix_.colorRange = colorRange;
1156- cachedColorMatrix_.bitDepth = bitDepth;
1157- cachedColorMatrix_.valid = true ;
1158- colorMatrixChanged = true ;
1159- }
1160- return convertP016FrameToRGB16 (
1161- gpuFrame,
1162- device_,
1163- nvdecStream,
1164- preAlloc,
1165- originalDims,
1166- bitDepth,
1167- cachedColorMatrix_.matrix ,
1168- colorMatrixChanged);
1169986 }
1170- return convertNV12FrameToRGB (
1171- gpuFrame, device_, nppCtx_, nvdecStream, preAlloc, originalDims);
987+ return convertYUVFrameToRGB (
988+ gpuFrame,
989+ device_,
990+ nvdecStream,
991+ preAlloc,
992+ originalDims,
993+ isP016,
994+ bitDepth,
995+ cachedColorMatrix_);
1172996 };
1173997
1174998 if (rotation_ == Rotation::NONE ) {
0 commit comments