@@ -28,6 +28,7 @@ namespace facebook::torchcodec {
2828namespace xpu {
2929
3030const char * USE_SYCL_KERNELS = std::getenv(" USE_SYCL_KERNELS" );
31+ const char * FORCE_CPU_FALLBACK = std::getenv(" FORCE_CPU_FALLBACK" );
3132
3233static bool g_xpu = registerDeviceInterface(
3334 DeviceInterfaceKey (StableDeviceType::XPU ),
@@ -65,6 +66,13 @@ inline bool use_sycl_color_conversion_kernel() {
6566#endif
6667}
6768
69+ inline bool force_cpu_fallback () {
70+ if (!FORCE_CPU_FALLBACK ) {
71+ return false ;
72+ }
73+ return to_bool (FORCE_CPU_FALLBACK );
74+ }
75+
6876bool has_fp64 (const StableDevice& device) {
6977 int deviceIndex = getDeviceIndex (device);
7078 sycl::device syclDevice = c10::xpu::get_raw_device (deviceIndex);
@@ -127,36 +135,6 @@ torch::stable::Tensor allocateEmptyHWCTensor(
127135 device);
128136}
129137
130- // Self-contained SW NV12/YUV->RGB24 conversion for the CPU fallback path.
131- // Uses libswscale directly rather than delegating to CpuDeviceInterface: the
132- // relevant torchcodec symbols (CpuDeviceInterface ctor, createDeviceInterface)
133- // are not exported from the installed libtorchcodec_core6.so, so delegation is
134- // not linkable against the shipped wheel.
135- void convertSWFrameToRGB_sws (
136- AVFrame* avFrame,
137- torch::stable::Tensor& dstRGB_CPU) {
138- const int width = avFrame->width ;
139- const int height = avFrame->height ;
140- auto srcFormat = static_cast <AVPixelFormat>(avFrame->format );
141-
142- SwsContext* sws = sws_getContext (width, height, srcFormat, width, height,
143- AV_PIX_FMT_RGB24 , SWS_BILINEAR , nullptr , nullptr , nullptr );
144- TORCH_CHECK (
145- sws != nullptr , " sws_getContext failed for " , av_get_pix_fmt_name (srcFormat),
146- " -> RGB24 at " , width, " x" ,
147- height);
148-
149- uint8_t * dstData[4 ] = {static_cast <uint8_t *>(dstRGB_CPU.mutable_data_ptr ()),
150- nullptr , nullptr , nullptr };
151- int dstLinesize[4 ] = {width * 3 , 0 , 0 , 0 };
152-
153- int scaled = sws_scale (sws, avFrame->data , avFrame->linesize , 0 ,
154- height, dstData, dstLinesize);
155- sws_freeContext (sws);
156- TORCH_CHECK (
157- scaled == height, " sws_scale produced " , scaled, " lines, expected " , height);
158- }
159-
160138} // namespace xpu
161139
162140int getDeviceIndex (const StableDevice& device) {
@@ -184,11 +162,13 @@ XpuDeviceInterface::XpuDeviceInterface(const StableDevice& device)
184162 {1 }, kStableUInt8 , std::nullopt , StableDevice (device));
185163
186164 auto arch = xpu::getArchitecture (device);
187- // Checking for devices which don't have HW media engines so we can skip
188- // initialization of VAAPI context.
189- if (arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc &&
165+ if (!xpu::force_cpu_fallback ()) {
166+ // Checking for devices which don't have HW media engines so we can skip
167+ // initialization of VAAPI context.
168+ if (arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc &&
190169 arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc_vg) {
191170 ctx_ = xpu::getVaapiContext (device_);
171+ }
192172 }
193173
194174 if (xpu::use_sycl_color_conversion_kernel ()) {
@@ -216,6 +196,13 @@ void XpuDeviceInterface::initialize(
216196 TORCH_CHECK (avStream != nullptr , " avStream is null" );
217197 codecContext_ = codecContext;
218198 timeBase_ = avStream->time_base ;
199+
200+ cpuInterface_ = createDeviceInterface (kStableCPU );
201+ STD_TORCH_CHECK (
202+ cpuInterface_ != nullptr , " Failed to create CPU device interface" );
203+ cpuInterface_->initialize (avStream, avFormatCtx, codecContext);
204+ cpuInterface_->initializeVideo (
205+ VideoStreamOptions (), {}, /* resizedOutputDims=*/ std::nullopt );
219206}
220207
221208void XpuDeviceInterface::initializeVideo (
@@ -369,25 +356,17 @@ void XpuDeviceInterface::convertAVFrameToFrameOutput(
369356 // general or on this particular device. In this case we have a frame on the
370357 // CPU. We send the frame back to the XPU device when we're done.
371358
372- // Self-contained SW->RGB conversion via libswscale. We do not delegate to
373- // CpuDeviceInterface because its constructor and the createDeviceInterface
374- // factory are not exported from the installed libtorchcodec_core wheel.
375- auto frameDims = FrameDims (avFrame->height , avFrame->width );
376- torch::stable::Tensor cpuRGB = torch::stable::empty (
377- {frameDims.height , frameDims.width , 3 },
378- kStableUInt8 ,
379- std::nullopt ,
380- StableDevice (kStableCPU ));
381- xpu::convertSWFrameToRGB_sws (avFrame.get (), cpuRGB);
359+ FrameOutput cpuFrameOutput;
360+ cpuInterface_->convertAVFrameToFrameOutput (avFrame, cpuFrameOutput);
382361
383362 // Finally, we need to send the frame back to the GPU. Note that the
384363 // pre-allocated tensor is on the GPU, so we can't send that to the CPU
385364 // device interface. We copy it over here.
386365 if (preAllocatedOutputTensor.has_value ()) {
387- torch::stable::copy_ (preAllocatedOutputTensor.value (), cpuRGB );
366+ torch::stable::copy_ (preAllocatedOutputTensor.value (), cpuFrameOutput. data );
388367 frameOutput.data = preAllocatedOutputTensor.value ();
389368 } else {
390- frameOutput.data = torch::stable::to (cpuRGB , device_);
369+ frameOutput.data = torch::stable::to (cpuFrameOutput. data , device_);
391370 }
392371 return ;
393372 }
0 commit comments