Skip to content

Commit c7cccb0

Browse files
committed
torchcodec-xpu: delegate CPU fallback to CpuDeviceInterface
Changes: * Delegate CPU fallback to CpuDeviceInterface * Introduce `FORCE_CPU_FALLBACK` to easy verify the fallback Fixes: #44 Requires: meta-pytorch/torchcodec#1346 Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
1 parent e22139d commit c7cccb0

3 files changed

Lines changed: 30 additions & 45 deletions

File tree

packages/torchcodec-xpu/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ The following environment variables can be used to customize the behavior of Int
2424

2525
* `USE_SYCL_KERNELS = on|off` (default: `off`) - use SYCL kernels for augmentation such as color space conversion instead of VAAPI interface. If SYCL kernels are requested but can not be used due to hardware limitations, then fallback to VAAPI will be attempted.
2626

27+
* `FORCE_CPU_FALLBACL=on|off` (default: `off`) - force CPU fallback.
28+
2729
## Known limitations
2830

2931
* [Intel® Data Center GPU Max Series][PVC] (Ponte Vecchio, PVC) GPUs are not supported due to missing hardware media engines

packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp

Lines changed: 24 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ namespace facebook::torchcodec {
2828
namespace xpu {
2929

3030
const char* USE_SYCL_KERNELS = std::getenv("USE_SYCL_KERNELS");
31+
const char* FORCE_CPU_FALLBACK = std::getenv("FORCE_CPU_FALLBACK");
3132

3233
static bool g_xpu = registerDeviceInterface(
3334
DeviceInterfaceKey(StableDeviceType::XPU),
@@ -65,6 +66,13 @@ inline bool use_sycl_color_conversion_kernel() {
6566
#endif
6667
}
6768

69+
inline bool force_cpu_fallback() {
70+
if (!FORCE_CPU_FALLBACK) {
71+
return false;
72+
}
73+
return to_bool(FORCE_CPU_FALLBACK);
74+
}
75+
6876
bool has_fp64(const StableDevice& device) {
6977
int deviceIndex = getDeviceIndex(device);
7078
sycl::device syclDevice = c10::xpu::get_raw_device(deviceIndex);
@@ -127,36 +135,6 @@ torch::stable::Tensor allocateEmptyHWCTensor(
127135
device);
128136
}
129137

130-
// Self-contained SW NV12/YUV->RGB24 conversion for the CPU fallback path.
131-
// Uses libswscale directly rather than delegating to CpuDeviceInterface: the
132-
// relevant torchcodec symbols (CpuDeviceInterface ctor, createDeviceInterface)
133-
// are not exported from the installed libtorchcodec_core6.so, so delegation is
134-
// not linkable against the shipped wheel.
135-
void convertSWFrameToRGB_sws(
136-
AVFrame* avFrame,
137-
torch::stable::Tensor& dstRGB_CPU) {
138-
const int width = avFrame->width;
139-
const int height = avFrame->height;
140-
auto srcFormat = static_cast<AVPixelFormat>(avFrame->format);
141-
142-
SwsContext* sws = sws_getContext(width, height, srcFormat, width, height,
143-
AV_PIX_FMT_RGB24, SWS_BILINEAR, nullptr, nullptr, nullptr);
144-
TORCH_CHECK(
145-
sws != nullptr, "sws_getContext failed for ", av_get_pix_fmt_name(srcFormat),
146-
" -> RGB24 at ", width, "x",
147-
height);
148-
149-
uint8_t* dstData[4] = {static_cast<uint8_t*>(dstRGB_CPU.mutable_data_ptr()),
150-
nullptr, nullptr, nullptr};
151-
int dstLinesize[4] = {width * 3, 0, 0, 0};
152-
153-
int scaled = sws_scale(sws, avFrame->data, avFrame->linesize, 0,
154-
height, dstData, dstLinesize);
155-
sws_freeContext(sws);
156-
TORCH_CHECK(
157-
scaled == height, "sws_scale produced ", scaled, " lines, expected ", height);
158-
}
159-
160138
} // namespace xpu
161139

162140
int getDeviceIndex(const StableDevice& device) {
@@ -184,11 +162,13 @@ XpuDeviceInterface::XpuDeviceInterface(const StableDevice& device)
184162
{1}, kStableUInt8, std::nullopt, StableDevice(device));
185163

186164
auto arch = xpu::getArchitecture(device);
187-
// Checking for devices which don't have HW media engines so we can skip
188-
// initialization of VAAPI context.
189-
if (arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc &&
165+
if (!xpu::force_cpu_fallback()) {
166+
// Checking for devices which don't have HW media engines so we can skip
167+
// initialization of VAAPI context.
168+
if (arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc &&
190169
arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc_vg) {
191170
ctx_ = xpu::getVaapiContext(device_);
171+
}
192172
}
193173

194174
if (xpu::use_sycl_color_conversion_kernel()) {
@@ -216,6 +196,13 @@ void XpuDeviceInterface::initialize(
216196
TORCH_CHECK(avStream != nullptr, "avStream is null");
217197
codecContext_ = codecContext;
218198
timeBase_ = avStream->time_base;
199+
200+
cpuInterface_ = createDeviceInterface(kStableCPU);
201+
STD_TORCH_CHECK(
202+
cpuInterface_ != nullptr, "Failed to create CPU device interface");
203+
cpuInterface_->initialize(avStream, avFormatCtx, codecContext);
204+
cpuInterface_->initializeVideo(
205+
VideoStreamOptions(), {}, /*resizedOutputDims=*/std::nullopt);
219206
}
220207

221208
void XpuDeviceInterface::initializeVideo(
@@ -369,25 +356,17 @@ void XpuDeviceInterface::convertAVFrameToFrameOutput(
369356
// general or on this particular device. In this case we have a frame on the
370357
// CPU. We send the frame back to the XPU device when we're done.
371358

372-
// Self-contained SW->RGB conversion via libswscale. We do not delegate to
373-
// CpuDeviceInterface because its constructor and the createDeviceInterface
374-
// factory are not exported from the installed libtorchcodec_core wheel.
375-
auto frameDims = FrameDims(avFrame->height, avFrame->width);
376-
torch::stable::Tensor cpuRGB = torch::stable::empty(
377-
{frameDims.height, frameDims.width, 3},
378-
kStableUInt8,
379-
std::nullopt,
380-
StableDevice(kStableCPU));
381-
xpu::convertSWFrameToRGB_sws(avFrame.get(), cpuRGB);
359+
FrameOutput cpuFrameOutput;
360+
cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
382361

383362
// Finally, we need to send the frame back to the GPU. Note that the
384363
// pre-allocated tensor is on the GPU, so we can't send that to the CPU
385364
// device interface. We copy it over here.
386365
if (preAllocatedOutputTensor.has_value()) {
387-
torch::stable::copy_(preAllocatedOutputTensor.value(), cpuRGB);
366+
torch::stable::copy_(preAllocatedOutputTensor.value(), cpuFrameOutput.data);
388367
frameOutput.data = preAllocatedOutputTensor.value();
389368
} else {
390-
frameOutput.data = torch::stable::to(cpuRGB, device_);
369+
frameOutput.data = torch::stable::to(cpuFrameOutput.data, device_);
391370
}
392371
return;
393372
}

packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ class XpuDeviceInterface : public DeviceInterface {
3939
std::nullopt) override;
4040

4141
private:
42+
// We sometimes encounter frames that cannot be decoded on the XPU device.
43+
// Rather than erroring out, we decode them on the CPU.
44+
std::unique_ptr<DeviceInterface> cpuInterface_;
45+
4246
VideoStreamOptions videoStreamOptions_;
4347
AVRational timeBase_;
4448
bool has_fp64_;

0 commit comments

Comments
 (0)