Describe the Bug:
Illegal memory access (CUDA ERR 700) when trying to use onedpl sort in a sycl program built with adaptivecpp (for nvidia gpu; i have not tested it on different vendors).
To be clear, oneDPL works fine when built with dpcpp.
So, I am not sure if this is a compiler issue, or some special handling on oneDPL's side. Just wanted to have it documented somewhere.
To Reproduce:
The steps to reproduce the behavior:
File src/sort_repro.cpp:
#include <sycl/sycl.hpp>
#include <oneapi/dpl/execution>
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/iterator>
#include <iostream>
#include <numeric>
#include <vector>
#include <cstdlib>
static sycl::queue& get_queue()
{
static sycl::queue q({sycl::property::queue::in_order()});
return q;
}
static auto& get_policy()
{
static auto p = oneapi::dpl::execution::make_device_policy(get_queue());
return p;
}
#ifdef USE_USM
// USM path: sycl::malloc_shared, pass raw pointers to oneDPL
static bool run_sort(std::size_t n)
{
auto& q = get_queue();
double* data = sycl::malloc_shared<double>(n, q);
if (!data) {
std::cerr << " sycl::malloc_shared failed for n=" << n << "\n";
return false;
}
// Fill descending: n-1, n-2, ..., 1, 0
for (std::size_t i = 0; i < n; ++i)
data[i] = static_cast<double>(n - 1 - i);
oneapi::dpl::sort(get_policy(), data, data + n);
q.wait();
// Verify
bool ok = true;
for (std::size_t i = 0; i < n && i < 10; ++i) {
if (data[i] != static_cast<double>(i)) {
ok = false;
break;
}
}
sycl::free(data, q);
return ok;
}
#else
// NO_USM path: sycl::buffer over host vector, oneDPL buffer iterators
static bool run_sort(std::size_t n)
{
std::vector<double> host(n);
// Fill descending
for (std::size_t i = 0; i < n; ++i)
host[i] = static_cast<double>(n - 1 - i);
{
sycl::buffer<double, 1> buf(host.data(), sycl::range<1>(n));
oneapi::dpl::sort(get_policy(),
oneapi::dpl::begin(buf),
oneapi::dpl::end(buf));
} // buffer destructs -> writeback to host
// Verify
for (std::size_t i = 0; i < n && i < 10; ++i) {
if (host[i] != static_cast<double>(i))
return false;
}
return true;
}
#endif
int main()
{
auto& q = get_queue();
std::cout << "Device: " << q.get_device().get_info<sycl::info::device::name>() << "\n";
#ifdef USE_USM
std::cout << "Mode: USM (sycl::malloc_shared)\n";
#else
std::cout << "Mode: NO_USM (sycl::buffer)\n";
#endif
std::cout << "\n";
// Sweep sizes: 4, 16, 64, ..., 4M
// On acpp 25.10 + oneDPL, expect crash around 1M (NO_USM) or
// immediate kernel-load failure (USM).
const std::size_t sizes[] = {
4, 16, 64, 256, 1024, 4096, 16384, 65536,
262144, 1048576, 4194304
};
for (std::size_t n : sizes) {
std::cout << " sort(n=" << n << ") ... " << std::flush;
bool ok = run_sort(n);
if (ok)
std::cout << "OK\n";
else
std::cout << "FAILED (verification)\n";
}
std::cout << "\nAll sizes completed without crash.\n";
return 0;
}
File CMakeLists.txt:
cmake_minimum_required(VERSION 3.22)
# Pin compiler to acpp BEFORE project().
set(CMAKE_CXX_COMPILER acpp CACHE STRING "" FORCE)
# Target: "generic" JITs at runtime to whichever GPU ACPP_VISIBILITY_MASK picks.
# Override with -DACPP_TARGETS=cuda:sm_70 (V100S) or cuda:sm_89 (RTX 2000 Ada)
# for AOT compilation if desired.
set(ACPP_TARGETS "generic" CACHE STRING
"Value for --acpp-targets= (generic | cuda:sm_XX)")
set(_acpp_flags "--acpp-targets=${ACPP_TARGETS}")
set(CMAKE_CXX_FLAGS_INIT "${_acpp_flags}")
set(CMAKE_EXE_LINKER_FLAGS_INIT "${_acpp_flags}")
project(acpp_onedpl_sort_crash LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
endif ()
# Conda-only paths (avoid system libpfm / stale headers)
list(APPEND CMAKE_IGNORE_PATH
/usr/include /usr/local/include
/usr/lib /usr/lib64 /usr/local/lib /usr/local/lib64)
find_package(oneDPL REQUIRED)
find_package(TBB REQUIRED)
# NO_USM variant: sycl::buffer path (crashes with CUDA:700 at size ~1M)
add_executable(sort_repro_nousm src/sort_repro.cpp)
target_compile_options(sort_repro_nousm PRIVATE -O3)
target_link_libraries(sort_repro_nousm PRIVATE oneDPL TBB::tbb)
add_executable(sort_repro_usm src/sort_repro.cpp)
target_compile_definitions(sort_repro_usm PRIVATE USE_USM)
target_compile_options(sort_repro_usm PRIVATE -O3)
target_link_libraries(sort_repro_usm PRIVATE oneDPL TBB::tbb)
message(STATUS "acpp_onedpl_sort_crash: ACPP_TARGETS = ${ACPP_TARGETS}")
message(STATUS " Run with:")
message(STATUS " ACPP_VISIBILITY_MASK=cuda ./sort_repro_nousm")
message(STATUS " ACPP_VISIBILITY_MASK=cuda ./sort_repro_usm")
message(STATUS " Expected: crash at size ~1M (nousm) or immediate (usm)")
message(STATUS " Tested GPUs: V100S (sm_70), RTX 2000 Ada (sm_89)")
The
The following information might be useful:
- CMake command: see above.
- oneDPL version: bundled with oneAPI Basekit 2026.3.0
- Compiler version: AdaptiveCpp 25.10.0
- OS: Ubuntu 22 and Archlinux
- Device and driver version:
- Ubuntu 22 with V100S: Driver 555.42.02 (CUDA 12.5); NVHPC 2025_257 12.9
- Archlinux with RTX2000Ada: Driver 595.71.05 (CUDA 13.2) ; NVHPC 2025_257 12.9
Expected Behavior:
Building a SYCL application that uses oneDPL's parallel sort (USM or Buffer) with AdaptiveCpp results in illegal memory access.
Additional Context:
AdaptiveCpp's own parallel STL works fine without a crash.
#include <algorithm>
#include <execution>
#include <functional>
#include <iostream>
#include <vector>
static bool run_sort(std::size_t n)
{
std::vector<double> v(n);
for (std::size_t i = 0; i < n; ++i)
v[i] = static_cast<double>(n - 1 - i);
// Explicit comparator to bypass the no-comparator template bug
std::sort(std::execution::par_unseq, v.begin(), v.end(), std::less<>{});
for (std::size_t i = 0; i < n && i < 10; ++i) {
if (v[i] != static_cast<double>(i))
return false;
}
return true;
}
int main()
{
std::cout << "Mode: stdpar (std::sort + std::execution::par_unseq)\n\n";
const std::size_t sizes[] = {
4, 16, 64, 256, 1024, 4096, 16384, 65536,
262144, 1048576, 4194304
};
for (std::size_t n : sizes) {
std::cout << " sort(n=" << n << ") ... " << std::flush;
bool ok = run_sort(n);
if (ok)
std::cout << "OK\n";
else
std::cout << "FAILED (verification)\n";
}
std::cout << "\nAll sizes completed without crash.\n";
return 0;
}
Describe the Bug:
Illegal memory access (CUDA ERR 700) when trying to use onedpl sort in a sycl program built with adaptivecpp (for nvidia gpu; i have not tested it on different vendors).
To be clear, oneDPL works fine when built with dpcpp.
So, I am not sure if this is a compiler issue, or some special handling on oneDPL's side. Just wanted to have it documented somewhere.
To Reproduce:
The steps to reproduce the behavior:
File
src/sort_repro.cpp:File
CMakeLists.txt:The
The following information might be useful:
Expected Behavior:
Building a SYCL application that uses oneDPL's parallel sort (USM or Buffer) with AdaptiveCpp results in illegal memory access.
Additional Context:
AdaptiveCpp's own parallel STL works fine without a crash.