Skip to content

cudaErrorIllegalAddress for ::sort() (both USM and Buffer) when built with ACPP #2679

@salehjg

Description

@salehjg

Describe the Bug:
Illegal memory access (CUDA ERR 700) when trying to use onedpl sort in a sycl program built with adaptivecpp (for nvidia gpu; i have not tested it on different vendors).
To be clear, oneDPL works fine when built with dpcpp.
So, I am not sure if this is a compiler issue, or some special handling on oneDPL's side. Just wanted to have it documented somewhere.

To Reproduce:
The steps to reproduce the behavior:

File src/sort_repro.cpp:

#include <sycl/sycl.hpp>
#include <oneapi/dpl/execution>
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/iterator>
#include <iostream>
#include <numeric>
#include <vector>
#include <cstdlib>

static sycl::queue& get_queue()
{
	static sycl::queue q({sycl::property::queue::in_order()});
	return q;
}

static auto& get_policy()
{
	static auto p = oneapi::dpl::execution::make_device_policy(get_queue());
	return p;
}

#ifdef USE_USM
// USM path: sycl::malloc_shared, pass raw pointers to oneDPL
static bool run_sort(std::size_t n)
{
	auto& q = get_queue();
	double* data = sycl::malloc_shared<double>(n, q);
	if (!data) {
		std::cerr << "  sycl::malloc_shared failed for n=" << n << "\n";
		return false;
	}

	// Fill descending: n-1, n-2, ..., 1, 0
	for (std::size_t i = 0; i < n; ++i)
		data[i] = static_cast<double>(n - 1 - i);

	oneapi::dpl::sort(get_policy(), data, data + n);
	q.wait();

	// Verify
	bool ok = true;
	for (std::size_t i = 0; i < n && i < 10; ++i) {
		if (data[i] != static_cast<double>(i)) {
			ok = false;
			break;
		}
	}

	sycl::free(data, q);
	return ok;
}
#else
// NO_USM path: sycl::buffer over host vector, oneDPL buffer iterators
static bool run_sort(std::size_t n)
{
	std::vector<double> host(n);
	// Fill descending
	for (std::size_t i = 0; i < n; ++i)
		host[i] = static_cast<double>(n - 1 - i);

	{
		sycl::buffer<double, 1> buf(host.data(), sycl::range<1>(n));
		oneapi::dpl::sort(get_policy(),
		                  oneapi::dpl::begin(buf),
		                  oneapi::dpl::end(buf));
	} // buffer destructs -> writeback to host

	// Verify
	for (std::size_t i = 0; i < n && i < 10; ++i) {
		if (host[i] != static_cast<double>(i))
			return false;
	}
	return true;
}
#endif

int main()
{
	auto& q = get_queue();
	std::cout << "Device: " << q.get_device().get_info<sycl::info::device::name>() << "\n";
#ifdef USE_USM
	std::cout << "Mode: USM (sycl::malloc_shared)\n";
#else
	std::cout << "Mode: NO_USM (sycl::buffer)\n";
#endif
	std::cout << "\n";

	// Sweep sizes: 4, 16, 64, ..., 4M
	// On acpp 25.10 + oneDPL, expect crash around 1M (NO_USM) or
	// immediate kernel-load failure (USM).
	const std::size_t sizes[] = {
		4, 16, 64, 256, 1024, 4096, 16384, 65536,
		262144, 1048576, 4194304
	};

	for (std::size_t n : sizes) {
		std::cout << "  sort(n=" << n << ") ... " << std::flush;
		bool ok = run_sort(n);
		if (ok)
			std::cout << "OK\n";
		else
			std::cout << "FAILED (verification)\n";
	}

	std::cout << "\nAll sizes completed without crash.\n";
	return 0;
}

File CMakeLists.txt:

cmake_minimum_required(VERSION 3.22)

# Pin compiler to acpp BEFORE project().
set(CMAKE_CXX_COMPILER acpp CACHE STRING "" FORCE)

# Target: "generic" JITs at runtime to whichever GPU ACPP_VISIBILITY_MASK picks.
# Override with -DACPP_TARGETS=cuda:sm_70 (V100S) or cuda:sm_89 (RTX 2000 Ada)
# for AOT compilation if desired.
set(ACPP_TARGETS "generic" CACHE STRING
        "Value for --acpp-targets= (generic | cuda:sm_XX)")

set(_acpp_flags "--acpp-targets=${ACPP_TARGETS}")
set(CMAKE_CXX_FLAGS_INIT        "${_acpp_flags}")
set(CMAKE_EXE_LINKER_FLAGS_INIT "${_acpp_flags}")

project(acpp_onedpl_sort_crash LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

if (NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
endif ()

# Conda-only paths (avoid system libpfm / stale headers)
list(APPEND CMAKE_IGNORE_PATH
        /usr/include /usr/local/include
        /usr/lib /usr/lib64 /usr/local/lib /usr/local/lib64)

find_package(oneDPL REQUIRED)
find_package(TBB REQUIRED)

# NO_USM variant: sycl::buffer path (crashes with CUDA:700 at size ~1M)
add_executable(sort_repro_nousm src/sort_repro.cpp)
target_compile_options(sort_repro_nousm PRIVATE -O3)
target_link_libraries(sort_repro_nousm PRIVATE oneDPL TBB::tbb)

add_executable(sort_repro_usm src/sort_repro.cpp)
target_compile_definitions(sort_repro_usm PRIVATE USE_USM)
target_compile_options(sort_repro_usm PRIVATE -O3)
target_link_libraries(sort_repro_usm PRIVATE oneDPL TBB::tbb)

message(STATUS "acpp_onedpl_sort_crash: ACPP_TARGETS = ${ACPP_TARGETS}")
message(STATUS "  Run with:")
message(STATUS "    ACPP_VISIBILITY_MASK=cuda ./sort_repro_nousm")
message(STATUS "    ACPP_VISIBILITY_MASK=cuda ./sort_repro_usm")
message(STATUS "  Expected: crash at size ~1M (nousm) or immediate (usm)")
message(STATUS "  Tested GPUs: V100S (sm_70), RTX 2000 Ada (sm_89)")

The

The following information might be useful:

  • CMake command: see above.
  • oneDPL version: bundled with oneAPI Basekit 2026.3.0
  • Compiler version: AdaptiveCpp 25.10.0
  • OS: Ubuntu 22 and Archlinux
  • Device and driver version:
    • Ubuntu 22 with V100S: Driver 555.42.02 (CUDA 12.5); NVHPC 2025_257 12.9
    • Archlinux with RTX2000Ada: Driver 595.71.05 (CUDA 13.2) ; NVHPC 2025_257 12.9

Expected Behavior:
Building a SYCL application that uses oneDPL's parallel sort (USM or Buffer) with AdaptiveCpp results in illegal memory access.

Additional Context:
AdaptiveCpp's own parallel STL works fine without a crash.

#include <algorithm>
#include <execution>
#include <functional>
#include <iostream>
#include <vector>

static bool run_sort(std::size_t n)
{
	std::vector<double> v(n);
	for (std::size_t i = 0; i < n; ++i)
		v[i] = static_cast<double>(n - 1 - i);

	// Explicit comparator to bypass the no-comparator template bug
	std::sort(std::execution::par_unseq, v.begin(), v.end(), std::less<>{});

	for (std::size_t i = 0; i < n && i < 10; ++i) {
		if (v[i] != static_cast<double>(i))
			return false;
	}
	return true;
}

int main()
{
	std::cout << "Mode: stdpar (std::sort + std::execution::par_unseq)\n\n";

	const std::size_t sizes[] = {
		4, 16, 64, 256, 1024, 4096, 16384, 65536,
		262144, 1048576, 4194304
	};

	for (std::size_t n : sizes) {
		std::cout << "  sort(n=" << n << ") ... " << std::flush;
		bool ok = run_sort(n);
		if (ok)
			std::cout << "OK\n";
		else
			std::cout << "FAILED (verification)\n";
	}

	std::cout << "\nAll sizes completed without crash.\n";
	return 0;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions