Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG_AMD.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@ Full documentation for ROCgdb is available at

### Added

- Add support for local memory out-of-address-range reporting for
architectures that support it. Local memory out-of-address-range
reporting is controlled with the following commands:
- set amdgpu lds-out-of-addr-exception
- show amdgpu lds-out-of-addr-exception

## ROCgdb-X for ROCm-next

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these new added lines are duplicate and should be removed.

## ROCgdb-X for ROCm-next

### Added

The new entries should also go at the end of the "Added" list.


### Added

- GDB now determines the name of AMD GPU threads based on the name of
their kernel function.
- Support for the HIP language and its built-in variables:
Expand Down
133 changes: 125 additions & 8 deletions gdb/amd-dbgapi-target.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,15 @@ struct amd_dbgapi_inferior_info
{
explicit amd_dbgapi_inferior_info
(inferior *inf, bool precise_memory_requested = false,
bool precise_alu_exceptions_requested = false)
bool precise_alu_exceptions_requested = false,
bool local_memory_out_of_addr_range_exception_requested = false)

: inf (inf)
{
precise_memory.requested = precise_memory_requested;
precise_alu_exceptions.requested = precise_alu_exceptions_requested;
local_memory_out_of_addr_range_exception.requested =
local_memory_out_of_addr_range_exception_requested;
}

/* Backlink to inferior. */
Expand Down Expand Up @@ -252,6 +256,10 @@ struct amd_dbgapi_inferior_info
enablement. */
struct dbgapi_feature_request precise_alu_exceptions;

/* Track the status of local memory out-of-address-range exception reporting
request and enablement. */
struct dbgapi_feature_request local_memory_out_of_addr_range_exception;

gdb::unordered_map<decltype (amd_dbgapi_breakpoint_id_t::handle),
struct breakpoint *>
breakpoint_map;
Expand Down Expand Up @@ -2495,6 +2503,33 @@ set_process_alu_exceptions_precision (amd_dbgapi_inferior_info &info)
get_status_string (status));
}

/* Set the process' local memory out-of-address-range exception reporting mode.

Warn if the requested mode is not supported by at least one of the agents
in the process. */

static void
set_process_local_memory_out_of_addr_range_exception (amd_dbgapi_inferior_info &info)
{
auto mode = (info.local_memory_out_of_addr_range_exception.requested
? AMD_DBGAPI_GROUP_SEGMENT_EXCEPTIONS_OUT_OF_ADDR_RANGE
: AMD_DBGAPI_GROUP_SEGMENT_EXCEPTIONS_NONE);

amd_dbgapi_status_t status
= amd_dbgapi_set_group_segment_out_of_addr_range_exception
(info.process_id, mode);

if (status == AMD_DBGAPI_STATUS_SUCCESS)
info.local_memory_out_of_addr_range_exception.enabled =
info.local_memory_out_of_addr_range_exception.requested;
else if (status == AMD_DBGAPI_STATUS_ERROR_NOT_SUPPORTED)
warning (_("AMDGPU local memory out-of-address-range exception reporting could not "

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could not be enabled -> is not supported.

"be enabled."));
else if (status != AMD_DBGAPI_STATUS_SUCCESS)
error (_("amd_dbgapi_set_local_memory_out_of_addr_range_exception failed (%s)"),
get_status_string (status));
}

/* Handle extra initialisation after we have attached to a AMDGPU corefile. */

static void
Expand Down Expand Up @@ -2617,6 +2652,7 @@ attach_amd_dbgapi (inferior *inf)

set_process_memory_precision (info);
set_process_alu_exceptions_precision (info);
set_process_local_memory_out_of_addr_range_exception (info);

/* If GDB is attaching to a process that has the runtime loaded, there will
already be a "runtime loaded" event available. Consume it and push the
Expand Down Expand Up @@ -2669,7 +2705,8 @@ detach_amd_dbgapi (inferior *inf)
/* Reset the amd_dbgapi_inferior_info, except for precise_memory_mode and
precise_alu_exceptions. */
info = amd_dbgapi_inferior_info (inf, info.precise_memory.requested,
info.precise_alu_exceptions.requested);
info.precise_alu_exceptions.requested,
info.local_memory_out_of_addr_range_exception.requested);

maybe_reset_amd_dbgapi ();
}
Expand Down Expand Up @@ -3074,12 +3111,14 @@ amd_dbgapi_target_inferior_cloned (inferior *original_inferior,
= get_amd_dbgapi_inferior_info (new_inferior);

/* At this point, the process is not started. Therefore it is sufficient to
copy the precise memory request, it will be applied when the process
starts. */
copy the precise memory and local memory request, it will be applied when
the process starts. */

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We now have two things, so some words must be adjusted for plural:

request => requests
it will be => they will be

gdb_assert (new_info.process_id == AMD_DBGAPI_PROCESS_NONE);
new_info.precise_memory.requested = orig_info.precise_memory.requested;
new_info.precise_alu_exceptions.requested
= orig_info.precise_alu_exceptions.requested;
new_info.local_memory_out_of_addr_range_exception.requested
= orig_info.local_memory_out_of_addr_range_exception.requested;
}

/* inferior_execd observer. */
Expand All @@ -3092,14 +3131,17 @@ amd_dbgapi_inferior_execd (inferior *exec_inf, inferior *follow_inf)
the new process image. */
detach_amd_dbgapi (exec_inf);

/* If using "follow-exec-mode new", carry over the precise-memory setting
to the new inferior (otherwise, FOLLOW_INF and ORIG_INF point to the same
inferior, so this is a no-op). */
/* If using "follow-exec-mode new", carry over the precise-memory and local-memory
settings to the new inferior (otherwise, FOLLOW_INF and ORIG_INF point to
the same inferior, so this is a no-op). */
get_amd_dbgapi_inferior_info (follow_inf).precise_memory.requested
= get_amd_dbgapi_inferior_info (exec_inf).precise_memory.requested;
get_amd_dbgapi_inferior_info (follow_inf).precise_alu_exceptions.requested
= get_amd_dbgapi_inferior_info (exec_inf)
.precise_alu_exceptions.requested;
get_amd_dbgapi_inferior_info (follow_inf).local_memory_out_of_addr_range_exception
.requested = get_amd_dbgapi_inferior_info (exec_inf)
.local_memory_out_of_addr_range_exception.requested;

attach_amd_dbgapi (follow_inf);
}
Expand All @@ -3112,7 +3154,7 @@ amd_dbgapi_inferior_forked (inferior *parent_inf, inferior *child_inf,
{
if (child_inf != nullptr)
{
/* Copy precise-memory requested value from parent to child. */
/* Copy precise-memory and local-memory requested values from parent to child. */
const amd_dbgapi_inferior_info &parent_info
= get_amd_dbgapi_inferior_info (parent_inf);
amd_dbgapi_inferior_info &child_info
Expand All @@ -3121,6 +3163,8 @@ amd_dbgapi_inferior_forked (inferior *parent_inf, inferior *child_inf,
= parent_info.precise_memory.requested;
child_info.precise_alu_exceptions.requested
= parent_info.precise_alu_exceptions.requested;
child_info.local_memory_out_of_addr_range_exception.requested
= parent_info.local_memory_out_of_addr_range_exception.requested;

if (fork_kind != TARGET_WAITKIND_VFORKED)
{
Expand Down Expand Up @@ -3350,6 +3394,13 @@ may not be accurate. See \"show amdgpu precise-alu-exceptions\".\n"));
gdb_printf (_("\
Warning: precise memory violation signal reporting is not enabled, reported\n\
location may not be accurate. See \"show amdgpu precise-memory\".\n"));
}
if (!info.local_memory_out_of_addr_range_exception.enabled)
{
gdb_printf (_("\
Warning: local memory out-of-address-range exception reporting is not enabled.\n\
See \"show amdgpu lds-addr-range-exception\".\n"));

}
return;
default:
Expand Down Expand Up @@ -3693,6 +3744,58 @@ get_effective_precise_alu_exception_mode ()
return info.precise_alu_exceptions.enabled;
}

/* Callback for "show amdgpu lds-addr-range-exception". */

static void
show_lds_addr_range_excp_mode (struct ui_file *file, int from_tty,
struct cmd_list_element *c, const char *value)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The alignment of the text is off.

I'm going to suppress other GNU coding standard violations at this point. Please find and fix them.

{
amd_dbgapi_inferior_info &info
= get_amd_dbgapi_inferior_info (current_inferior ());

gdb_printf (file,
_("AMDGPU local memory out-of-address-range exception reporting is %s "
"(currently %s).\n"),
info.local_memory_out_of_addr_range_exception.requested ? "on" : "off",
info.local_memory_out_of_addr_range_exception.enabled ?
"enabled" : "disabled");
}

/* Callback for "set amdgpu lds-addr-range-exception". */

static void
set_lds_addr_range_excp_mode (bool value)
{
amd_dbgapi_inferior_info &info
= get_amd_dbgapi_inferior_info (current_inferior ());

info.local_memory_out_of_addr_range_exception.requested = value;

if (info.process_id != AMD_DBGAPI_PROCESS_NONE)
set_process_local_memory_out_of_addr_range_exception (info);
}

/* Get the local memory out-of-address-range exception reporting requested mode. */

static bool
get_lds_addr_range_excp_mode ()
{
amd_dbgapi_inferior_info &info
= get_amd_dbgapi_inferior_info (current_inferior ());
return info.local_memory_out_of_addr_range_exception.requested;
}

/* Get the local memory out-of-address-range exception reporting effective mode. */

static bool
get_effective_local_memory_out_of_addr_range_exception_mode ()
{
amd_dbgapi_inferior_info &info
= get_amd_dbgapi_inferior_info (current_inferior ());
return info.local_memory_out_of_addr_range_exception.enabled;
}


static const char *
get_dbgapi_library_file_path ()
{
Expand Down Expand Up @@ -5035,6 +5138,20 @@ running. If off (default), precise ALU exceptions reporting is disabled."),
cmds.show->var->set_effective_value_getter<bool>
(get_effective_precise_alu_exception_mode);

cmds = add_setshow_boolean_cmd ("lds-addr-range-exception", no_class,
_("Set lds-addr-range-exception mode."),
_("Show lds-addr-range-exception mode."), _("\
If on, local memory out-of-address-range exception reporting is enabled if/when\n\

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like you copy/pasted this from the code block above, nonetheless I think "if/when" is not adding any value. a mere "when" would suffice.

the inferior is running. If off (default), local memory out-of-address-range\n\
exception reporting is disabled."),
set_lds_addr_range_excp_mode,
get_lds_addr_range_excp_mode,
show_lds_addr_range_excp_mode,
&set_amdgpu_list, &show_amdgpu_list);

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This setting / feature must be documented in the GDB manual.


cmds.show->var->set_effective_value_getter<bool>
(get_effective_local_memory_out_of_addr_range_exception_mode);

add_cmd ("version", no_set_class, show_dbgapi_version,
_("Show the ROCdbgapi library version and build information."),
&show_amdgpu_list);
Expand Down
18 changes: 9 additions & 9 deletions gdb/configure
Original file line number Diff line number Diff line change
Expand Up @@ -25347,19 +25347,19 @@ if test "$gdb_require_amd_dbgapi" = true \
# version of the library.

pkg_failed=no
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for amd-dbgapi >= 0.80.0" >&5
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for amd-dbgapi >= 0.81.0" >&5
$as_echo_n "checking for amd-dbgapi >= 0.80.0... " >&6; }

if test -n "$AMD_DBGAPI_CFLAGS"; then
pkg_cv_AMD_DBGAPI_CFLAGS="$AMD_DBGAPI_CFLAGS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
{ { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"amd-dbgapi >= 0.80.0\""; } >&5
($PKG_CONFIG --exists --print-errors "amd-dbgapi >= 0.80.0") 2>&5
{ { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"amd-dbgapi >= 0.81.0\""; } >&5
($PKG_CONFIG --exists --print-errors "amd-dbgapi >= 0.81.0") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
pkg_cv_AMD_DBGAPI_CFLAGS=`$PKG_CONFIG --cflags "amd-dbgapi >= 0.80.0" 2>/dev/null`
pkg_cv_AMD_DBGAPI_CFLAGS=`$PKG_CONFIG --cflags "amd-dbgapi >= 0.81.0" 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
Expand All @@ -25371,12 +25371,12 @@ if test -n "$AMD_DBGAPI_LIBS"; then
pkg_cv_AMD_DBGAPI_LIBS="$AMD_DBGAPI_LIBS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
{ { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"amd-dbgapi >= 0.80.0\""; } >&5
($PKG_CONFIG --exists --print-errors "amd-dbgapi >= 0.80.0") 2>&5
{ { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"amd-dbgapi >= 0.81.0\""; } >&5
($PKG_CONFIG --exists --print-errors "amd-dbgapi >= 0.81.0") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
pkg_cv_AMD_DBGAPI_LIBS=`$PKG_CONFIG --libs "amd-dbgapi >= 0.80.0" 2>/dev/null`
pkg_cv_AMD_DBGAPI_LIBS=`$PKG_CONFIG --libs "amd-dbgapi >= 0.81.0" 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
Expand Down Expand Up @@ -25421,9 +25421,9 @@ else
_pkg_short_errors_supported=no
fi
if test $_pkg_short_errors_supported = yes; then
AMD_DBGAPI_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "amd-dbgapi >= 0.80.0" 2>&1`
AMD_DBGAPI_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "amd-dbgapi >= 0.81.0" 2>&1`
else
AMD_DBGAPI_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "amd-dbgapi >= 0.80.0" 2>&1`
AMD_DBGAPI_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "amd-dbgapi >= 0.81.0" 2>&1`
fi
# Put the nasty error message in config.log where it belongs
echo "$AMD_DBGAPI_PKG_ERRORS" >&5
Expand Down
2 changes: 1 addition & 1 deletion gdb/configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ if test "$gdb_require_amd_dbgapi" = true \
# stability until amd-dbgapi hits 1.0, but for convenience, still check for
# greater or equal that version. It can be handy when testing with a newer
# version of the library.
PKG_CHECK_MODULES([AMD_DBGAPI], [amd-dbgapi >= 0.80.0],
PKG_CHECK_MODULES([AMD_DBGAPI], [amd-dbgapi >= 0.81.0],
[has_amd_dbgapi=yes], [has_amd_dbgapi=no])

if test "$has_amd_dbgapi" = "yes"; then
Expand Down
2 changes: 2 additions & 0 deletions gdb/testsuite/gdb.rocm/branch-fault.exp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ proc do_test { } {
"Thread $any received signal SIGSEGV, Segmentation fault\\." \
"Warning: precise memory violation signal reporting is not enabled, reported" \
"location may not be accurate. See \"show amdgpu precise-memory\"\\." \
"Warning: local memory out-of-address-range exception reporting is not enabled\\." \
"See \"show amdgpu lds-addr-range-exception\"\\." \
"\\\[Switching to thread $::decimal, lane 0 \\(AMDGPU Lane $any/0 \\(0,0,0\\)\\\[0,0,0\\\]\\)\\\]" \
"0x0000000000000100 in \\?\\? \\(\\)"] \
"continue to bad branch fault"
Expand Down
68 changes: 68 additions & 0 deletions gdb/testsuite/gdb.rocm/lds-addr-range-exception.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/* Copyright (C) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2026


This file is part of GDB.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */

#include <hip/hip_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include "rocm-test-utils.h"

__global__ void
kern (size_t dyn_alloc, size_t lds_size)
{
extern __shared__ char arr[];

/* Access up to dyn_alloc should always work. */
for (size_t i = 0; i < dyn_alloc; i++)
arr[i] = i;

__syncthreads ();

for (size_t i = 0; i < dyn_alloc; i++)
assert (arr[i] == i);

/* Use the last four bytes of the LDS */
size_t idx = lds_size - 4;
if (threadIdx.x == 0)
{
assert (idx >= dyn_alloc);
arr[idx] = 8;
}
__syncthreads ();

/* This is expected to fail.
One could run this kernel once and expect to fail at the assert, then run
it again with the LDS reporting on, and check you receive the memviol. */
if (threadIdx.x == 0)
assert (arr[idx] == 8);
__syncthreads ();
}

int
main (int argc, char* argv[])
{
hipDeviceProp_t props;
int deviceId;
CHECK (hipGetDevice (&deviceId));
CHECK (hipGetDeviceProperties(&props, deviceId));

constexpr size_t shared_mem_alloc = 64;
size_t lds_size = props.sharedMemPerBlock;
kern<<<1, 128, shared_mem_alloc, 0>>> (shared_mem_alloc, lds_size);
CHECK (hipDeviceSynchronize ());
return 0;
}
Loading