Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,13 @@ jobs:
xpmem_version: master
sos_config: --with-xpmem=${XPMEM_INSTALL_DIR} --enable-error-checking
--enable-remote-virtual-addressing --enable-pmi-simple
--enable-hard-polling
libfabric_version: v2.1.x
- config_name: XPMEM shared atomics
xpmem_version: master
sos_config: --with-xpmem=${XPMEM_INSTALL_DIR} --enable-shr-atomics
--enable-error-checking --enable-pmi-simple
--enable-hard-polling
libfabric_version: v2.1.x
- config_name: RVA, thread completion
sos_config: --enable-error-checking --enable-remote-virtual-addressing
Expand Down Expand Up @@ -517,7 +519,7 @@ jobs:
sos_config: [--enable-pmi-simple --disable-fortran,
--with-cma --enable-error-checking --enable-profiling
--enable-pmi-simple --disable-fortran --with-hwloc=no,
--with-xpmem --enable-error-checking --enable-pmi-simple --with-hwloc=no]
--with-xpmem --enable-error-checking --enable-pmi-simple --with-hwloc=no --enable-hard-polling]
steps:
- name: Checking OS version
run: |
Expand Down Expand Up @@ -620,7 +622,7 @@ jobs:
matrix:
include:
- config_name: XPMEM with Shared Atomics
sos_config: --with-xpmem --enable-shr-atomics --enable-error-checking --enable-pmi-simple
sos_config: --with-xpmem --enable-shr-atomics --enable-error-checking --enable-pmi-simple --enable-hard-polling
portals4_version: master
xpmem_version: master

Expand Down Expand Up @@ -736,7 +738,7 @@ jobs:
include:
- config_name: transport_none
xpmem_version: master
sos_config: [--with-xpmem --enable-shr-atomics --enable-error-checking --enable-pmi-simple]
sos_config: [--with-xpmem --enable-shr-atomics --enable-error-checking --enable-pmi-simple --enable-hard-polling]

steps:
- name: Checking OS version
Expand Down
1 change: 0 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,6 @@ AM_CONDITIONAL([USE_CMA], [test "$transport_cma" = "yes"])

AS_IF([test "$transport_xpmem" = "yes" -o "$transport_cma" = "yes"],
[AC_DEFINE([USE_ON_NODE_COMMS], [1], [Define if any on-node comm transport is available])
AC_DEFINE([ENABLE_HARD_POLLING], [1], [Enable hard polling])
])

if test "$enable_shr_atomics" = "yes"; then
Expand Down
4 changes: 4 additions & 0 deletions src/transport_ofi.c
Original file line number Diff line number Diff line change
Expand Up @@ -1731,6 +1731,7 @@ static int shmem_transport_ofi_ctx_init(shmem_transport_ctx_t *ctx, int id)
cntr_put_attr.events = FI_CNTR_EVENTS_COMP;
cntr_get_attr.events = FI_CNTR_EVENTS_COMP;

#if 0
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean to leave this dead code?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, see comment below.

/* Set FI_WAIT based on the put and get polling limits defined above */
if (shmem_transport_ofi_put_poll_limit < 0) {
cntr_put_attr.wait_obj = FI_WAIT_NONE;
Expand All @@ -1742,6 +1743,9 @@ static int shmem_transport_ofi_ctx_init(shmem_transport_ctx_t *ctx, int id)
} else {
cntr_get_attr.wait_obj = FI_WAIT_UNSPEC;
}
#endif
cntr_put_attr.wait_obj = FI_WAIT_UNSPEC;
cntr_get_attr.wait_obj = FI_WAIT_UNSPEC;

/* Allow provider to choose CQ size, since we are using FI_RM_ENABLED.
* Context format is used to return bounce buffer pointers in the event
Expand Down
47 changes: 3 additions & 44 deletions src/transport_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -504,28 +504,8 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx)
* reverse order: first the fid_cntr event counter, then the put issued
* counter. We'll want to preserve this property in the future.
*/
uint64_t success, fail, cnt, cnt_new;
long poll_count = 0;
while (poll_count < shmem_transport_ofi_put_poll_limit ||
shmem_transport_ofi_put_poll_limit < 0) {
success = fi_cntr_read(ctx->put_cntr);
fail = fi_cntr_readerr(ctx->put_cntr);
cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr);

shmem_transport_probe();

if (success < cnt && fail == 0) {
SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx);
SPINLOCK_BODY();
SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx);
} else if (fail) {
RAISE_ERROR_MSG("Operations completed in error (%" PRIu64 ")\n", fail);
} else {
SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx);
return;
}
poll_count++;
}
uint64_t cnt, cnt_new;

cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr);
do {
cnt = cnt_new;
Expand Down Expand Up @@ -965,31 +945,10 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx)
* reverse order: first the fid_cntr event counter, then the get issued
* counter. We'll want to preserve this property in the future.
*/
uint64_t success, fail, cnt, cnt_new;
long poll_count = 0;
uint64_t cnt, cnt_new;

SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx);

while (poll_count < shmem_transport_ofi_get_poll_limit ||
shmem_transport_ofi_get_poll_limit < 0) {
success = fi_cntr_read(ctx->get_cntr);
fail = fi_cntr_readerr(ctx->get_cntr);
cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr);

shmem_transport_probe();

if (success < cnt && fail == 0) {
SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx);
SPINLOCK_BODY();
SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx);
} else if (fail) {
RAISE_ERROR_MSG("Operations completed in error (%" PRIu64 ")\n", fail);
} else {
SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx);
return;
}
poll_count++;
}
cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr);
do {
cnt = cnt_new;
Expand Down