Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion scripts/exrrfs_prep_ic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ else
mem_list=("000") # if determinitic
fi

if [[ "${RESILIENT_ENSEMBLE:-"FALSE"}" == "TRUE" ]]; then
mem_list=("${ENS_INDEX}")
fi


for index in "${mem_list[@]}"; do # loop through all the members
# Determine path
if (( 10#${index} == 0 )); then
Expand Down Expand Up @@ -187,7 +192,11 @@ done # done for all the members
# parallel run the serial tasks
#
${cpreq} "${EXECrrfs}"/rank_run.x .
${MPI_RUN_CMD} ./rank_run.x "${DATA}/script_prep_ic_*.sh"
if [[ "${RESILIENT_ENSEMBLE:-"FALSE"}" == "TRUE" ]]; then
${MPI_RUN_CMD} ./rank_run.x "${DATA}/script_prep_ic_${pid}.sh"

@guoqing-noaa guoqing-noaa Apr 24, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is only to run one script?
If so, we don't need to do ${MPI_RUN_CMD} ./rank_run.x .
We can run ${DATA}/script_prep_ic_${pid}.sh directly

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't run it directly unless we update its execute permissions, since the scripts this script creates only have r/w enabled. rank_run.x works around that and seems harmless to me.

else
${MPI_RUN_CMD} ./rank_run.x "${DATA}/script_prep_ic_*.sh"
fi

# Check for errors
export err=$?
Expand Down
8 changes: 7 additions & 1 deletion workflow/rocoto_funcs/fcst.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
# begin of fcst --------------------------------------------------------


def fcst(xmlFile, expdir, do_ensemble=False, dcEnsGrpInfo=None, do_spinup=False):
def fcst(xmlFile, expdir, do_ensemble=False, dcEnsGrpInfo=None, do_spinup=False, resilient=False):
meta_id = 'fcst'
if resilient:
meta_id = 'fcst_resilient'
dep_xml = ""
if do_spinup:
cycledefs = 'spinup'
Expand Down Expand Up @@ -69,6 +71,8 @@ def fcst(xmlFile, expdir, do_ensemble=False, dcEnsGrpInfo=None, do_spinup=False)
ens_indices = dcEnsGrpInfo["ens_indices"]
dep_xml = dcEnsGrpInfo["dep_xml"]
group_name = dcEnsGrpInfo["group_name"]
if resilient:
group_name = 'resilient_' + group_name
metatask = True
task_id = f'{meta_id}_m#ens_index#'
dcTaskEnv['ENS_INDEX'] = "#ens_index#"
Expand Down Expand Up @@ -169,6 +173,8 @@ def fcst(xmlFile, expdir, do_ensemble=False, dcEnsGrpInfo=None, do_spinup=False)
prep_ic_dep = f'<taskdep task="prep_ic"/>'
if do_spinup:
prep_ic_dep = f'<taskdep task="prep_ic_spinup"/>'
if resilient:
prep_ic_dep = f'<taskdep task="resilient_prep_ic{ensindexstr}"/>'
prep_lbc_dep = f'\n <taskdep task="prep_lbc{ensindexstr}" cycle_offset="0:00:00"/>'
if "global" in os.getenv("MESH_NAME"):
prep_lbc_dep = ''
Expand Down
38 changes: 34 additions & 4 deletions workflow/rocoto_funcs/prep_ic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import textwrap
from rocoto_funcs.base import xml_task, get_cascade_env

# begin of fcst --------------------------------------------------------
# begin of prep_ic --------------------------------------------------------


def prep_ic(xmlFile, expdir, do_ensemble=False, spinup_mode=0):
def prep_ic(xmlFile, expdir, do_ensemble=False, spinup_mode=0, resilient=False):
# spinup_mode:
# 0 = no parallel spinup cycles in the experiment
# 1 = a spinup cycle
Expand Down Expand Up @@ -52,6 +52,28 @@ def prep_ic(xmlFile, expdir, do_ensemble=False, spinup_mode=0):

if "global" in os.getenv("MESH_NAME"):
dcTaskEnv['cpreq'] = "ln -snf"
if not (do_ensemble and resilient):
metatask = False
meta_id = ""
meta_bgn = ""
meta_end = ""
ensindexstr = ""
else:
meta_id = 'resilient_prep_ic'

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lIne 62: suggest changing meta_id to prep_ic_resilient so that it can inherit any resource settings configured for prep_ic, such as NODE_PREP_IC, etc.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need the nodes from NODE_PREP_IC since PREP_IC for ensemble runs several things simultaneously (1:ppn=40 while the rerun just needs 1:ppn=1, which it gets automatically in the current definition). However, I added a line to the exp.ens_conus* sample setup files to set the walltime for the rerun task to 10 minutes like the main PREP_IC requests.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AmandaBack-NOAA
Thanks for mentioning that the resilient runs use different NODE settings. In this situation, we can define NODE_PREP_IC_RESILEINT explicitly in config_resources/config.base.

We have more resource definitions, such as 'WALLTIME', ACCOUNT, QUEUE, PARTITION, RESERVATION, NATIVE, etc, although they take default values most times.

But we have situations when prep_ic may configure some of the resources different from the defaults.
Using a consistent cascading way (i.e prep_ic_resilient) can facilitate this task to inherit changes made to prep_ic. Thanks!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't want it to inherit RESERVATION and it shouldn't use a START_TIME from PREP_IC either. Probably best to evaluate these settings on a case-by-case basis instead of making a broad rule of inheriting.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AmandaBack-NOAA Thanks for the discussion. If possible, we would like to be consistent unless there are any unresolvable tech challenges. Most users will automatically classify prep_ic and its resilient counter part into the same category.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The version of the PR originally submitted used the default resources for RESILIENT_PREP_IC and specified resources for RESILIENT_FCST. The specified resources for RESILIENT_FCST were removed at your request @guoqing-noaa but as you've since pointed out there could be some issues arising from that.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For realtime runs, we need more tests to make sure a rerun of ensemble forecasts can be done in time without a reservation.

@AmandaBack-NOAA AmandaBack-NOAA Apr 27, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's an interesting opinion. Right now, the procedure as delineated by @hu5970 is that someone should notice that real-time runs have stopped, should find the member whose forecast failed, manually replace that member's GETKF analysis with the background, and then reboot the forecast task. This PR automates that procedure.

@guoqing-noaa guoqing-noaa Apr 27, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For realtime runs, we need more tests to make sure a rerun of ensemble forecasts can be done in time without a reservation.

I mean, fcst task may stay in a queue for a long time if no reservation and hence may not resolve the issue in time.
Manual rebooting uses the same reservation.

I may misunderstand. Do you want to use reservations or not for the fcst rerun?
Also, if this PR is to address the realtime issue, we expect to modify the file exp/rt_ursa/exp.rrfsv2x_ens.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah. In that case it's right for the backup fcst task to inherit the reservation. Thanks!

metatask = True
task_id = f'{meta_id}_m#ens_index#'
dcTaskEnv['ENS_INDEX'] = "#ens_index#"
dcTaskEnv['RESILIENT_ENSEMBLE'] = "TRUE"
meta_bgn = ""
meta_end = ""
ens_size = int(os.getenv('ENS_SIZE', '2'))
ens_indices = ''.join(f'{i:03d} ' for i in range(1, int(ens_size) + 1)).strip()
meta_bgn = f'''
<metatask name="{meta_id}">
<var name="ens_index">{ens_indices}</var>'''
meta_end = f'\
</metatask>\n'
ensindexstr = "_m#ens_index#"
dcTaskEnv['KEEPDATA'] = get_cascade_env(f"KEEPDATA_{task_id}".upper()).upper()
# dependencies
coldhrs = coldhrs.split(' ')
Expand Down Expand Up @@ -192,6 +214,14 @@ def prep_ic(xmlFile, expdir, do_ensemble=False, spinup_mode=0):
</or>
</and>
</dependency>'''

# if this is a re-run because fcst died, that is the lone dependency.
if resilient:
dependencies = f'''
<dependency>
<taskdep state="Dead" task="fcst{ensindexstr}"/>
</dependency>'''
#
xml_task(xmlFile, expdir, task_id, cycledefs, dcTaskEnv, dependencies, command_id="PREP_IC")
# end of fcst --------------------------------------------------------
xml_task(xmlFile, expdir, task_id, cycledefs, dcTaskEnv, dependencies,
metatask, meta_id, meta_bgn, meta_end, "PREP_IC")
# end of prep_ic --------------------------------------------------------
3 changes: 3 additions & 0 deletions workflow/rocoto_funcs/setup_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ def setup_xml(HOMErrfs, expdir):
listEnsGrpInfo = smart_ens_groups('fcst')
for dcEnsGrpInfo in listEnsGrpInfo["group_list"]:
fcst(xmlFile, expdir, do_ensemble=True, dcEnsGrpInfo=dcEnsGrpInfo)
if os.getenv("RESILIENT_ENSEMBLE", "FALSE").upper() == "TRUE":
prep_ic(xmlFile, expdir, do_ensemble=True, resilient=True)
fcst(xmlFile, expdir, do_ensemble=True, dcEnsGrpInfo=dcEnsGrpInfo, resilient=True)
if os.getenv('DO_CYC', 'FALSE').upper() == "TRUE":
save_for_next(xmlFile, expdir, do_ensemble=True)
if os.getenv("DO_POST", "TRUE").upper() == "TRUE":
Expand Down
10 changes: 9 additions & 1 deletion workflow/rocoto_funcs/upp.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,15 @@ def upp(xmlFile, expdir, index, dcGrpInfo, do_ensemble=False, do_ensmean_post=Fa
starttime = get_cascade_env(f"STARTTIME_{meta_id}".upper())
timedep = f'\n <timedep><cyclestr offset="{starttime}">@Y@m@d@H@M00</cyclestr></timedep>'
#
dependencies = f'''
if do_ensmean_post:
dependencies = f'''
<dependency>
<and>{timedep}
<taskdep task="mpassit{ensindexstr}_g{index:02d}"/>
</and>
</dependency>'''
else:
dependencies = f'''
<dependency>
<and>{timedep}
<taskdep task="mpassit_g{index:02d}{ensindexstr}"/>
Expand Down
2 changes: 1 addition & 1 deletion workflow/sideload/launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ case ${task_id} in
module load "rrfs/${MACHINE}.${COMPILER}"
module load wgrib2
;;
prep_ic)
prep_ic|resilient_prep_ic)
module purge
module load "rrfs/${MACHINE}.${COMPILER}"
module load nco
Expand Down
Loading