-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup
More file actions
executable file
·354 lines (296 loc) · 11 KB
/
Copy pathsetup
File metadata and controls
executable file
·354 lines (296 loc) · 11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#!/usr/bin/env bash
# ======================
# Check IPSL-AID installation
# ======================
if ! command -v ipsl-aid &> /dev/null; then
echo "ERROR: ipsl-aid command not found. Please install the package first using: uv pip install -e ."
echo " or activate the virtual environment if already installed using: source .venv/bin/activate"
exit 1
fi
# ======================
# 1. Unified Debug Flag
# ======================
debug=false # true = SLURM debug job + Python --debug true, Note: Avoid True (use true instead)
# false = full job + Python --debug false, Note: Avoid False (use false instead)
DATE=$(date +"%Y%m%d")
TODAY=$(date +"%Y%m%d_%H%M%S")
# ======================
# 2. Parameters
# ======================
run_type="train" # "train", ""train_regional"
region="Europe" # Only for run_type="train_regional"
region_size=("288" "720") # Only for run_type="train_regional"
save_model=true
save_checkpoint_name="difusion_model"
load_checkpoint_name="difusion_model"
save_per_samples=10000
year_start=2015
year_end=2019
year_start_test=2020 #2020
year_end_test=2020 #2020
batch_size=36 #20 #40 # use maximum 18 for 1 GPU
num_epochs=8
learning_rate=0.0001
num_workers=16
datadir="/leonardo_work/EUHPC_D27_095/kkingston/AI-Downscaling/data/data_FOURxDaily"
per_var_datadir=(
"VAR_2T=/leonardo_work/EUHPC_D27_095/kkingston/AI-Downscaling/data/data_FOURxDaily"
"VAR_10U=/leonardo_work/EUHPC_D27_095/kkingston/AI-Downscaling/data/data_FOURxDaily"
"VAR_10V=/leonardo_work/EUHPC_D27_095/kkingston/AI-Downscaling/data/data_FOURxDaily"
"VAR_TP=/leonardo_work/EUHPC_D27_095/kkingston/AI-Downscaling/data/data_FOURxDaily_tp"
"VAR_D2M=/leonardo_work/EUHPC_D27_095/kkingston/AI-Downscaling/data/data_FOURxDaily_d2m_sstk"
"VAR_ST=/leonardo_work/EUHPC_D27_095/kkingston/AI-Downscaling/data/data_FOURxDaily_st"
)
time_normalization="cos_sin"
varnames_list=("VAR_2T" "VAR_10U" "VAR_10V" "VAR_TP" "VAR_D2M" "VAR_ST")
constant_varnames_list=("z" "lsm")
constant_varnames_file="ERA5_const_sfc_variables.nc"
normalization_types=("VAR_2T=standard" "VAR_10U=standard" "VAR_10V=standard" "VAR_TP=log1p_standard" "VAR_D2M=standard" "VAR_ST=standard")
units_list=("K" "m/s" "m/s" "m/h" "K" "K")
dynamic_covariates=()
dynamic_covariates_dir="../data_covariates/"
sbatch=12 #12
tbatch=1800
batch_size_lat=145
batch_size_lon=361
epsilon=0.02
beta=1.0
margin=8
pretrained_path=""
model_name=""
dtype="fp32"
arch="adm" # options are "ddpmpp", "ncsnpp", and "adm"
precond="unet" # edm for edm (default model), unet for unet,
# for diffusion in_channels = numbers of variables, for unet in_channels = numbers of variables + constants
if [[ "$precond" == "unet" ]]; then
in_channels=10 # number of variables + constants (lat + lon + z + lsm)
cond_channels="" # no cond_channels for unet
else # for diffusion-based arch
in_channels=6 # number of variables only
cond_channels=10 # number of variables + lat + lon + z + lsm
fi
out_channels=6 # number of variables
inference_type="direct" # "direct" for unet, "sampler" for diffusion
compute_crps=false
if [[ "$compute_crps" == true && "$inference_type" != "sampler" ]]; then
echo "WARNING: CRPS requires sampler inference. Disabling compute_crps."
compute_crps=false
fi
# EDM Sampler Parameters
num_steps=10
sigma_min=0.002
sigma_max=80.0
rho=7
s_churn=40
solver="heun"
# Add apply_filter parameter
apply_filter=false
# ======================
# 3. SLURM Configuration
# ======================
SLURM_ACCOUNT="EUHPC_D27_095"
SLURM_PARTITION="boost_usr_prod"
SLURM_QOS="boost_qos_bprod"
SLURM_GRES=4
SLURM_NODES=1
SLURM_NTASKS_PER_NODE=1
SLURM_CPUS_PER_TASK=16
# ======================
# Generate Tag and Output Paths
# ======================
eps_str="${epsilon//./d}"
lr_str="${learning_rate//./d}"
mode_str=$([[ "$debug" == true ]] && echo "debug" || echo "prod")
# Create BASE model name parts (configuration only)
base_parts=("y${year_start}_${year_end}")
base_parts+=("norm_${time_normalization}")
base_parts+=("lat${batch_size_lat}")
base_parts+=("lon${batch_size_lon}")
base_parts+=("vars${#varnames_list[@]}")
base_parts+=("dt${dtype}")
base_parts+=("arch${arch}")
base_parts+=("pre${precond}")
base_parts+=("in${in_channels}")
if [[ "$precond" == "edm" ]]; then
base_parts+=("cond${cond_channels}")
fi
base_parts+=("out${out_channels}")
filter_str=$([[ "$apply_filter" == true ]] && echo "filter" || echo "nofilter")
base_parts+=("${filter_str}")
# Create TRAINING model name parts (training-specific)
train_parts=("ep${num_epochs}")
train_parts+=("sb${sbatch}")
train_parts+=("tb${tbatch}")
train_parts+=("eps${eps_str}")
train_parts+=("mrg${margin}")
train_parts+=("inf_${inference_type}")
sigma_min_str="${sigma_min//./d}"
sigma_max_str="${sigma_max//./d}"
rho_str="${rho//./d}"
if [[ "$inference_type" == "sampler" ]]; then
train_parts+=("steps${num_steps}")
train_parts+=("sigmin${sigma_min_str}")
train_parts+=("sigmax${sigma_max_str}")
train_parts+=("rho${rho_str}")
train_parts+=("s_churn${s_churn}")
train_parts+=("solver_${solver}")
fi
# Add checkpoint info to model name if not training from scratch
if [[ "$run_type" == "resume_train" || "$run_type" == "inference" ]]; then
train_parts+=("resume")
fi
# Combine all parts
model_parts=("${mode_str}" "${base_parts[@]}" "${train_parts[@]}")
# Model name (full configuration) - PROPERLY joined with underscores
model_name=$(IFS=_ ; echo "${model_parts[*]}")
tag="${model_name}_bs${batch_size}_lr${lr_str}_${TODAY}"
# Define output paths - hierarchical structure
# Main folder: mode + base configuration - PROPERLY joined
main_folder="${mode_str}_$(IFS=_ ; echo "${base_parts[*]}")"
# Sub folder: training parameters + date - PROPERLY joined
train_parts_str=$(IFS=_ ; echo "${train_parts[*]}")
sub_folder="train_${train_parts_str}_bs${batch_size}_lr${lr_str}"
# Prefix: simple run identifier
prefix="run_${TODAY}"
# ======================
# Generate SBATCH Script Only (removed run script)
# ======================
PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
sbatch_script="$PROJECT_ROOT/slurm/sbatch_diffusion_${main_folder}_${sub_folder}.sh"
# Create slurm_io directory if it doesn't exist
mkdir -p "$PROJECT_ROOT/slurm_io"
# ======================
# Generate SBATCH Script
# ======================
cat > $sbatch_script <<EOF
#!/bin/bash
#SBATCH --job-name=diffusion_${tag}
#SBATCH --account=$SLURM_ACCOUNT
#SBATCH --partition=$SLURM_PARTITION
EOF
# SLURM debug selection
if [ "$debug" = true ]; then
echo "#SBATCH --qos=boost_qos_dbg" >> $sbatch_script
echo "#SBATCH --time=00:30:00" >> $sbatch_script
else
echo "#SBATCH --time=24:00:00" >> $sbatch_script
fi
cat >> $sbatch_script <<EOF
#SBATCH --nodes=$SLURM_NODES
#SBATCH --gpus-per-node=$SLURM_GRES
#SBATCH --mem-per-gpu=64G
#SBATCH --ntasks-per-node=$SLURM_NTASKS_PER_NODE
#SBATCH --cpus-per-task=$SLURM_CPUS_PER_TASK
#SBATCH --error=$PROJECT_ROOT/slurm_io/diffusion_${tag}_%j.err
#SBATCH --output=$PROJECT_ROOT/slurm_io/diffusion_${tag}_%j.out
ulimit -s unlimited
module purge
# Activate virtual environment
source $PROJECT_ROOT/.venv/bin/activate
export PYTHONUNBUFFERED=1
echo "===== Job Infos ====="
echo "Debug mode: $debug"
echo "Apply filter: $apply_filter"
echo "Inference type: $inference_type"
echo "Node list: \${SLURM_NODELIST}"
echo "Job ID: \${SLURM_JOB_ID}"
echo "Current dir: \$(pwd)"
echo "Output main folder: $main_folder"
echo "Output sub folder: $sub_folder"
echo "====================="
# Use ipsl-aid command directly
ipsl-aid \\
--debug $debug \\
--main_folder "$main_folder" \\
--sub_folder "$sub_folder" \\
--prefix "$prefix" \\
--run_type "$run_type" \\
--save_model $save_model \\
--save_checkpoint_name "$save_checkpoint_name" \\
--load_checkpoint_name "$load_checkpoint_name" \\
--save_per_samples "$save_per_samples" \\
--inference_type "$inference_type" \\
--arch "$arch" \\
--precond "$precond" \\
--in_channels "$in_channels" \\
--out_channels "$out_channels" \\
--apply_filter $apply_filter \\
EOF
# Add cond_channels only for EDM
[[ "$precond" == "edm" ]] && echo " --cond_channels \"$cond_channels\" \\" >> $sbatch_script
# Add optional parameters
[ -n "$pretrained_path" ] && echo " --pretrained_path \"$pretrained_path\" \\" >> $sbatch_script
[ -n "$model_name" ] && echo " --model_name \"$model_name\" \\" >> $sbatch_script
# Regional only
if [[ "$run_type" == "train_regional" || "$run_type" == "inference_regional" ]]; then
[ -n "$region" ] && echo " --region \"$region\" \\" >> $sbatch_script
[ ${#region_size[@]} -gt 0 ] && echo " --region_size ${region_size[@]} \\" >> $sbatch_script
fi
# Add arrays
[ ${#varnames_list[@]} -gt 0 ] && echo " --varnames_list ${varnames_list[@]} \\" >> $sbatch_script
[ ${#normalization_types[@]} -gt 0 ] && echo " --normalization_types ${normalization_types[@]} \\" >> $sbatch_script
[ ${#constant_varnames_list[@]} -gt 0 ] && echo " --constant_varnames_list ${constant_varnames_list[@]} \\" >> $sbatch_script
echo " --constant_varnames_file \"$constant_varnames_file\" \\" >> $sbatch_script
[ ${#units_list[@]} -gt 0 ] && echo " --units_list ${units_list[@]} \\" >> $sbatch_script
[ ${#dynamic_covariates[@]} -gt 0 ] && echo " --dynamic_covariates ${dynamic_covariates[@]} \\" >> $sbatch_script
cat >> $sbatch_script <<EOF
--year_start "$year_start" \\
--year_end "$year_end" \\
--year_start_test "$year_start_test" \\
--year_end_test "$year_end_test" \\
--batch_size "$batch_size" \\
--num_epochs "$num_epochs" \\
--learning_rate "$learning_rate" \\
--num_workers "$num_workers" \\
--datadir "$datadir" \\
--per_var_datadir ${per_var_datadir[@]} \\
--time_normalization "$time_normalization" \\
--tbatch "$tbatch" \\
--sbatch "$sbatch" \\
--batch_size_lat "$batch_size_lat" \\
--batch_size_lon "$batch_size_lon" \\
--epsilon "$epsilon" \\
--beta "$beta" \\
--margin "$margin" \\
--dynamic_covariates_dir "$dynamic_covariates_dir" \\
--dtype "$dtype" \\
--num_steps "$num_steps" \\
--sigma_min "$sigma_min" \\
--sigma_max "$sigma_max" \\
--rho "$rho" \\
--s_churn "$s_churn" \\
--solver "$solver" \\
--compute_crps "$compute_crps"
exit \$?
EOF
chmod +x $sbatch_script
# ======================
# Final Output
# ======================
echo "=== Setup Complete ==="
echo "Debug mode: $debug"
echo "Apply filter: $apply_filter"
echo "Inference type: $inference_type"
echo "Run type: $run_type"
echo "Save model: $save_model"
echo "Load checkpoint: $load_checkpoint_name"
if [[ "$run_type" == "resume_train" || "$run_type" == "inference" ]]; then
echo "Checkpoint file: $load_checkpoint_name"
fi
echo ""
echo "Model name parts: ${model_parts[*]}"
echo "Model name: $model_name"
echo "Tag: $tag"
echo ""
echo "Main folder: $main_folder"
echo "Sub folder: $sub_folder"
echo "Prefix: $prefix"
echo ""
echo "SBATCH script: $sbatch_script"
echo ""
echo "To submit the job:"
echo " sbatch $sbatch_script"
echo ""
echo "To test locally (small run):"
echo " source .venv/bin/activate"
echo " ipsl-aid --debug true --run_type train --num_epochs 1 --batch_size 2"