|
| 1 | +#!/bin/sh |
| 2 | + |
| 3 | +DIRNAME=`dirname $0` |
| 4 | +#Set Script Name variable |
| 5 | +SCRIPT=`basename ${0}` |
| 6 | + |
| 7 | +# Figure out which cluster we are on |
| 8 | +CLUSTER=`hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g'` |
| 9 | +# Look for the binary in the cluster specific build directory |
| 10 | +BINDIR="${DIRNAME}/../build/${CLUSTER}.llnl.gov/model_zoo" |
| 11 | + |
| 12 | +#Initialize variables to default values. |
| 13 | +TRAINING_SAMPLES=-1 |
| 14 | +VALIDATION_SAMPLES=-1 |
| 15 | +EPOCHS=20 |
| 16 | + |
| 17 | +NETWORK="1000" |
| 18 | + |
| 19 | +PARIO=0 |
| 20 | +BLOCK_SIZE=256 |
| 21 | +MODE="false" |
| 22 | +MB_SIZE=200 |
| 23 | +LR=0.01 |
| 24 | +ACT=3 |
| 25 | +LRM=1 |
| 26 | +TEST_W_TRAIN_DATA=0 |
| 27 | +LR_DECAY=0.5 |
| 28 | + |
| 29 | +RUN="srun" |
| 30 | + |
| 31 | +ROOT_DATASET_DIR="/l/ssd" |
| 32 | +DATASET_DIR="datasets/ILSVRC2012" |
| 33 | +OUTPUT_DIR="/l/ssd/lbann/outputs" |
| 34 | +PARAM_DIR="/l/ssd/lbann/models" |
| 35 | +SAVE_MODEL=false |
| 36 | +LOAD_MODEL=false |
| 37 | +CKPT=10 |
| 38 | + |
| 39 | +# need this in an mxterm |
| 40 | +export SLURM_NNODES=$SLURM_JOB_NUM_NODES |
| 41 | + |
| 42 | +TASKS_PER_NODE=1 |
| 43 | +NNODES=${SLURM_NNODES} |
| 44 | + |
| 45 | +if [ "${CLUSTER}" = "catalyst" ]; then |
| 46 | +LUSTRE_FILEPATH="/p/lscratchf/brainusr" |
| 47 | +ENABLE_HT= |
| 48 | +CORES_PER_NODE=48 |
| 49 | +elif [ "${CLUSTER}" = "sierra" ]; then |
| 50 | +LUSTRE_FILEPATH="/p/lscratche/brainusr" |
| 51 | +#ENABLE_HT=--enable-hyperthreads |
| 52 | +#CORES_PER_NODE=24 |
| 53 | +ENABLE_HT= |
| 54 | +CORES_PER_NODE=12 |
| 55 | +else |
| 56 | +LUSTRE_FILEPATH="/p/lscratche/brainusr" |
| 57 | +ENABLE_HT= |
| 58 | +CORES_PER_NODE=12 |
| 59 | +fi |
| 60 | + |
| 61 | +USE_LUSTRE_DIRECT=1 |
| 62 | +SHUFFLE_TRAINING=1 |
| 63 | + |
| 64 | +#Set fonts for Help. |
| 65 | +NORM=`tput sgr0` |
| 66 | +BOLD=`tput bold` |
| 67 | +REV=`tput smso` |
| 68 | + |
| 69 | +#Help function |
| 70 | +function HELP { |
| 71 | + echo -e \\n"Help documentation for ${BOLD}${SCRIPT}.${NORM}"\\n |
| 72 | + echo -e "${REV}Basic usage:${NORM} ${BOLD}$SCRIPT -t <training set size> -e <epochs> -v <validation set size>${NORM}"\\n |
| 73 | + echo "Command line switches are optional. The following switches are recognized." |
| 74 | + echo "${REV}-a${NORM} <val> --Sets the ${BOLD}activation type${NORM}. Default is ${BOLD}${ACT}${NORM}." |
| 75 | + echo "${REV}-b${NORM} <val> --Sets the ${BOLD}mini-batch size${NORM}. Default is ${BOLD}${MB_SIZE}${NORM}." |
| 76 | + echo "${REV}-c${NORM} --(CHEAT) Test / validate with the ${BOLD}training data${NORM}. Default is ${BOLD}${TEST_W_TRAIN_DATA}${NORM}." |
| 77 | + echo "${REV}-d${NORM} --Sets the ${BOLD}debug mode${NORM}." |
| 78 | + echo "${REV}-e${NORM} <val> --Sets the ${BOLD}number of epochs${NORM}. Default is ${BOLD}${EPOCHS}${NORM}." |
| 79 | + echo "${REV}-f${NORM} <val> --Path to the ${BOLD}datasets${NORM}. Default is ${BOLD}${ROOT_DATASET_DIR}${NORM}." |
| 80 | + echo "${REV}-i${NORM} <val> --Sets the ${BOLD}parallel I/O limit${NORM}. Default is ${BOLD}${PARIO}${NORM}." |
| 81 | + echo "${REV}-j${NORM} <val> --Sets the ${BOLD}learning rate decay${NORM}. Default is ${BOLD}${LR_DECAY}${NORM}." |
| 82 | + echo "${REV}-k${NORM} <val> --Checkpoint after every ${BOLD}N${NORM} epochs. Default is ${BOLD}${CKPT}${NORM}." |
| 83 | + echo "${REV}-l${NORM} <val> --Determines if the model is ${BOLD}loaded${NORM}. Default is ${BOLD}${LOAD_MODEL}${NORM}." |
| 84 | + echo "${REV}-m${NORM} <val> --Sets the ${BOLD}mode${NORM}. Default is ${BOLD}${MODE}${NORM}." |
| 85 | + echo "${REV}-n${NORM} <val> --Sets the ${BOLD}network topology${NORM}. Default is ${BOLD}${NETWORK}${NORM}." |
| 86 | + echo "${REV}-o${NORM} <val> --Sets the ${BOLD}output directory${NORM}. Default is ${BOLD}${OUTPUT_DIR}${NORM}." |
| 87 | + echo "${REV}-p${NORM} <val> --Sets the ${BOLD}input parameter directory${NORM}. Default is ${BOLD}${PARAM_DIR}${NORM}." |
| 88 | + echo "${REV}-q${NORM} <val> --Sets the ${BOLD}learning rate method${NORM}. Default is ${BOLD}${LRM}${NORM}." |
| 89 | + echo "${REV}-r${NORM} <val> --Sets the ${BOLD}inital learning rate${NORM}. Default is ${BOLD}${LR}${NORM}." |
| 90 | + echo "${REV}-s${NORM} <val> --Determines if the model is ${BOLD}saved${NORM}. Default is ${BOLD}${SAVE_MODEL}${NORM}." |
| 91 | + echo "${REV}-t${NORM} <val> --Sets the number of ${BOLD}training samples${NORM}. Default is ${BOLD}${TRAINING_SAMPLES}${NORM}." |
| 92 | + echo "${REV}-u${NORM} --Use the ${BOLD}Lustre filesystem${NORM} directly. Default is ${BOLD}${USE_LUSTRE_DIRECT}${NORM}." |
| 93 | + echo "${REV}-v${NORM} <val> --Sets the number of ${BOLD}validation samples${NORM}. Default is ${BOLD}${VALIDATION_SAMPLES}${NORM}." |
| 94 | + echo "${REV}-w${NORM} <val> -- ${BOLD}Order N${NORM} or ${BOLD}Pick N${NORM} training samples. Default is ${BOLD}${SHUFFLE_TRAINING}${NORM}." |
| 95 | + echo "${REV}-x${NORM} <val> --Sets the ${BOLD}lib Elemental block size${NORM}. Default is ${BOLD}${BLOCK_SIZE}${NORM}." |
| 96 | + echo "${REV}-y${NORM} <val> --Sets the ${BOLD}number of nodes allowed in the allocation${NORM}. Default is ${BOLD}${SLURM_NNODES}${NORM}." |
| 97 | + echo "${REV}-z${NORM} <val> --Sets the ${BOLD}tasks per node${NORM}. Default is ${BOLD}${TASKS_PER_NODE}${NORM}." |
| 98 | + echo -e "${REV}-h${NORM} --Displays this help message. No further functions are performed."\\n |
| 99 | + exit 1 |
| 100 | +} |
| 101 | + |
| 102 | +while getopts ":a:b:cde:f:hi:j:k:l:m:n:o:p:q:r:s:t:uv:w:x:y:z:" opt; do |
| 103 | + case $opt in |
| 104 | + a) |
| 105 | + ACT=$OPTARG |
| 106 | + ;; |
| 107 | + b) |
| 108 | + MB_SIZE=$OPTARG |
| 109 | + ;; |
| 110 | + c) |
| 111 | + TEST_W_TRAIN_DATA=1 |
| 112 | + ;; |
| 113 | + d) |
| 114 | + RUN="totalview srun -a" |
| 115 | + ;; |
| 116 | + e) |
| 117 | + EPOCHS=$OPTARG |
| 118 | + ;; |
| 119 | + f) |
| 120 | + ROOT_DATASET_DIR=$OPTARG |
| 121 | + ;; |
| 122 | + h) |
| 123 | + HELP |
| 124 | + exit 1 |
| 125 | + ;; |
| 126 | + i) |
| 127 | + PARIO=$OPTARG |
| 128 | + ;; |
| 129 | + j) |
| 130 | + LR_DECAY=$OPTARG |
| 131 | + ;; |
| 132 | + k) |
| 133 | + CKPT=$OPTARG |
| 134 | + ;; |
| 135 | + l) |
| 136 | + LOAD_MODEL=$OPTARG |
| 137 | + ;; |
| 138 | + m) |
| 139 | + MODE=$OPTARG |
| 140 | + ;; |
| 141 | + n) |
| 142 | + NETWORK=$OPTARG |
| 143 | + ;; |
| 144 | + o) |
| 145 | + OUTPUT_DIR=$OPTARG |
| 146 | + ;; |
| 147 | + p) |
| 148 | + PARAM_DIR=$OPTARG |
| 149 | + ;; |
| 150 | + q) |
| 151 | + LRM=$OPTARG |
| 152 | + ;; |
| 153 | + r) |
| 154 | + LR=$OPTARG |
| 155 | + ;; |
| 156 | + s) |
| 157 | + SAVE_MODEL=$OPTARG |
| 158 | + ;; |
| 159 | + t) |
| 160 | + TRAINING_SAMPLES=$OPTARG |
| 161 | + ;; |
| 162 | + u) |
| 163 | + USE_LUSTRE_DIRECT=1 |
| 164 | + ;; |
| 165 | + v) |
| 166 | + VALIDATION_SAMPLES=$OPTARG |
| 167 | + ;; |
| 168 | + w) |
| 169 | + SHUFFLE_TRAINING=$OPTARG |
| 170 | + ;; |
| 171 | + x) |
| 172 | + BLOCK_SIZE=$OPTARG |
| 173 | + ;; |
| 174 | + y) |
| 175 | + NNODES=$OPTARG |
| 176 | + ;; |
| 177 | + z) |
| 178 | + TASKS_PER_NODE=$OPTARG |
| 179 | + ;; |
| 180 | + \?) |
| 181 | + echo "Invalid option: -$OPTARG" >&2 |
| 182 | + exit 1 |
| 183 | + ;; |
| 184 | + :) |
| 185 | + echo "Option -$OPTARG requires an argument." >&2 |
| 186 | + exit 1 |
| 187 | + ;; |
| 188 | + esac |
| 189 | +done |
| 190 | + |
| 191 | +shift $((OPTIND-1)) |
| 192 | +# now do something with $@ |
| 193 | + |
| 194 | +# Once all of the options are parsed, you can setup the environment |
| 195 | +#source ${DIRNAME}/setup_brain_lbann_env.sh -m mvapich2 -v 0.86 |
| 196 | +#source ${DIRNAME}/setup_brain_lbann_env.sh -m debug_mvapich2 -v 0.86 |
| 197 | +#source ${DIRNAME}/setup_brain_lbann_env.sh -m openmpi -v 0.86 |
| 198 | +#source ${DIRNAME}/setup_brain_lbann_env.sh -m debug_openmpi -v 0.86 |
| 199 | +source ${DIRNAME}/setup_brain_lbann_env.sh -m mvapich2 -v El_0.86/v86-6ec56a |
| 200 | + |
| 201 | +TASKS=$((${SLURM_NNODES} * ${SLURM_CPUS_ON_NODE})) |
| 202 | +if [ ${TASKS} -gt 384 ]; then |
| 203 | +TASKS=384 |
| 204 | +fi |
| 205 | +LBANN_TASKS=$((${NNODES} * ${TASKS_PER_NODE})) |
| 206 | + |
| 207 | +export PATH=/collab/usr/global/tools/stat/file_bcast/${SYS_TYPE}/fbcast:${PATH} |
| 208 | + |
| 209 | +if [ ${USE_LUSTRE_DIRECT} -eq 1 ]; then |
| 210 | + |
| 211 | +ROOT_DATASET_DIR=${LUSTRE_FILEPATH} |
| 212 | + |
| 213 | +else |
| 214 | + |
| 215 | +FILES=(labels.tar resized_256x256/train.tar resized_256x256/val.tar resized_256x256/test.tar) |
| 216 | +for tarball in "${FILES[@]}" |
| 217 | +do |
| 218 | + FILE=`basename $tarball` |
| 219 | + if [ ! -e ${ROOT_DATASET_DIR}/${FILE} ]; then |
| 220 | +# CMD="pdcp /p/lscratchf/brainusr/datasets/ILSVRC2012/${tarball} /l/ssd/" |
| 221 | + CMD="srun -n${TASKS} -N${SLURM_NNODES} file_bcast_par13 1MB ${LUSTRE_FILEPATH}/${DATASET_DIR}/${tarball} ${ROOT_DATASET_DIR}/${FILE}" |
| 222 | + echo "${CMD}" |
| 223 | + ${CMD} |
| 224 | + fi |
| 225 | +done |
| 226 | + |
| 227 | +if [ ! -d ${ROOT_DATASET_DIR}/${DATASET_DIR}/resized_256x256 ]; then |
| 228 | + CMD="pdsh mkdir -p ${ROOT_DATASET_DIR}/${DATASET_DIR}/resized_256x256" |
| 229 | + echo "${CMD}" |
| 230 | + ${CMD} |
| 231 | +fi |
| 232 | + |
| 233 | +FILES=(labels) |
| 234 | +for tarball in "${FILES[@]}" |
| 235 | +do |
| 236 | + if [ ! -e ${ROOT_DATASET_DIR}/${DATASET_DIR}/${tarball} ]; then |
| 237 | + CMD="pdsh /usr/bin/time tar xf ${ROOT_DATASET_DIR}/${tarball}.tar -C ${ROOT_DATASET_DIR}/${DATASET_DIR}/" |
| 238 | + echo "${CMD}" |
| 239 | + ${CMD} |
| 240 | + fi |
| 241 | +done |
| 242 | + |
| 243 | +FILES=(train val test) |
| 244 | +for tarball in "${FILES[@]}" |
| 245 | +do |
| 246 | + if [ ! -e ${ROOT_DATASET_DIR}/${DATASET_DIR}/resized_256x256/${tarball} ]; then |
| 247 | + CMD="pdsh /usr/bin/time tar xf ${ROOT_DATASET_DIR}/${tarball}.tar -C ${ROOT_DATASET_DIR}/${DATASET_DIR}/resized_256x256/" |
| 248 | + echo "${CMD}" |
| 249 | + ${CMD} |
| 250 | + fi |
| 251 | +done |
| 252 | + |
| 253 | +if [ ! -d ${PARAM_DIR} ]; then |
| 254 | + CMD="mkdir -p ${PARAM_DIR}" |
| 255 | + echo ${CMD} |
| 256 | + ${CMD} |
| 257 | +fi |
| 258 | + |
| 259 | +if [ ! -d ${OUTPUT_DIR} ]; then |
| 260 | + CMD="mkdir -p ${OUTPUT_DIR}" |
| 261 | + echo ${CMD} |
| 262 | + ${CMD} |
| 263 | +fi |
| 264 | + |
| 265 | +fi |
| 266 | + |
| 267 | +#NNODES=4 |
| 268 | +#LBANN_TASKS=4 |
| 269 | +CMD="${RUN} -N${NNODES} -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE} --distribution=block --drop-caches=pagecache ${BINDIR}/lbann_multi_alexnet --hostname ${CLUSTER} --num-nodes ${NNODES} --num-cores $((${NNODES}*${CORES_PER_NODE})) --tasks-per-node ${TASKS_PER_NODE} --par-IO ${PARIO} --dataset ${ROOT_DATASET_DIR}/${DATASET_DIR}/ --max-validation-samples ${VALIDATION_SAMPLES} --profiling true --max-training-samples ${TRAINING_SAMPLES} --block-size ${BLOCK_SIZE} --output ${OUTPUT_DIR} --mode ${MODE} --num-epochs ${EPOCHS} --params ${PARAM_DIR} --save-model ${SAVE_MODEL} --load-model ${LOAD_MODEL} --mb-size ${MB_SIZE} --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --checkpoint ${CKPT} --lr-decay-rate ${LR_DECAY} --random-training-samples ${SHUFFLE_TRAINING} --num-classes 10 --z-score 1 --procs-per-model 4" |
| 270 | +echo ${CMD} |
| 271 | +${CMD} |
0 commit comments