Skip to content

Commit f5878cf

Browse files
committed
Merge branch 'release-v0.93'
Release Notes: v0.93 This release contains a major refactoring / overhaul of the code base. Key highlights include: - Moving layer design into smaller simpler layers that have a single compute behavior per layer. Specifically, linear combination of the inputs, non-linear activations, and regularizers now exist as their own layers. - Layers now have a template parameter that specifies the data layout for the distributed matrices. - Prototext interface for specifying neural network models and data readers is nearly fully functional. - Code now adheres to internal coding style as outlined in README_coding_style.txt - Dead-code has been eliminated and layer file hierachy has been cleaned up.
2 parents 16b504b + 59bc798 commit f5878cf

283 files changed

Lines changed: 24712 additions & 25389 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ language: cpp
88
# Blacklist
99
branches:
1010
except:
11+
- release-v0.93
1112
- develop
1213
- gh-pages
1314

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ endif()
4343
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
4444

4545
# Initialize C++ flags
46-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -std=c++11")
46+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -std=c++11 -g -Wall -Wextra -Wno-unused-parameter -Wnon-virtual-dtor -Wshadow")
4747

4848
# Disable all optimization in debug for better viewing under debuggers (cmake already adds -g)
4949
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -DLBANN_DEBUG")

cmake/Elemental.cmake

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ else()
6262
if(ELEMENTAL_USE_CUBLAS)
6363
set(EL_CUBLAS_FLAGS "-DEL_USE_CUBLAS -I${CUDA_INCLUDE_DIRS} -I${CUB_SOURCE_DIR}")
6464
#set(EL_CUBLAS_FLAGS "${CMAKE_CXX_FLAGS} ${EL_CUBLAS_FLAGS}")
65-
set(EL_CUBLAS_LINK "${CMAKE_EXE_LINKER_FLAGS} -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas -lcudart")
65+
set(EL_CUBLAS_EXE_LINK "${CMAKE_EXE_LINKER_FLAGS} -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas -lcudart")
66+
set(EL_CUBLAS_SHARED_LINK "${CMAKE_SHARED_LINKER_FLAGS} -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas -lcudart")
6667
endif()
6768

6869
# patch file
@@ -83,7 +84,7 @@ else()
8384
GIT_REPOSITORY ${ELEMENTAL_URL}
8485
GIT_TAG ${ELEMENTAL_TAG}
8586
#--Update/Patch step----------
86-
PATCH_COMMAND patch -d ${ELEMENTAL_SOURCE_DIR} -p 1 < ${PROJECT_SOURCE_DIR}/external/Elemental/elemental_cublas.patch
87+
PATCH_COMMAND patch -N -s -d ${ELEMENTAL_SOURCE_DIR} -p 1 < ${PROJECT_SOURCE_DIR}/external/Elemental/elemental_cublas.patch
8788
#--Configure step-------------
8889
SOURCE_DIR ${ELEMENTAL_SOURCE_DIR}
8990
BINARY_DIR ${ELEMENTAL_BINARY_DIR}
@@ -115,8 +116,11 @@ else()
115116
-D CMAKE_INSTALL_RPATH_USE_LINK_PATH=${CMAKE_INSTALL_RPATH_USE_LINK_PATH}
116117
-D CMAKE_INSTALL_RPATH=${CMAKE_INSTALL_RPATH}
117118
-D CMAKE_MACOSX_RPATH=${CMAKE_MACOSX_RPATH}
118-
-D CMAKE_CXX_FLAGS=${EL_CUBLAS_FLAGS}
119-
-D CMAKE_EXE_LINKER_FLAGS=${EL_CUBLAS_LINK}
119+
-D CMAKE_C_FLAGS=${CMAKE_C_FLAGS}
120+
-D CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} ${EL_CUBLAS_FLAGS}
121+
-D CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}
122+
-D CMAKE_EXE_LINKER_FLAGS=${EL_CUBLAS_EXE_LINK}
123+
-D CMAKE_SHARED_LINKER_FLAGS=${EL_CUBLAS_SHARED_LINK}
120124
-D PATCH_DIR=${PATCH_DIR}
121125
)
122126

cmake/OpenCV.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,9 @@ else()
219219
-D CMAKE_C_COMPILER=${CMAKE_C_COMPILER}
220220
-D CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
221221
-D CMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}
222+
-D CMAKE_C_FLAGS=${CMAKE_C_FLAGS}
223+
-D CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
224+
-D CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}
222225
-D CMAKE_SKIP_BUILD_RPATH=${CMAKE_SKIP_BUILD_RPATH}
223226
-D CMAKE_BUILD_WITH_INSTALL_RPATH=${CMAKE_BUILD_WITH_INSTALL_RPATH}
224227
-D CMAKE_INSTALL_RPATH_USE_LINK_PATH=${CMAKE_INSTALL_RPATH_USE_LINK_PATH}
Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
#!/bin/sh
2+
3+
DIRNAME=`dirname $0`
4+
#Set Script Name variable
5+
SCRIPT=`basename ${0}`
6+
7+
# Figure out which cluster we are on
8+
CLUSTER=`hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g'`
9+
# Look for the binary in the cluster specific build directory
10+
BINDIR="${DIRNAME}/../build/${CLUSTER}.llnl.gov/model_zoo"
11+
12+
#Initialize variables to default values.
13+
TRAINING_SAMPLES=-1
14+
VALIDATION_SAMPLES=-1
15+
EPOCHS=20
16+
17+
NETWORK="1000"
18+
19+
PARIO=0
20+
BLOCK_SIZE=256
21+
MODE="false"
22+
MB_SIZE=200
23+
LR=0.01
24+
ACT=3
25+
LRM=1
26+
TEST_W_TRAIN_DATA=0
27+
LR_DECAY=0.5
28+
29+
RUN="srun"
30+
31+
ROOT_DATASET_DIR="/l/ssd"
32+
DATASET_DIR="datasets/ILSVRC2012"
33+
OUTPUT_DIR="/l/ssd/lbann/outputs"
34+
PARAM_DIR="/l/ssd/lbann/models"
35+
SAVE_MODEL=false
36+
LOAD_MODEL=false
37+
CKPT=10
38+
39+
# need this in an mxterm
40+
export SLURM_NNODES=$SLURM_JOB_NUM_NODES
41+
42+
TASKS_PER_NODE=1
43+
NNODES=${SLURM_NNODES}
44+
45+
if [ "${CLUSTER}" = "catalyst" ]; then
46+
LUSTRE_FILEPATH="/p/lscratchf/brainusr"
47+
ENABLE_HT=
48+
CORES_PER_NODE=48
49+
elif [ "${CLUSTER}" = "sierra" ]; then
50+
LUSTRE_FILEPATH="/p/lscratche/brainusr"
51+
#ENABLE_HT=--enable-hyperthreads
52+
#CORES_PER_NODE=24
53+
ENABLE_HT=
54+
CORES_PER_NODE=12
55+
else
56+
LUSTRE_FILEPATH="/p/lscratche/brainusr"
57+
ENABLE_HT=
58+
CORES_PER_NODE=12
59+
fi
60+
61+
USE_LUSTRE_DIRECT=1
62+
SHUFFLE_TRAINING=1
63+
64+
#Set fonts for Help.
65+
NORM=`tput sgr0`
66+
BOLD=`tput bold`
67+
REV=`tput smso`
68+
69+
#Help function
70+
function HELP {
71+
echo -e \\n"Help documentation for ${BOLD}${SCRIPT}.${NORM}"\\n
72+
echo -e "${REV}Basic usage:${NORM} ${BOLD}$SCRIPT -t <training set size> -e <epochs> -v <validation set size>${NORM}"\\n
73+
echo "Command line switches are optional. The following switches are recognized."
74+
echo "${REV}-a${NORM} <val> --Sets the ${BOLD}activation type${NORM}. Default is ${BOLD}${ACT}${NORM}."
75+
echo "${REV}-b${NORM} <val> --Sets the ${BOLD}mini-batch size${NORM}. Default is ${BOLD}${MB_SIZE}${NORM}."
76+
echo "${REV}-c${NORM} --(CHEAT) Test / validate with the ${BOLD}training data${NORM}. Default is ${BOLD}${TEST_W_TRAIN_DATA}${NORM}."
77+
echo "${REV}-d${NORM} --Sets the ${BOLD}debug mode${NORM}."
78+
echo "${REV}-e${NORM} <val> --Sets the ${BOLD}number of epochs${NORM}. Default is ${BOLD}${EPOCHS}${NORM}."
79+
echo "${REV}-f${NORM} <val> --Path to the ${BOLD}datasets${NORM}. Default is ${BOLD}${ROOT_DATASET_DIR}${NORM}."
80+
echo "${REV}-i${NORM} <val> --Sets the ${BOLD}parallel I/O limit${NORM}. Default is ${BOLD}${PARIO}${NORM}."
81+
echo "${REV}-j${NORM} <val> --Sets the ${BOLD}learning rate decay${NORM}. Default is ${BOLD}${LR_DECAY}${NORM}."
82+
echo "${REV}-k${NORM} <val> --Checkpoint after every ${BOLD}N${NORM} epochs. Default is ${BOLD}${CKPT}${NORM}."
83+
echo "${REV}-l${NORM} <val> --Determines if the model is ${BOLD}loaded${NORM}. Default is ${BOLD}${LOAD_MODEL}${NORM}."
84+
echo "${REV}-m${NORM} <val> --Sets the ${BOLD}mode${NORM}. Default is ${BOLD}${MODE}${NORM}."
85+
echo "${REV}-n${NORM} <val> --Sets the ${BOLD}network topology${NORM}. Default is ${BOLD}${NETWORK}${NORM}."
86+
echo "${REV}-o${NORM} <val> --Sets the ${BOLD}output directory${NORM}. Default is ${BOLD}${OUTPUT_DIR}${NORM}."
87+
echo "${REV}-p${NORM} <val> --Sets the ${BOLD}input parameter directory${NORM}. Default is ${BOLD}${PARAM_DIR}${NORM}."
88+
echo "${REV}-q${NORM} <val> --Sets the ${BOLD}learning rate method${NORM}. Default is ${BOLD}${LRM}${NORM}."
89+
echo "${REV}-r${NORM} <val> --Sets the ${BOLD}inital learning rate${NORM}. Default is ${BOLD}${LR}${NORM}."
90+
echo "${REV}-s${NORM} <val> --Determines if the model is ${BOLD}saved${NORM}. Default is ${BOLD}${SAVE_MODEL}${NORM}."
91+
echo "${REV}-t${NORM} <val> --Sets the number of ${BOLD}training samples${NORM}. Default is ${BOLD}${TRAINING_SAMPLES}${NORM}."
92+
echo "${REV}-u${NORM} --Use the ${BOLD}Lustre filesystem${NORM} directly. Default is ${BOLD}${USE_LUSTRE_DIRECT}${NORM}."
93+
echo "${REV}-v${NORM} <val> --Sets the number of ${BOLD}validation samples${NORM}. Default is ${BOLD}${VALIDATION_SAMPLES}${NORM}."
94+
echo "${REV}-w${NORM} <val> -- ${BOLD}Order N${NORM} or ${BOLD}Pick N${NORM} training samples. Default is ${BOLD}${SHUFFLE_TRAINING}${NORM}."
95+
echo "${REV}-x${NORM} <val> --Sets the ${BOLD}lib Elemental block size${NORM}. Default is ${BOLD}${BLOCK_SIZE}${NORM}."
96+
echo "${REV}-y${NORM} <val> --Sets the ${BOLD}number of nodes allowed in the allocation${NORM}. Default is ${BOLD}${SLURM_NNODES}${NORM}."
97+
echo "${REV}-z${NORM} <val> --Sets the ${BOLD}tasks per node${NORM}. Default is ${BOLD}${TASKS_PER_NODE}${NORM}."
98+
echo -e "${REV}-h${NORM} --Displays this help message. No further functions are performed."\\n
99+
exit 1
100+
}
101+
102+
while getopts ":a:b:cde:f:hi:j:k:l:m:n:o:p:q:r:s:t:uv:w:x:y:z:" opt; do
103+
case $opt in
104+
a)
105+
ACT=$OPTARG
106+
;;
107+
b)
108+
MB_SIZE=$OPTARG
109+
;;
110+
c)
111+
TEST_W_TRAIN_DATA=1
112+
;;
113+
d)
114+
RUN="totalview srun -a"
115+
;;
116+
e)
117+
EPOCHS=$OPTARG
118+
;;
119+
f)
120+
ROOT_DATASET_DIR=$OPTARG
121+
;;
122+
h)
123+
HELP
124+
exit 1
125+
;;
126+
i)
127+
PARIO=$OPTARG
128+
;;
129+
j)
130+
LR_DECAY=$OPTARG
131+
;;
132+
k)
133+
CKPT=$OPTARG
134+
;;
135+
l)
136+
LOAD_MODEL=$OPTARG
137+
;;
138+
m)
139+
MODE=$OPTARG
140+
;;
141+
n)
142+
NETWORK=$OPTARG
143+
;;
144+
o)
145+
OUTPUT_DIR=$OPTARG
146+
;;
147+
p)
148+
PARAM_DIR=$OPTARG
149+
;;
150+
q)
151+
LRM=$OPTARG
152+
;;
153+
r)
154+
LR=$OPTARG
155+
;;
156+
s)
157+
SAVE_MODEL=$OPTARG
158+
;;
159+
t)
160+
TRAINING_SAMPLES=$OPTARG
161+
;;
162+
u)
163+
USE_LUSTRE_DIRECT=1
164+
;;
165+
v)
166+
VALIDATION_SAMPLES=$OPTARG
167+
;;
168+
w)
169+
SHUFFLE_TRAINING=$OPTARG
170+
;;
171+
x)
172+
BLOCK_SIZE=$OPTARG
173+
;;
174+
y)
175+
NNODES=$OPTARG
176+
;;
177+
z)
178+
TASKS_PER_NODE=$OPTARG
179+
;;
180+
\?)
181+
echo "Invalid option: -$OPTARG" >&2
182+
exit 1
183+
;;
184+
:)
185+
echo "Option -$OPTARG requires an argument." >&2
186+
exit 1
187+
;;
188+
esac
189+
done
190+
191+
shift $((OPTIND-1))
192+
# now do something with $@
193+
194+
# Once all of the options are parsed, you can setup the environment
195+
#source ${DIRNAME}/setup_brain_lbann_env.sh -m mvapich2 -v 0.86
196+
#source ${DIRNAME}/setup_brain_lbann_env.sh -m debug_mvapich2 -v 0.86
197+
#source ${DIRNAME}/setup_brain_lbann_env.sh -m openmpi -v 0.86
198+
#source ${DIRNAME}/setup_brain_lbann_env.sh -m debug_openmpi -v 0.86
199+
source ${DIRNAME}/setup_brain_lbann_env.sh -m mvapich2 -v El_0.86/v86-6ec56a
200+
201+
TASKS=$((${SLURM_NNODES} * ${SLURM_CPUS_ON_NODE}))
202+
if [ ${TASKS} -gt 384 ]; then
203+
TASKS=384
204+
fi
205+
LBANN_TASKS=$((${NNODES} * ${TASKS_PER_NODE}))
206+
207+
export PATH=/collab/usr/global/tools/stat/file_bcast/${SYS_TYPE}/fbcast:${PATH}
208+
209+
if [ ${USE_LUSTRE_DIRECT} -eq 1 ]; then
210+
211+
ROOT_DATASET_DIR=${LUSTRE_FILEPATH}
212+
213+
else
214+
215+
FILES=(labels.tar resized_256x256/train.tar resized_256x256/val.tar resized_256x256/test.tar)
216+
for tarball in "${FILES[@]}"
217+
do
218+
FILE=`basename $tarball`
219+
if [ ! -e ${ROOT_DATASET_DIR}/${FILE} ]; then
220+
# CMD="pdcp /p/lscratchf/brainusr/datasets/ILSVRC2012/${tarball} /l/ssd/"
221+
CMD="srun -n${TASKS} -N${SLURM_NNODES} file_bcast_par13 1MB ${LUSTRE_FILEPATH}/${DATASET_DIR}/${tarball} ${ROOT_DATASET_DIR}/${FILE}"
222+
echo "${CMD}"
223+
${CMD}
224+
fi
225+
done
226+
227+
if [ ! -d ${ROOT_DATASET_DIR}/${DATASET_DIR}/resized_256x256 ]; then
228+
CMD="pdsh mkdir -p ${ROOT_DATASET_DIR}/${DATASET_DIR}/resized_256x256"
229+
echo "${CMD}"
230+
${CMD}
231+
fi
232+
233+
FILES=(labels)
234+
for tarball in "${FILES[@]}"
235+
do
236+
if [ ! -e ${ROOT_DATASET_DIR}/${DATASET_DIR}/${tarball} ]; then
237+
CMD="pdsh /usr/bin/time tar xf ${ROOT_DATASET_DIR}/${tarball}.tar -C ${ROOT_DATASET_DIR}/${DATASET_DIR}/"
238+
echo "${CMD}"
239+
${CMD}
240+
fi
241+
done
242+
243+
FILES=(train val test)
244+
for tarball in "${FILES[@]}"
245+
do
246+
if [ ! -e ${ROOT_DATASET_DIR}/${DATASET_DIR}/resized_256x256/${tarball} ]; then
247+
CMD="pdsh /usr/bin/time tar xf ${ROOT_DATASET_DIR}/${tarball}.tar -C ${ROOT_DATASET_DIR}/${DATASET_DIR}/resized_256x256/"
248+
echo "${CMD}"
249+
${CMD}
250+
fi
251+
done
252+
253+
if [ ! -d ${PARAM_DIR} ]; then
254+
CMD="mkdir -p ${PARAM_DIR}"
255+
echo ${CMD}
256+
${CMD}
257+
fi
258+
259+
if [ ! -d ${OUTPUT_DIR} ]; then
260+
CMD="mkdir -p ${OUTPUT_DIR}"
261+
echo ${CMD}
262+
${CMD}
263+
fi
264+
265+
fi
266+
267+
#NNODES=4
268+
#LBANN_TASKS=4
269+
CMD="${RUN} -N${NNODES} -n${LBANN_TASKS} ${ENABLE_HT} --ntasks-per-node=${TASKS_PER_NODE} --distribution=block --drop-caches=pagecache ${BINDIR}/lbann_multi_alexnet --hostname ${CLUSTER} --num-nodes ${NNODES} --num-cores $((${NNODES}*${CORES_PER_NODE})) --tasks-per-node ${TASKS_PER_NODE} --par-IO ${PARIO} --dataset ${ROOT_DATASET_DIR}/${DATASET_DIR}/ --max-validation-samples ${VALIDATION_SAMPLES} --profiling true --max-training-samples ${TRAINING_SAMPLES} --block-size ${BLOCK_SIZE} --output ${OUTPUT_DIR} --mode ${MODE} --num-epochs ${EPOCHS} --params ${PARAM_DIR} --save-model ${SAVE_MODEL} --load-model ${LOAD_MODEL} --mb-size ${MB_SIZE} --learning-rate ${LR} --activation-type ${ACT} --network ${NETWORK} --learning-rate-method ${LRM} --test-with-train-data ${TEST_W_TRAIN_DATA} --checkpoint ${CKPT} --lr-decay-rate ${LR_DECAY} --random-training-samples ${SHUFFLE_TRAINING} --num-classes 10 --z-score 1 --procs-per-model 4"
270+
echo ${CMD}
271+
${CMD}

0 commit comments

Comments
 (0)