mlcommons · ShriyaRishab · Aug 8, 2025 · Apr 14, 2025 · Apr 25, 2025 · May 5, 2025
@@ -0,0 +1 @@
+mlcube/workspace/
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG NEMO_BASE_IMAGE=nvcr.io/nvidia/nemo:24.12-rc0
+FROM ${NEMO_BASE_IMAGE} AS nemo-base-image
+
+RUN pip uninstall transformers -y
+RUN pip install transformers==4.47.1 blobfile==3.0.0
+RUN pip install prettytable==3.12.0
+RUN pip install git+https://github.com/mlcommons/logging.git@4.1.0-rc3
+
+RUN apt-get update \
+    && apt-get install -y curl unzip
+
+RUN curl https://rclone.org/install.sh | bash
+
+# setup workspace
+WORKDIR /workspace/llama31
+COPY . .
+
+# Fixes the validation dataset order
+RUN patch --directory=/opt/megatron-lm -p1 < mcore.patch
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SSH: username that connects to the remote cluster
+export USER="root"
+# SSH: remote cluster URL
+export HOST="$(hostname)"
+# Slurm: account for job submission 
+export ACCOUNT="default"
+# Slurm: partition for job submission
+export PARTITION="gpu"
+# Slurm: job time limit, defaults to 4 hours
+# export TIME="04:00:00"
+# Slurm: --nodes arguments, default to use 288 nodes
+export NNODES=1
+# Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node
+export GPUS_PER_NODE=1
+# Slurm: max job retries for transient job failures, defaults to retry 3 times
+export MAX_RETRIES=3
+
+# Folder mapping:
+# Output directory that holds logs, any path that you like. 
+export JOB_DIR=""
+# Image / container path, either local cache file or remote URL
+export IMAGE="nvcr.io/nvidia/nemo:24.12-rc0"
+# Dataset: C4 dataset location that contains the dataset after preprocessing
+# This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part
+export PREPROCESSED_PATH=""
+# Dataset: Numpy index working directory, contains shuffled dataset
+# This path must be able to hold >400GB data
+export TMP_NPY_INDEX="/temp_npy_index"
+# Dataset: Tokenizer path
+# This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part
+export TOKENIZER_PATH=""
+
+# Model: checkpoint and tokenizer path
+#     This is the checkpoint that we want to start with. 
+#     Each checkpoint should be a folder containing two sub-folders: context and weights. 
+#     And we need to pass this folder's path (the folder containing context and weights) here.  
+export MODEL_CKPT=""
+# Model: Continual checkpoint directory to write and resume
+#     This is the directory to hold all intermediate checkpoints. 
+#     Once a run is complete and we specify to save checkpoints, 
+#     we should see a checkpoint written in this folder
+#     with name `checkpoint-par-x-y-steps`
+#     Inside this directory, there should be a `checkpoint` directory that holds context and weights
+#     which is the "actual checkpoint". 
+#     Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. 
+export CONTINUAL_CKPT="/continual_ckpt"
+# Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. 
+export USE_CKPT=0
+# Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only). 
+#     If set to 1, then checkpoint resuming code will not try to load the optimizer states. 
+export FROM_HF=1
+# Model: Whether we want to save a checkpoint. Must be 1 if NPAR > 1. If 1, then we save a checkpoint at the end.
+export SAVE_CKPT=0
+
+
+# Training Configs: 
+# Model: size, to choose from 8b, 70b, 405b
+export SIZE="8b"
+# Dataloader: Global batch size
+export GBS=16
+# Dataloader: Micro batch size
+export MBS=1
+# Dataloader: Max run N batches, optional
+#     If an empty string is provided (""), then the training will continue until time limit
+#     If we want to save a checkpoint, then this value must be set
+export MAX_STEPS="100"
+
+# Experiment: starting steps
+#     This is the starting "offset" step from the checkpoint. 
+#     For instance, if you are resuming from a checkpoint folder `checkpoint-par-0-20-steps/checkpoint`, 
+#     which means that the model is trained for 20 steps to generate the checkpoint, 
+#     then the value 20 is needed here. 
+export START_STEPS="0"
+# Experiment manager: Number of experiments to launch
+export NEXP=1
+# Experiment manager: how many consecutive jobs we want for each experiment
+export NPAR=1
+# Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236"
+#     The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. 
+#     To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. 
+export SEEDS="42"
@@ -0,0 +1 @@
+workspace/*
@@ -0,0 +1,72 @@
+# MLCube for LLaMA 3.1
+
+MLCube™ GitHub [repository](https://github.com/mlcommons/mlcube). MLCube™ [wiki](https://mlcommons.github.io/mlcube/).
+
+## Project setup
+
+An important requirement is that you must have Docker installed.
+
+```bash
+# Create Python environment and install MLCube Docker runner 
+virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker
+# Fetch the implementation from GitHub
+git clone https://github.com/mlcommons/training && cd ./training
+git fetch origin pull/792/head:feature/mlcube_llama3.1 && git checkout feature/mlcube_llama3.1
+cd ./large_language_model_pretraining/nemo/mlcube
+```
+
+Inside the mlcube directory run the following command to check implemented tasks.
+
+```shell
+mlcube describe
+```
+
+### Extra requirements
+
+#### Nvidia Driver
+
+The base Docker image requires the host machine to have the NVIDIA driver version [560.35.03](https://www.nvidia.com/en-us/drivers/details/231063/) installed.
+
+#### Rclone
+
+Install Rclone in your system, by following [these instructions](https://rclone.org/install/).
+
+MLCommons hosts the model for download exclusively by MLCommons Members. You must first agree to the [confidentiality notice](https://sites.google.com/view/mlcommons-llama3-1). If you cannot access the form, follow these [intructions](https://github.com/mlcommons/training/tree/master/large_language_model_pretraining/nemo#checkpoint-download).
+
+When finishing the previous form, you will receive an email with access to the Drive folder containing a file called `Llama 3.1 CLI Download Instructions`, follow the instructions inside that file up to step: `3. Authenticate Rclone with Google Drive`.
+
+When finishing this step a configuration file for Rclone will contain the necessary data to download the dataset and models. To check where this file is located run the command:
+
+```bash
+rclone config file
+```
+
+ **Default:** `~/.config/rclone/rclone.conf`
+
+Finally copy that file inside the `workspace` folder that is located in the same path as this readme, it must have the name `rclone.conf`.
+
+### MLCube tasks
+
+* Demo tasks:
+
+Download demo dataset.
+
+```shell
+mlcube run --task=download_demo -Pdocker.build_strategy=always
+```
+
+Train demo.
+
+```shell
+mlcube run --task=demo -Pdocker.build_strategy=always
+```
+
+### Execute the complete pipeline
+
+You can execute the complete pipeline with one single command.
+
+* Demo pipeline:
+
+```shell
+mlcube run --task=download_demo,demo -Pdocker.build_strategy=always
+```
@@ -0,0 +1,36 @@
+name: Llama 3.1
+description: Large Language Model pretraining - Llama 3.1
+authors:
+  - { name: "MLCommons Best Practices Working Group" }
+
+platform:
+  accelerator_count: 1
+
+docker:
+  # Image name.
+  image: dfjbtest/llama_3.1:0.0.1
+  # Docker build context relative to $MLCUBE_ROOT. Default is `build`.
+  build_context: "../"
+  # Docker file name within docker build context, default is `Dockerfile`.
+  build_file: "Dockerfile_mlcube"
+  # GPU arguments
+  gpu_args: "--gpus=all --shm-size=1g --network=host --ipc=host -v ~/.ssh:/root/.ssh"
+
+tasks:
+  download_demo:
+    entrypoint: ./utils/download_demo.sh -a
+    parameters:
+      inputs:
+        rclone_config: rclone.conf
+      outputs:
+        data_dir: demo_data/
+        model_dir: demo_model/
+  demo:
+    entrypoint: ./run_demo.sh -a
+    parameters:
+      inputs:
+        preprocessed_path: demo_data/mixtral_8x22b_preprocessed
+        tokenizer_path: demo_data/tokenizer
+        model_ckpt: demo_model/
+      outputs:
+        job_dir: demo_result/
@@ -372,7 +372,7 @@ def get_parser() -> argparse.ArgumentParser:
     tp = pretrain.trainer.strategy.tensor_model_parallel_size
     pp = pretrain.trainer.strategy.pipeline_model_parallel_size
     cp = pretrain.trainer.strategy.context_parallel_size
-    dp = (pretrain.trainer.num_nodes * pretrain.trainer.devices) // (tp * pp * cp)
+    dp = ((pretrain.trainer.num_nodes * pretrain.trainer.devices) // (tp * pp * cp)) or 1
     mini_batch_size = (args.gbs // dp)
     grad_accumulation_steps = mini_batch_size // args.mbs