Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions large_language_model_pretraining/nemo/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mlcube/workspace/
33 changes: 33 additions & 0 deletions large_language_model_pretraining/nemo/Dockerfile_mlcube
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

ARG NEMO_BASE_IMAGE=nvcr.io/nvidia/nemo:24.12-rc0
FROM ${NEMO_BASE_IMAGE} AS nemo-base-image

RUN pip uninstall transformers -y
RUN pip install transformers==4.47.1 blobfile==3.0.0
RUN pip install prettytable==3.12.0
RUN pip install git+https://github.com/mlcommons/logging.git@4.1.0-rc3

RUN apt-get update \
&& apt-get install -y curl unzip

RUN curl https://rclone.org/install.sh | bash

# setup workspace
WORKDIR /workspace/llama31
COPY . .

# Fixes the validation dataset order
RUN patch --directory=/opt/megatron-lm -p1 < mcore.patch
95 changes: 95 additions & 0 deletions large_language_model_pretraining/nemo/demo_config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# SSH: username that connects to the remote cluster
export USER="root"
# SSH: remote cluster URL
export HOST="$(hostname)"
# Slurm: account for job submission
export ACCOUNT="default"
# Slurm: partition for job submission
export PARTITION="gpu"
# Slurm: job time limit, defaults to 4 hours
# export TIME="04:00:00"
# Slurm: --nodes arguments, default to use 288 nodes
export NNODES=1
# Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node
export GPUS_PER_NODE=1
# Slurm: max job retries for transient job failures, defaults to retry 3 times
export MAX_RETRIES=3

# Folder mapping:
# Output directory that holds logs, any path that you like.
export JOB_DIR=""
# Image / container path, either local cache file or remote URL
export IMAGE="nvcr.io/nvidia/nemo:24.12-rc0"
# Dataset: C4 dataset location that contains the dataset after preprocessing
# This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part
export PREPROCESSED_PATH=""
# Dataset: Numpy index working directory, contains shuffled dataset
# This path must be able to hold >400GB data
export TMP_NPY_INDEX="/temp_npy_index"
# Dataset: Tokenizer path
# This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part
export TOKENIZER_PATH=""

# Model: checkpoint and tokenizer path
# This is the checkpoint that we want to start with.
# Each checkpoint should be a folder containing two sub-folders: context and weights.
# And we need to pass this folder's path (the folder containing context and weights) here.
export MODEL_CKPT=""
# Model: Continual checkpoint directory to write and resume
# This is the directory to hold all intermediate checkpoints.
# Once a run is complete and we specify to save checkpoints,
# we should see a checkpoint written in this folder
# with name `checkpoint-par-x-y-steps`
# Inside this directory, there should be a `checkpoint` directory that holds context and weights
# which is the "actual checkpoint".
# Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB.
export CONTINUAL_CKPT="/continual_ckpt"
# Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring.
export USE_CKPT=0
# Model: Whether we are resuming from a NeMo-formatted HuggingFace checkpoint (weights only).
# If set to 1, then checkpoint resuming code will not try to load the optimizer states.
export FROM_HF=1
# Model: Whether we want to save a checkpoint. Must be 1 if NPAR > 1. If 1, then we save a checkpoint at the end.
export SAVE_CKPT=0


# Training Configs:
# Model: size, to choose from 8b, 70b, 405b
export SIZE="8b"
# Dataloader: Global batch size
export GBS=16
# Dataloader: Micro batch size
export MBS=1
# Dataloader: Max run N batches, optional
# If an empty string is provided (""), then the training will continue until time limit
# If we want to save a checkpoint, then this value must be set
export MAX_STEPS="100"

# Experiment: starting steps
# This is the starting "offset" step from the checkpoint.
# For instance, if you are resuming from a checkpoint folder `checkpoint-par-0-20-steps/checkpoint`,
# which means that the model is trained for 20 steps to generate the checkpoint,
# then the value 20 is needed here.
export START_STEPS="0"
# Experiment manager: Number of experiments to launch
export NEXP=1
# Experiment manager: how many consecutive jobs we want for each experiment
export NPAR=1
# Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236"
# The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP.
# To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated.
export SEEDS="42"
1 change: 1 addition & 0 deletions large_language_model_pretraining/nemo/mlcube/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
workspace/*
72 changes: 72 additions & 0 deletions large_language_model_pretraining/nemo/mlcube/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# MLCube for LLaMA 3.1

MLCube™ GitHub [repository](https://github.com/mlcommons/mlcube). MLCube™ [wiki](https://mlcommons.github.io/mlcube/).

## Project setup

An important requirement is that you must have Docker installed.

```bash
# Create Python environment and install MLCube Docker runner
virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker
# Fetch the implementation from GitHub
git clone https://github.com/mlcommons/training && cd ./training
git fetch origin pull/792/head:feature/mlcube_llama3.1 && git checkout feature/mlcube_llama3.1
cd ./large_language_model_pretraining/nemo/mlcube
```

Inside the mlcube directory run the following command to check implemented tasks.

```shell
mlcube describe
```

### Extra requirements

#### Nvidia Driver

The base Docker image requires the host machine to have the NVIDIA driver version [560.35.03](https://www.nvidia.com/en-us/drivers/details/231063/) installed.

#### Rclone

Install Rclone in your system, by following [these instructions](https://rclone.org/install/).

MLCommons hosts the model for download exclusively by MLCommons Members. You must first agree to the [confidentiality notice](https://sites.google.com/view/mlcommons-llama3-1). If you cannot access the form, follow these [intructions](https://github.com/mlcommons/training/tree/master/large_language_model_pretraining/nemo#checkpoint-download).

When finishing the previous form, you will receive an email with access to the Drive folder containing a file called `Llama 3.1 CLI Download Instructions`, follow the instructions inside that file up to step: `3. Authenticate Rclone with Google Drive`.

When finishing this step a configuration file for Rclone will contain the necessary data to download the dataset and models. To check where this file is located run the command:

```bash
rclone config file
```

**Default:** `~/.config/rclone/rclone.conf`

Finally copy that file inside the `workspace` folder that is located in the same path as this readme, it must have the name `rclone.conf`.

### MLCube tasks

* Demo tasks:

Download demo dataset.

```shell
mlcube run --task=download_demo -Pdocker.build_strategy=always
```

Train demo.

```shell
mlcube run --task=demo -Pdocker.build_strategy=always
```

### Execute the complete pipeline

You can execute the complete pipeline with one single command.

* Demo pipeline:

```shell
mlcube run --task=download_demo,demo -Pdocker.build_strategy=always
```
36 changes: 36 additions & 0 deletions large_language_model_pretraining/nemo/mlcube/mlcube.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Llama 3.1
description: Large Language Model pretraining - Llama 3.1
authors:
- { name: "MLCommons Best Practices Working Group" }

platform:
accelerator_count: 1

docker:
# Image name.
image: dfjbtest/llama_3.1:0.0.1
# Docker build context relative to $MLCUBE_ROOT. Default is `build`.
build_context: "../"
# Docker file name within docker build context, default is `Dockerfile`.
build_file: "Dockerfile_mlcube"
# GPU arguments
gpu_args: "--gpus=all --shm-size=1g --network=host --ipc=host -v ~/.ssh:/root/.ssh"

tasks:
download_demo:
entrypoint: ./utils/download_demo.sh -a
parameters:
inputs:
rclone_config: rclone.conf
outputs:
data_dir: demo_data/
model_dir: demo_model/
demo:
entrypoint: ./run_demo.sh -a
parameters:
inputs:
preprocessed_path: demo_data/mixtral_8x22b_preprocessed
tokenizer_path: demo_data/tokenizer
model_ckpt: demo_model/
outputs:
job_dir: demo_result/
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def get_parser() -> argparse.ArgumentParser:
tp = pretrain.trainer.strategy.tensor_model_parallel_size
pp = pretrain.trainer.strategy.pipeline_model_parallel_size
cp = pretrain.trainer.strategy.context_parallel_size
dp = (pretrain.trainer.num_nodes * pretrain.trainer.devices) // (tp * pp * cp)
dp = ((pretrain.trainer.num_nodes * pretrain.trainer.devices) // (tp * pp * cp)) or 1
mini_batch_size = (args.gbs // dp)
grad_accumulation_steps = mini_batch_size // args.mbs

Expand Down
Loading