Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 120 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.5)
cmake_minimum_required(VERSION 3.10)
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

if (POLICY CMP0074)
Expand Down Expand Up @@ -35,13 +35,17 @@ option(USE_FBGEMM "Use FBGEMM" OFF)
option(USE_MKL "Compile with MKL support" ON)
option(USE_MPI "Use MPI library" OFF)
option(USE_NCCL "Use NCCL library" ON)
option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
option(USE_LOCAL_NCCL "Use the local NCCL library from git submodule (about version 2.8.3). Might need for legacy build." OFF)
option(USE_SENTENCEPIECE "Use SentencePiece" ON)
option(USE_LOCAL_SENTENCEPIECE "Use the local SentencePiece from git submodule (about 1.9.4, with bugs)" OFF)
option(USE_TCMALLOC "Use TCMALLOC if available" ON)
option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF)
option(PYMARIAN "Build Pymarian package which is based on pybind11" OFF)
option(USE_OPENSSL "Use OpenSSL library" OFF)
# Note: if you can't find cublasLt, turn this parameter OFF, and optionally modify this CMake config to look for it at a specific directory.
option(DISABLE_CUBLASLT_DEFAULT_PATH_LOOKUP "Disable using automatic discovery of cublasLt library (for historic reason)" ON)

# fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them,
# so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12,
Expand Down Expand Up @@ -339,9 +343,38 @@ endif()
# Downloading SentencePiece if requested and set to compile with it.
# Requires all the dependencies imposed by SentencePiece
if(USE_SENTENCEPIECE)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_SENTENCEPIECE")
LIST(APPEND CUDA_NVCC_FLAGS -DUSE_SENTENCEPIECE; )
set(EXT_LIBS ${EXT_LIBS} sentencepiece sentencepiece_train)
if (USE_LOCAL_SENTENCEPIECE)
set(EXT_LIBS ${EXT_LIBS} sentencepiece sentencepiece_train)
else(USE_LOCAL_SENTENCEPIECE)
# NOTE(OR FALSE): tried an OR TRUE and it seems to have linked fine, but
# didn't test it. (OR TRUE because I couldn't try full USE_STATIC_LIBS due
# to not having static CUDA libs around). No guarantees (where does it get
# protobuf entries from, for example?).
if(USE_STATIC_LIBS OR FALSE)
set(_sentencepiece_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
if(WIN32)
# not sure, didn't try
list(INSERT CMAKE_FIND_LIBRARY_SUFFIXES 0 .lib .a)
else()
set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
endif()
endif()

find_library(LIB_sentencepiece NAMES sentencepiece)
find_library(LIB_sentencepiece_train NAMES sentencepiece_train)
if (NOT LIB_sentencepiece OR NOT LIB_sentencepiece_train)
message(FATAL_ERROR "Could not find sentencepiece (did you want to specify -DUSE_LOCAL_SENTENCEPIECE=ON?)")
endif()
message(STATUS "Sentencepiece found: ${LIB_sentencepiece}, ${LIB_sentencepiece_train}")
set(EXT_LIBS ${EXT_LIBS} ${LIB_sentencepiece} ${LIB_sentencepiece_train})
if(USE_STATIC_LIBS OR FALSE)
# Restore
set(CMAKE_FIND_LIBRARY_SUFFIXES ${_sentencepiece_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif(USE_LOCAL_SENTENCEPIECE)
endif()

if(USE_ONNX)
Expand Down Expand Up @@ -383,13 +416,26 @@ if(USE_STATIC_LIBS)
endif()
endif()

# NOTE(deprecation): the FindCUDA is deprecated, see "cmake --help-policy CMP0146"
#
# But for now we keep it, just need to manually add cuBLASLt.
#
find_package(CUDA "9.0") # TODO: only enable FP16-related options for compute_70 and higher.
if(CUDA_FOUND)
# CUDA >= 10.0 requires CMake >= 3.12.2
if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND (CMAKE_VERSION VERSION_LESS "3.12.2"))
message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
endif()

# cuBLASLt, at least on nix, is packaged together with cublas.
# Not sure where was this change introduced, or if it is related to how it is done generally.
#if(CUDA_VERSION VERSION_EQUAL "12.8" OR CUDA_VERSION VERSION_GREATER "12.8")
# message(STATUS "Befare adding cublasLt: CUDA_CUBLAS_LIBRARIES=${CUDA_CUBLAS_LIBRARIES}")
# find_library(CUBLASLT NAMES cublasLt)
# set(CUDA_CUBLAS_LIBRARIES ${CUDA_CUBLAS_LIBRARIES} ${CUBLASLT})
# message(STATUS "Updated CUDA_CUBLAS_LIBRARIES libs to ${CUDA_CUBLAS_LIBRARIES}")
#endif()

# We want to compile as many targets as possible but different CUDA versions support different targets.
# Let's instead enable options based on what cuda version we have.
if((CUDA_VERSION VERSION_EQUAL "9.0" OR CUDA_VERSION VERSION_GREATER "9.0") AND CUDA_VERSION VERSION_LESS "11.0")
Expand Down Expand Up @@ -424,6 +470,30 @@ if(CUDA_FOUND)
option(COMPILE_AMPERE_RTX "Compile GPU version with SM86 support" ON)
LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
endif()
if(CUDA_VERSION VERSION_EQUAL "12.6" OR CUDA_VERSION VERSION_GREATER "12.6")
option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF) # deprecated for CUDA 11
option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) # deprecated for CUDA 11
option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON)
option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON)
option(COMPILE_TURING "Compile GPU version with SM75 support" ON)
option(COMPILE_AMPERE "Compile GPU version with SM80 support" ON)
option(COMPILE_AMPERE_RTX "Compile GPU version with SM86 support" ON)
option(COMPILE_BLACKWELL_100 "Compile GPU version with SM100 support" ON)
LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
endif()
if(CUDA_VERSION VERSION_EQUAL "12.8" OR CUDA_VERSION VERSION_GREATER "12.8")
option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF) # deprecated for CUDA 11
option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) # deprecated for CUDA 11
option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON)
option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON)
option(COMPILE_TURING "Compile GPU version with SM75 support" ON)
option(COMPILE_AMPERE "Compile GPU version with SM80 support" ON)
option(COMPILE_AMPERE_RTX "Compile GPU version with SM86 support" ON)
option(COMPILE_BLACKWELL_100 "Compile GPU version with SM100 support" ON)
option(COMPILE_BLACKWELL_120 "Compile GPU version with SM120 support" ON)
LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
endif()


message(STATUS "CUDA_VERSION=${CUDA_VERSION}; CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
if(COMPILE_KEPLER)
Expand Down Expand Up @@ -460,6 +530,18 @@ if(CUDA_FOUND)
LIST(APPEND COMPUTE -gencode=arch=compute_86,code=sm_86; -gencode=arch=compute_86,code=compute_86) # Ampere RTX GPUs
endif(COMPILE_AMPERE_RTX)
endif()
if(CUDA_VERSION VERSION_EQUAL "12.6" OR CUDA_VERSION VERSION_GREATER "12.6")
if(COMPILE_BLACKWELL_100)
message(STATUS "Compiling code for Blackwell (CUDA 12.6+, SM100) GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_100,code=sm_100; -gencode=arch=compute_100,code=compute_100)
endif(COMPILE_BLACKWELL_100)
endif()
if(CUDA_VERSION VERSION_EQUAL "12.8" OR CUDA_VERSION VERSION_GREATER "12.8")
if(COMPILE_BLACKWELL_120)
message(STATUS "Compiling code for Blackwell (CUDA 12.8+, SM120) GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_120,code=sm_120; -gencode=arch=compute_120,code=compute_120)
endif(COMPILE_BLACKWELL_120)
endif()

if(USE_STATIC_LIBS)
set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusparse_LIBRARY})
Expand All @@ -473,31 +555,41 @@ if(CUDA_FOUND)
elseif(NOT WIN32)
message(FATAL_ERROR "cuLIBOS library not found")
endif()
# CUDA 10.1 introduces cublasLt library that is required on static build
if ((CUDA_VERSION VERSION_EQUAL "10.1" OR CUDA_VERSION VERSION_GREATER "10.1"))
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
# CUDA 10.1 introduces cublasLt library that is required on static build.
if (CUDA_VERSION VERSION_EQUAL "10.1" OR CUDA_VERSION VERSION_GREATER "10.1")
if (DISABLE_CUBLASLT_DEFAULT_PATH_LOOKUP)
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
else(DISABLE_CUBLASLT_DEFAULT_PATH_LOOKUP)
# NOTE: didn't test this path, not sure if/how cublasLt is statically bundled separately.
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
endif()
if(NOT CUDA_cublasLt_LIBRARY)
message(FATAL_ERROR "cuBLASLt library not found")
message(FATAL_ERROR "cuBLASLt library not found (consider passing -DDISABLE_CUBLASLT_DEFAULT_PATH_LOOKUP=OFF). -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
endif()
set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
endif()
message(STATUS "Found CUDA libraries: ${CUDA_LIBS}")
else(USE_STATIC_LIBS)
set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
# We actually only need cublasLt here after cuda 11. Marian will work fine without it pre cuda 11. We want to force CMake to use the cublas
# version that ships with CUDA 11 so we force the search to occur inside of the cuda toolkit directory.
set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
if (CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
if (DISABLE_CUBLASLT_DEFAULT_PATH_LOOKUP)
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
else(DISABLE_CUBLASLT_DEFAULT_PATH_LOOKUP)
# Not sure why the forcing was necessary at 11, but at least on a current setup, where the library might not be in the CUDA_TOOLKIT_ROOT_DIR,
# the NO_DEFAULT_PATH would prevent picking it up. For example, nix-flake based build will supply all the build-inputs (lib dependencies)
# through the CMAKE_LIBRARY_PATH and CMAKE_INCLUDE_PATH env variables.
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
endif()
if(NOT CUDA_cublasLt_LIBRARY)
message(FATAL_ERROR "cuBLASLt library not found. -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
message(FATAL_ERROR "cuBLASLt library not found (consider passing -DDISABLE_CUBLASLT_DEFAULT_PATH_LOOKUP=OFF). -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
endif()
set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
endif()
set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
message(STATUS "Found CUDA libraries: ${CUDA_LIBS}")
endif(USE_STATIC_LIBS)

if(USE_CUDNN)
Expand All @@ -518,8 +610,19 @@ if(CUDA_FOUND)
endif()

if(USE_NCCL)
add_library(nccl STATIC IMPORTED)
set(EXT_LIBS ${EXT_LIBS} nccl)
if (USE_LOCAL_NCCL)
add_library(nccl STATIC IMPORTED)
message(STATUS "Using local bundled nccl library")
set(EXT_LIBS ${EXT_LIBS} nccl)
else()
find_library(CUDA_nccl_LIBRARY NAMES nccl)
if(NOT CUDA_nccl_LIBRARY)
message(FATAL_ERROR "CUDA NCCL library not found")
endif()
message(STATUS "Found CUDA NCCL library: ${CUDA_nccl_LIBRARY}")
set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_nccl_LIBRARY})
set(EXT_LIBS ${EXT_LIBS} ${CUDA_nccl_LIBRARY})
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_NCCL")
LIST(APPEND CUDA_NVCC_FLAGS -DUSE_NCCL; )
endif(USE_NCCL)
Expand All @@ -528,6 +631,8 @@ if(CUDA_FOUND)
set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
endif()

message(STATUS "Found CUDA libraries: ${CUDA_LIBS}")

else(CUDA_FOUND)
message("
Cannot find suitable CUDA libraries. Specify the path explicitly with
Expand Down
4 changes: 4 additions & 0 deletions cmake/FindMKL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ set(INTEL_ROOT ${INTEL_ROOT_DEFAULT} CACHE PATH "Folder contains intel libs")
find_path(MKL_ROOT include/mkl.h
PATHS $ENV{MKLROOT} ${INTEL_ROOT}/mkl ${INTEL_ROOT}/oneapi/mkl/latest
DOC "Folder contains MKL")
message(STATUS "MKL_ROOT ${MKL_ROOT}")

find_path(MKL_INCLUDE_DIR NAMES mkl.h HINTS ${MKL_ROOT}/include /usr/include/mkl)
message(STATUS "MKL_INCLUDE_DIR ${MKL_INCLUDE_DIR}")

find_library(MKL_INTERFACE_LIBRARY
NAMES ${INT_LIB}
Expand All @@ -73,6 +75,8 @@ find_library(MKL_INTERFACE_LIBRARY
${MKL_ROOT}/lib/intel64_win
${INTEL_ROOT}/mkl/lib/intel64
${INTEL_ROOT}/oneapi/mkl/latest/lib)
message(STATUS "INT_LIB ${INT_LIB}")
message(STATUS "MKL_INTERFACE_LIBRARY ${MKL_INTERFACE_LIBRARY}")

find_library(MKL_SEQUENTIAL_LAYER_LIBRARY
NAMES ${SEQ_LIB}
Expand Down
67 changes: 67 additions & 0 deletions doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,73 @@ Then set up a Python environment and install modules:

Documentation building should also work on Windows, but it has not been tested.

### Building using Nix

On existing Linux systems, install the [nix package
manager](https://nixos.org/download/). Then in the checkout directory where
`flake.nix` and `flake.lock` are found, execute

```
nix \
--extra-experimental-features nix-command \
--extra-experimental-features flakes \
develop

```

which would drop you in a development shell, with all the library dependencies
and environment variables prepared. It might need to compile some of those
dependencies first, so you might want to drink a tea in the meantime.

The build is mostly deterministic, thanks to the `flake.lock` mechanism that
pins the inputs and build environment, which helps to avoid surprises.

#### Running nix-built binaries with the system CUDA driver.

Bit of context first, jump to the last paragraph for the TLDR. For CUDA, there
are two sets of libraries: the CUDA runtime, and the CUDA driver. The CUDA
runtime (somewhat ill-named, we might call it userland maybe?) are the helper
libraries, for example cublas or cusparse (`libcu...so.x`). While the CUDA
driver is mostly `libcublas.so.1` (and in some cases
`libnvidia-ptxjitcompiler.so.1`).

The choice of CUDA runtime (or userland) version depends on the
application (here, Marian), and the CUDA runtime level it wants to code
against (in case of Marian, at least CUDA 9).

But the CUDA driver library is tightly coupled to your kernel and GPU, and is
not something that a given application would know upfront. So a CUDA
application would either link to a stub driver (inspectable via ldd) and expect
a run-time override (for example via `LD_PRELOAD` or `LD_LIBRARY_PATH`), or
not even register a dynamic dependency upfront, but rather try to load it
dynamically after program startup (latter is what we would observe with the
nix-built marian).

Now, nix will build binaries that are linked against very specific shared
libraries (in the nix store) to avoid dependency hell. The built marian
executables indeed link against the CUDA libraries sourced via nixpgs.

But, the nix-build has no chance to know link against the right CUDA driver
library. So it doesn't. Which results in an obscure crash if you run the built
executables.

So, to run [Nix CUDA on non-NixOS
systems](https://danieldk.eu/Software/Nix/Nix-CUDA-on-non-NixOS-systems),
the most straightforward solution is to symlink the CUDA driver libraries into
a separate directory, and pass that directory in `LD_LIBRARY_PATH`. (Note,
passing the whole of `/usr/lib64` or similar is not recommended, since it would
override the specific nix-linked libraries too).

```
mkdir /run/nvidia-libs
ln -s /usr/lib64/libcuda.so.1 /run/nvidia-libs/
LD_LIBRARY_PATH=/run/nvidia-libs marian ...

```

(In case a crash occurs, either prefix with `LD_DEBUG=libs` or `strace` and see
what library was searched and where before the crash. But using above method,
there wouldn't be any crash.)

## Generation

Expand Down
61 changes: 61 additions & 0 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading