uw-ipd · fnachon · Apr 3, 2026 · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -0,0 +1,13 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(/Users/florian/miniconda3/envs/tmol/bin/python3.13 -m pytest tmol/tests/pack/test_pack_rotamers.py -k \"mps and not gradcheck\" -v)",
+      "Bash(python -m pytest tmol/tests/test_mps.py --no-header)",
+      "Bash(echo \"---EXIT: $?\")",
+      "Bash(conda activate:*)",
+      "Bash(python -m pytest tmol/tests/ -k mps --no-header -q --ignore=tmol/tests/score/common/test_uaid_util.py)",
+      "Bash(/Users/florian/miniconda3/envs/tmol/bin/python -m pytest tmol/tests/ -k mps --no-header -q --ignore=tmol/tests/score/common/test_uaid_util.py)",
+      "Bash(/Users/florian/miniconda3/envs/tmol/bin/python -c ':*)"
+    ]
+  }
+}
diff --git a/.cmake/api/v1/query/cache-v2 b/.cmake/api/v1/query/cache-v2
diff --git a/.cmake/api/v1/query/cmakeFiles-v1 b/.cmake/api/v1/query/cmakeFiles-v1
diff --git a/.cmake/api/v1/query/codemodel-v2 b/.cmake/api/v1/query/codemodel-v2
diff --git a/.cmake/api/v1/query/toolchains-v1 b/.cmake/api/v1/query/toolchains-v1
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,21 @@
 .torch_extensions
 
+# scikit-build-core / CMake in-source build artifacts
+.cmake/api/v1/reply/
+.ninja_deps
+.ninja_log
+.skbuild-info.json
+CMakeCache.txt
+CMakeFiles/
+CMakeInit.txt
+Makefile
+build.ninja
+cmake_install.cmake
+metal_air/
+
+# Output PDB files from pack_rotamers runs
+pack_rotamers_*.pdb
+
 # Conda environment
 .conda
 # Rendered environment definitions

diff --git a/CMakeLists.txt b/CMakeLists.txt
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -15,25 +15,40 @@ This document covers building, testing, and contributing to tmol.
 
 ## Local Setup
 
+**NVIDIA GPU (Linux):**
 ```bash
 git clone https://github.com/uw-ipd/tmol.git && cd tmol
-pip install -e ".[dev]"   # builds C++/CUDA extensions via CMake
+pip install -e ".[dev]"       # builds C++/CUDA extensions via CMake
 ```
-
 Requirements: Python 3.10+, PyTorch 2.5+, `nvcc` (CUDA toolkit), C++17 compiler, CMake 3.18+.
 
+**Apple Silicon Mac (macOS):**
+
+> [!IMPORTANT]
+> MPS support lives in the **[fnachon/tmol](https://github.com/fnachon/tmol)** fork.
+> Clone that repository for Apple Silicon development.
+
+```bash
+git clone https://github.com/fnachon/tmol.git && cd tmol
+pip install -e ".[dev,mps]"   # builds C++/Metal extensions via CMake
+```
+Requirements: Python 3.10+, PyTorch 2.5+, macOS 13+, Xcode Command Line Tools (`xcode-select --install`), CMake 3.18+. No `nvcc` needed.
+
 ## Building Extensions
 
-tmol ships custom C++/CUDA kernels that are compiled via CMake (using scikit-build-core as the build backend). `pip install -e .` handles compilation automatically.
+tmol ships custom C++/CUDA/Metal kernels compiled via CMake (using scikit-build-core as the build backend). `pip install -e .` handles compilation automatically.
 
 ```bash
-# Full build (production extensions only)
+# Full build — NVIDIA GPU
 pip install -e .
 
+# Full build — Apple Silicon (MPS backend)
+pip install -e ".[mps]"
+
 # Build with test extensions
 pip install -e . -Ccmake.define.TMOL_BUILD_TESTS=ON
 
-# Target specific GPU architectures (default: "80;86;89;90")
+# Target specific CUDA GPU architectures (default: "80;86;89;90")
 pip install -e . -Ccmake.define.CMAKE_CUDA_ARCHITECTURES="80;90"
 
 # Control parallelism
@@ -44,14 +59,32 @@ CMake build options:
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `CMAKE_CUDA_ARCHITECTURES` | `80;86;89;90` | GPU compute capabilities to compile for |
+| `CMAKE_CUDA_ARCHITECTURES` | `80;86;89;90` | CUDA GPU compute capabilities to compile for |
 | `TMOL_BUILD_TESTS` | `OFF` | Build test-only C++/CUDA extensions |
+| `TMOL_BUILD_MPS` | auto-detected | Build MPS (Metal) backend; auto-enabled when `xcrun` and Metal SDK are found |
 | `TMOL_NVCC_THREADS` | `4` | Threads per nvcc invocation |
 | `TMOL_FORCE_CXX11_ABI` | `FALSE` | Force C++11 ABI (for NGC container compat) |
 | `TORCH_CUDA_ARCH_LIST` | `8.0 8.6 8.9 9.0 10.0+PTX` | GPU architectures to compile for |
 | `MAX_JOBS` | auto | Max parallel compilation jobs |
 | `NVCC_THREADS` | `4` | Threads per nvcc invocation |
 
+### MPS / Metal build notes
+
+The MPS backend is enabled automatically on macOS when `xcrun` and the Metal SDK are present (they ship with Xcode Command Line Tools). The build compiles:
+
+- Objective-C++ (`.mm`) bridge files that call Metal API
+- Metal Shading Language kernels (`.metal` → `.air` → `tmol_primitives.metallib`) for GPU-accelerated scan, reduce, and segmented scan primitives
+
+To explicitly enable or disable the MPS backend:
+
+```bash
+# Force-enable (will fail if Metal SDK is absent)
+pip install -e . -Ccmake.define.TMOL_BUILD_MPS=ON
+
+# Force-disable (CPU-only build on macOS)
+pip install -e . -Ccmake.define.TMOL_BUILD_MPS=OFF
+```
+
 ## Extension Loading: AOT vs JIT
 
 tmol's C++/CUDA kernels can be loaded in two ways:
@@ -101,6 +134,10 @@ JIT mode requires `nvcc` and CUDA headers. You can either:
 pip install .[cuda]
 ```
 
+### MPS / Metal and JIT mode
+
+The MPS backend does not use JIT compilation — Metal shaders are always compiled ahead-of-time at build time via `xcrun metal`. Setting `TMOL_USE_JIT=1` on macOS still compiles the C++/Objective-C++ bridge code via `torch.utils.cpp_extension`, but the `.metallib` binary is loaded from disk. No additional environment variables are needed for MPS.
+
 ## Running Tests
 
 ```bash
@@ -110,8 +147,11 @@ pytest tmol/tests/ -v
 # Specific test file
 pytest tmol/tests/score/test_score_function.py -v
 
-# Only CPU tests (skip cuda-parametrized tests)
-pytest tmol/tests/ -v -k "not cuda"
+# Only CPU tests (skip cuda- and mps-parametrized tests)
+pytest tmol/tests/ -v -k "not cuda and not mps"
+
+# Only MPS tests (Apple Silicon)
+pytest tmol/tests/test_mps.py -v
 
 # With coverage
 pytest tmol/tests/ --cov=./tmol --junitxml=results.xml
@@ -120,14 +160,31 @@ pytest tmol/tests/ --cov=./tmol --junitxml=results.xml
 pytest --benchmark-enable --benchmark-only --benchmark-max-time=.1
 ```
 
+### MPS test suite
+
+> [!NOTE]
+> MPS tests require the [fnachon/tmol](https://github.com/fnachon/tmol) fork — the upstream repository does not include MPS patches.
+
+`tmol/tests/test_mps.py` contains a five-layer smoke test for the Apple Silicon backend:
+
+| Layer | What it checks |
+|-------|---------------|
+| 1 — Tensor plumbing | MPS availability, creation, matmul, autograd |
+| 2 — Primitives | cumsum, reduce, elementwise ops via PyTorch wrappers |
+| 3 — Dispatch macro | Pose stack construction on MPS (exercises compiled ops) |
+| 4 — Forward pass | CartBonded, Elec, LJLK, HBond, full beta2016 score function |
+| 5 — CPU consistency | MPS scores and gradients match CPU within float32 tolerance |
+
+All tests are automatically skipped on non-Apple-Silicon machines via the `@requires_mps` mark.
+
 ### Testing a specific release
 
 ```bash
-# Install a release wheel from GitHub
+# CUDA/Linux: install a release wheel from the upstream GitHub
 pip install https://github.com/uw-ipd/tmol/releases/download/v0.1.1/tmol-0.1.1+cu126torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
 
-# Or install a specific branch/tag from source
-pip install git+https://github.com/uw-ipd/tmol.git@v0.1.1
+# MPS/macOS: install a specific branch/tag from the MPS fork
+pip install git+https://github.com/fnachon/tmol.git@master
 
 # Run tests against it
 pytest --pyargs tmol.tests -v
@@ -162,6 +219,9 @@ tmol uses GitHub Actions for all CI:
 | `build_wheel.yml` | Push to `kdidi/precompiled_extensions` | Builds wheels across a PyTorch/CUDA/ABI matrix. Saves as artifacts (no upload). |
 | `publish.yml` | Manual (`workflow_dispatch`) | Builds wheels + sdist, uploads sdist to TestPyPI, uploads wheels to a GitHub Release. |
 
+> [!NOTE]
+> MPS tests (`tmol/tests/test_mps.py`) are not yet part of the automated CI pipeline, which runs on Linux GPU runners. Run them locally on an Apple Silicon Mac with `pytest tmol/tests/test_mps.py -v`.
+
 ### CI architecture
 
 ```

diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
 # Tmol
 
-`tmol` (TensorMol) is a GPU-accelerated reimplementation of the Rosetta molecular modeling energy function (`beta_nov2016_cart`) in PyTorch with custom C++/CUDA kernels. It computes energies and derivatives for protein structures and supports gradient-based minimization, enabling ML models to incorporate biophysical scoring during training or to refine predicted structures with Rosetta's experimentally validated energy function.
+`tmol` (TensorMol) is a GPU-accelerated reimplementation of the Rosetta molecular modeling energy function (`beta_nov2016_cart`) in PyTorch with custom C++/CUDA/Metal kernels. It computes energies and derivatives for protein structures and supports gradient-based minimization, enabling ML models to incorporate biophysical scoring during training or to refine predicted structures with Rosetta's experimentally validated energy function.
+
+tmol runs on **NVIDIA GPUs** (CUDA), **Apple Silicon Macs** (MPS / Metal), and **CPU**.
 
 Full documentation: [tmol Wiki](https://github.com/uw-ipd/tmol/wiki/DevHome)
 
@@ -14,9 +16,61 @@ Full documentation: [tmol Wiki](https://github.com/uw-ipd/tmol/wiki/DevHome)
 
 ## Installation
 
-### Pre-built wheels (recommended)
+### Apple Silicon / MPS (macOS)
+
+tmol runs natively on Apple Silicon (M1/M2/M3/M4) via PyTorch's Metal Performance Shaders (MPS) backend. No CUDA toolkit or NVIDIA GPU is needed.
+
+> [!IMPORTANT]
+> MPS support is maintained in the **[fnachon/tmol](https://github.com/fnachon/tmol)** fork.
+> The upstream [uw-ipd/tmol](https://github.com/uw-ipd/tmol) repository targets NVIDIA GPUs (CUDA/Linux).
+> Use `https://github.com/fnachon/tmol` for Apple Silicon.
+
+**Requirements:**
+- macOS 13.0 (Ventura) or later
+- Apple Silicon Mac (M-series)
+- PyTorch ≥ 2.0 with MPS support (`torch.backends.mps.is_available()` returns `True`)
+- Xcode Command Line Tools (`xcode-select --install`)
+- Python 3.10+
 
-Pre-built wheels ship with **ahead-of-time (AOT) compiled** C++/CUDA extensions -- no `nvcc` or CUDA toolkit needed at install time.
+**Install from source (MPS):**
+
+```bash
+# Install PyTorch with MPS support (ships in the standard macOS wheel)
+pip install torch
+
+# Clone the MPS-enabled fork
+git clone https://github.com/fnachon/tmol.git && cd tmol
+pip install -e ".[dev,mps]"
+```
+
+**Verify MPS is working:**
+
+```python
+import torch
+print(torch.backends.mps.is_available())  # must be True
+
+import tmol
+pose_stack = tmol.pose_stack_from_pdb("1ubq.pdb", device=torch.device("mps"))
+sfxn = tmol.beta2016_score_function(torch.device("mps"))
+scorer = sfxn.render_whole_pose_scoring_module(pose_stack)
+print(scorer(pose_stack.coords))
+```
+
+> [!NOTE]
+> The MPS backend uses Apple's unified memory architecture — CPU and GPU share the same physical RAM — so there is no host↔device copy overhead. All energy terms, gradients, and minimization work identically to CUDA.
+
+> [!TIP]
+> Run the MPS smoke tests to confirm everything is wired up:
+> ```bash
+> pytest tmol/tests/test_mps.py -v
+> ```
+
+---
+
+### Pre-built wheels (Linux / NVIDIA GPU only)
+
+Pre-built wheels ship with **ahead-of-time (AOT) compiled** C++/CUDA extensions — no `nvcc` or CUDA toolkit needed at install time.
+MPS users should install [from source](#from-source) using the [fnachon/tmol](https://github.com/fnachon/tmol) fork.
 
 Wheels are available for Linux x86_64. Pick the one matching your **PyTorch version** and **CXX11 ABI**:
 
@@ -86,35 +140,46 @@ pip install https://github.com/uw-ipd/tmol/releases/download/RELEASE_TAG/WHEEL_F
 pip install tmol --find-links https://github.com/uw-ipd/tmol/releases/download/RELEASE_TAG/
 ```
 
-### From PyPI (source distribution)
+### From PyPI (source distribution, NVIDIA GPU)
 
-The source distribution on PyPI compiles C++/CUDA extensions during installation.
-This requires `nvcc` (CUDA toolkit) and a C++17-capable compiler.
+The source distribution on PyPI compiles C++ extensions during installation. This targets NVIDIA GPUs; for Apple Silicon use the [fnachon/tmol](https://github.com/fnachon/tmol) fork directly.
 
 ```bash
-pip install tmol              # requires nvcc for kernel compilation
+# NVIDIA GPU (requires nvcc / CUDA toolkit)
+pip install tmol              # requires nvcc for CUDA kernel compilation
 pip install tmol[dev]         # includes development tools (black, flake8, pytest, etc.)
+pip install tmol[cuda]        # also installs pip-distributed nvcc + CCCL headers
 ```
 
 ### From source
 
 ```bash
+# NVIDIA GPU (upstream repository)
 git clone https://github.com/uw-ipd/tmol.git && cd tmol
-pip install -e ".[dev]"   # builds C++/CUDA extensions via CMake
+pip install -e ".[dev]"       # builds C++/CUDA extensions via CMake
+
+# Apple Silicon — use the MPS fork
+git clone https://github.com/fnachon/tmol.git && cd tmol
+pip install -e ".[dev,mps]"   # builds C++/Metal extensions via CMake
 ```
 
 ## Usage
 
 ### Quick start
 
 ```python
+import torch
 import tmol
 
+# Pick your device: "cpu", "cuda", or "mps" (Apple Silicon)
+device = torch.device("mps" if torch.backends.mps.is_available() else
+                      "cuda" if torch.cuda.is_available() else "cpu")
+
 # Load a structure
-pose_stack = tmol.pose_stack_from_pdb("1ubq.pdb")
+pose_stack = tmol.pose_stack_from_pdb("1ubq.pdb", device=device)
 
 # Score it
-sfxn = tmol.beta2016_score_function(pose_stack.device)
+sfxn = tmol.beta2016_score_function(device)
 scorer = sfxn.render_whole_pose_scoring_module(pose_stack)
 print(scorer(pose_stack.coords))
 ```
@@ -168,7 +233,7 @@ xyz = tmol.pose_stack_to_rosettafold2(...)
 ```
 
 > [!NOTE]
-> Tested on Ubuntu 20.04. Other platforms should work but are not yet verified.
+> Tested on Ubuntu 20.04 (CUDA) and macOS 14+ (MPS). Other platforms should work but are not yet verified.
 
 > [!WARNING]
 > Call `torch.set_grad_enabled(True)` before using the tmol minimizer, since RF2 disables gradients during inference by default.

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: Implementation :: CPython",
     "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]
@@ -87,9 +88,15 @@ cuda = [
     # CCCL headers (nv/target, cub/, thrust/) needed by nvcc
     "nvidia-cuda-cccl>=13,<14",
 ]
+# MPS (Apple Metal) backend — macOS + Apple Silicon only.
+# No pip dependencies: Metal SDK ships with macOS 13+ / Xcode 14+.
+# Requires: macOS >= 13.0, Apple Silicon or AMD GPU with Metal 3 support,
+#           PyTorch >= 2.0 (MPS backend), Xcode Command Line Tools.
+mps = []
 
 [project.urls]
-repository = "https://github.com/uw-ipd/tmol"
+repository = "https://github.com/fnachon/tmol"
+"Upstream (CUDA/Linux)" = "https://github.com/uw-ipd/tmol"
 
 # Build settings ---------------------------------------------------------------
 [build-system]

diff --git a/tmol/_cpp_lib.py b/tmol/_cpp_lib.py
@@ -52,13 +52,17 @@ def __str__(self) -> str:
 
 
 def _find_extension_library() -> str | None:
-    """Locate the _C shared library in tmol's package directory."""
-    try:
-        spec = importlib.util.find_spec("tmol._C")
-        if spec is not None and spec.origin is not None:
-            return spec.origin
-    except (ModuleNotFoundError, ValueError):
-        pass
+    """Locate the _C shared library path without loading it."""
+    import glob
+    import os
+
+    # Search by filename pattern — avoids triggering importlib to load the .so
+    # (which would register TORCH_LIBRARY ops and cause a double-registration
+    # crash if load_library is later called for the same file).
+    package_dir = os.path.dirname(__file__)
+    matches = glob.glob(os.path.join(package_dir, "_C.*.so"))
+    if matches:
+        return matches[0]
     return None
 
 
@@ -73,6 +77,16 @@ def _ensure_loaded() -> None:
     if _loaded:
         return
 
+    # Check whether the .so is already in the process (loaded by Python's own
+    # import machinery, e.g. via an editable-install .pth that puts the package
+    # directory on sys.path and a subsequent `import tmol._C`).
+    # In that case all TORCH_LIBRARY ops are already registered; calling
+    # load_library again causes a "Key already registered" C++ abort.
+    if "tmol._C" in __import__("sys").modules:
+        _loaded = True
+        logger.debug("tmol._C already in sys.modules; skipping load_library")
+        return
+
     lib_path = _find_extension_library()
     if lib_path is None:
         raise TmolExtensionNotBuiltError()