mosaicml · knighton · Dec 25, 2023 · Dec 26, 2023 · Dec 26, 2023 · Dec 26, 2023
@@ -47,13 +47,4 @@ jobs:
         id: tests
         run: |
           set -ex
-          pytest --splits 10 --group 1 --cov-fail-under=10
-          pytest --splits 10 --group 2 --cov-fail-under=10
-          pytest --splits 10 --group 3 --cov-fail-under=10
-          pytest --splits 10 --group 4 --cov-fail-under=10
-          pytest --splits 10 --group 5 --cov-fail-under=10
-          pytest --splits 10 --group 6 --cov-fail-under=10
-          pytest --splits 10 --group 7 --cov-fail-under=10
-          pytest --splits 10 --group 8 --cov-fail-under=10
-          pytest --splits 10 --group 9 --cov-fail-under=10
-          pytest --splits 10 --group 10 --cov-fail-under=10
+          pytest --cov-fail-under 50
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -365,14 +365,13 @@ def _modules_to_rst() -> List[types.ModuleType]:
     document_modules: List[types.Module] = [
         streaming,
         streaming.base.compression,
+        streaming.base.coord,
         streaming.base.format,
         streaming.base.hashing,
         streaming.base.partition,
-        streaming.base.shared,
         streaming.base.shuffle,
         streaming.base.storage,
         streaming.base.util,
-        streaming.base.world,
     ]
     exclude_modules: List[types.Module] = [streaming.base, streaming._version]
     for name in streaming.__dict__:

@@ -58,6 +58,7 @@
     'azure-storage-blob>=12.0.0,<13',
     'azure-storage-file-datalake>=12.11.0,<13',
     'azure-identity>=1.13.0',
+    'psutil>=5.9.4',
 ]
 
 extra_deps = {}

diff --git a/simulation/core/sim_dataset.py b/simulation/core/sim_dataset.py
diff --git a/simulation/core/sim_world.py b/simulation/core/sim_world.py
@@ -3,7 +3,7 @@
 
 """Contains info about the nodes, ranks, and workers of the run for simulation purposes."""
 
-from streaming.base.world import World
+from streaming.base.coord.world import World
 
 
 class SimulationWorld(World):

diff --git a/simulation/core/yaml_processing.py b/simulation/core/yaml_processing.py
@@ -197,11 +197,29 @@ def create_simulation_dataset(nodes: int, devices: int, workers: int, global_bat
     sampling_granularity = train_dataset.get('sampling_granularity', 1)
     batching_method = train_dataset.get('batching_method', 'random')
 
-    dataset = SimulationDataset(nodes, devices, workers, streams, remote, local, split,
-                                download_retry, download_timeout, validate_hash, keep_zip,
-                                epoch_size, predownload, cache_limit, partition_algo,
-                                num_canonical_nodes, batch_size, shuffle, shuffle_algo,
-                                shuffle_seed, shuffle_block_size, sampling_method,
-                                sampling_granularity, batching_method)
+    dataset = SimulationDataset(nodes=nodes,
+                                devices=devices,
+                                workers=workers,
+                                streams=streams,
+                                remote=remote,
+                                local=local,
+                                split=split,
+                                download_retry=download_retry,
+                                download_timeout=download_timeout,
+                                validate_hash=validate_hash,
+                                keep_zip=keep_zip,
+                                epoch_size=epoch_size,
+                                predownload=predownload,
+                                cache_limit=cache_limit,
+                                partition_algo=partition_algo,
+                                num_canonical_nodes=num_canonical_nodes,
+                                batch_size=batch_size,
+                                shuffle=shuffle,
+                                shuffle_algo=shuffle_algo,
+                                shuffle_seed=shuffle_seed,
+                                shuffle_block_size=shuffle_block_size,
+                                sampling_method=sampling_method,
+                                sampling_granularity=sampling_granularity,
+                                batching_method=batching_method)
 
     return dataset
diff --git a/streaming/base/batching/__init__.py b/streaming/base/batching/__init__.py
@@ -12,7 +12,7 @@
 from streaming.base.batching.per_stream import generate_work_per_stream_batching
 from streaming.base.batching.random import generate_work_random_batching
 from streaming.base.batching.stratified import generate_work_stratified_batching
-from streaming.base.world import World
+from streaming.base.coord.world import World
 
 if TYPE_CHECKING:
     from streaming.base.dataset import StreamingDataset

diff --git a/streaming/base/batching/per_stream.py b/streaming/base/batching/per_stream.py
@@ -10,9 +10,9 @@
 import numpy as np
 from numpy.typing import NDArray
 
+from streaming.base.coord.world import World
 from streaming.base.partition import get_partitions
 from streaming.base.shuffle import get_shuffle
-from streaming.base.world import World
 
 if TYPE_CHECKING:
     from streaming.base.dataset import StreamingDataset
@@ -63,9 +63,6 @@ def generate_work_per_stream_batching(dataset: StreamingDataset, world: World, e
             # same as the ratio of the stream's samples to overall samples.
             # This ensures that the overall training shuffle block size is still approximately
             # equal to what is set by the user, and allows for reasoning about cache_limit as well.
-            if not isinstance(dataset.shuffle_block_size, int):
-                raise TypeError(f'Dataset `shuffle_block_size` must be an integer. ' +
-                                f'Got {type(dataset.shuffle_block_size)} instead.')
             shuffle_block_portion = int(dataset.shuffle_block_size * stream.proportion)
             stream_shuffle = get_shuffle(dataset.shuffle_algo, shuffle_units,
                                          dataset.num_canonical_nodes, dataset.shuffle_seed, epoch,

diff --git a/streaming/base/batching/random.py b/streaming/base/batching/random.py
@@ -10,9 +10,9 @@
 import numpy as np
 from numpy.typing import NDArray
 
+from streaming.base.coord.world import World
 from streaming.base.partition import get_partitions
 from streaming.base.shuffle import get_shuffle
-from streaming.base.world import World
 
 if TYPE_CHECKING:
     from streaming.base.dataset import StreamingDataset
@@ -58,9 +58,6 @@ def generate_work_random_batching(dataset: StreamingDataset, world: World, epoch
 
     # If we need to shuffle, shuffle in a node-aware and *underlying* shard-aware way.
     if dataset.shuffle:
-        if not isinstance(dataset.shuffle_block_size, int):
-            raise TypeError(f'Dataset `shuffle_block_size` must be an integer. ' +
-                            f'Got {type(dataset.shuffle_block_size)} instead.')
         shuffle = get_shuffle(dataset.shuffle_algo, shuffle_units, dataset.num_canonical_nodes,
                               dataset.shuffle_seed, epoch, dataset.shuffle_block_size)
         big_ids = np.where(big_ids != -1, shuffle[big_ids], -1)

diff --git a/streaming/base/batching/stratified.py b/streaming/base/batching/stratified.py
@@ -11,9 +11,9 @@
 import numpy as np
 from numpy.typing import NDArray
 
+from streaming.base.coord.world import World
 from streaming.base.partition import get_partitions
 from streaming.base.shuffle import get_shuffle
-from streaming.base.world import World
 
 if TYPE_CHECKING:
     from streaming.base.dataset import StreamingDataset
@@ -75,9 +75,6 @@ def generate_work_stratified_batching(dataset: StreamingDataset, world: World, e
             # same as the ratio of the stream's samples to overall samples.
             # This ensures that the overall training shuffle block size is still approximately
             # equal to what is set by the user, and allows for reasoning about cache_limit as well.
-            if not isinstance(dataset.shuffle_block_size, int):
-                raise TypeError(f'Dataset `shuffle_block_size` must be an integer. ' +
-                                f'Got {type(dataset.shuffle_block_size)} instead.')
             shuffle_block_portion = int(dataset.shuffle_block_size * stream.proportion)
             stream_shuffle = get_shuffle(dataset.shuffle_algo, shuffle_units,
                                          dataset.num_canonical_nodes, dataset.shuffle_seed, epoch,

diff --git a/streaming/base/coord/__init__.py b/streaming/base/coord/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2022-2024 MosaicML Streaming authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Coordination among ranks and workers."""
+
+from streaming.base.coord.job import JobDirectory, JobRegistry
+from streaming.base.coord.shmem import (SharedArray, SharedBarrier, SharedMemory, SharedScalar,
+                                        get_shm_prefix)
+from streaming.base.coord.world import World
+
+__all__ = [
+    'JobDirectory', 'JobRegistry', 'SharedArray', 'SharedBarrier', 'SharedMemory',
+    'get_shm_prefix', 'SharedScalar', 'World'
+]
diff --git a/streaming/base/coord/file/__init__.py b/streaming/base/coord/file/__init__.py
@@ -0,0 +1,9 @@
+# Copyright 2022-2024 MosaicML Streaming authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Coordinating processes using files."""
+
+from streaming.base.coord.file.lock import SoftFileLock
+from streaming.base.coord.file.waiting import create_file, wait_for_creation, wait_for_deletion
+
+__all__ = ['create_file', 'wait_for_creation', 'wait_for_deletion', 'SoftFileLock']
diff --git a/streaming/base/coord/file/lock.py b/streaming/base/coord/file/lock.py
@@ -0,0 +1,184 @@
+# Copyright 2022-2024 MosaicML Streaming authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Soft file locking via file open mode 'x'."""
+
+import os
+from types import TracebackType
+from typing import Optional, Type, Union
+
+from typing_extensions import Self
+
+from streaming.base.coord.process import get_live_processes
+from streaming.base.coord.waiting import wait
+
+__all__ = ['SoftFileLock']
+
+
+class SoftFileLock:
+    """Soft file locking via file open mode 'x'.
+
+    Args:
+        filename (str): Path to lock.
+        timeout (float, optional): How long to wait in seconds before raising an exception.
+            Set to ``None`` to never time out. Defaults to ``30``.
+        tick (float): Check interval in seconds. Defaults to ``0.007``.
+    """
+
+    def __init__(
+        self,
+        filename: str,
+        timeout: Optional[float] = 30,
+        tick: float = 0.007,
+    ) -> None:
+        if not filename:
+            raise ValueError('Path to file lock is empty.')
+
+        if timeout is not None:
+            if timeout <= 0:
+                raise ValueError(
+                    f'Timeout must be positive float seconds, but got: {timeout} sec.')
+
+        if tick <= 0:
+            raise ValueError(f'Tick must be positive float seconds, but got: {tick} sec.')
+
+        self.filename = filename
+        self.timeout = timeout
+        self.tick = tick
+
+        self._normalize(filename)
+
+    @classmethod
+    def _write(cls, filename: str, pid: int) -> None:
+        """Write the locking process's pid.
+
+        Args:
+            filename (str): Path to lock.
+        """
+        with open(filename, 'x') as file:
+            file.write(str(pid))
+
+    @classmethod
+    def _read(cls, filename: str) -> int:
+        """Read the locking process's pid.
+
+        Args:
+            filename (str): Path to lock.
+        """
+        with open(filename, 'r') as file:
+            return int(file.read())
+
+    @classmethod
+    def _normalize(cls, filename: str) -> None:
+        """Ensure parent dirs exist and lock files held by dead processes do not exist.
+
+        Args:
+            filename (str): Path to lock.
+        """
+        # Ensure the file's parent directory exists so we can write it in one shot.
+        dirname = os.path.dirname(filename)
+        if dirname:
+            os.makedirs(dirname, exist_ok=True)
+
+        # If no file, we don't need to do anything.
+        if not os.path.exists(filename):
+            return
+
+        # If we fail to open the file and parse the pid, bail out while deleting it.
+        try:
+            pid = cls._read(filename)
+        except:
+            os.remove(filename)
+            return
+
+        # If the pid is not among the living, delete the file.
+        if pid not in get_live_processes():
+            os.remove(filename)
+
+    @classmethod
+    def _get_timeout(
+        cls,
+        init_timeout: Optional[float],
+        timeout: Optional[Union[str, float]] = 'auto',
+    ) -> Optional[float]:
+        """Determine the timeout for a given acquire().
+
+        Args:
+            init_timeout (float, optional): Default timeout provided to init.
+            timeout (str | float, optional): Override timeout for just this method call.
+
+        Returns:
+            float, optional: Normalized timeout as positive float seconds or ``None`` to disable.
+        """
+        if timeout is None:
+            # No timeout.
+            ret = timeout
+        elif isinstance(timeout, float):
+            # Override timeout.
+            if timeout <= 0:
+                raise ValueError(
+                    f'Timeout must be positive float seconds, but got: {timeout} sec.')
+            ret = timeout
+        elif timeout == 'auto':
+            # Default timeout.
+            ret = init_timeout
+        else:
+            raise ValueError(f'Timeout must either be positive float seconds, ``None`` to ' +
+                             f'disable timing out, or ``auto`` to use the default passed to ' +
+                             f'init, but got: {timeout}.')
+        return ret
+
+    def acquire(
+        self,
+        timeout: Optional[Union[str, float]] = 'auto',
+    ) -> None:
+        """Acquire this lock.
+
+        Args:
+            timeout (str | float, optional): Override timeout for just this method call.
+        """
+
+        def stop() -> bool:
+            try:
+                with open(self.filename, 'x') as out:
+                    text = str(os.getpid())
+                    out.write(text)
+                return True
+            except:
+                return False
+
+        norm_timeout = self._get_timeout(self.timeout, timeout)
+        wait(stop, norm_timeout, self.tick)
+
+    def release(self) -> None:
+        """Release this lock."""
+        if os.path.isfile(self.filename):
+            os.remove(self.filename)
+        elif os.path.exists(self.filename):
+            raise ValueError(f'Path exists, but is not a file: {self.filename}.')
+        else:
+            raise ValueError(f'Path does not exist: {self.filename}.')
+
+    def __enter__(self) -> Self:
+        """Enter context manager.
+
+        Returns:
+            Self: This lock.
+        """
+        self.acquire()
+        return self
+
+    def __exit__(
+        self,
+        err_type: Optional[Type[BaseException]] = None,
+        err: Optional[BaseException] = None,
+        trace: Optional[TracebackType] = None,
+    ) -> None:
+        """Exit context manager.
+
+        Args:
+            err_type (Type[BaseException], optional): Exc type.
+            err (BaseException, optional): Exc.
+            trace (TracebackType, optional): Traceback.
+        """
+        self.release()