meta-pytorch · zhengchenyu · Nov 18, 2025 · Nov 18, 2025
diff --git a/torchft/data.py b/torchft/data.py
@@ -14,10 +14,118 @@
 dataloader frequently to avoid duplicate batches.
 """
 
-from typing import Optional
+import math
+from collections.abc import Iterator
+from typing import Optional, TypeVar
 
+import torch
 import torch.distributed as dist
 from torch.utils import data
+from torch.utils.data.dataset import Dataset
+from torch.utils.data.sampler import Sampler
+
+_T_co = TypeVar("_T_co", covariant=True)
+
+
+class SkipDistributedSampler(Sampler[_T_co]):
+    def __init__(
+        self,
+        dataset: Dataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+        skip_samples: int = 0,
+    ) -> None:
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                f"Invalid rank {rank}, rank should be in the interval [0, {num_replicas - 1}]"
+            )
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        self.skip_samples = skip_samples
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:  # type: ignore[arg-type]
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                (len(self.dataset) - self.skip_samples - self.num_replicas)
+                / self.num_replicas  # type: ignore[arg-type]
+            )
+        else:
+            self.num_samples = math.ceil(
+                (len(self.dataset) - self.skip_samples) / self.num_replicas
+            )  # type: ignore[arg-type]
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+
+    def __iter__(self) -> Iterator[_T_co]:
+        if self.shuffle:
+            # deterministically shuffle based on epoch and seed
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
+        else:
+            indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+
+        if not self.drop_last:
+            indices = indices[self.skip_samples : len(indices)]
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[
+                    :padding_size
+                ]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[self.skip_samples : self.skip_samples + self.total_size]
+        if len(indices) != self.total_size:
+            raise AssertionError(
+                f"Number of indices ({len(indices)}) does not match total_size ({self.total_size})"
+            )
+
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        if len(indices) != self.num_samples:
+            raise AssertionError(
+                f"Number of subsampled indices ({len(indices)}) does not match num_samples ({self.num_samples})"
+            )
+
+        # pyrefly: ignore  # bad-return
+        return iter(indices)
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        r"""
+        Set the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas
+        use a different random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
 
 
 # pyre-fixme[24]: expected generic parameter

diff --git a/torchft/data_test.py b/torchft/data_test.py
@@ -8,7 +8,7 @@
 
 from torch.utils.data import Dataset
 
-from torchft.data import DistributedSampler
+from torchft.data import DistributedSampler, SkipDistributedSampler
 
 
 class DummyDataset(Dataset):
@@ -37,3 +37,84 @@ def test_distributed_sampler(self) -> None:
 
         sampler_iter = iter(sampler)
         self.assertEqual(next(sampler_iter), 500)
+
+    def test_skip_distributed_sampler(self):
+        dataset_length = 100
+        dataset = DummyDataset(dataset_length)
+
+        # Case 1: sample is not skipped
+        for drop_last in [True, False]:
+            num_replicas = 7
+            for rank in range(num_replicas):
+                sampler = SkipDistributedSampler(
+                    dataset=dataset,
+                    num_replicas=num_replicas,
+                    rank=rank,
+                    shuffle=False,
+                    drop_last=drop_last,
+                )
+                cur = rank
+                for idx in sampler:
+                    self.assertEqual(
+                        idx, (cur % dataset_length), f"idx={idx}, cur={cur}"
+                    )
+                    cur += num_replicas
+                # If drop_last is True, read ceil((100-7)/7)*7=98 samples totally.
+                # If drop_last is False, read ceil(100/7)*7=105 samples totally.
+                if drop_last:
+                    self.assertEqual(cur, 98 + rank, f"rank={rank}, cur={cur}")
+                else:
+                    self.assertEqual(cur, 105 + rank, f"rank={rank}, cur={cur}")
+
+        # Case 2: sample is skipped
+        for drop_last in [True, False]:
+            num_replicas = 7
+            skip_samples = 10
+            for rank in range(num_replicas):
+                sampler = SkipDistributedSampler(
+                    dataset=dataset,
+                    num_replicas=num_replicas,
+                    rank=rank,
+                    shuffle=False,
+                    drop_last=drop_last,
+                    skip_samples=skip_samples,
+                )
+                cur = rank
+                for idx in sampler:
+                    expected = (
+                        ((cur + skip_samples) % dataset_length + skip_samples)
+                        if (cur + skip_samples) >= dataset_length
+                        else (cur + skip_samples)
+                    )
+                    self.assertEqual(idx, expected, f"idx={idx}, expected={expected}")
+                    cur += num_replicas
+                # If drop_last is True, read ceil((100-10-7)/7)*7=84 samples totally.
+                # If drop_last is False, read ceil((100-10)/7)*7=91 samples totally.
+                if drop_last:
+                    self.assertEqual(cur, 84 + rank, f"rank={rank}, cur={cur}")
+                else:
+                    self.assertEqual(cur, 91 + rank, f"rank={rank}, cur={cur}")
+
+        # Case 3: drop last is False and padding size is larger than number of indices
+        # If skip_samples is 90, and num_replicas is 31, then the indices is [90, 92, ..., 99].
+        # It means only 10 samples are left, so padding size is 21 which is larger than 10.
+        num_replicas = 31
+        skip_samples = 90
+        expected = list(range(90, 100))
+        expected = (expected * 4)[:31]
+        for rank in range(num_replicas):
+            sampler = SkipDistributedSampler(
+                dataset=dataset,
+                num_replicas=num_replicas,
+                rank=rank,
+                shuffle=False,
+                drop_last=False,
+                skip_samples=skip_samples,
+            )
+            cnt = 0
+            for idx in sampler:
+                self.assertEqual(
+                    idx, expected[rank], f"idx={idx}, rank={rank}, expected={expected}"
+                )
+                cnt += 1
+            self.assertTrue(cnt, 1)