Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 212 additions & 0 deletions easybuild/easyconfigs/p/PyTorch/PyTorch-2.3.0-foss-2024a.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
name = 'PyTorch'
version = '2.3.0'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2024a'}

source_urls = [GITHUB_RELEASE]
sources = ['%(namelower)s-v%(version)s.tar.gz']
patches = [
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.12.1_add-hypothesis-suppression.patch',
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
'PyTorch-1.13.1_fix-protobuf-dependency.patch',
'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
'PyTorch-2.0.1_fix-skip-decorators.patch',
'PyTorch-2.0.1_fix-vsx-loadu.patch',
'PyTorch-2.0.1_skip-failing-gradtest.patch',
'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch',
'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch',
'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
'PyTorch-2.1.0_skip-diff-test-on-ppc.patch',
'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch',
'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
'PyTorch-2.3.0_avoid_caffe2_test_cpp_jit.patch',
'PyTorch-2.3.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch',
'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch',
'PyTorch-2.3.0_disable_tests_which_need_network_download.patch',
'PyTorch-2.3.0_disable-gcc12-warning.patch',
'PyTorch-2.3.0_fix-cpuinfo-bug-with-smt.patch',
'PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch',
'PyTorch-2.3.0_fix_missing_masked_load_for_int_type.patch',
'PyTorch-2.3.0_remove-fsspec-test.patch',
'PyTorch-2.3.0_fix-compat-with-pytest-8.2.patch',
'PyTorch-2.3.0_fix-mkldnn-avx512-f32-bias.patch',
'PyTorch-2.3.0_fix-pytest-7-compat.patch',
'PyTorch-2.3.0_fix-segfault-in-filestore-on-python-3.12.patch',
'PyTorch-2.3.0_fix-test_extension_backend-without-vectorization.patch',
'PyTorch-2.3.0_fix-test_fine_tuning.patch',
'PyTorch-2.3.0_no-cuda-stubs-rpath.patch',
'PyTorch-2.3.0_skip-test-logaddexp-complex-for-scipy-1.13.patch',
'PyTorch-2.3.0_skip_test_sdpa_nn_functional_scaled_dot_product_attention_cpu.patch',
'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch',
'PyTorch-2.3.0_skip-test_init_from_local_shards.patch',
]
checksums = [
{'pytorch-v2.3.0.tar.gz': '69579513b26261bbab32e13b7efc99ad287fcf3103087f2d4fdf1adacd25316f'},
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
{'PyTorch-1.12.1_add-hypothesis-suppression.patch':
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
{'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
'5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
{'PyTorch-1.13.1_fix-protobuf-dependency.patch':
'8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
{'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
{'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
'72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
{'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
'02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
{'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
{'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
{'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
{'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
'7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
{'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
'166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
{'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch':
'3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'},
{'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch':
'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'},
{'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
'35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
{'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'},
{'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch':
'6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'},
{'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch':
'5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'},
{'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
'7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
{'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
{'PyTorch-2.3.0_avoid_caffe2_test_cpp_jit.patch':
'041adcd91d994b8c2ab57d227f081cd57e572c157117b37171e1eb8eb576f8fc'},
{'PyTorch-2.3.0_disable_DataType_dependent_test_if_tensorboard_is_not_available.patch':
'0dcbdfde6752c3ff54c5376f521b4a742167669feb7f0f1d4e1d4d55f72b664f'},
{'PyTorch-2.3.0_disable_test_linear_package_if_no_half_types_are_available.patch':
'23416f2d9d5226695ec3fbea0671e3650c655c19deefd3f0f8ddab5afa50f485'},
{'PyTorch-2.3.0_disable_tests_which_need_network_download.patch':
'b7fd1a5135dfd4098cdc054182f7bf84a23ac98462a00477712182b5442da855'},
{'PyTorch-2.3.0_disable-gcc12-warning.patch': 'a8a624e1a2a5f4c82610173e50bd0f853e49bd5621b432f5aac689f9f6eb1514'},
{'PyTorch-2.3.0_fix-cpuinfo-bug-with-smt.patch':
'29fb95d1dba070133b513de050febd328ed36905a73f1ca135dc633f16beafa4'},
{'PyTorch-2.3.0_increase-tolerance-test_jit-test_freeze_conv_relu_fusion.patch':
'6f8eba5b546129ea975cda1a8a7098ca3245ad2b040a31a98807ee6d69cad0d4'},
{'PyTorch-2.3.0_fix_missing_masked_load_for_int_type.patch':
'aa6ff764f3f7bf84372a8a257fe1b4ae6dc4b9744ad35f0f9015f2696c62a41e'},
{'PyTorch-2.3.0_remove-fsspec-test.patch': '09be192401013cd8cd66add9d6565ac3e879e004d77e61145f826b768267ff61'},
{'PyTorch-2.3.0_fix-compat-with-pytest-8.2.patch':
'f249169f12b8603285b321cf6ae6e9062c09ab01fad2aa50c6c4601d5719126a'},
{'PyTorch-2.3.0_fix-mkldnn-avx512-f32-bias.patch':
'ee07d21c3ac7aeb0bd0e39507b18a417b9125284a529102929c4b5c6727c2976'},
{'PyTorch-2.3.0_fix-pytest-7-compat.patch': 'a34c5e718f4b06d915504fc1ab49eaf168674ec6e498e6e040a882c2b31632ee'},
{'PyTorch-2.3.0_fix-segfault-in-filestore-on-python-3.12.patch':
'e9f213f52a255e082cda04cc98c328079f6731c587c0e6be44fbdeea857ef3a9'},
{'PyTorch-2.3.0_fix-test_extension_backend-without-vectorization.patch':
'36aa2d5ba175be17f4e996f4fb2d544fe477d4a0bd0644cd59a85063779afc8e'},
{'PyTorch-2.3.0_fix-test_fine_tuning.patch': 'daa24801f3b2b5f76b639a14fba9a6ad84fe99ebed53401e217d02f94cfe48bf'},
{'PyTorch-2.3.0_no-cuda-stubs-rpath.patch': '7ba26824b5def7379cff02ae821a080698e6affea0da45bc846e9ecb89939cb1'},
{'PyTorch-2.3.0_skip-test-logaddexp-complex-for-scipy-1.13.patch':
'8149418df7a1180f29a6a6127bfe8cd33bcbdb1bff6e85911e1da749365080ab'},
{'PyTorch-2.3.0_skip_test_sdpa_nn_functional_scaled_dot_product_attention_cpu.patch':
'7955f2655db3da18606574fdcbc5990be24098f49ad1db5e86ea756ea1cc506f'},
{'PyTorch-2.3.0_skip_test_var_mean_differentiable.patch':
'9703fd0f1fca8916f6d79d83e9a7efe8e3f717362a5fdaa8f5d9da90d0c75018'},
{'PyTorch-2.3.0_skip-test_init_from_local_shards.patch':
'90ed9c2870f57ee6dc032d00873a37e2217a2b92a13035ded1c25ad5306455f2'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.29.3'),
('hypothesis', '6.103.1'),
# For tests
('pytest-flakefinder', '1.1.0'),
('pytest-rerunfailures', '15.0'),
('pytest-shard', '0.1.2'),
('tlparse', '0.3.37'),
('optree', '0.14.1'),
('unittest-xml-reporting', '3.1.0'),
]

dependencies = [
('Ninja', '1.12.1'), # Required for JIT compilation of C++ extensions
('Python', '3.12.3'),
('Python-bundle-PyPI', '2024.06'),
('protobuf', '28.0'),
('protobuf-python', '5.28.0'),
('pybind11', '2.12.0'),
('SciPy-bundle', '2024.05'),
('PyYAML', '6.0.2'),
('MPFR', '4.2.1'),
('GMP', '6.3.0'),
('numactl', '2.0.18'),
('FFmpeg', '7.0.2'),
('Pillow', '10.4.0'),
('expecttest', '0.2.1'),
('networkx', '3.4.2'),
('sympy', '1.12'), # Breaking changes in 1.13 failing e.g. test_dynamic_shapes
('Z3', '4.13.0',),
]

buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
'distributed/test_distributed_spawn',
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
'distributions/test_constraints',
# no xdoctest
'doctests',
# failing on broadwell
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'test_native_mha',
# intermittent failures on various systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'distributed/rpc/test_tensorpipe_agent',
# This test is expected to fail when run in their CI, but won't in our case.
# It just checks for a "CI" env variable
'test_ci_sanity_check_fail',
# This fails consistently and is disabled upstream
# See https://github.com/pytorch/pytorch/issues/100152 and
# https://github.com/pytorch/pytorch/pull/124712
'test_cpp_extensions_open_device_registration',
# These don't work with Python 3.12+, yet
'distributed/_composable/test_replicate_with_compiler',
'distributed/_tensor/experimental/test_tp_transform',
'distributed/_tensor/test_dtensor_compile',
]
}

local_test_opts = '--continue-through-error --pipe-logs --verbose %(excluded_tests)s'
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py ' + local_test_opts

# Especially test_quantization has a few corner cases that are triggered by the random input values,
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
# So allow a low number of tests to fail as the tests "usually" succeed
max_failed_tests = 6

tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
PyTest 8.2+ instantiates the test class with "runTest" causing an error:
> AttributeError: 'FSDPTest' object has no attribute 'runTest'. Did you mean: 'run_test'?

Fix using backport of https://github.com/pytorch/pytorch/issues/127517
author: Alexander Grund (TU Dresden)

diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 48c8e4dabbb..b1910b5122c 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -549,8 +549,14 @@ class MultiProcessTestCase(TestCase):
# or run the underlying test function.
def __init__(self, method_name: str = "runTest") -> None:
super().__init__(method_name)
- fn = getattr(self, method_name)
- setattr(self, method_name, self.join_or_run(fn))
+ try:
+ fn = getattr(self, method_name)
+ setattr(self, method_name, self.join_or_run(fn))
+ except AttributeError as e:
+ if method_name != 'runTest':
+ # we allow instantiation with no explicit method name
+ # but not an *incorrect* or missing method name
+ raise ValueError(f"no such test method in {self.__class__}: {method_name}") from e

def setUp(self) -> None:
super().setUp()
@@ -964,8 +970,14 @@ class MultiThreadedTestCase(TestCase):

def __init__(self, method_name: str = "runTest") -> None:
super().__init__(method_name)
- test_fn = getattr(self, method_name, None)
- setattr(self, method_name, self.join_or_run(test_fn))
+ try:
+ fn = getattr(self, method_name)
+ setattr(self, method_name, self.join_or_run(fn))
+ except AttributeError as e:
+ if method_name != 'runTest':
+ # we allow instantiation with no explicit method name
+ # but not an *incorrect* or missing method name
+ raise ValueError(f"no such test method in {self.__class__}: {method_name}") from e

def perThreadSetUp(self):
# super().setUp() # TestCase.setUp() calls torch.manual_seed()
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
PyTest 7 removed the ability to pass `None` to `pytest.warns()`, and expects no arguments instead.
This now leads to this error:
> TypeError: exceptions must be derived from Warning, not <class 'NoneType'>

Update to use native warnings-as-errors handling.
See https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html#additional-use-cases-of-warnings-in-tests
Author: Alexander Grund (TU Dresden)
--- a/test/distributed/pipeline/sync/test_pipe.py
+++ b/test/distributed/pipeline/sync/test_pipe.py
@@ -9,6 +9,7 @@
from collections import OrderedDict
from copy import deepcopy
import time
+import warnings

import pytest
import random
@@ -87,23 +88,21 @@ def test_batch_size_indivisible(setup_rpc):
model = nn.Sequential(nn.Linear(1, 1))
model = Pipe(model, chunks=4)

- with pytest.warns(None) as record:
+ with warnings.catch_warnings():
+ # Indivisible batch size is legal.
+ warnings.simplefilter("error")
model(torch.rand(7, 1))

- # Indivisible batch size is legal.
- assert not record
-

def test_batch_size_small(setup_rpc):
model = nn.Sequential(nn.Linear(1, 1))
model = Pipe(model, chunks=4)

- with pytest.warns(None) as record:
+ with warnings.catch_warnings():
+ # Batch size smaller than chunks is legal.
+ warnings.simplefilter("error")
model(torch.rand(2, 1))

- # Batch size smaller than chunks is legal.
- assert not record
-

def test_checkpoint_mode(setup_rpc):
def count_grad_fn(grad_fn, name, visited=None):
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
Fix a segmentation fault on Python 3.12 happening e.g. in distributed/test_store.py::FileStoreTest::test_compare_set

Backport of https://github.com/pytorch/pytorch/pull/128212

Author: Alexander Grund (TU Dresden)

diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 8056008579c..1d165dcbdfb 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -997,16 +997,18 @@ Example::
const std::string& key,
const std::string& expected_value,
const std::string& desired_value) -> py::bytes {
- std::vector<uint8_t> expectedValue_(
- expected_value.begin(), expected_value.end());
- std::vector<uint8_t> desiredValue_(
- desired_value.begin(), desired_value.end());
- auto value =
- store.compareSet(key, expectedValue_, desiredValue_);
+ auto value = [&]() {
+ py::gil_scoped_release guard;
+ std::vector<uint8_t> expectedValue_(
+ expected_value.begin(), expected_value.end());
+ std::vector<uint8_t> desiredValue_(
+ desired_value.begin(), desired_value.end());
+ return store.compareSet(
+ key, expectedValue_, desiredValue_);
+ }();
return py::bytes(
reinterpret_cast<char*>(value.data()), value.size());
},
- py::call_guard<py::gil_scoped_release>(),
R"(
Inserts the key-value pair into the store based on the supplied ``key`` and
performs comparison between ``expected_value`` and ``desired_value`` before inserting. ``desired_value``
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
Fixes
> TypeError: get_state_dict() missing 1 required positional argument: 'optimizers'

From 61d30b6e8acbd3cfb087761defa74f19f9be96bb Mon Sep 17 00:00:00 2001
From: cdzhan <[email protected]>
Date: Mon, 24 Jun 2024 20:02:08 +0800
Subject: [PATCH] [easy][DCP] Fix test_fine_tuning.py for get/set_state_dict
API changes

---
test/distributed/checkpoint/e2e/test_fine_tuning.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/distributed/checkpoint/e2e/test_fine_tuning.py b/test/distributed/checkpoint/e2e/test_fine_tuning.py
index a93f242187709c..fd21524882c839 100644
--- a/test/distributed/checkpoint/e2e/test_fine_tuning.py
+++ b/test/distributed/checkpoint/e2e/test_fine_tuning.py
@@ -9,7 +9,9 @@
import torch.nn as nn
from torch.distributed._tensor import init_device_mesh
from torch.distributed.checkpoint.state_dict import (
+ get_model_state_dict,
get_state_dict,
+ set_model_state_dict,
set_state_dict,
StateDictOptions,
)
@@ -120,7 +122,7 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
# Simulate that the fine tuning restart after 3 iterations
for i in range(2):
# Load pretrain submodules checkpoint
- pretrain_state_dict, _ = get_state_dict(
+ pretrain_state_dict = get_model_state_dict(
model,
submodules={model.pretrain},
options=StateDictOptions(keep_submodule_prefixes=False),
@@ -129,7 +131,7 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
{"model": pretrain_state_dict},
storage_reader=dist_cp.FileSystemReader(pretrain_dir),
)
- set_state_dict(
+ set_model_state_dict(
model,
model_state_dict={model.pretrain: pretrain_state_dict},
options=StateDictOptions(strict=False),
Loading
Loading