easybuilders · boegel · Jul 7, 2022 · Mar 17, 2022 · Mar 18, 2022 · Mar 18, 2022
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb
@@ -0,0 +1,122 @@
+name = 'PyTorch'
+version = '1.11.0'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2021a'}
+
+sources = [{
+    'filename': '%(name)s-%(version)s.tar.gz',
+    'git_config': {
+        'url': 'https://github.com/pytorch',
+        'repo_name': 'pytorch',
+        'tag': 'v%(version)s',
+        'recursive': True,
+    },
+}]
+patches = [
+    'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
+    'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
+    'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
+    'PyTorch-1.10.0_skip_cmake_rpath.patch',
+    'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
+    'PyTorch-1.11.0_skip_failing_ops_tests.patch',
+    'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch',
+    'PyTorch-1.11.0_fix_sharded_imports.patch',
+    'PyTorch-1.11.0_increase_test_tolerances_TF32.patch',
+    'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
+    'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch',
+]
+checksums = [
+    None,  # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
+    'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18',  # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
+    '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a',  # PyTorch-1.7.0_disable-dev-shm-test.patch
+    '89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6',  # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
+    # PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
+    'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
+    # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
+    '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
+    'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448',  # PyTorch-1.10.0_skip_cmake_rpath.patch
+    # PyTorch-1.11.0_increase-distributed-test-timeout.patch
+    '087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
+    '8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2',  # PyTorch-1.11.0_skip_failing_ops_tests.patch
+    '21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1',  # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
+    '9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9',  # PyTorch-1.11.0_fix_sharded_imports.patch
+    # PyTorch-1.11.0_increase_test_tolerances_TF32.patch
+    '26e179a4f6f57e49209092612ae5f5cd8c03fd2ca84566ba0244eabefc3736ba',
+    # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
+    '20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
+    # PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
+    'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51',
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.20.1'),
+    ('hypothesis', '6.13.1'),
+]
+
+dependencies = [
+    ('CUDA', '11.3.1', '', True),
+    ('Ninja', '1.10.2'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.9.5'),
+    ('protobuf', '3.17.3'),
+    ('protobuf-python', '3.17.3'),
+    ('pybind11', '2.6.2'),
+    ('SciPy-bundle', '2021.05'),
+    ('typing-extensions', '3.10.0.0'),
+    ('PyYAML', '5.4.1'),
+    ('MPFR', '4.1.0'),
+    ('GMP', '6.2.1'),
+    ('numactl', '2.0.14'),
+    ('FFmpeg', '4.3.2'),
+    ('Pillow', '8.2.0'),
+    ('cuDNN', '8.2.1.32', '-CUDA-%(cudaver)s', True),
+    ('magma', '2.6.1', '-CUDA-%(cudaver)s'),
+    ('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
+    ('expecttest', '0.1.3'),
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6']
+
+custom_opts = ["USE_CUPTI_SO=1"]
+
+excluded_tests = {
+    '': [
+        # Bad tests: https://github.com/pytorch/pytorch/issues/60260
+        'distributed/elastic/utils/distributed_test',
+        'distributed/elastic/multiprocessing/api_test',
+        # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
+        # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
+        # 'distributed/test_distributed_fork',
+        'distributed/test_distributed_spawn',
+        # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
+        'test_optim',
+        # Test from this suite timeout often. The process group backend is deprecated anyway
+        # 'distributed/rpc/test_process_group_agent',
+        # This test fails constently when run as part of the test suite, but succeeds when run interactively
+        'test_model_dump',
+        # These tests appear flaky, possibly related to number of GPUs that are used
+        'distributed/fsdp/test_fsdp_memory',
+        'distributed/fsdp/test_fsdp_overlap',
+    ]
+}
+
+runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'
+
+# The readelf sanity check command can be taken out once the TestRPATH test from 
+# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite
+local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
+sanity_check_commands = [
+    "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
+]
+tests = ['PyTorch-check-cpp-extension.py']
+
+moduleclass = 'devel'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
@@ -0,0 +1,50 @@
+# Author: Caspar van Leeuwen
+# Company: SURF
+# We've seen that these tests fail for version 1.11.0, see https://github.com/pytorch/pytorch/issues/76107
+# These failures probably point to underlying issues, but the PR that fixes them touches a ton of files
+# It's near-impossible to cherry pick that, without causing other issues. Moreover,
+# PyTorch devs have pointed out that nvfuser is not enabled by default in 1.11.0, so chances of anyone
+# hitting these issues are very small
+# We simply disable the tests and accept that in v 1.11.0 in PyTorch, this functionality is broken.
+diff -Nru pytorch_orig/test/test_jit_cuda_fuser.py pytorch/test/test_jit_cuda_fuser.py
+--- pytorch_orig/test/test_jit_cuda_fuser.py	2022-04-29 14:54:30.771378000 +0200
++++ pytorch/test/test_jit_cuda_fuser.py	2022-04-29 14:05:54.067297000 +0200
+@@ -1313,6 +1313,12 @@
+     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                      "Requires fusion optimization pass to be effective")
+     @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
++    # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11
++    # Thus, even if this points to an underlying issue, it should be extremely rare that
++    # anyone hits it.
++    # See https://github.com/pytorch/pytorch/issues/76107
++    # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137
++    @unittest.skip("Skipping test that is known to fail, see PT #76107")
+     def test_native_layer_norm_bfloat(self):
+         dims = 4
+         rnds = 3
+@@ -2828,6 +2834,12 @@
+     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                      "Requires fusion optimization pass to be effective")
++    # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11
++    # Thus, even if this points to an underlying issue, it should be extremely rare that
++    # anyone hits it.
++    # See https://github.com/pytorch/pytorch/issues/76107
++    # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137
++    @unittest.skip("Skipping test that is known to fail, see PT #76107")
+     def test_batch_norm_half(self):
+         with torch.backends.cudnn.flags(enabled=True):
+             setups = [
+@@ -2843,6 +2855,12 @@
+     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                      "Requires fusion optimization pass to be effective")
++    # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11
++    # Thus, even if this points to an underlying issue, it should be extremely rare that
++    # anyone hits it.
++    # See https://github.com/pytorch/pytorch/issues/76107
++    # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137
++    @unittest.skip("Skipping test that is known to fail, see PT #76107")
+     def test_batch_norm_impl_index_correctness(self):
+         with torch.backends.cudnn.flags(enabled=True):
+             batch = [2, 7, 16]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch
@@ -0,0 +1,44 @@
+# Fixes a "NameError: name 'sharded_tensor' is not defined" error 
+# for the test_named_params_with_sharded_tensor test
+# See https://github.com/pytorch/pytorch/pull/73309
+From 012d490ed76d8af8538d310a508b0e09a91b7632 Mon Sep 17 00:00:00 2001
+From: wanchaol <[email protected]>
+Date: Wed, 23 Feb 2022 12:10:39 -0800
+Subject: [PATCH] [shard] fix some imports in tests
+
+This fix some imports in sharded optimizer tests
+
+Differential Revision: [D34427252](https://our.internmc.facebook.com/intern/diff/D34427252/)
+
+[ghstack-poisoned]
+---
+ .../_shard/sharded_optim/test_sharded_optim.py           | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+index 085c928985eb..d3f1468aea3c 100644
+--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py
++++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+@@ -2,7 +2,10 @@
+
+ import torch
+ import torch.optim as optim
+-import torch.distributed._shard.sharded_tensor
++from torch.distributed._shard import (
++    sharded_tensor,
++    shard_parameter
++)
+
+ from copy import deepcopy
+ from torch.distributed._shard.sharding_spec import (
+@@ -77,8 +80,8 @@ def shard_parameter(self):
+             ],
+         )
+
+-        sharded_tensor.shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
+-        sharded_tensor.shard_parameter(self.linear2, "weight", colwise_sharding_spec)
++        shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
++        shard_parameter(self.linear2, "weight", colwise_sharding_spec)
+
+     def forward(self, inp):
+         return self.linear2(self.gelu(self.linear1(inp)))