Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
d770adf
adding easyconfigs: PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb
casparvl Mar 17, 2022
ebd2b1f
Fixed style. Removed unneeded comment.
casparvl Mar 18, 2022
94ca2ec
Added missing patch
casparvl Mar 18, 2022
e8b2fd7
Added more missing patches
casparvl Mar 18, 2022
64b99c9
Updated checksums
casparvl Mar 18, 2022
1d49f5c
Updated checksums, a second time...
casparvl Mar 18, 2022
ccb8d3e
Updated checksums, now setting the PyTorch-1.11.0 checksum to 'None'.…
casparvl Mar 18, 2022
d53f303
Added patch for failing test_sharded_optim test
casparvl Mar 22, 2022
93112d6
Added patch for increasing test tolerances to make them pass with GPU…
casparvl Apr 7, 2022
eb9c457
Fixed too long line
casparvl Apr 7, 2022
b4e0d4e
Updated patch for TF32 test tolerances to also cover the test_jit_fus…
casparvl Apr 12, 2022
7bbff35
Excluded test model dump since it succeeds interactively
casparvl Apr 14, 2022
0e657b2
Increase timeout in c10d_gloo tests
casparvl Apr 19, 2022
1a40278
Fixed too long line
casparvl Apr 19, 2022
0638e00
Patch was invalid, since I mixed up the original and patched dirs whe…
casparvl Apr 20, 2022
9df8bbc
Disabled tests that are known to fail, see https://github.com/pytorch…
casparvl Apr 29, 2022
166c066
Fixed patch as it wasn't being applied properly
casparvl Apr 29, 2022
06119fe
Forget the at in the decorator...
casparvl Apr 29, 2022
8ae2674
Added descriptions to patches
casparvl May 2, 2022
a3ad8e8
Skipping two more tests, as suggested by @Micket
casparvl Jun 20, 2022
2d31528
Allow for 10 tests to fail using the new EasyBlock https://github.com…
Jul 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
name = 'PyTorch'
version = '1.11.0'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2021a'}

sources = [{
'filename': '%(name)s-%(version)s.tar.gz',
'git_config': {
'url': 'https://github.com/pytorch',
'repo_name': 'pytorch',
'tag': 'v%(version)s',
'recursive': True,
},
}]
patches = [
'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
'PyTorch-1.10.0_skip_cmake_rpath.patch',
'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
'PyTorch-1.11.0_skip_failing_ops_tests.patch',
'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch',
'PyTorch-1.11.0_fix_sharded_imports.patch',
'PyTorch-1.11.0_increase_test_tolerances_TF32.patch',
'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch',
]
checksums = [
None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
'89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
# PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
# PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
'313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch
# PyTorch-1.11.0_increase-distributed-test-timeout.patch
'087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
'8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2', # PyTorch-1.11.0_skip_failing_ops_tests.patch
'21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
'9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9', # PyTorch-1.11.0_fix_sharded_imports.patch
# PyTorch-1.11.0_increase_test_tolerances_TF32.patch
'26e179a4f6f57e49209092612ae5f5cd8c03fd2ca84566ba0244eabefc3736ba',
# PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
'20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
# PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51',
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.20.1'),
('hypothesis', '6.13.1'),
]

dependencies = [
('CUDA', '11.3.1', '', True),
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.9.5'),
('protobuf', '3.17.3'),
('protobuf-python', '3.17.3'),
('pybind11', '2.6.2'),
('SciPy-bundle', '2021.05'),
('typing-extensions', '3.10.0.0'),
('PyYAML', '5.4.1'),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14'),
('FFmpeg', '4.3.2'),
('Pillow', '8.2.0'),
('cuDNN', '8.2.1.32', '-CUDA-%(cudaver)s', True),
('magma', '2.6.1', '-CUDA-%(cudaver)s'),
('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
('expecttest', '0.1.3'),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6']

custom_opts = ["USE_CUPTI_SO=1"]

excluded_tests = {
'': [
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
# 'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
# 'distributed/rpc/test_process_group_agent',
# This test fails constently when run as part of the test suite, but succeeds when run interactively
'test_model_dump',
# These tests appear flaky, possibly related to number of GPUs that are used
'distributed/fsdp/test_fsdp_memory',
'distributed/fsdp/test_fsdp_overlap',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]
tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'devel'
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Author: Caspar van Leeuwen
# Company: SURF
# We've seen that these tests fail for version 1.11.0, see https://github.com/pytorch/pytorch/issues/76107
# These failures probably point to underlying issues, but the PR that fixes them touches a ton of files
# It's near-impossible to cherry pick that, without causing other issues. Moreover,
# PyTorch devs have pointed out that nvfuser is not enabled by default in 1.11.0, so chances of anyone
# hitting these issues are very small
# We simply disable the tests and accept that in v 1.11.0 in PyTorch, this functionality is broken.
diff -Nru pytorch_orig/test/test_jit_cuda_fuser.py pytorch/test/test_jit_cuda_fuser.py
--- pytorch_orig/test/test_jit_cuda_fuser.py 2022-04-29 14:54:30.771378000 +0200
+++ pytorch/test/test_jit_cuda_fuser.py 2022-04-29 14:05:54.067297000 +0200
@@ -1313,6 +1313,12 @@
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
@unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+ # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11
+ # Thus, even if this points to an underlying issue, it should be extremely rare that
+ # anyone hits it.
+ # See https://github.com/pytorch/pytorch/issues/76107
+ # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137
+ @unittest.skip("Skipping test that is known to fail, see PT #76107")
def test_native_layer_norm_bfloat(self):
dims = 4
rnds = 3
@@ -2828,6 +2834,12 @@
@unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
+ # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11
+ # Thus, even if this points to an underlying issue, it should be extremely rare that
+ # anyone hits it.
+ # See https://github.com/pytorch/pytorch/issues/76107
+ # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137
+ @unittest.skip("Skipping test that is known to fail, see PT #76107")
def test_batch_norm_half(self):
with torch.backends.cudnn.flags(enabled=True):
setups = [
@@ -2843,6 +2855,12 @@
@unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
+ # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11
+ # Thus, even if this points to an underlying issue, it should be extremely rare that
+ # anyone hits it.
+ # See https://github.com/pytorch/pytorch/issues/76107
+ # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137
+ @unittest.skip("Skipping test that is known to fail, see PT #76107")
def test_batch_norm_impl_index_correctness(self):
with torch.backends.cudnn.flags(enabled=True):
batch = [2, 7, 16]
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Fixes a "NameError: name 'sharded_tensor' is not defined" error
# for the test_named_params_with_sharded_tensor test
# See https://github.com/pytorch/pytorch/pull/73309
From 012d490ed76d8af8538d310a508b0e09a91b7632 Mon Sep 17 00:00:00 2001
From: wanchaol <[email protected]>
Date: Wed, 23 Feb 2022 12:10:39 -0800
Subject: [PATCH] [shard] fix some imports in tests

This fix some imports in sharded optimizer tests

Differential Revision: [D34427252](https://our.internmc.facebook.com/intern/diff/D34427252/)

[ghstack-poisoned]
---
.../_shard/sharded_optim/test_sharded_optim.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
index 085c928985eb..d3f1468aea3c 100644
--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
@@ -2,7 +2,10 @@

import torch
import torch.optim as optim
-import torch.distributed._shard.sharded_tensor
+from torch.distributed._shard import (
+ sharded_tensor,
+ shard_parameter
+)

from copy import deepcopy
from torch.distributed._shard.sharding_spec import (
@@ -77,8 +80,8 @@ def shard_parameter(self):
],
)

- sharded_tensor.shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
- sharded_tensor.shard_parameter(self.linear2, "weight", colwise_sharding_spec)
+ shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
+ shard_parameter(self.linear2, "weight", colwise_sharding_spec)

def forward(self, inp):
return self.linear2(self.gelu(self.linear1(inp)))
Loading