| 
 | 1 | +name: Run HVD-specific unit tests on GPUs  | 
 | 2 | +on:  | 
 | 3 | +  push:  | 
 | 4 | +    paths:  | 
 | 5 | +      - "ignite/**"  | 
 | 6 | +      - "tests/ignite/**"  | 
 | 7 | +      - "tests/run_gpu_tests.sh"  | 
 | 8 | +      - "tests/run_code_style.sh"  | 
 | 9 | +      - "examples/**.py"  | 
 | 10 | +      - "requirements-dev.txt"  | 
 | 11 | +      - ".github/workflows/gpu-hvd-tests.yml"  | 
 | 12 | +  workflow_dispatch:  | 
 | 13 | + | 
 | 14 | +concurrency:  | 
 | 15 | +  # <workflow_name>-<branch_name>-<true || commit_sha (if branch is protected)>  | 
 | 16 | +  group: gpu-hvd-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}  | 
 | 17 | +  cancel-in-progress: true  | 
 | 18 | + | 
 | 19 | +# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml  | 
 | 20 | + | 
 | 21 | +jobs:  | 
 | 22 | +  gpu-hvd-tests:  | 
 | 23 | +    strategy:  | 
 | 24 | +      matrix:  | 
 | 25 | +        pytorch-channel: [pytorch, ]  | 
 | 26 | +      fail-fast: false  | 
 | 27 | +    env:  | 
 | 28 | +      DOCKER_IMAGE: "pytorch/conda-builder:cuda11.8"  | 
 | 29 | +      REPOSITORY: ${{ github.repository }}  | 
 | 30 | +      PR_NUMBER: ${{ github.event.pull_request.number }}  | 
 | 31 | +    runs-on: linux.8xlarge.nvidia.gpu  | 
 | 32 | +    timeout-minutes: 60  | 
 | 33 | + | 
 | 34 | +    steps:  | 
 | 35 | +      - name: Clean workspace  | 
 | 36 | +        run: |  | 
 | 37 | +          echo "::group::Cleanup debug output"  | 
 | 38 | +          sudo rm -rfv "${GITHUB_WORKSPACE}"  | 
 | 39 | +          mkdir -p "${GITHUB_WORKSPACE}"  | 
 | 40 | +          echo "::endgroup::"  | 
 | 41 | +
  | 
 | 42 | +      - name: Checkout repository (pytorch/test-infra)  | 
 | 43 | +        uses: actions/checkout@v3  | 
 | 44 | +        with:  | 
 | 45 | +          # Support the use case where we need to checkout someone's fork  | 
 | 46 | +          repository: pytorch/test-infra  | 
 | 47 | +          path: test-infra  | 
 | 48 | + | 
 | 49 | +      - name: Setup Linux  | 
 | 50 | +        uses: ./test-infra/.github/actions/setup-linux  | 
 | 51 | + | 
 | 52 | +      - name: Pull docker image  | 
 | 53 | +        uses: ./test-infra/.github/actions/pull-docker-image  | 
 | 54 | +        with:  | 
 | 55 | +          docker-image: ${{ env.DOCKER_IMAGE }}  | 
 | 56 | + | 
 | 57 | +      - name: Checkout repository (${{ github.repository }})  | 
 | 58 | +        uses: actions/checkout@v3  | 
 | 59 | +        with:  | 
 | 60 | +          # Support the use case where we need to checkout someone's fork  | 
 | 61 | +          repository: ${{ github.repository }}  | 
 | 62 | +          ref: ${{ github.ref }}  | 
 | 63 | +          path: ${{ github.repository }}  | 
 | 64 | +          fetch-depth: 1  | 
 | 65 | + | 
 | 66 | +      - name: Start Pytorch container  | 
 | 67 | +        working-directory: ${{ github.repository }}  | 
 | 68 | +        run: |  | 
 | 69 | +          docker run --name pthd --gpus=all --rm \  | 
 | 70 | +            --cap-add=SYS_PTRACE \  | 
 | 71 | +            --detach \  | 
 | 72 | +            --ipc=host \  | 
 | 73 | +            --security-opt seccomp=unconfined \  | 
 | 74 | +            --shm-size=2g \  | 
 | 75 | +            --tty \  | 
 | 76 | +            --ulimit stack=10485760:83886080 \  | 
 | 77 | +            -v $PWD:/work \  | 
 | 78 | +            -w /work \  | 
 | 79 | +            ${DOCKER_IMAGE}  | 
 | 80 | +
  | 
 | 81 | +          script=$(cat << EOF  | 
 | 82 | +
  | 
 | 83 | +            set -xe  | 
 | 84 | +
  | 
 | 85 | +            nvidia-smi  | 
 | 86 | +            ls -alh  | 
 | 87 | +
  | 
 | 88 | +            conda --version  | 
 | 89 | +            python --version  | 
 | 90 | +
  | 
 | 91 | +          EOF  | 
 | 92 | +          )  | 
 | 93 | +          docker exec -t pthd /bin/bash -c "${script}"  | 
 | 94 | +
  | 
 | 95 | +      - name: Install PyTorch and dependencies  | 
 | 96 | +        continue-on-error: false  | 
 | 97 | +        run: |  | 
 | 98 | +
  | 
 | 99 | +          script=$(cat << EOF  | 
 | 100 | +
  | 
 | 101 | +          set -xe  | 
 | 102 | +
  | 
 | 103 | +          # Install PyTorch  | 
 | 104 | +          if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then  | 
 | 105 | +            pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu118  | 
 | 106 | +          else  | 
 | 107 | +            pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118  | 
 | 108 | +          fi  | 
 | 109 | +
  | 
 | 110 | +          python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"  | 
 | 111 | +          pip list  | 
 | 112 | +
  | 
 | 113 | +          # Install dependencies  | 
 | 114 | +          pip install -r requirements-dev.txt  | 
 | 115 | +          pip install -e .  | 
 | 116 | +
  | 
 | 117 | +          EOF  | 
 | 118 | +          )  | 
 | 119 | +
  | 
 | 120 | +          docker exec -t pthd /bin/bash -c "${script}"  | 
 | 121 | +
  | 
 | 122 | +      - name: Install Horovod with NCCL GPU ops  | 
 | 123 | +        run: |  | 
 | 124 | +          script=$(cat << EOF  | 
 | 125 | +
  | 
 | 126 | +          set -xe  | 
 | 127 | +
  | 
 | 128 | +          HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]  | 
 | 129 | +          horovodrun --check-build  | 
 | 130 | +          pip list  | 
 | 131 | +
  | 
 | 132 | +          EOF  | 
 | 133 | +          )  | 
 | 134 | +
  | 
 | 135 | +          docker exec -t pthd /bin/bash -c "${script}"  | 
 | 136 | +
  | 
 | 137 | +      - name: Run GPU and CPU Unit HVD Tests  | 
 | 138 | +        run: |  | 
 | 139 | +
  | 
 | 140 | +          script=$(cat << EOF  | 
 | 141 | +
  | 
 | 142 | +          set -xe  | 
 | 143 | +
  | 
 | 144 | +          bash tests/run_gpu_tests.sh 2 hvd  | 
 | 145 | +          CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd  | 
 | 146 | +
  | 
 | 147 | +          EOF  | 
 | 148 | +          )  | 
 | 149 | +
  | 
 | 150 | +          docker exec -t pthd /bin/bash -c "${script}"  | 
 | 151 | +
  | 
 | 152 | +      - name: Upload coverage to Codecov  | 
 | 153 | +        uses: codecov/codecov-action@v3  | 
 | 154 | +        with:  | 
 | 155 | +          file: ${{ github.repository }}/coverage.xml  | 
 | 156 | +          flags: gpu-2  | 
 | 157 | +          fail_ci_if_error: false  | 
 | 158 | + | 
 | 159 | +      - name: Run examples in container  | 
 | 160 | +        continue-on-error: false  | 
 | 161 | +        run: |  | 
 | 162 | +          SCRIPT=$(cat << EOF  | 
 | 163 | +
  | 
 | 164 | +          set -xe  | 
 | 165 | +
  | 
 | 166 | +          # Install additional example dependencies  | 
 | 167 | +          pip install fire  | 
 | 168 | +
  | 
 | 169 | +          # Check training on CIFAR10, run with horovod backend using horovodrun  | 
 | 170 | +          # initial run  | 
 | 171 | +          CI=1 horovodrun -np 2 python -u examples/cifar10/main.py run --backend=horovod --checkpoint_every=200 --stop_iteration=500  | 
 | 172 | +          # resume  | 
 | 173 | +          CI=1 horovodrun -np 2 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt  | 
 | 174 | +
  | 
 | 175 | +          # Check training on CIFAR10 using spawn  | 
 | 176 | +          # initial run  | 
 | 177 | +          CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500  | 
 | 178 | +          # resume  | 
 | 179 | +          CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt  | 
 | 180 | +
  | 
 | 181 | +          EOF  | 
 | 182 | +          )  | 
 | 183 | +
  | 
 | 184 | +          docker exec -t pthd /bin/bash -c "${script}"  | 
 | 185 | +
  | 
 | 186 | +      - name: Teardown Linux  | 
 | 187 | +        if: ${{ always() }}  | 
 | 188 | +        uses: ./test-infra/.github/actions/teardown-linux  | 
0 commit comments