Skip to content

Commit 57693cc

Browse files
committed
Update Mahti outputs
1 parent 30da599 commit 57693cc

File tree

8 files changed

+120
-105
lines changed

8 files changed

+120
-105
lines changed

openmp/01-axpy/solution/README.md

Lines changed: 3 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -71,117 +71,15 @@ Note for instructors: use `run*.sh` scripts to generate all the output files.
7171
export NVCOMPILER_ACC_NOTIFY=$((0x1 | 0x2))
7272
srun -p gputest --nodes=1 --ntasks-per-node=1 --cpus-per-task=4 --gres=gpu:a100:1 -t 0:10:00 ./axpy.x
7373

74-
Output from C version:
75-
76-
upload CUDA data file=.../axpy.c function=main line=27 device=0 threadid=1 variable=y[:] bytes=819200
77-
upload CUDA data file=.../axpy.c function=main line=27 device=0 threadid=1 variable=x[:] bytes=819200
78-
launch CUDA kernel file=.../axpy.c function=main line=27 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_main_F1L27_2 grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
79-
download CUDA data file=.../axpy.c function=main line=33 device=0 threadid=1 variable=x[:] bytes=819200
80-
download CUDA data file=.../axpy.c function=main line=33 device=0 threadid=1 variable=y[:] bytes=819200
81-
Using N = 102400
82-
Input:
83-
a = 3.0000
84-
x = 0.0000 0.0000 0.0000 0.0000 ... 1.0000 1.0000 1.0000 1.0000
85-
y = 0.0000 0.0010 0.0020 0.0029 ... 99.9971 99.9980 99.9990 100.0000
86-
Output:
87-
y = 0.0000 0.0010 0.0020 0.0030 ... 102.9970 102.9980 102.9990 103.0000
88-
89-
Output from Fortran version:
90-
91-
upload CUDA data file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=descriptor bytes=128
92-
upload CUDA data file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=y(:) bytes=819200
93-
upload CUDA data file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=descriptor bytes=128
94-
upload CUDA data file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=x(:) bytes=819200
95-
launch CUDA kernel file=.../axpy.F90 function=axpy line=31 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_MAIN__F1L31_2_ grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
96-
download CUDA data file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 variable=x(:) bytes=819200
97-
download CUDA data file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 variable=y(:) bytes=819200
98-
Using N = 102400
99-
Input:
100-
a = 3.0000
101-
x = 0.0000 0.0000 0.0000 0.0000 ... 1.0000 1.0000 1.0000 1.0000
102-
y = 0.0000 0.0010 0.0020 0.0029 ... 99.9971 99.9980 99.9990 100.0000
103-
Output:
104-
y = 0.0000 0.0010 0.0020 0.0030 ... 102.9970 102.9980 102.9990 103.0000
74+
Outputs are in `axpy_{c,f}_nv_debug.out` for C and Fortran, respectively.
10575

10676
We can read the memory transfers and kernel execution with 800 blocks and 128 threads per block from this output.
10777

108-
Now `NVCOMPILER_ACC_NOTIFY=$((0x1F))`.
109-
Output from C version:
110-
111-
Enter enter data construct file=.../axpy.c function=main line=27 device=0 threadid=1
112-
create CUDA data bytes=819200 file=.../axpy.c function=main line=27 device=0 threadid=1
113-
alloc CUDA data devaddr=0x7fff0b2fa000 bytes=819200 file=.../axpy.c function=main line=27 device=0 threadid=1
114-
upload CUDA data file=.../axpy.c function=main line=27 device=0 threadid=1 variable=y[:] bytes=819200
115-
create CUDA data bytes=819200 file=.../axpy.c function=main line=27 device=0 threadid=1
116-
alloc CUDA data devaddr=0x7fff0b800000 bytes=819200 file=.../axpy.c function=main line=27 device=0 threadid=1
117-
upload CUDA data file=.../axpy.c function=main line=27 device=0 threadid=1 variable=x[:] bytes=819200
118-
Leave enter data .../axpy.c main:27 device=0 threadid=1
119-
launch CUDA kernel file=.../axpy.c function=main line=27 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_main_F1L27_2 grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
120-
Enter exit data construct file=.../axpy.c function=main line=27 device=0 threadid=1
121-
download CUDA data file=.../axpy.c function=main line=33 device=0 threadid=1 variable=x[:] bytes=819200
122-
delete CUDA data devaddr=0x7fff0b800000 bytes=819200 file=.../axpy.c function=main line=33 device=0 threadid=1
123-
download CUDA data file=.../axpy.c function=main line=33 device=0 threadid=1 variable=y[:] bytes=819200
124-
delete CUDA data devaddr=0x7fff0b2fa000 bytes=819200 file=.../axpy.c function=main line=33 device=0 threadid=1
125-
Implicit wait file=.../axpy.c function=main line=33 device=0 threadid=1 queue=acc_async_sync
126-
Leave exit data .../axpy.c main:33 device=0 threadid=1
127-
Using N = 102400
128-
Input:
129-
a = 3.0000
130-
x = 0.0000 0.0000 0.0000 0.0000 ... 1.0000 1.0000 1.0000 1.0000
131-
y = 0.0000 0.0010 0.0020 0.0029 ... 99.9971 99.9980 99.9990 100.0000
132-
Output:
133-
y = 0.0000 0.0010 0.0020 0.0030 ... 102.9970 102.9980 102.9990 103.0000
134-
135-
Output from Fortran version:
136-
137-
Enter enter data construct file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
138-
create CUDA data bytes=819200 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
139-
alloc CUDA data devaddr=0x7fff0b2fa000 bytes=819200 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
140-
create CUDA data bytes=128 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
141-
alloc CUDA data devaddr=0x7fff0b3c2000 bytes=512 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
142-
upload CUDA data file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=descriptor bytes=128
143-
upload CUDA data file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=y(:) bytes=819200
144-
create CUDA data bytes=819200 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
145-
alloc CUDA data devaddr=0x7fff0b800000 bytes=819200 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
146-
create CUDA data bytes=128 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
147-
alloc CUDA data devaddr=0x7fff0b3c2200 bytes=512 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
148-
upload CUDA data file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=descriptor bytes=128
149-
upload CUDA data file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=x(:) bytes=819200
150-
Leave enter data .../axpy.F90 axpy:31 device=0 threadid=1
151-
launch CUDA kernel file=.../axpy.F90 function=axpy line=31 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_MAIN__F1L31_2_ grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
152-
Enter exit data construct file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
153-
download CUDA data file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 variable=x(:) bytes=819200
154-
delete CUDA data devaddr=0x7fff0b800000 bytes=819200 file=.../axpy.F90 function=axpy line=35 device=0 threadid=1
155-
delete CUDA data devaddr=0x7fff0b3c2200 bytes=512 file=.../axpy.F90 function=axpy line=35 device=0 threadid=1
156-
download CUDA data file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 variable=y(:) bytes=819200
157-
delete CUDA data devaddr=0x7fff0b2fa000 bytes=819200 file=.../axpy.F90 function=axpy line=35 device=0 threadid=1
158-
delete CUDA data devaddr=0x7fff0b3c2000 bytes=512 file=.../axpy.F90 function=axpy line=35 device=0 threadid=1
159-
Implicit wait file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 queue=acc_async_sync
160-
Leave exit data .../axpy.F90 axpy:35 device=0 threadid=1
161-
Using N = 102400
162-
Input:
163-
a = 3.0000
164-
x = 0.0000 0.0000 0.0000 0.0000 ... 1.0000 1.0000 1.0000 1.0000
165-
y = 0.0000 0.0010 0.0020 0.0029 ... 99.9971 99.9980 99.9990 100.0000
166-
Output:
167-
y = 0.0000 0.0010 0.0020 0.0030 ... 102.9970 102.9980 102.9990 103.0000
78+
For `NVCOMPILER_ACC_NOTIFY=$((0x1F))` outputs are in `axpy_{c,f}_nv_debug_max.out`.
16879

16980
Compiling with diagnostics:
17081

17182
nvc -O3 -mp=gpu -gpu=cc80 -Minfo=mp axpy.c -o axpy.x
17283
nvfortran -O3 -mp=gpu -gpu=cc80 -Minfo=mp helper_functions.F90 axpy.F90 -o axpy.x
17384

174-
Output for C:
175-
176-
main:
177-
27, #omp target teams distribute parallel for
178-
27, Generating "nvkernel_main_F1L27_2" GPU kernel
179-
31, Loop parallelized across teams and threads(128), schedule(static)
180-
27, Generating implicit map(tofrom:y[:],x[:])
181-
182-
Output for Fortran:
183-
184-
axpy:
185-
31, !$omp target teams distribute parallel do
186-
31, Generating "nvkernel_MAIN__F1L31_2" GPU kernel
187-
31, Generating implicit map(tofrom:y(:),x(:))
85+
Outputs are in `axpy_{c,f}_nv_info.out`.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
upload CUDA data file=axpy.c function=main line=33 device=0 threadid=1 variable=y[:] bytes=819200
2+
upload CUDA data file=axpy.c function=main line=33 device=0 threadid=1 variable=x[:] bytes=819200
3+
launch CUDA kernel file=axpy.c function=main line=33 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_main_F1L33_2 grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
4+
download CUDA data file=axpy.c function=main line=39 device=0 threadid=1 variable=x[:] bytes=819200
5+
download CUDA data file=axpy.c function=main line=39 device=0 threadid=1 variable=y[:] bytes=819200
6+
Using N = 102400
7+
Input:
8+
a = 3.0000
9+
x = 0.0000 0.0000 0.0000 0.0000 ... 1.0000 1.0000 1.0000 1.0000
10+
y = 0.0000 0.0010 0.0020 0.0029 ... 99.9971 99.9980 99.9990 100.0000
11+
Output:
12+
y = 0.0000 0.0010 0.0020 0.0030 ... 102.9970 102.9980 102.9990 103.0000
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
Enter enter data construct file=axpy.c function=main line=33 device=0 threadid=1
2+
create CUDA data bytes=819200 file=axpy.c function=main line=33 device=0 threadid=1
3+
alloc CUDA data devaddr=0x7fff0b2fa000 bytes=819200 file=axpy.c function=main line=33 device=0 threadid=1
4+
upload CUDA data file=axpy.c function=main line=33 device=0 threadid=1 variable=y[:] bytes=819200
5+
create CUDA data bytes=819200 file=axpy.c function=main line=33 device=0 threadid=1
6+
alloc CUDA data devaddr=0x7fff0b800000 bytes=819200 file=axpy.c function=main line=33 device=0 threadid=1
7+
upload CUDA data file=axpy.c function=main line=33 device=0 threadid=1 variable=x[:] bytes=819200
8+
Leave enter data axpy.c main:33 device=0 threadid=1
9+
launch CUDA kernel file=axpy.c function=main line=33 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_main_F1L33_2 grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
10+
Enter exit data construct file=axpy.c function=main line=33 device=0 threadid=1
11+
download CUDA data file=axpy.c function=main line=39 device=0 threadid=1 variable=x[:] bytes=819200
12+
delete CUDA data devaddr=0x7fff0b800000 bytes=819200 file=axpy.c function=main line=39 device=0 threadid=1
13+
download CUDA data file=axpy.c function=main line=39 device=0 threadid=1 variable=y[:] bytes=819200
14+
delete CUDA data devaddr=0x7fff0b2fa000 bytes=819200 file=axpy.c function=main line=39 device=0 threadid=1
15+
Implicit wait file=axpy.c function=main line=39 device=0 threadid=1 queue=acc_async_sync
16+
Leave exit data axpy.c main:39 device=0 threadid=1
17+
Using N = 102400
18+
Input:
19+
a = 3.0000
20+
x = 0.0000 0.0000 0.0000 0.0000 ... 1.0000 1.0000 1.0000 1.0000
21+
y = 0.0000 0.0010 0.0020 0.0029 ... 99.9971 99.9980 99.9990 100.0000
22+
Output:
23+
y = 0.0000 0.0010 0.0020 0.0030 ... 102.9970 102.9980 102.9990 103.0000
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
main:
2+
33, #omp target teams distribute parallel for
3+
33, Generating "nvkernel_main_F1L33_2" GPU kernel
4+
37, Loop parallelized across teams and threads(128), schedule(static)
5+
33, Generating implicit map(tofrom:y[:],x[:])
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
upload CUDA data file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=descriptor bytes=128
2+
upload CUDA data file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=y(:) bytes=819200
3+
upload CUDA data file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=descriptor bytes=128
4+
upload CUDA data file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=x(:) bytes=819200
5+
launch CUDA kernel file=axpy.F90 function=axpy line=35 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_MAIN__F1L35_2_ grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
6+
download CUDA data file=axpy.F90 function=axpy line=39 device=0 threadid=1 variable=x(:) bytes=819200
7+
download CUDA data file=axpy.F90 function=axpy line=39 device=0 threadid=1 variable=y(:) bytes=819200
8+
Using N = 102400
9+
Input:
10+
a = 3.0000
11+
x = 0.0000 0.0000 0.0000 0.0000 ... 1.0000 1.0000 1.0000 1.0000
12+
y = 0.0000 0.0010 0.0020 0.0029 ... 99.9971 99.9980 99.9990 100.0000
13+
Output:
14+
y = 0.0000 0.0010 0.0020 0.0030 ... 102.9970 102.9980 102.9990 103.0000
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
Enter enter data construct file=axpy.F90 function=axpy line=35 device=0 threadid=1
2+
create CUDA data bytes=819200 file=axpy.F90 function=axpy line=35 device=0 threadid=1
3+
alloc CUDA data devaddr=0x7fff0b2fa000 bytes=819200 file=axpy.F90 function=axpy line=35 device=0 threadid=1
4+
create CUDA data bytes=128 file=axpy.F90 function=axpy line=35 device=0 threadid=1
5+
alloc CUDA data devaddr=0x7fff0b3c2000 bytes=512 file=axpy.F90 function=axpy line=35 device=0 threadid=1
6+
upload CUDA data file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=descriptor bytes=128
7+
upload CUDA data file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=y(:) bytes=819200
8+
create CUDA data bytes=819200 file=axpy.F90 function=axpy line=35 device=0 threadid=1
9+
alloc CUDA data devaddr=0x7fff0b800000 bytes=819200 file=axpy.F90 function=axpy line=35 device=0 threadid=1
10+
create CUDA data bytes=128 file=axpy.F90 function=axpy line=35 device=0 threadid=1
11+
alloc CUDA data devaddr=0x7fff0b3c2200 bytes=512 file=axpy.F90 function=axpy line=35 device=0 threadid=1
12+
upload CUDA data file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=descriptor bytes=128
13+
upload CUDA data file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=x(:) bytes=819200
14+
Leave enter data axpy.F90 axpy:35 device=0 threadid=1
15+
launch CUDA kernel file=axpy.F90 function=axpy line=35 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_MAIN__F1L35_2_ grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
16+
Enter exit data construct file=axpy.F90 function=axpy line=35 device=0 threadid=1
17+
download CUDA data file=axpy.F90 function=axpy line=39 device=0 threadid=1 variable=x(:) bytes=819200
18+
delete CUDA data devaddr=0x7fff0b800000 bytes=819200 file=axpy.F90 function=axpy line=39 device=0 threadid=1
19+
delete CUDA data devaddr=0x7fff0b3c2200 bytes=512 file=axpy.F90 function=axpy line=39 device=0 threadid=1
20+
download CUDA data file=axpy.F90 function=axpy line=39 device=0 threadid=1 variable=y(:) bytes=819200
21+
delete CUDA data devaddr=0x7fff0b2fa000 bytes=819200 file=axpy.F90 function=axpy line=39 device=0 threadid=1
22+
delete CUDA data devaddr=0x7fff0b3c2000 bytes=512 file=axpy.F90 function=axpy line=39 device=0 threadid=1
23+
Implicit wait file=axpy.F90 function=axpy line=39 device=0 threadid=1 queue=acc_async_sync
24+
Leave exit data axpy.F90 axpy:39 device=0 threadid=1
25+
Using N = 102400
26+
Input:
27+
a = 3.0000
28+
x = 0.0000 0.0000 0.0000 0.0000 ... 1.0000 1.0000 1.0000 1.0000
29+
y = 0.0000 0.0010 0.0020 0.0029 ... 99.9971 99.9980 99.9990 100.0000
30+
Output:
31+
y = 0.0000 0.0010 0.0020 0.0030 ... 102.9970 102.9980 102.9990 103.0000
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
helper_functions.F90:
2+
axpy.F90:
3+
axpy:
4+
35, !$omp target teams distribute parallel do
5+
35, Generating "nvkernel_MAIN__F1L35_2" GPU kernel
6+
35, Generating implicit map(tofrom:y(:),x(:))
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=test
3+
#SBATCH --partition=gputest
4+
#SBATCH --nodes=1
5+
#SBATCH --ntasks-per-node=1
6+
#SBATCH --cpus-per-task=4
7+
#SBATCH --gres=gpu:a100:1
8+
#SBATCH --time=00:10:00
9+
10+
ml purge
11+
ml use /appl/opt/nvhpc/modulefiles
12+
ml nvhpc-hpcx-cuda12/25.1
13+
14+
export OMP_TARGET_OFFLOAD=MANDATORY
15+
16+
# nvc
17+
nvc -O3 -mp=gpu -gpu=cc80 -Minfo=mp axpy.c -o axpy.x &> axpy_c_nv_info.out
18+
19+
NVCOMPILER_ACC_NOTIFY=$((0x1 | 0x2)) srun ./axpy.x 2>&1 | sed "s|$(dirname $(readlink -f axpy.c))/||g" > axpy_c_nv_debug.out
20+
NVCOMPILER_ACC_NOTIFY=$((0x1F)) srun ./axpy.x 2>&1 | sed "s|$(dirname $(readlink -f axpy.c))/||g" > axpy_c_nv_debug_max.out
21+
22+
# nvfortran
23+
nvfortran -O3 -mp=gpu -gpu=cc80 -Minfo=mp helper_functions.F90 axpy.F90 -o axpy.x &> axpy_f_nv_info.out
24+
25+
NVCOMPILER_ACC_NOTIFY=$((0x1 | 0x2)) srun ./axpy.x 2>&1 | sed "s|$(dirname $(readlink -f axpy.F90))/||g" > axpy_f_nv_debug.out
26+
NVCOMPILER_ACC_NOTIFY=$((0x1F)) srun ./axpy.x 2>&1 | sed "s|$(dirname $(readlink -f axpy.F90))/||g" > axpy_f_nv_debug_max.out

0 commit comments

Comments
 (0)