Update Mahti outputs

trossi · trossi · commit 57693ccac0d7 · 2025-10-30T09:51:06.000+02:00
diff --git a/openmp/01-axpy/solution/README.md b/openmp/01-axpy/solution/README.md
@@ -71,117 +71,15 @@ Note for instructors: use `run*.sh` scripts to generate all the output files.
        export NVCOMPILER_ACC_NOTIFY=$((0x1 | 0x2))
        srun -p gputest --nodes=1 --ntasks-per-node=1 --cpus-per-task=4 --gres=gpu:a100:1 -t 0:10:00 ./axpy.x
 
-   Output from C version:
-
-       upload CUDA data  file=.../axpy.c function=main line=27 device=0 threadid=1 variable=y[:] bytes=819200
-       upload CUDA data  file=.../axpy.c function=main line=27 device=0 threadid=1 variable=x[:] bytes=819200
-       launch CUDA kernel file=.../axpy.c function=main line=27 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_main_F1L27_2 grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
-       download CUDA data  file=.../axpy.c function=main line=33 device=0 threadid=1 variable=x[:] bytes=819200
-       download CUDA data  file=.../axpy.c function=main line=33 device=0 threadid=1 variable=y[:] bytes=819200
-       Using N = 102400
-       Input:
-       a =   3.0000
-       x =   0.0000   0.0000   0.0000   0.0000 ...   1.0000   1.0000   1.0000   1.0000
-       y =   0.0000   0.0010   0.0020   0.0029 ...  99.9971  99.9980  99.9990 100.0000
-       Output:
-       y =   0.0000   0.0010   0.0020   0.0030 ... 102.9970 102.9980 102.9990 103.0000
-
-   Output from Fortran version:
-
-       upload CUDA data  file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=descriptor bytes=128
-       upload CUDA data  file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=y(:) bytes=819200
-       upload CUDA data  file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=descriptor bytes=128
-       upload CUDA data  file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=x(:) bytes=819200
-       launch CUDA kernel file=.../axpy.F90 function=axpy line=31 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_MAIN__F1L31_2_ grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
-       download CUDA data  file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 variable=x(:) bytes=819200
-       download CUDA data  file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 variable=y(:) bytes=819200
-       Using N = 102400
-       Input:
-       a =   3.0000
-       x =   0.0000   0.0000   0.0000   0.0000 ...   1.0000   1.0000   1.0000   1.0000
-       y =   0.0000   0.0010   0.0020   0.0029 ...  99.9971  99.9980  99.9990 100.0000
-       Output:
-       y =   0.0000   0.0010   0.0020   0.0030 ... 102.9970 102.9980 102.9990 103.0000
+   Outputs are in `axpy_{c,f}_nv_debug.out` for C and Fortran, respectively.
 
    We can read the memory transfers and kernel execution with 800 blocks and 128 threads per block from this output.
 
-   Now `NVCOMPILER_ACC_NOTIFY=$((0x1F))`.
-   Output from C version:
-
-       Enter enter data construct file=.../axpy.c function=main line=27 device=0 threadid=1
-       create CUDA data  bytes=819200 file=.../axpy.c function=main line=27 device=0 threadid=1
-       alloc  CUDA data  devaddr=0x7fff0b2fa000 bytes=819200 file=.../axpy.c function=main line=27 device=0 threadid=1
-       upload CUDA data  file=.../axpy.c function=main line=27 device=0 threadid=1 variable=y[:] bytes=819200
-       create CUDA data  bytes=819200 file=.../axpy.c function=main line=27 device=0 threadid=1
-       alloc  CUDA data  devaddr=0x7fff0b800000 bytes=819200 file=.../axpy.c function=main line=27 device=0 threadid=1
-       upload CUDA data  file=.../axpy.c function=main line=27 device=0 threadid=1 variable=x[:] bytes=819200
-       Leave enter data .../axpy.c main:27 device=0 threadid=1
-       launch CUDA kernel file=.../axpy.c function=main line=27 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_main_F1L27_2 grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
-       Enter exit data construct file=.../axpy.c function=main line=27 device=0 threadid=1
-       download CUDA data  file=.../axpy.c function=main line=33 device=0 threadid=1 variable=x[:] bytes=819200
-       delete CUDA data  devaddr=0x7fff0b800000 bytes=819200 file=.../axpy.c function=main line=33 device=0 threadid=1
-       download CUDA data  file=.../axpy.c function=main line=33 device=0 threadid=1 variable=y[:] bytes=819200
-       delete CUDA data  devaddr=0x7fff0b2fa000 bytes=819200 file=.../axpy.c function=main line=33 device=0 threadid=1
-       Implicit wait  file=.../axpy.c function=main line=33 device=0 threadid=1 queue=acc_async_sync
-       Leave exit data .../axpy.c main:33 device=0 threadid=1
-       Using N = 102400
-       Input:
-       a =   3.0000
-       x =   0.0000   0.0000   0.0000   0.0000 ...   1.0000   1.0000   1.0000   1.0000
-       y =   0.0000   0.0010   0.0020   0.0029 ...  99.9971  99.9980  99.9990 100.0000
-       Output:
-       y =   0.0000   0.0010   0.0020   0.0030 ... 102.9970 102.9980 102.9990 103.0000
-
-   Output from Fortran version:
-
-       Enter enter data construct file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       create CUDA data  bytes=819200 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       alloc  CUDA data  devaddr=0x7fff0b2fa000 bytes=819200 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       create CUDA data  bytes=128 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       alloc  CUDA data  devaddr=0x7fff0b3c2000 bytes=512 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       upload CUDA data  file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=descriptor bytes=128
-       upload CUDA data  file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=y(:) bytes=819200
-       create CUDA data  bytes=819200 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       alloc  CUDA data  devaddr=0x7fff0b800000 bytes=819200 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       create CUDA data  bytes=128 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       alloc  CUDA data  devaddr=0x7fff0b3c2200 bytes=512 file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       upload CUDA data  file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=descriptor bytes=128
-       upload CUDA data  file=.../axpy.F90 function=axpy line=31 device=0 threadid=1 variable=x(:) bytes=819200
-       Leave enter data .../axpy.F90 axpy:31 device=0 threadid=1
-       launch CUDA kernel file=.../axpy.F90 function=axpy line=31 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_MAIN__F1L31_2_ grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
-       Enter exit data construct file=.../axpy.F90 function=axpy line=31 device=0 threadid=1
-       download CUDA data  file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 variable=x(:) bytes=819200
-       delete CUDA data  devaddr=0x7fff0b800000 bytes=819200 file=.../axpy.F90 function=axpy line=35 device=0 threadid=1
-       delete CUDA data  devaddr=0x7fff0b3c2200 bytes=512 file=.../axpy.F90 function=axpy line=35 device=0 threadid=1
-       download CUDA data  file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 variable=y(:) bytes=819200
-       delete CUDA data  devaddr=0x7fff0b2fa000 bytes=819200 file=.../axpy.F90 function=axpy line=35 device=0 threadid=1
-       delete CUDA data  devaddr=0x7fff0b3c2000 bytes=512 file=.../axpy.F90 function=axpy line=35 device=0 threadid=1
-       Implicit wait  file=.../axpy.F90 function=axpy line=35 device=0 threadid=1 queue=acc_async_sync
-       Leave exit data .../axpy.F90 axpy:35 device=0 threadid=1
-       Using N = 102400
-       Input:
-       a =   3.0000
-       x =   0.0000   0.0000   0.0000   0.0000 ...   1.0000   1.0000   1.0000   1.0000
-       y =   0.0000   0.0010   0.0020   0.0029 ...  99.9971  99.9980  99.9990 100.0000
-       Output:
-       y =   0.0000   0.0010   0.0020   0.0030 ... 102.9970 102.9980 102.9990 103.0000
+   For `NVCOMPILER_ACC_NOTIFY=$((0x1F))` outputs are in `axpy_{c,f}_nv_debug_max.out`.
 
    Compiling with diagnostics:
 
        nvc -O3 -mp=gpu -gpu=cc80 -Minfo=mp axpy.c -o axpy.x
        nvfortran -O3 -mp=gpu -gpu=cc80 -Minfo=mp helper_functions.F90 axpy.F90 -o axpy.x
 
-   Output for C:
-
-       main:
-            27, #omp target teams distribute parallel for
-                27, Generating "nvkernel_main_F1L27_2" GPU kernel
-                31, Loop parallelized across teams and threads(128), schedule(static)
-            27, Generating implicit map(tofrom:y[:],x[:])
-
-   Output for Fortran:
-
-       axpy:
-            31, !$omp target teams distribute parallel do
-                31, Generating "nvkernel_MAIN__F1L31_2" GPU kernel
-            31, Generating implicit map(tofrom:y(:),x(:))
+   Outputs are in `axpy_{c,f}_nv_info.out`.
diff --git a/openmp/01-axpy/solution/axpy_c_nv_debug.out b/openmp/01-axpy/solution/axpy_c_nv_debug.out
@@ -0,0 +1,12 @@
+upload CUDA data  file=axpy.c function=main line=33 device=0 threadid=1 variable=y[:] bytes=819200
+upload CUDA data  file=axpy.c function=main line=33 device=0 threadid=1 variable=x[:] bytes=819200
+launch CUDA kernel file=axpy.c function=main line=33 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_main_F1L33_2 grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
+download CUDA data  file=axpy.c function=main line=39 device=0 threadid=1 variable=x[:] bytes=819200
+download CUDA data  file=axpy.c function=main line=39 device=0 threadid=1 variable=y[:] bytes=819200
+Using N = 102400
+Input:
+a =   3.0000
+x =   0.0000   0.0000   0.0000   0.0000 ...   1.0000   1.0000   1.0000   1.0000
+y =   0.0000   0.0010   0.0020   0.0029 ...  99.9971  99.9980  99.9990 100.0000
+Output:
+y =   0.0000   0.0010   0.0020   0.0030 ... 102.9970 102.9980 102.9990 103.0000
diff --git a/openmp/01-axpy/solution/axpy_c_nv_debug_max.out b/openmp/01-axpy/solution/axpy_c_nv_debug_max.out
@@ -0,0 +1,23 @@
+Enter enter data construct file=axpy.c function=main line=33 device=0 threadid=1
+create CUDA data  bytes=819200 file=axpy.c function=main line=33 device=0 threadid=1
+alloc  CUDA data  devaddr=0x7fff0b2fa000 bytes=819200 file=axpy.c function=main line=33 device=0 threadid=1
+upload CUDA data  file=axpy.c function=main line=33 device=0 threadid=1 variable=y[:] bytes=819200
+create CUDA data  bytes=819200 file=axpy.c function=main line=33 device=0 threadid=1
+alloc  CUDA data  devaddr=0x7fff0b800000 bytes=819200 file=axpy.c function=main line=33 device=0 threadid=1
+upload CUDA data  file=axpy.c function=main line=33 device=0 threadid=1 variable=x[:] bytes=819200
+Leave enter data axpy.c main:33 device=0 threadid=1
+launch CUDA kernel file=axpy.c function=main line=33 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_main_F1L33_2 grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
+Enter exit data construct file=axpy.c function=main line=33 device=0 threadid=1
+download CUDA data  file=axpy.c function=main line=39 device=0 threadid=1 variable=x[:] bytes=819200
+delete CUDA data  devaddr=0x7fff0b800000 bytes=819200 file=axpy.c function=main line=39 device=0 threadid=1
+download CUDA data  file=axpy.c function=main line=39 device=0 threadid=1 variable=y[:] bytes=819200
+delete CUDA data  devaddr=0x7fff0b2fa000 bytes=819200 file=axpy.c function=main line=39 device=0 threadid=1
+Implicit wait  file=axpy.c function=main line=39 device=0 threadid=1 queue=acc_async_sync
+Leave exit data axpy.c main:39 device=0 threadid=1
+Using N = 102400
+Input:
+a =   3.0000
+x =   0.0000   0.0000   0.0000   0.0000 ...   1.0000   1.0000   1.0000   1.0000
+y =   0.0000   0.0010   0.0020   0.0029 ...  99.9971  99.9980  99.9990 100.0000
+Output:
+y =   0.0000   0.0010   0.0020   0.0030 ... 102.9970 102.9980 102.9990 103.0000
diff --git a/openmp/01-axpy/solution/axpy_c_nv_info.out b/openmp/01-axpy/solution/axpy_c_nv_info.out
@@ -0,0 +1,5 @@
+main:
+     33, #omp target teams distribute parallel for
+         33, Generating "nvkernel_main_F1L33_2" GPU kernel
+         37, Loop parallelized across teams and threads(128), schedule(static)
+     33, Generating implicit map(tofrom:y[:],x[:]) 
diff --git a/openmp/01-axpy/solution/axpy_f_nv_debug.out b/openmp/01-axpy/solution/axpy_f_nv_debug.out
@@ -0,0 +1,14 @@
+upload CUDA data  file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=descriptor bytes=128
+upload CUDA data  file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=y(:) bytes=819200
+upload CUDA data  file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=descriptor bytes=128
+upload CUDA data  file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=x(:) bytes=819200
+launch CUDA kernel file=axpy.F90 function=axpy line=35 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_MAIN__F1L35_2_ grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
+download CUDA data  file=axpy.F90 function=axpy line=39 device=0 threadid=1 variable=x(:) bytes=819200
+download CUDA data  file=axpy.F90 function=axpy line=39 device=0 threadid=1 variable=y(:) bytes=819200
+Using N = 102400
+Input:
+a =   3.0000
+x =   0.0000   0.0000   0.0000   0.0000 ...   1.0000   1.0000   1.0000   1.0000
+y =   0.0000   0.0010   0.0020   0.0029 ...  99.9971  99.9980  99.9990 100.0000
+Output:
+y =   0.0000   0.0010   0.0020   0.0030 ... 102.9970 102.9980 102.9990 103.0000
diff --git a/openmp/01-axpy/solution/axpy_f_nv_debug_max.out b/openmp/01-axpy/solution/axpy_f_nv_debug_max.out
@@ -0,0 +1,31 @@
+Enter enter data construct file=axpy.F90 function=axpy line=35 device=0 threadid=1
+create CUDA data  bytes=819200 file=axpy.F90 function=axpy line=35 device=0 threadid=1
+alloc  CUDA data  devaddr=0x7fff0b2fa000 bytes=819200 file=axpy.F90 function=axpy line=35 device=0 threadid=1
+create CUDA data  bytes=128 file=axpy.F90 function=axpy line=35 device=0 threadid=1
+alloc  CUDA data  devaddr=0x7fff0b3c2000 bytes=512 file=axpy.F90 function=axpy line=35 device=0 threadid=1
+upload CUDA data  file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=descriptor bytes=128
+upload CUDA data  file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=y(:) bytes=819200
+create CUDA data  bytes=819200 file=axpy.F90 function=axpy line=35 device=0 threadid=1
+alloc  CUDA data  devaddr=0x7fff0b800000 bytes=819200 file=axpy.F90 function=axpy line=35 device=0 threadid=1
+create CUDA data  bytes=128 file=axpy.F90 function=axpy line=35 device=0 threadid=1
+alloc  CUDA data  devaddr=0x7fff0b3c2200 bytes=512 file=axpy.F90 function=axpy line=35 device=0 threadid=1
+upload CUDA data  file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=descriptor bytes=128
+upload CUDA data  file=axpy.F90 function=axpy line=35 device=0 threadid=1 variable=x(:) bytes=819200
+Leave enter data axpy.F90 axpy:35 device=0 threadid=1
+launch CUDA kernel file=axpy.F90 function=axpy line=35 device=0 host-threadid=0 num_teams=0 thread_limit=0 kernelname=nvkernel_MAIN__F1L35_2_ grid=<<<800,1,1>>> block=<<<128,1,1>>> shmem=0b
+Enter exit data construct file=axpy.F90 function=axpy line=35 device=0 threadid=1
+download CUDA data  file=axpy.F90 function=axpy line=39 device=0 threadid=1 variable=x(:) bytes=819200
+delete CUDA data  devaddr=0x7fff0b800000 bytes=819200 file=axpy.F90 function=axpy line=39 device=0 threadid=1
+delete CUDA data  devaddr=0x7fff0b3c2200 bytes=512 file=axpy.F90 function=axpy line=39 device=0 threadid=1
+download CUDA data  file=axpy.F90 function=axpy line=39 device=0 threadid=1 variable=y(:) bytes=819200
+delete CUDA data  devaddr=0x7fff0b2fa000 bytes=819200 file=axpy.F90 function=axpy line=39 device=0 threadid=1
+delete CUDA data  devaddr=0x7fff0b3c2000 bytes=512 file=axpy.F90 function=axpy line=39 device=0 threadid=1
+Implicit wait  file=axpy.F90 function=axpy line=39 device=0 threadid=1 queue=acc_async_sync
+Leave exit data axpy.F90 axpy:39 device=0 threadid=1
+Using N = 102400
+Input:
+a =   3.0000
+x =   0.0000   0.0000   0.0000   0.0000 ...   1.0000   1.0000   1.0000   1.0000
+y =   0.0000   0.0010   0.0020   0.0029 ...  99.9971  99.9980  99.9990 100.0000
+Output:
+y =   0.0000   0.0010   0.0020   0.0030 ... 102.9970 102.9980 102.9990 103.0000
diff --git a/openmp/01-axpy/solution/axpy_f_nv_info.out b/openmp/01-axpy/solution/axpy_f_nv_info.out
@@ -0,0 +1,6 @@
+helper_functions.F90:
+axpy.F90:
+axpy:
+     35, !$omp target teams distribute parallel do
+         35, Generating "nvkernel_MAIN__F1L35_2" GPU kernel
+     35, Generating implicit map(tofrom:y(:),x(:)) 
diff --git a/openmp/01-axpy/solution/run_mahti.sh b/openmp/01-axpy/solution/run_mahti.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --job-name=test
+#SBATCH --partition=gputest
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=4
+#SBATCH --gres=gpu:a100:1
+#SBATCH --time=00:10:00
+
+ml purge
+ml use /appl/opt/nvhpc/modulefiles
+ml nvhpc-hpcx-cuda12/25.1
+
+export OMP_TARGET_OFFLOAD=MANDATORY
+
+# nvc
+nvc -O3 -mp=gpu -gpu=cc80 -Minfo=mp axpy.c -o axpy.x &> axpy_c_nv_info.out
+
+NVCOMPILER_ACC_NOTIFY=$((0x1 | 0x2))  srun ./axpy.x 2>&1 | sed "s|$(dirname $(readlink -f axpy.c))/||g" > axpy_c_nv_debug.out
+NVCOMPILER_ACC_NOTIFY=$((0x1F))       srun ./axpy.x 2>&1 | sed "s|$(dirname $(readlink -f axpy.c))/||g" > axpy_c_nv_debug_max.out
+
+# nvfortran
+nvfortran -O3 -mp=gpu -gpu=cc80 -Minfo=mp helper_functions.F90 axpy.F90 -o axpy.x &> axpy_f_nv_info.out
+
+NVCOMPILER_ACC_NOTIFY=$((0x1 | 0x2))  srun ./axpy.x 2>&1 | sed "s|$(dirname $(readlink -f axpy.F90))/||g" > axpy_f_nv_debug.out
+NVCOMPILER_ACC_NOTIFY=$((0x1F))       srun ./axpy.x 2>&1 | sed "s|$(dirname $(readlink -f axpy.F90))/||g" > axpy_f_nv_debug_max.out