# Invocation command line:
# /home/HPC2021v1.1.7/bin/harness/runhpc --reportable --configfile nv_example.cfg --tune base,peak --pmodel ACC --define model=acc --define THREADS=1 --ranks 1 --size ref --iterations 3 --nopower --runmode speed --tune base:peak --size ref tiny
# output_root was not used for this run
############################################################################
######################################################################
# Example configuration file for the NVIDIA HPC SDK Compilers
#
# Before using this config file, copy it to a new config (such as nvhpc.cfg) and edit as needed
#
# Defines: "model" => "mpi", "acc", "accmc", "omp", "tgt", "tgtgpu"  default "mpi"
#          "label" => ext base label, default "nv"
#
# MPI-only Command:
# runhpc -c nvhpc --reportable -T base --define model=mpi --ranks=40 tiny
#
# OpenACC offload to GPU Command:
# runhpc -c nvhpc --reportable -T base --define model=acc --ranks=4  tiny
#   Add "--define ucx" if using OpenMPI 4 with UCX support.
#
# OpenACC offload to Multicore CPU Command:
# runhpc -c nvhpc --reportable -T base --define model=accmc --ranks=4  tiny
#
# OpenMP Command:
# runhpc -c nvhpc --reportable -T base --define model=omp --ranks=1 --threads=40 tiny
#
# OpenMP Target Offload to Host Command:
# runhpc -c nvhpc --reportable -T base --define model=tgt --ranks=1 --threads=40 tiny
#
# OpenMP Target Offload to GPU Command:
# runhpc -c nvhpc --reportable -T base --define model=tgtgpu --ranks=4  tiny
#
#######################################################################

%ifndef %{label}         # IF label is not set use nv
%   define label nv
%endif

%ifndef %{model}         # IF model is not set use mpi
%   define model mpi
pmodel = MPI
%endif

teeout = yes

# Display the Internal Timer info

# Adjust the number of make jobs to use here
makeflags=-j 40

flagsurl000=http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.2022-08-24.xml

# Tester Information
license_num     = 28
test_sponsor    = Lenovo Global Technology
tester          = Lenovo Global Technology

######################################################
# SUT Section
######################################################
#include: Example_SUT.inc
#  ----- Begin inclusion of 'Example_SUT.inc'
############################################################################
######################################################
# Example configuration information for a
# system under test (SUT) Section
######################################################
# General SUT info
system_vendor      = Lenovo Global Technology
system_name        = ThinkSystem SR675 V3 (AMD EPYC 9654, Nvidia H100-PCIe-80G)
node_compute_sw_accel_driver = 535.54.03
hw_avail           = Oct-2023
sw_avail           = Oct-2023
prepared_by = Lenovo Global Technology

# Computation node info
# [Node_Description: Hardware]
node_compute_syslbl = ThinkSystem SR675 V3
node_compute_order = 1
node_compute_count = 1
node_compute_purpose = compute
node_compute_hw_vendor = Lenovo Global Technology
node_compute_hw_model = ThinkSystem SR655 V3
node_compute_hw_cpu_name = AMD EPYC 9654
node_compute_hw_ncpuorder = 1 chips
node_compute_hw_nchips = 2
node_compute_hw_ncores = 192
node_compute_hw_ncoresperchip = 96
node_compute_hw_nthreadspercore = 1
node_compute_hw_cpu_char = Intel Turbo Boost Technology up to 3.7 GHz
node_compute_hw_cpu_mhz = 2400
node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core
node_compute_hw_scache = 1 MB I+D on chip per core
node_compute_hw_tcache   = 384 MB I+D on chip per chip
node_compute_hw_ocache = None
node_compute_hw_memory = 768 GB (24 x 32 GB 2Rx8 PC5-4800B-R)
node_compute_hw_disk = 1x ThinkSystem 2.5" 5300 480GB SSD
node_compute_hw_other = None

#[Node_Description: Accelerator]
node_compute_hw_accel_model = Tesla H100 PCIe 80GB
node_compute_hw_accel_count = 8
node_compute_hw_accel_vendor= Nvidia Corporation
node_compute_hw_accel_type  = GPU
node_compute_hw_accel_connect = PCIe Gen5 x16
node_compute_hw_accel_ecc    = Yes
node_compute_hw_accel_desc   = Nvidia Tesla H100 PCIe 80GB

#[Node_Description: Software]
node_compute_hw_adapter_fs_model = Mellanox ConnectX-7 NDR
node_compute_hw_adapter_fs_count = 1
node_compute_hw_adapter_fs_slot_type = PCI-Express 5.0 x8
node_compute_hw_adapter_fs_data_rate = 400 Gb/s
node_compute_hw_adapter_fs_ports_used = 1
node_compute_hw_adapter_fs_interconnect = Nvidia Mellanox ConnectX-7 NDR
node_compute_hw_adapter_fs_driver = 5.9-0.5.5
node_compute_hw_adapter_fs_firmware = 28.33.0508
node_compute_sw_os000 = Red Hat Enterprise Linux Server release 8.6,
node_compute_sw_os001 = Kernel 4.18.0-372.9.1.el8.x86_64
node_compute_sw_localfile = xfs
node_compute_sw_sharedfile = XFS
node_compute_sw_state = Multi-user, run level 3
node_compute_sw_other = None

#[Fileserver]

#[Interconnect]

#######################################################################
# End of SUT section
# If this config file were to be applied to several SUTs, edits would
# be needed only ABOVE this point.
######################################################################
# ---- End inclusion of '/home/HPC2021v1.1.7/config/Example_SUT.inc'

#[Software]
system_class = Homogeneous Cluster
sw_compiler      = Nvidia HPC SDK 23.5
sw_mpi_library = Open MPI 4.0.5
sw_mpi_other = None
sw_other = --

#[General notes]

#######################################################################
# End of SUT section
######################################################################

######################################################################
# The header section of the config file.  Must appear
# before any instances of "section markers" (see below)
#
# ext = how the binaries you generated will be identified
# tune = specify "base" or "peak" or "all"
label         = %{label}_%{model}
tune          = base
output_format = text
use_submit_for_speed = 1

# Setting 'strict_rundir_verify=0' will allow direct source code modifications
# but will disable the ability to create reportable results.
# May be useful for academic and research purposes
# strict_rundir_verify = 0

# Compiler Settings
default:
CC           = mpicc
CXX          = mpicxx
FC           = mpif90

# Compiler Version Flags
CC_VERSION_OPTION  = -V
CXX_VERSION_OPTION = -V
FC_VERSION_OPTION  = -V

# if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI
# without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs
preENV_UCX_MEMTYPE_CACHE=n
preENV_UCX_TLS=self,shm,cuda_copy

# MPI options and binding environment, dependent upon Model being run
# Adjust to match your system
MPIRUN_OPTS = --bind-to core #numa #none

# Note that SPH_EXA is known to hang when using multiple nodes with some versions of UCX,
# to work around, add the following setting:
#MPIRUN_OPTS += --mca topo basic

%ifdef %{bindomp}
# use the example bindomp.pl script
submit = mpirun --allow-run-as-root ${MPIRUN_OPTS} -np $ranks command
#submit = mpirun --allow-run-as-root ${MPIRUN_OPTS} -np $ranks specperl $[top]/config/scripts/bindomp.pl $command
%else
submit = mpirun --allow-run-as-root ${MPIRUN_OPTS} -np $ranks $command
%endif

#######################################################################
# Optimization
#
# Note that SPEC baseline rules require that all uses of a given compiler
# use the same flags in the same order. See the SPEChpc Run Rules
# for more details
#      http://www.spec.org/hpc2021/Docs/runrules.html
#
# OPTIMIZE    = flags applicable to all compilers
# FOPTIMIZE   = flags appliable to the Fortran compiler
# COPTIMIZE   = flags appliable to the C compiler
# CXXOPTIMIZE = flags appliable to the C++ compiler
#
# See your compiler manual for information on the flags available
# for your compiler
#
# Compiler flags applied to all models
default=base=default:
OPTIMIZE       = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast
CXXPORTABILITY = --c++17

# OpenACC (GPU) flags
%if %{model} eq 'acc'
pmodel=ACC
OPTIMIZE += -acc=gpu -Minfo=accel -DSPEC_ACCEL_AWARE_MPI

505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l:
PORTABILITY += -DSPEC_OPENACC_NO_SELF

%endif

# OpenACC (Multicore CPU) flags
%if %{model} eq 'accmc'
pmodel=ACC
OPTIMIZE += -acc=multicore -mp -Minfo=accel

505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l:
PORTABILITY += -DSPEC_OPENACC_NO_SELF

521.miniswp_t:
PORTABILITY+= -DSPEC_USE_HOST_THREADS=1

%endif

# OpenMP Threaded (CPU) flags
%if %{model} eq 'omp'
pmodel=OMP
OPTIMIZE += -mp -Minfo=mp
%endif

# OpenMP Targeting host flags
%if %{model} eq 'tgt'
pmodel=TGT
OPTIMIZE += -mp -Minfo=mp

# Note that while NVHPC added support for OpenMP
# array reduction in v22.2, a compiler issue
# prevents it's use.  This may be used in future
# versions of the compiler, in which case remove
# -DSPEC_NO_VAR_ARRAY_REDUCE
513.soma_t,613.soma_s:
PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE -DSPEC_USE_HOST_THREADS=1

521.miniswp_t:
PORTABILITY+=-DSPEC_USE_HOST_THREADS=1

%endif

# OpenMP Targeting GPU flags
%if %{model} eq 'tgtgpu'
pmodel=TGT
OPTIMIZE += -mp=gpu -Minfo=mp

# Note that while NVHPC added support for OpenMP
# array reduction in v22.2, a compiler issue
# prevents it's use.  This may be used in future
# versions of the compiler, in which case comment
# out the following two lines
513.soma_t,613.soma_s:
PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE

%endif

# No peak flags set, so make peak use the same flags as base
default=peak=default:
basepeak=0

505.lbm_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -DSPEC_ACCEL_AWARE_MPI

513.soma_t=peak=default:
basepeak=1

518.tealeaf_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -Msafeptr -DSPEC_ACCEL_AWARE_MPI

519.clvleaf_t=peak=default:
basepeak=1

521.miniswp_t=peak=default:
basepeak=1

528.pot3d_t=peak=default:
basepeak=1

532.sph_exa_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI

534.hpgmgfv_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -static-nvidia -DSPEC_ACCEL_AWARE_MPI

535.weather_t=peak=default:
ranks   = %{RANKS}
OPTIMIZE       = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI


# The following section was added automatically, and contains settings that
# did not appear in the original configuration file, but were added to the
# raw file after the run.
default: