# Invocation command line: # /home/HPC2021v1.1.7/bin/harness/runhpc --reportable --configfile nv_example.cfg --tune base,peak --pmodel ACC --define model=acc --define THREADS=1 --ranks 1 --size ref --iterations 3 --nopower --runmode speed --tune base:peak --size ref tiny # output_root was not used for this run ############################################################################ ###################################################################### # Example configuration file for the NVIDIA HPC SDK Compilers # # Before using this config file, copy it to a new config (such as nvhpc.cfg) and edit as needed # # Defines: "model" => "mpi", "acc", "accmc", "omp", "tgt", "tgtgpu" default "mpi" # "label" => ext base label, default "nv" # # MPI-only Command: # runhpc -c nvhpc --reportable -T base --define model=mpi --ranks=40 tiny # # OpenACC offload to GPU Command: # runhpc -c nvhpc --reportable -T base --define model=acc --ranks=4 tiny # Add "--define ucx" if using OpenMPI 4 with UCX support. # # OpenACC offload to Multicore CPU Command: # runhpc -c nvhpc --reportable -T base --define model=accmc --ranks=4 tiny # # OpenMP Command: # runhpc -c nvhpc --reportable -T base --define model=omp --ranks=1 --threads=40 tiny # # OpenMP Target Offload to Host Command: # runhpc -c nvhpc --reportable -T base --define model=tgt --ranks=1 --threads=40 tiny # # OpenMP Target Offload to GPU Command: # runhpc -c nvhpc --reportable -T base --define model=tgtgpu --ranks=4 tiny # ####################################################################### %ifndef %{label} # IF label is not set use nv % define label nv %endif %ifndef %{model} # IF model is not set use mpi % define model mpi pmodel = MPI %endif teeout = yes # Display the Internal Timer info # Adjust the number of make jobs to use here makeflags=-j 40 flagsurl000=http://www.spec.org/hpc2021/flags/nv2021_flags_v1.0.3.2022-08-24.xml # Tester Information license_num = 28 test_sponsor = Lenovo Global Technology tester = Lenovo Global Technology ###################################################### # SUT Section ###################################################### #include: Example_SUT.inc # ----- Begin inclusion of 'Example_SUT.inc' ############################################################################ ###################################################### # Example configuration information for a # system under test (SUT) Section ###################################################### # General SUT info system_vendor = Lenovo Global Technology system_name = ThinkSystem SR675 V3 (AMD EPYC 9654, Nvidia H100-PCIe-80G) node_compute_sw_accel_driver = 535.54.03 hw_avail = Oct-2023 sw_avail = Oct-2023 prepared_by = Lenovo Global Technology # Computation node info # [Node_Description: Hardware] node_compute_syslbl = ThinkSystem SR675 V3 node_compute_order = 1 node_compute_count = 1 node_compute_purpose = compute node_compute_hw_vendor = Lenovo Global Technology node_compute_hw_model = ThinkSystem SR655 V3 node_compute_hw_cpu_name = AMD EPYC 9654 node_compute_hw_ncpuorder = 1 chips node_compute_hw_nchips = 2 node_compute_hw_ncores = 192 node_compute_hw_ncoresperchip = 96 node_compute_hw_nthreadspercore = 1 node_compute_hw_cpu_char = Intel Turbo Boost Technology up to 3.7 GHz node_compute_hw_cpu_mhz = 2400 node_compute_hw_pcache = 32 KB I + 32 KB D on chip per core node_compute_hw_scache = 1 MB I+D on chip per core node_compute_hw_tcache = 384 MB I+D on chip per chip node_compute_hw_ocache = None node_compute_hw_memory = 768 GB (24 x 32 GB 2Rx8 PC5-4800B-R) node_compute_hw_disk = 1x ThinkSystem 2.5" 5300 480GB SSD node_compute_hw_other = None #[Node_Description: Accelerator] node_compute_hw_accel_model = Tesla H100 PCIe 80GB node_compute_hw_accel_count = 8 node_compute_hw_accel_vendor= Nvidia Corporation node_compute_hw_accel_type = GPU node_compute_hw_accel_connect = PCIe Gen5 x16 node_compute_hw_accel_ecc = Yes node_compute_hw_accel_desc = Nvidia Tesla H100 PCIe 80GB #[Node_Description: Software] node_compute_hw_adapter_fs_model = Mellanox ConnectX-7 NDR node_compute_hw_adapter_fs_count = 1 node_compute_hw_adapter_fs_slot_type = PCI-Express 5.0 x8 node_compute_hw_adapter_fs_data_rate = 400 Gb/s node_compute_hw_adapter_fs_ports_used = 1 node_compute_hw_adapter_fs_interconnect = Nvidia Mellanox ConnectX-7 NDR node_compute_hw_adapter_fs_driver = 5.9-0.5.5 node_compute_hw_adapter_fs_firmware = 28.33.0508 node_compute_sw_os000 = Red Hat Enterprise Linux Server release 8.6, node_compute_sw_os001 = Kernel 4.18.0-372.9.1.el8.x86_64 node_compute_sw_localfile = xfs node_compute_sw_sharedfile = XFS node_compute_sw_state = Multi-user, run level 3 node_compute_sw_other = None #[Fileserver] #[Interconnect] ####################################################################### # End of SUT section # If this config file were to be applied to several SUTs, edits would # be needed only ABOVE this point. ###################################################################### # ---- End inclusion of '/home/HPC2021v1.1.7/config/Example_SUT.inc' #[Software] system_class = Homogeneous Cluster sw_compiler = Nvidia HPC SDK 23.5 sw_mpi_library = Open MPI 4.0.5 sw_mpi_other = None sw_other = -- #[General notes] ####################################################################### # End of SUT section ###################################################################### ###################################################################### # The header section of the config file. Must appear # before any instances of "section markers" (see below) # # ext = how the binaries you generated will be identified # tune = specify "base" or "peak" or "all" label = %{label}_%{model} tune = base output_format = text use_submit_for_speed = 1 # Setting 'strict_rundir_verify=0' will allow direct source code modifications # but will disable the ability to create reportable results. # May be useful for academic and research purposes # strict_rundir_verify = 0 # Compiler Settings default: CC = mpicc CXX = mpicxx FC = mpif90 # Compiler Version Flags CC_VERSION_OPTION = -V CXX_VERSION_OPTION = -V FC_VERSION_OPTION = -V # if using OpenMPI with UCX support, these settings are needed with use of CUDA Aware MPI # without these flags, LBM is known to hang when using OpenACC and OpenMP Target to GPUs preENV_UCX_MEMTYPE_CACHE=n preENV_UCX_TLS=self,shm,cuda_copy # MPI options and binding environment, dependent upon Model being run # Adjust to match your system MPIRUN_OPTS = --bind-to core #numa #none # Note that SPH_EXA is known to hang when using multiple nodes with some versions of UCX, # to work around, add the following setting: #MPIRUN_OPTS += --mca topo basic %ifdef %{bindomp} # use the example bindomp.pl script submit = mpirun --allow-run-as-root ${MPIRUN_OPTS} -np $ranks command #submit = mpirun --allow-run-as-root ${MPIRUN_OPTS} -np $ranks specperl $[top]/config/scripts/bindomp.pl $command %else submit = mpirun --allow-run-as-root ${MPIRUN_OPTS} -np $ranks $command %endif ####################################################################### # Optimization # # Note that SPEC baseline rules require that all uses of a given compiler # use the same flags in the same order. See the SPEChpc Run Rules # for more details # http://www.spec.org/hpc2021/Docs/runrules.html # # OPTIMIZE = flags applicable to all compilers # FOPTIMIZE = flags appliable to the Fortran compiler # COPTIMIZE = flags appliable to the C compiler # CXXOPTIMIZE = flags appliable to the C++ compiler # # See your compiler manual for information on the flags available # for your compiler # # Compiler flags applied to all models default=base=default: OPTIMIZE = -w -Mfprelaxed -Mnouniform -Mstack_arrays -fast CXXPORTABILITY = --c++17 # OpenACC (GPU) flags %if %{model} eq 'acc' pmodel=ACC OPTIMIZE += -acc=gpu -Minfo=accel -DSPEC_ACCEL_AWARE_MPI 505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l: PORTABILITY += -DSPEC_OPENACC_NO_SELF %endif # OpenACC (Multicore CPU) flags %if %{model} eq 'accmc' pmodel=ACC OPTIMIZE += -acc=multicore -mp -Minfo=accel 505.lbm_t,605.lbm_s,705.lbm_m,805.lbm_l: PORTABILITY += -DSPEC_OPENACC_NO_SELF 521.miniswp_t: PORTABILITY+= -DSPEC_USE_HOST_THREADS=1 %endif # OpenMP Threaded (CPU) flags %if %{model} eq 'omp' pmodel=OMP OPTIMIZE += -mp -Minfo=mp %endif # OpenMP Targeting host flags %if %{model} eq 'tgt' pmodel=TGT OPTIMIZE += -mp -Minfo=mp # Note that while NVHPC added support for OpenMP # array reduction in v22.2, a compiler issue # prevents it's use. This may be used in future # versions of the compiler, in which case remove # -DSPEC_NO_VAR_ARRAY_REDUCE 513.soma_t,613.soma_s: PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE -DSPEC_USE_HOST_THREADS=1 521.miniswp_t: PORTABILITY+=-DSPEC_USE_HOST_THREADS=1 %endif # OpenMP Targeting GPU flags %if %{model} eq 'tgtgpu' pmodel=TGT OPTIMIZE += -mp=gpu -Minfo=mp # Note that while NVHPC added support for OpenMP # array reduction in v22.2, a compiler issue # prevents it's use. This may be used in future # versions of the compiler, in which case comment # out the following two lines 513.soma_t,613.soma_s: PORTABILITY+=-DSPEC_NO_VAR_ARRAY_REDUCE %endif # No peak flags set, so make peak use the same flags as base default=peak=default: basepeak=0 505.lbm_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -DSPEC_ACCEL_AWARE_MPI 513.soma_t=peak=default: basepeak=1 518.tealeaf_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -Msafeptr -DSPEC_ACCEL_AWARE_MPI 519.clvleaf_t=peak=default: basepeak=1 521.miniswp_t=peak=default: basepeak=1 528.pot3d_t=peak=default: basepeak=1 532.sph_exa_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI 534.hpgmgfv_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -static-nvidia -DSPEC_ACCEL_AWARE_MPI 535.weather_t=peak=default: ranks = %{RANKS} OPTIMIZE = -w -fast -acc=gpu -O3 -Mfprelaxed -Mnouniform -Mstack_arrays -static-nvidia -DSPEC_ACCEL_AWARE_MPI # The following section was added automatically, and contains settings that # did not appear in the original configuration file, but were added to the # raw file after the run. default: