#!/bin/bash set -eu exec 3>&2 SECONDS=0 hostname error() { echo echo "$@" 1>&2 exit 1 } usage() { #set +x echo echo "Usage: $program -n -a [ -c ] [-b] [-d] [-e] [-k] [-h] [-x] [-z]" echo echo " -a specify HPC to use for batch job" echo echo " -n specify " echo echo " -c specify " echo " defaults to all test-cases: thr,mpi,dcp,rst,bit,dbg,fhz" echo " comma-separated list of any combination of std,thr,mpi,dcp,rst,bit,dbg,fhz" echo echo " -b test reproducibility for bit; compare against baseline" echo " -d test reproducibility for dbg; compare against baseline" echo " -s test reproducibility for std; compare against baseline" echo " -e use ecFlow workflow manager" echo " -k keep run directory" echo " -h display this help and exit" echo " -x skip compile" echo " -z skip run" echo echo " Examples" echo echo " 'opnReqTest -n fv3_control' tests all for fv3_control" echo " 'opnReqTest -n fv3_control -c rst' tests restart for fv3_control" echo " 'opnReqTest -n fv3_gfdlmp -c mpi,thr' tests mpi and threading for fv3_gfdlmp" echo " 'opnReqTest -n fv3_gfdlmp -c bit' tests bit for fv3_gfdlmp" echo " 'opnReqTest -n fv3_satmedmf -c dcp,dbg,rst' tests decomposition, debug, restart for fv3_satmedmf" echo #set -x } usage_and_exit() { usage exit $1 } cleanup() { rm -rf ${lockdir} [[ ${ECFLOW:-false} == true ]] && ecflow_stop exit $1 } find_build() { cat rt.conf >rt.temp local compile_line='' local run_line='' while read -r line || [ "$line" ]; do line="${line#"${line%%[![:space:]]*}"}" [[ ${#line} == 0 ]] && continue [[ $line == \#* ]] && continue if [[ $line =~ COMPILE ]] ; then compile_opt=$(echo $line | cut -d'|' -f4 | sed -e 's/^ *//' -e 's/ *$//') elif [[ $line =~ RUN ]]; then tmp_test_name=$(echo $line | cut -d'|' -f2 | sed -e 's/^ *//' -e 's/ *$//') if [[ $TEST_NAME == $tmp_test_name && $compile_opt != '' ]]; then model_found=true base_opt=$compile_opt break fi fi done <'rt.temp' } build_opnReqTests() { rm -f fv3_std.exe fv3_dbg.exe fv3_bit.exe modules.fv3_std modules.fv3_dbg modules.fv3_bit model_found=false base_opt= find_build if [[ $model_found == false ]]; then error "build options for $TEST_NAME not found. please take a look at rt.conf" fi for name in $compile_case; do case $name in std) MAKE_OPT=$base_opt ;; bit) if [[ $base_opt =~ "-D32BIT=ON" ]]; then MAKE_OPT=$(echo $base_opt | sed -e 's/-D32BIT=ON//') else MAKE_OPT="$base_opt -D32BIT=ON" fi ;; dbg) MAKE_OPT="$base_opt -DDEBUG=ON" ;; esac MAKE_OPT=$(echo $MAKE_OPT | sed -e 's/^ *//' -e 's/ *$//') export COMPILE_NR=${name} cat <<-EOF > ${RUNDIR_ROOT}/compile_${name}.env export MACHINE_ID=${MACHINE_ID} export JOB_NR=${COMPILE_NR} export COMPILE_NR=${COMPILE_NR} export PATHRT=${PATHRT} export PATHTR=${PATHTR} export SCHEDULER=${SCHEDULER} export ACCNR=${ACCNR} export QUEUE=${COMPILE_QUEUE} export PARTITION=${PARTITION} export ROCOTO=${ROCOTO} export ECFLOW=${ECFLOW} export REGRESSIONTEST_LOG=${REGRESSIONTEST_LOG} export LOG_DIR=${LOG_DIR} export RT_COMPILER=${RT_COMPILER} EOF if [[ $ECFLOW == true ]]; then COMPILE_NR=$name ecflow_create_compile_task else echo "compiling $name with compile option $MAKE_OPT" ./compile.sh $MACHINE_ID "${MAKE_OPT}" $name ${RT_COMPILER} >${LOG_DIR}/compile_${TEST_NAME}_$name.log 2>&1 echo "done compiling $name" fi done } run_opnReqTests() { JOB_NR=0 for rc in $run_case; do # if TEST_NAME specifies WARM_START true, error and exit source ${PATHRT}/tests/$TEST_NAME if [[ ${WARM_START} == .T. ]]; then error "test-name cannot be a restart run (i.e. WARM_START=.T.)" fi if [[ $TEST_NAME =~ regional ]]; then application=regional elif [[ $TEST_NAME =~ cpld ]]; then application=cpld elif [[ $TEST_NAME =~ datm ]]; then application=datm elif [[ $TEST_NAME =~ atmw ]]; then application=atmw else application=global fi comp_nm=std RT_SUFFIX="_${rc}" JOB_NR=$( printf '%03d' $(( 10#$JOB_NR + 1 )) ) case $rc in std_base) CREATE_BASELINE=true BL_SUFFIX=_std_base source $PATHRT/opnReqTests/std.sh ;; std) CREATE_BASELINE=false BL_SUFFIX=_std_base source $PATHRT/opnReqTests/std.sh ;; thr) CREATE_BASELINE=false BL_SUFFIX=_std_base source $PATHRT/opnReqTests/thr.sh ;; mpi) CREATE_BASELINE=false BL_SUFFIX=_std_base source $PATHRT/opnReqTests/mpi.sh ;; dcp) CREATE_BASELINE=false BL_SUFFIX=_std_base source $PATHRT/opnReqTests/dcp.sh ;; rst) # this is not going to work for regional model CREATE_BASELINE=false BL_SUFFIX=_std_base source $PATHRT/opnReqTests/rst.sh ;; bit_base) CREATE_BASELINE=true BL_SUFFIX=_bit_base comp_nm=bit source $PATHRT/opnReqTests/bit.sh ;; bit) CREATE_BASELINE=false BL_SUFFIX=_bit_base comp_nm=bit source $PATHRT/opnReqTests/bit.sh ;; dbg_base) CREATE_BASELINE=true BL_SUFFIX=_dbg_base comp_nm=dbg source $PATHRT/opnReqTests/dbg.sh WLCLK=60 ;; dbg) CREATE_BASELINE=false BL_SUFFIX=_dbg_base comp_nm=dbg source $PATHRT/opnReqTests/dbg.sh WLCLK=60 ;; fhz) CREATE_BASELINE=false BL_SUFFIX=_std_base source $PATHRT/opnReqTests/fhz.sh ;; esac cat <<- EOF > ${RUNDIR_ROOT}/run_test${RT_SUFFIX}.env export JOB_NR=${JOB_NR} export MACHINE_ID=${MACHINE_ID} export RTPWD=${RTPWD} export INPUTDATA_ROOT=${INPUTDATA_ROOT} export INPUTDATA_ROOT_WW3=${INPUTDATA_ROOT_WW3} export INPUTDATA_ROOT_BMIC=${INPUTDATA_ROOT_BMIC} export PATHRT=${PATHRT} export PATHTR=${PATHTR} export NEW_BASELINE=${NEW_BASELINE} export CREATE_BASELINE=${CREATE_BASELINE} export RT_SUFFIX=${RT_SUFFIX} export BL_SUFFIX=${BL_SUFFIX} export SCHEDULER=${SCHEDULER} export ACCNR=${ACCNR} export QUEUE=${QUEUE} export PARTITION=${PARTITION} export ROCOTO=${ROCOTO} export ECFLOW=${ECFLOW} export REGRESSIONTEST_LOG=${REGRESSIONTEST_LOG} export LOG_DIR=${LOG_DIR} export skip_check_results=${skip_check_results} export delete_rundir=${delete_rundir} export RT_COMPILER=${RT_COMPILER} export WLCLK=${WLCLK} EOF if [[ $ECFLOW == true ]]; then TEST_NR=${RT_SUFFIX:1} COMPILE_NR=$comp_nm RT_COMPILER=${RT_COMPILER:-intel} DEP_RUN= if [[ ${RT_SUFFIX} == _std || ${RT_SUFFIX} == _thr || ${RT_SUFFIX} == _mpi || ${RT_SUFFIX} == _dcp || ${RT_SUFFIX} == _rst || ${RT_SUFFIX} == _fhz ]]; then DEP_RUN="${TEST_NAME}_${RT_COMPILER}_std_base" elif [[ ${RT_SUFFIX} == _bit ]]; then DEP_RUN="${TEST_NAME}_${RT_COMPILER}_bit_base" elif [[ ${RT_SUFFIX} == _dbg ]]; then DEP_RUN="${TEST_NAME}_${RT_COMPILER}_dbg_base" else DEP_RUN='' fi ecflow_create_run_task else echo "Running test for $rc" echo " THRD: $THRD; INPES: $INPES; JNPES: $JNPES; TPN: $TPN" TEST_NR=${RT_SUFFIX:1} ./run_test.sh $PATHRT $RUNDIR_ROOT $TEST_NAME $TEST_NR $comp_nm > $LOG_DIR/run_${TEST_NR}_${TEST_NAME}_${RT_COMPILER}${RT_SUFFIX}.log #2>&1 fi done } trap 'echo opnReqTest killed; cleanup $?' KILL trap 'echo opnReqTest interrupted; cleanup $?' INT trap 'echo opnReqTest quit; cleanup $?' QUIT trap 'echo opnReqTest terminated; cleanup $?' TERM trap 'echo opnReqTest error on line $LINENO; cleanup $?' ERR trap ' ret=$? if [ "$ret" -ne 0 ]; then echo >&2 "Died with error code $ret" else echo >&2 "opnReqTest finished" fi cleanup $ret' EXIT ######################################################################## #### PROGRAM STARTS #### ######################################################################## readonly program=$(basename $0) # PATHRT - Path to operation requirement tests directory readonly PATHRT=$(cd $(dirname $0) && pwd -P) # PATHTR - Path to trunk directory readonly PATHTR=$(cd ${PATHRT}/.. && pwd) readonly lockdir=${PATHRT}/lock_opnReqTest export CI_TEST=${CI_TEST:-false} [[ $# -eq 0 ]] && usage_and_exit 1 # specify compiler export RT_COMPILER=${RT_COMPILER:-intel} # detect_machine sets MACHINE_ID RT_MACHINE=${RT_MACHINE:-} source detect_machine.sh cd $PATHRT # make sure only one instance of opnReqTest is running if mkdir $lockdir 2>/dev/null; then echo $(hostname) $$ > ${lockdir}/PID else error "Only one instance of opnReqTest can be running at a time" fi # utility functions in rt_utils need to be able to see variables in opnReqTest source rt_utils.sh source module-setup.sh rm -f fail_test* fail_compile* # Machine-dependent libraries, modules, variables, etc. if [[ $MACHINE_ID = hera ]]; then PYTHONHOME=/scratch1/NCEPDEV/nems/emc.nemspara/soft/miniconda3_new_20210629 export PATH=$PYTHONHOME/bin:$PATH export PYTHONPATH=$PYTHONHOME/lib/python3.7/site-packages module load ecflow ECFLOW_START=ecflow_start.sh PARTITION= QUEUE=batch COMPILE_QUEUE=batch dprefix=/scratch1/NCEPDEV DISKNM=$dprefix/nems/emc.nemspara/RT STMP=${dprefix}/stmp4 PTMP=${dprefix}/stmp2 SCHEDULER=slurm elif [[ $MACHINE_ID = orion ]]; then module load gcc/8.3.0 export PATH=/work/noaa/nems/emc.nemspara/soft/miniconda3/bin:$PATH export PYTHONPATH=/work/noaa/nems/emc.nemspara/soft/miniconda3/lib/python3.8/site-packages ECFLOW_START=/work/noaa/nems/emc.nemspara/soft/miniconda3/bin/ecflow_start.sh ECF_PORT=$(( $(id -u) + 1500 )) QUEUE=batch COMPILE_QUEUE=batch PARTITION=orion dprefix=/work/noaa/stmp/${USER} DISKNM=/work/noaa/nems/emc.nemspara/RT STMP=$dprefix/stmp PTMP=$dprefix/stmp SCHEDULER=slurm elif [[ $MACHINE_ID = hercules ]]; then module load contrib rocoto/1.3.5 ROCOTORUN=$(which rocotorun) ROCOTOSTAT=$(which rocotostat) ROCOTOCOMPLETE=$(which rocotocomplete) module use /work/noaa/epic/role-epic/spack-stack/modulefiles module load ecflow/5.8.4-hercules ECFLOW_START=/work/noaa/epic/role-epic/spack-stack/ecflow-5.8.4-hercules/bin/ecflow_start.sh ECF_PORT=$(( $(id -u) + 1500 )) QUEUE=windfall COMPILE_QUEUE=windfall PARTITION=hercules dprefix=/work2/noaa/epic/${USER} DISKNM=/work/noaa/epic/hercules/UFS-WM_RT STMP=$dprefix/stmp PTMP=$dprefix/stmp SCHEDULER=slurm cp fv3_conf/fv3_slurm.IN_hercules fv3_conf/fv3_slurm.IN cp fv3_conf/compile_slurm.IN_hercules fv3_conf/compile_slurm.IN elif [[ $MACHINE_ID = linux ]]; then PARTITION= QUEUE= COMPILE_QUEUE= dprefix=/home/builder DISKNM=${dprefix}/data STMP=${dprefix}/stmp4 PTMP=${dprefix}/stmp2 SCHEDULER=none else error "unknown machine ID. edit detect_machine.sh file" fi # default variable values TEST_NAME= CREATE_BASELINE= test_case="std thr mpi dcp rst bit dbg fhz" ECFLOW=false ROCOTO=false std_compare=false bit_compare=false dbg_compare=false keep_rundir=false skip_compile=false skip_run=false skip_check_results=false delete_rundir=false # parse command line arguments to fill-in/modify the above default variables while getopts :a:n:c:ekhbdsxz opt; do case $opt in a) ACCNR=$OPTARG ;; n) TEST_NAME=$OPTARG echo "test name: ${TEST_NAME}" ;; c) test_case=$OPTARG test_case=$(echo $test_case | sed -e 's/^ *//' -e 's/ *$//' -e 's/,/ /g') for i in $test_case; do if [[ $i != std && $i != thr && $i != mpi && $i != dcp && $i != rst && $i != bit && $i != dbg && $i != fhz ]]; then error "invalid test case specified: $i" fi done ;; e) ECFLOW=true ;; k) keep_rundir=true ;; b) bit_compare=true ;; d) dbg_compare=true ;; s) std_compare=true ;; x) skip_compile=true ;; z) skip_run=true ;; h) usage_and_exit 0 ;; '?') error "$program: invalid option -$OPTARG" ;; esac done # some safety guards for input arguments ACCNR=${ACCNR:-""} if [[ -z "$ACCNR" ]]; then echo "Please use -a to set group account to use on HPC" exit 1 fi if [[ -z $TEST_NAME ]]; then error "$program: test name is required. try 'opnReqTest -h' for usage" fi if [[ $skip_compile == true && $skip_run == true ]]; then error "$program: cannot skip both compile and run" fi if [[ $std_compare == true && ! $test_case =~ std ]]; then error "$program: std reproducibility test requires specifying std" fi if [[ $bit_compare == true && ! $test_case =~ bit ]]; then error "$program: bit reproducibility test requires specifying bit" fi if [[ $dbg_compare == true && ! $test_case =~ dbg ]]; then error "$program: debug reproducibility test requires specifying dbg" fi # load default variables and override as necessary source default_vars.sh # enumerate which case to compile and run compile_case= run_case= for i in $test_case; do if [[ $i == thr || $i == mpi || $i == dcp || $i == rst || $i == fhz ]]; then if [[ ! $compile_case =~ std ]]; then compile_case+=" std" fi if [[ ! $run_case =~ std_base ]]; then run_case+=" std_base" fi run_case+=" $i" elif [[ $i == std ]]; then if [[ ! $compile_case =~ std ]]; then compile_case+=" std" fi if [[ ! $run_case =~ std_base ]]; then run_case+=" std_base" fi if [[ $std_compare == true ]]; then run_case+=" std" fi elif [[ $i == bit ]]; then compile_case+=" bit" run_case+=" bit_base" if [[ $bit_compare == true ]]; then run_case+=" bit" fi elif [[ $i == dbg ]]; then compile_case+=" dbg" run_case+=" dbg_base" if [[ $dbg_compare == true ]]; then run_case+=" dbg" fi fi done compile_case=$(echo $compile_case | sed -e 's/^ *//' -e 's/ *$//') run_case=$(echo $run_case | sed -e 's/^ *//' -e 's/ *$//') # if there exists std_base in run_case, make sure it is the first if [[ $run_case =~ std_base && ! $run_case =~ ^std_base ]]; then run_case=$(echo $run_case | sed -e 's/ std_base//') run_case="std_base $run_case" fi # log directory export LOG_DIR=${PATHRT}/logs/log_opnReqTest_$MACHINE_ID rm -rf ${LOG_DIR} mkdir -p ${LOG_DIR} # directory where all simulations are run RUNDIR_ROOT=${RUNDIR_ROOT:-${PTMP}/${USER}}/FV3_OPNREQ_TEST/opnReqTest_$$ mkdir -p ${RUNDIR_ROOT} if [[ $ECFLOW == true ]]; then ECFLOW_RUN=${PATHRT}/ecflow_opnReqTest_run ECFLOW_SUITE=opnreqtest_$$ rm -rf ${ECFLOW_RUN} mkdir -p ${ECFLOW_RUN}/${ECFLOW_SUITE} cp head.h tail.h ${ECFLOW_RUN} > ${ECFLOW_RUN}/${ECFLOW_SUITE}.def cat <<-EOF >> ${ECFLOW_RUN}/${ECFLOW_SUITE}.def suite ${ECFLOW_SUITE} edit ECF_HOME '${ECFLOW_RUN}' edit ECF_INCLUDE '${ECFLOW_RUN}' edit ECF_KILL_CMD kill -15 %ECF_RID% > %ECF_JOB%.kill 2>&1 edit ECF_TRIES 1 label src_dir '{$PATHTR}' label rundir_root '${RUNDIR_ROOT}' limit max_builds 6 limit max_jobs 30 EOF if [[ $MACHINE_ID == hera ]]; then QUEUE=batch elif [[ $MACHINE_ID == hercules ]]; then QUEUE=windfall elif [[ $MACHINE_ID == orion ]]; then QUEUE=batch else error "ecflow is not supported on this machine $MACHINE_ID" fi fi opnreqtest_log=${PATHRT}/logs/OpnReqTests_${TEST_NAME}_$MACHINE_ID.log REGRESSIONTEST_LOG=$opnreqtest_log rm -f fail_test* fail_compile* date >${opnreqtest_log} echo "Start Operation Requirement Test" >> ${opnreqtest_log} echo >> ${opnreqtest_log} ######################################################################## #### COMPILE #### ######################################################################## # if skipping compile, check if all required executables exist. if not, # exit with a non-zero number equal to the number of missing executables num_of_missing_exe=0 if [[ $skip_compile == true ]]; then for i in $compile_case; do if [[ ! -f ${PATHRT}/fv3_$i.exe ]]; then echo "cannot find the required executable file fv3_$i.exe" >&2 num_of_missing_exe=$(( num_of_missing_exe + 1 )) else echo "found the required executable file fv3_$i.exe" >&2 fi done if [[ $num_of_missing_exe == 0 ]]; then echo "all required executables are found" >&2 else exit $num_of_missing_exe fi else echo "cases to compile: $compile_case" build_opnReqTests fi # if skipping run, stop after build. print out status information and # exit with a non-zero number equal to the number of failed builds num_of_failed_compilation=0 if [[ $skip_run == true ]]; then for i in $compile_case; do if [[ ! -f ${PATHRT}/fv3_$i.exe ]]; then echo "failed to generate executable file fv3_$i.exe" >&2 num_of_failed_compilation=$(( num_of_failed_compilation + 1 )) fi done if [[ $num_of_failed_compilation == 0 ]]; then echo "all executables have been generated" >&2 fi exit $num_of_failed_compilation fi ######################################################################## #### RUN #### ######################################################################## mkdir -p ${STMP}/${USER} NEW_BASELINE=${STMP}/${USER}/FV3_OPNREQ_TEST/OPNREQ_TEST RTPWD=${NEW_BASELINE} INPUTDATA_ROOT=${INPUTDATA_ROOT:-$DISKNM/NEMSfv3gfs/input-data-20221101} INPUTDATA_ROOT_WW3=${INPUTDATA_ROOT}/WW3_input_data_20220624/ INPUTDATA_ROOT_BMIC=${INPUTDATA_ROOT_BMIC:-$DISKNM/NEMSfv3gfs/BM_IC-20220207} rm -rf $NEW_BASELINE mkdir -p $NEW_BASELINE echo "cases to run: $run_case" run_opnReqTests if [[ $ECFLOW == true ]]; then echo "endsuite" >> ${ECFLOW_RUN}/${ECFLOW_SUITE}.def ecflow_run fi ######################################################################## #### OPERATION REQUIREMENT TEST STATUS #### ######################################################################## set +e cat ${LOG_DIR}/compile_*_time.log | tee -a ${opnreqtest_log} cat ${LOG_DIR}/rt_*.log | tee -a ${opnreqtest_log} FILES="fail_test_* fail_compile_*" for f in $FILES; do if [[ -f "$f" ]]; then cat "$f" >> fail_test fi done if [[ -e fail_test ]]; then echo "FAILED TESTS: " | tee -a ${opnreqtest_log} while read -r failed_test_name; do echo "Test ${failed_test_name} failed " | tee -a ${opnreqtest_log} done < fail_test echo "OPERATION REQUIREMENT TEST FAILED" | tee -a ${opnreqtest_log} date >>${opnreqtest_log} hrs=$(( $SECONDS%86400/3600 )); mins=$(( $SECONDS%3600/60 )); secs=$(( $SECONDS%60 )) elapsed_time=$(printf '%02dh:%02dm:%02ds\n' $hrs $mins $secs) echo "Elapsed time: ${elapsed_time}. Have a nice day!" | tee -a ${opnreqtest_log} exit 1 else [[ ${keep_rundir} == false ]] && rm -rf ${RUNDIR_ROOT} echo "OPERATION REQUIREMENT TEST WAS SUCCESSFUL" | tee -a ${opnreqtest_log} date >>${opnreqtest_log} hrs=$(( $SECONDS%86400/3600 )); mins=$(( $SECONDS%3600/60 )); secs=$(( $SECONDS%60 )) elapsed_time=$(printf '%02dh:%02dm:%02ds\n' $hrs $mins $secs) echo "Elapsed time: ${elapsed_time}. Have a nice day!" | tee -a ${opnreqtest_log} exit 0 fi