#PBS -N hmon%NUM%_post2_%CYC% #PBS -j oe #PBS -S /bin/bash #PBS -q %QUEUE% #PBS -A %PROJ%-%PROJENVIR% #PBS -l walltime=02:40:00 #PBS -l place=shared,select=1:ncpus=24:mpiprocs=24:mem=10GB #PBS -l debug=true set -x model=hmon export cyc=%CYC% export PDY=%PDY:""% export TOTAL_TASKS=$(wc -l ${PBS_NODEFILE:?} | cut -f 1 -d " ") %include %include export storm_num="%NUM%" module load PrgEnv-intel/${PrgEnv_intel_ver} module load intel/${intel_ver} module load craype/${craype_ver} module load cray-mpich/${cray_mpich_ver} module load cray-pals/${cray_pals_ver} module load libjpeg/${libjpeg_ver} module load grib_util/${grib_util_ver} module load wgrib2/${wgrib2_ver} #module load atp/${atp_ver} module load hdf5/${hdf5_ver} module load netcdf/${netcdf_ver} module load python/${python_ver} ${HOMEhmon}/jobs/JHMON_POST2 %include %manual PURPOSE: Runs in parallel with the forecast job, converting native WRF output files to GRIB2 files. There are two copies of this job because that is how many is needed to keep up with the forecast. They communicate with one another using lock files and an sqlite3 database, to prevent duplication of work. TROUBLESHOOTING Most failures of this job fall in two categories: - model failed - operator error - system issues If this job failed, check the model first. If the model is stuck or failed, that is why the post1/2 job failed. What do I mean by "operator error?" * ALWAYS KILL AND REQUEUE THE fcst FAMILY AND post FAMILY to rerun the the forecast model. Never, under ANY circumstances, rerun just the model! * If the forecast runs successfully but you need to rerun post1 or post2, rerun both. %end