#PBS -N hmon%NUM%_post2_%CYC%
#PBS -j oe
#PBS -S /bin/bash
#PBS -q %QUEUE%
#PBS -A %PROJ%-%PROJENVIR%
#PBS -l walltime=02:40:00
#PBS -l place=shared,select=1:ncpus=24:mpiprocs=24:mem=10GB
#PBS -l debug=true

set -x
model=hmon
export cyc=%CYC%
export PDY=%PDY:""%

export TOTAL_TASKS=$(wc -l ${PBS_NODEFILE:?} | cut -f 1 -d " ")

%include <head.h>
%include <envir-p1.h>

export storm_num="%NUM%"

module load PrgEnv-intel/${PrgEnv_intel_ver}
module load intel/${intel_ver}
module load craype/${craype_ver}
module load cray-mpich/${cray_mpich_ver}
module load cray-pals/${cray_pals_ver}

module load libjpeg/${libjpeg_ver}
module load grib_util/${grib_util_ver}
module load wgrib2/${wgrib2_ver}
#module load atp/${atp_ver}
module load hdf5/${hdf5_ver}
module load netcdf/${netcdf_ver}
module load python/${python_ver}

${HOMEhmon}/jobs/JHMON_POST2

%include <tail.h>

%manual

PURPOSE: Runs in parallel with the forecast job, converting
native WRF output files to GRIB2 files.

There are two copies of this job because that is how many is needed to
keep up with the forecast.  They communicate with one another using lock
files and an sqlite3 database, to prevent duplication of work.

TROUBLESHOOTING

Most failures of this job fall in two categories:

  - model failed
  - operator error
  - system issues

If this job failed, check the model first.  If the model is stuck or
failed, that is why the post1/2 job failed.

What do I mean by "operator error?"

* ALWAYS KILL AND REQUEUE THE fcst FAMILY AND post FAMILY to rerun the
    the forecast model.  Never, under ANY circumstances, rerun just the model!

* If the forecast runs successfully but you need to rerun post1 or post2, rerun both.

%end