!------------------------------------------------------------------------------
!
! regress_method.f90 -- Make probability forecasts by linear regression.
!
! This routine writes the probability output file, in addition to calculations.
!
! This is one of the top level method routines of the probability
! forecast module, a main component of the NOAA NCO/ARL/PSD bias
! correction system for CMAQ forecast outputs.
!
! 2020-aug-04	Original version.  By Dave Allured, NOAA/ESRL/PSL/CIRES.
!		Algorithm by Irina Djalalova, NOAA/ESRL/PSL/CIRES.
!
! * Remember to update the date in the module_id below.
!
! This probability module incorporates several major functions:
!
! * Read gridded model forecasts for a given forecast cycle.
! * Compute daily averages or 8-hour maximums of obs and corrected site data.
! * Compute cube root best fit of aggregate daily obs and corrected values.
! * Apply linear correction to cube root of forecast grids.
! * Compute final probability grids using cumulative distribution function.
! * Write probability output file.
!
! Primary inputs:
!
! * Gridded raw forecast file for current forecast cycle.
! * Bias corrected forecast values at sites, for full training period.
! * Obs data at site locations, for full training period.
! * Grid coordinates and elevations.
!
! Primary output:
!
! * Single data file with several computed daily probability forecast grids.
!
! Notes:
!
! Input and output file paths are specified with path templates.
! These contain YYYY MM DD HH substitution strings that will
! resolve to desired file names for the given forecast cycle.
! Templates may include full or relative paths, and they may
! begin with an $ENV environnment variable.
!
! Forecast date index:  Integer Gregorian date, relative to
! 1 = January 1 of base_year.  See the index_to_date library
! routine for details.
!
!------------------------------------------------------------------------------

module regress__method
contains

subroutine regress_method (grid_coord_file, output_file_template, &
      target_var, forecast_date, cycle_time, base_year, calendar, &
      filter_result_all, obs_reshaped, corr_grids, vmiss, prob, diag)

   use cdf__to_probability
   use config, only : dp
   use expand__filename
   use index_to_date_mod
   use make__daily
   use probability_type
   use read__grid_coords
   use regress_mod
   use write__probability_netcdf_lr
   implicit none

   character(*), parameter :: &
      module_id = 'regress_method.f90 version 2020-aug-04'

! Input arguments.

   character(*), intent(in) :: grid_coord_file	    ! aux. grid coordinate file
   character(*), intent(in) :: output_file_template ! output filename template
   character(*), intent(in) :: target_var	    ! target variable name

   integer,      intent(in) :: forecast_date	    ! target date index
   integer,      intent(in) :: cycle_time	    ! forecast cycle time
   integer,      intent(in) :: base_year	    ! base year for date indexes
   character(*), intent(in) :: calendar	  	    ! calendar system in use

   real(dp),     intent(in) :: filter_result_all(:,:,:)  ! corrected forecast
						    ! values, all days in period
						    ! (days, hours, sites)

   real(dp),     intent(in) :: obs_reshaped(:,:,:)  ! obs reshaped to model data
   						    ! (days, hours, sites)
						    ! vestigial var dim. removed

   real(dp),     intent(in) :: corr_grids (:,:,:)   ! bias corrected grids from
   						    ! spreading (X, Y, hours)

   real(dp),        intent(in) :: vmiss		    ! common missing value code
   type(prob_type), intent(in) :: prob		    ! probability config params
   integer,         intent(in) :: diag		    ! diag verbosity level, 0-N

! Local variables.

   character fdate_str*24, outfile*200
   character fmt1*60, fmt2*60, fmt3*60

   integer nx, ny
   integer d1, d2, fday, pday, npdays
   integer ndays_filter, ndays_avail, ndays_required
   integer year, month, day

   integer ithresh, nthresh
   integer nmiss_c, nmiss_f, nmiss_g, nmiss_o
   integer nvalid_c, nvalid_f, nvalid_g, nvalid_o

   real(dp) a, b, sigma
   real(dp) cmin, cmax, gmin, gmax, omin, omax

! Dynamic arrays.

   real(dp), allocatable :: grid_lats(:,:)	   ! grid coordinates (X, Y)
   real(dp), allocatable :: grid_lons(:,:)

   logical,  allocatable :: valid_mask(:,:)	   ! valid data mask
						   !   (tdays, sites, pdays)

   real(dp), allocatable :: obs_daily(:,:,:)	   ! daily averages or maximums
   real(dp), allocatable :: filter_daily(:,:,:)	   !   (tdays, sites, pdays)

   real(dp), allocatable :: grid_daily(:,:,:)	   ! daily averages or maximums
   						   !   (X, Y, pdays)

   real(dp), allocatable :: grid_cube_root(:,:)	   ! cube root original and
   real(dp), allocatable :: grid_fitted(:,:)	   !   fitted daily grids (X, Y)

   real(dp), allocatable :: error_variance(:,:)	   ! variance grid for CDF (X,Y)

   real(dp), allocatable :: thresh_cube_root(:)	   ! cube root prob. thresholds

   real(dp), allocatable :: obs_1d(:)		   ! aggregated obs and filter
   real(dp), allocatable :: filter_1d(:)	   !   packed daily values

! Probability output grids.  Map to CMAQ standard dimensions:
!   (X, Y, thresh, days) --> (X, Y, LEV, TSTEP)

   real(dp), allocatable :: probability(:,:,:,:)   ! (X, Y, thresh, pdays)
   real(dp), allocatable :: zscore(:,:,:,:)
   real(dp), allocatable :: cdf(:,:,:,:)

!-------------------------------------------------------
! Initialize.
!-------------------------------------------------------

   if (diag >= 2) print *
   if (diag >= 2) print *, 'regress_method: Begin linear regression method.'
   if (diag >= 2) print *, '  Module ID = ' // module_id

! Get dimensions.

   nx      = size (grid_lats, 1)	! grid dims from CMAQ coordinate grid
   ny      = size (grid_lats, 2)

   nthresh = size (prob%thresh)		! number of requested threshold values

! Decode the date index for the current forecast cycle.

   call index_to_date (forecast_date, year, month, day, base_year, calendar)
      						! get current Y M D integers

! Read CMAQ 2-D coordinate grids.

   if (diag >= 2) print *
   if (diag >= 2) print *, '  Read CMAQ 2-D coordinate grids.'

   call read_grid_coords (grid_coord_file, diag, grid_lats, grid_lons)

! Input diagnostics.

   if (diag >= 2) then
      print *
      print *, '  regress_method: Input summary:'

      fmt1 = '(5x,a,99i6)'
      print fmt1, 'shape (obs_reshaped)      = ', shape (obs_reshaped)
      print fmt1, 'shape (filter_result_all) = ', shape (filter_result_all)
      print fmt1, 'shape (corr_grids)        = ', shape (corr_grids)

      nmiss_o  = count (obs_reshaped      == vmiss)
      nmiss_f  = count (filter_result_all == vmiss)
      nmiss_g  = count (corr_grids        == vmiss)

      nvalid_o = count (obs_reshaped      /= vmiss)
      nvalid_f = count (filter_result_all /= vmiss)
      nvalid_g = count (corr_grids        /= vmiss)

      print *
      fmt2 = "(5x, a, ' valid =', i9, ', missing =', i9))"
      print fmt2, 'obs_reshaped,     ', nvalid_o, nmiss_o
      print fmt2, 'filter_result_all,', nvalid_f, nmiss_f
      print fmt2, 'corr_grids,       ', nvalid_g, nmiss_g
   end if

!-----------------------------------------------------------
! Make daily averages or maximums for best fit period.
!-----------------------------------------------------------

! The best fit period is the last N days of the training period.

   ndays_filter = size (filter_result_all, 1)  ! no. of bias corr. forecast days

   fday = ndays_filter			! forecast day index = last input day
   					! = day after end of training period

   d1 = fday - prob%ndays_best_fit	! start of best fit period
   d1 = max (1, d1)			! keep start day within training period

   d2 = fday - 1			! last day = end of training period

   ndays_avail    = d2 - d1 + 1		! actual days in best fit period
   ndays_required = 2			! minimum # days for best fit period

   if (ndays_avail < ndays_required-999) then
      print *
      print *, '*** regress_method: Insufficient training period for' &
         // ' probability forecast.  Abort.'
      print '(a,i0)',' *** Number of actual days available    = ',ndays_avail
      print '(a,i0)',' *** Number of days required for method = ',ndays_required
      call exit (1)
   end if

   if (diag >= 2) then
      print *
      fmt1 = '(5x,a,99i6)'
      print fmt1, 'd1, d2 for fit period  = ', d1, d2
      print fmt1, 'shape (obs_reshaped)   = ', shape (obs_reshaped)
      print fmt1, 'shape (obs fit subset) = ', shape (obs_reshaped(d1:d2,:,:))
   end if

! Make daily values for hourly input site data and hourly forecast grids,
! over best fit period.  Dimension order is indicated when calling.
! TD = training period days, FD = forecast days

   obs_daily    = make_daily (obs_reshaped(d1:d2,:,:),      'X,H,Y', vmiss, &
                             'obs_reshaped', &
                     cycle_time, prob, diag)	! (TD, S, FD) <-- (TD, H, S)

   filter_daily = make_daily (filter_result_all(d1:d2,:,:), 'X,H,Y', vmiss, &
                             'filter_result_all', &
                     cycle_time, prob, diag)	! (TD, S, FD) <-- (TD, H, S)

   grid_daily   = make_daily (corr_grids(:,:,:),            'X,Y,H', vmiss, &
                             'corr_grids', &
                     cycle_time, prob, diag)	! (X, Y, FD)  <-- (X, Y, H)

   if (diag >= 2) then
      print *
      fmt1 = '(5x,a,99i6)'
      print fmt1, 'shape (obs_daily)    = ', shape (obs_daily)
      print fmt1, 'shape (filter_daily) = ', shape (filter_daily)
      print fmt1, 'shape (grid_daily)   = ', shape (grid_daily)

      nmiss_o  = count (obs_daily    == vmiss)
      nmiss_f  = count (filter_daily == vmiss)
      nmiss_g  = count (grid_daily   == vmiss)

      nvalid_o = count (obs_daily    /= vmiss)
      nvalid_f = count (filter_daily /= vmiss)
      nvalid_g = count (grid_daily   /= vmiss)

      print *
      fmt2 = "(5x, a, ' valid =', i9, ', missing =', i9))"
      print fmt2, 'obs_daily,   ', nvalid_o, nmiss_o
      print fmt2, 'filter_daily,', nvalid_f, nmiss_f
      print fmt2, 'grid_daily,  ', nvalid_g, nmiss_g
   end if

!-------------------------------------------------------
! Initialize for main loop over forecast days.
!-------------------------------------------------------

! Get dimensions.

   nx      = size (grid_lats, 1)	! grid dims from CMAQ coordinate grid
   ny      = size (grid_lats, 2)

   npdays  = size (obs_daily, 3)	! no. probability days from daily maker
   nthresh = size (prob%thresh)		! number of requested threshold values

! Allocate fixed work arrays that span multiple forecast days.

   allocate (probability (nx, ny, nthresh, npdays))	! probability
   allocate (zscore      (nx, ny, nthresh, npdays))	! intermediate and
   allocate (cdf         (nx, ny, nthresh, npdays))	! result arrays

!------------------------------------------------------------
! Main loop over each available probability forecast day.
!------------------------------------------------------------

pday_loop: &
   do pday = 1, npdays

      call fdate (fdate_str)
      print *
      print '(a)','------------------------------------------------------------'
      print '(2a,i0,a)', fdate_str, '  Start probability forecast day ',pday,'.'
      print '(a)','------------------------------------------------------------'
      print *

! Combine all valid (obs + corrected) pairs for all sites into
! unordered 1-D arrays.  Omit missing values.

      valid_mask =       (obs_daily   (:,:,pday) /= vmiss) &
                   .and. (filter_daily(:,:,pday) /= vmiss)

      obs_1d     = pack (obs_daily   (:,:,pday), valid_mask)
      filter_1d  = pack (filter_daily(:,:,pday), valid_mask)
						! (N) <-- (TD, S, FD)

!-------------------------------------------------------
! Input diagnostics for linear regression.
!-------------------------------------------------------

! Summary of input data for current forecast day.

      if (diag >= 2) then
         omin = minval (obs_1d)
         omax = maxval (obs_1d)

         cmin = minval (filter_1d)
         cmax = maxval (filter_1d)

         gmin = minval (grid_daily(:,:,pday), (grid_daily(:,:,pday) /= vmiss))
         gmax = maxval (grid_daily(:,:,pday), (grid_daily(:,:,pday) /= vmiss))

         fmt1 = '(4x,a,2g14.4)'
         fmt3 = '(4x,a,99i6)'

         print '(a,i0,a)', ' regress_method: Daily input summary for' &
            // ' forecast day ', pday, ':'

         print *
         print fmt3, 'shape (obs_1d)    = ', shape (obs_1d)
         print fmt3, 'shape (filter_1d) = ', shape (filter_1d)
         print fmt3, 'size (grid 1 day) = ', size (grid_daily(:,:,pday))

         print *
         print fmt1, 'Daily site obs       min, max =', omin, omax
         print fmt1, 'Daily site corrected min, max =', cmin, cmax
         print fmt1, 'Daily grid corrected min, max =', gmin, gmax

         nmiss_o  = count (obs_daily   (:,:,pday) == vmiss)
         nmiss_c  = count (filter_daily(:,:,pday) == vmiss)
         nmiss_g  = count (grid_daily  (:,:,pday) == vmiss)

         nvalid_o = count (obs_daily   (:,:,pday) /= vmiss)
         nvalid_c = count (filter_daily(:,:,pday) /= vmiss)
         nvalid_g = count (grid_daily  (:,:,pday) /= vmiss)

         print *
         fmt2 = "(4x, a, ' valid =', i9, ', missing =', i9))"
         print fmt2, 'Daily site obs,      ', nvalid_o, nmiss_o
         print fmt2, 'Daily site corrected,', nvalid_c, nmiss_c
         print fmt2, 'Daily grid corrected,', nvalid_g, nmiss_g
         print *
      end if

!-----------------------------------------------------------
! Compute best fit of cube roots, all sites aggregated.
!-----------------------------------------------------------

! Convert to cube roots.  Missing values were previously eliminated.

      if (diag >= 2) print *, ' regress_method: Calculate cube roots.'

      thresh_cube_root = prob%thresh(:) ** (1d0 / 3)

      obs_1d    = obs_1d    ** (1d0 / 3)	! all valid data; missing values
      filter_1d = filter_1d ** (1d0 / 3)	!   were previously excluded

      grid_cube_root = grid_daily(:,:,pday)	! copy 1 grid for current day
      						! include missing values

      where (grid_cube_root(:,:) /= vmiss) &		! preserve missing vals
         grid_cube_root = grid_cube_root ** (1d0 / 3)	! in this calculation

      if (diag >= 2) then
         omin = minval (obs_1d)
         omax = maxval (obs_1d)

         cmin = minval (filter_1d)
         cmax = maxval (filter_1d)

         gmin = minval (grid_cube_root, (grid_cube_root /= vmiss))
         gmax = maxval (grid_cube_root, (grid_cube_root /= vmiss))

         fmt1 = '(4x,a,2g14.4)'
         print fmt1, 'Cube root site obs       min, max =', omin, omax
         print fmt1, 'Cube root site corrected min, max =', cmin, cmax
         print fmt1, 'Cube root grid corrected min, max =', gmin, gmax
         print *
      end if

! Compute best fit by linear regression.

      if (diag >= 2) print *, ' Compute site obs to site corrected best fit,' &
         // ' by linear regression.'

      call regress (obs_1d, filter_1d, diag, b, a, sigma)

      if (diag >= 2) then
         fmt1 = '(4x,a,g14.4)'
         print fmt1, 'a     = ', a
         print fmt1, 'b     = ', b
         print fmt1, 'sigma = ', sigma
         print *
      end if

! Compute fitted cube root daily forecast grid.

      if (diag >= 2) print *, ' Compute fitted cube root daily forecast grid.'

      grid_fitted = grid_cube_root(:,:)		! copy 1 grid for current day
      						! include missing values

      where (grid_cube_root(:,:) /= vmiss) &	! preserve missing values
         grid_fitted = a + b * grid_cube_root	! in this calculation

      if (diag >= 2) then
         gmin = minval (grid_fitted, (grid_fitted /= vmiss))
         gmax = maxval (grid_fitted, (grid_fitted /= vmiss))

         fmt1 = '(4x,a,2g14.4)'
         print fmt1, 'Cube root grid fitted min, max    =', gmin, gmax
         print *
      end if

!-------------------------------------------------------
! Compute probability grids using CDF.
!-------------------------------------------------------

      print *, ' Compute probability grids for ' // trim (target_var) // '.'

! Convert scalar sigma into variance grid for this routine.

      if (.not. allocated (error_variance)) allocate (error_variance(nx, ny))

      error_variance(:,:) = sigma ** 2

! Compute probabiliy grids for each requested threshold.

      do ithresh = 1, nthresh
         print *
         fmt1 = '(4x,2a,f8.3,1x,a)'
         print fmt1, trim (target_var), ', original threshold  =', &
            prob%thresh(ithresh), trim (prob%thresh_units)
         print fmt1, trim (target_var), ', cube root threshold =', &
            thresh_cube_root(ithresh)

         call cdf_to_probability ('fitted', grid_fitted, &
            error_variance, grid_lats, grid_lons, &
            thresh_cube_root(ithresh), vmiss, diag, &
            probability(:,:,ithresh,pday), &
            zscore(:,:,ithresh,pday), cdf(:,:,ithresh,pday))
      end do

   end do pday_loop

!-------------------------------------------------------
! Write probability output file.
!-------------------------------------------------------

   print *
   call fdate (fdate_str)
   print '(2a)', fdate_str, '  regress_method: Write output file.'

! Create actual output file name for current forecast cycle.

   call expand_filename (output_file_template, year, month, day, cycle_time, &
      outfile)

! Write output file.  Data and coordinate vars will be converted from
! double to single precision, when writing to file.

   call write_probability_netcdf_lr (outfile, target_var, grid_lats, &
      grid_lons, probability, prob, diag, grid_daily, zscore, cdf)

   if (diag >= 2) print *, &
      'regress_method: All done.  Return to main program.'

end subroutine regress_method
end module regress__method