#include "w3macros.h"
!/ ------------------------------------------------------------------- /
      MODULE W3PROFSMD
!/
!/                  +-----------------------------------+
!/                  | WAVEWATCH III           NOAA/NCEP |
!/                  |           Aron Roland             |
!/                  |         Fabrice Ardhuin           |
!/                  |                        FORTRAN 90 |
!/                  | Last update :         18-Aug-2016 |
!/                  +-----------------------------------+
!/
!/    XX-Nov-2007 : Origination.                        ( version 3.10 )
!/    03-Nov-2011 : Adding shoreline reflection         ( version 4.04 )
!/    03-Jun-2013 : Removed assign statements           ( version 4.10 )
!/    20-Jun-2013 : Update test output for time steps   ( version 4.10 )
!/    17-Oct-2013 : Removes boundary nodes from CFL     ( version 4.12 )
!/    15-Dec-2013 : Bug fix for implicit scheme         ( version 4.16 )
!/    18-Aug-2016 : Corrected boundary treatment        ( version 4.16 )
!
!  1. Purpose :
!
!     Propagation schemes for unstructured grids using fluctuation splitting
!
!  2. Variables and types :
!
!      Name      Type  Scope    Description
!     ----------------------------------------------------------------
!     ----------------------------------------------------------------
!
!  3. Subroutines and functions :
!
!      Name      Type  Scope    Description
!     ----------------------------------------------------------------
!      W3XYPUG       Subr. Public   Generic fluctuation splitting operations 
!      W3XYPFSN2     Subr. Public   advection with N scheme (Csik et al. 2002)
!      W3XYPFSPSI    Subr. Public   advection with FCT scheme
!      W3XYPFSFCT2   Subr. Public   advection with FCT scheme
!     ----------------------------------------------------------------
!
!  4. Subroutines and functions used :
!
!      Name      Type  Module   Description
!     ----------------------------------------------------------------
!     ----------------------------------------------------------------
!
!  5. Remarks :
!     For a detailed description of the schemes and their properties, see
!     Roland (2008), Ph.D. Thesis, T. U. Darmstadt. 
!
!  6. Switches :
!
!  7. Source code :
!/
!/ ------------------------------------------------------------------- /
!/
      PUBLIC
!/
      CONTAINS
!/ ------------------------------------------------------------------- /
   SUBROUTINE W3XYPUG ( ISP, FACX, FACY, DTG, VQ, VGX, VGY, LCALC )    
!/
!/                  +-----------------------------------+
!/                  | WAVEWATCH III           NOAA/NCEP |
!/                  |            Aron Roland            |
!/                  |                        FORTRAN 90 |
!/                  | Last update :         10-Jan-2011 |
!/                  +-----------------------------------+
!/
!/    10-Jan-2008 : Origination.                        ( version 3.13 )
!/    10-Jan-2011 : Addition of implicit scheme         ( version 3.14.4 )
!/
!  1. Purpose :
!
!     Propagation in physical space for a given spectral component.
!     Gives the choice of scheme on unstructured grid
!
!  2. Method :
!
!     
!
!  3. Parameters :
!
!     Parameter list
!     ----------------------------------------------------------------
!       ISP     Int.   I   Number of spectral bin (IK-1)*NTH+ITH
!       FACX/Y  Real   I   Factor in propagation velocity.
!                          ( 1 or 0 * DT / DX )
!       DTG     Real   I   Total time step.
!       VQ      R.A.  I/O  Field to propagate.
!       VGX/Y   Real   I   Speed of grid.
!     ----------------------------------------------------------------
!
!     Local variables.
!     ----------------------------------------------------------------
!       VCFL0X  R.A.  Local courant numbers for absolute group vel.
!                     using local X-grid step.
!       VCFL0Y  R.A.  Id. in Y.
!     ----------------------------------------------------------------
!
!  4. Subroutines used :
!

!  5. Called by :
!
!       W3WAVE   Wave model routine.
!
!  6. Error messages :
!
!       None.
!
!  7. Remarks :
!              make the interface between the WAVEWATCH and the WWM code. 
!
!  8. Structure :
!
!
!  9. Switches :
!
!       !/S     Enable subroutine tracing.
!
!
! 10. Source code :     
!/ ------------------------------------------------------------------- /
!/
!
      USE CONSTANTS
!
      USE W3TIMEMD, ONLY: DSEC21
!
      USE W3GDATMD, ONLY: NX, NY, NSEA, MAPSF, MAPFS, DTCFL, CLATS,   &
                          FLCX, FLCY, NK, NTH, DTH, XFR,              &
                          ECOS, ESIN, SIG,  PFMOVE,IEN,               &
                          NTRI, TRIGP, CCON ,                         &
                          IE_CELL, POS_CELL, IOBP, IOBPD,             &
                          FSN, FSPSI, FSFCT, FSNIMP, GTYPE, UNGTYPE

      USE W3WDATMD, ONLY: TIME
      USE W3ODATMD, ONLY: TBPI0, TBPIN, FLBPI
      USE W3ADATMD, ONLY: CG, CX, CY, ATRNX, ATRNY, ITIME, CFLXYMAX
      USE W3IDATMD, ONLY: FLCUR
!      USE W3ODATMD, ONLY: NDSE, NDST, FLBPI, NBI, TBPI0, TBPIN,       &
!                          ISBPI, BBPI0, BBPIN
!/S      USE W3SERVMD, ONLY: STRACE
      
      IMPLICIT NONE
!/ ------------------------------------------------------------------- /
!/ Parameter list
!/
      INTEGER, INTENT(IN)     :: ISP
      REAL, INTENT(IN)        :: FACX, FACY, DTG, VGX, VGY
      REAL, INTENT(INOUT)     :: VQ(1-NY:NY*(NX+2))
      LOGICAL, INTENT(IN)     :: LCALC
!/
!/ ------------------------------------------------------------------- /
!/ Local parameters
!/
      INTEGER                 :: ITH, IK, ISEA, IXY
      INTEGER                 :: IX
!/S      INTEGER, SAVE           :: IENT = 0
      REAL                    :: CCOS, CSIN, CCURX, CCURY
      REAL                    :: C(NX,2)
      REAL                    :: RD1, RD2
!/
!/ Automatic work arrays
!/
      REAL                    :: VLCFLX((NX+1)*NY), VLCFLY(NX*NY)
      DOUBLE PRECISION        :: AC(NX)
!/ ------------------------------------------------------------------- /                                     
!
! 1.  Preparations --------------------------------------------------- *
! 1.a Set constants
!      
      
!/S      CALL STRACE (IENT, 'W3XYPUG')      
      ITH    = 1 + MOD(ISP-1,NTH)
      IK     = 1 + (ISP-1)/NTH     
   
      CCOS   = FACX * ECOS(ITH) 
      CSIN   = FACY * ESIN(ITH) 
      CCURX  = FACX 
      CCURY  = FACY  
!
! 1.b Initialize arrays
!
      VLCFLX = 0.
      VLCFLY = 0.
!
! 1.c Set depth 
!
      CALL SETDEPTH
!
!
! 2.  Calculate velocities ---------------- *
! 
      DO ISEA=1, NSEA
        IXY         =  MAPSF(ISEA,3)
        VQ(IXY)     = VQ(IXY) / CG(IK,ISEA) * CLATS(ISEA)
        VLCFLX(IXY) = CCOS * CG(IK,ISEA) / CLATS(ISEA)
        VLCFLY(IXY) = CSIN * CG(IK,ISEA)
!/MGP        VLCFLX(IXY) = VLCFLX(IXY) - CCURX*VGX/CLATS(ISEA)
!/MGP        VLCFLY(IXY) = VLCFLY(IXY) - CCURY*VGY
      END DO  

      IF ( FLCUR ) THEN   
        DO ISEA=1, NSEA
          IXY =  MAPSF(ISEA,3)
!
! Currents are not included on coastal boundaries (IOBP(IXY).EQ.0)
!
          IF (IOBP(IXY) .EQ. 1) THEN                                  
            VLCFLX(IXY) = VLCFLX(IXY) + CCURX*CX(ISEA)/CLATS(ISEA)
            VLCFLY(IXY) = VLCFLY(IXY) + CCURY*CY(ISEA)
          END IF    
        END DO     
      END IF

!
! 3. initialize fluctuation splitting arrays ( to fit with WWM notations)    
!
      AC(1:NX) = DBLE(VQ(1:NX))  
      C(1:NX,1)  = VLCFLX(1:NX)
      C(1:NX,2)  = VLCFLY(1:NX)
!
! 4. Prepares boundary update
!
      IF ( FLBPI ) THEN
        RD1    = DSEC21 ( TBPI0, TIME )
        RD2    = DSEC21 ( TBPI0, TBPIN )
      ELSE
        RD1=1.
        RD2=0.
      END IF
!
! 4. propagate using the selected scheme 
!      
       IF (FSN) THEN
         CALL W3XYPFSN2 (ISP, C, LCALC, RD1, RD2, DTG, AC)
       ELSE IF (FSPSI) THEN
         CALL W3XYPFSPSI2 (ISP, C, LCALC, RD1, RD2, DTG, AC)
       ELSE IF (FSFCT) THEN
         CALL W3XYPFSFCT2 (ISP, C, LCALC, RD1, RD2, DTG, AC)
       ELSE IF (FSNIMP) THEN
         CALL W3XYPFSNIMP(ISP, C, LCALC, RD1, RD2, DTG, AC)
       ENDIF  
!
       DO IX=1,NX
         ISEA=MAPFS(1,IX)
         VQ(IX)=REAL(AC(IX))
         ENDDO          
    
! 6.  Store results in VQ in proper format --------------------------- *
!
!/C90/!DIR$ IVDEP
!
      DO ISEA=1, NSEA
        IXY   = MAPSF(ISEA,3)
        VQ(IXY) =  MAX ( 0. , CG(IK,ISEA)/CLATS(ISEA)*VQ(IXY) )
      END DO
   END SUBROUTINE W3XYPUG
!/ ------------------------------------------------------------------- /
   SUBROUTINE W3CFLUG ( ISEA, NKCFL, FACX, FACY, DT, MAPFS, CFLXYMAX,    &
                          VGX, VGY )    
!/
!/                  +-----------------------------------+
!/                  | WAVEWATCH III           NOAA/NCEP |
!/                  |           Fabrice Ardhuin         |
!/                  |                        FORTRAN 90 |
!/                  | Last update :         20-Jun-2013 |
!/                  +-----------------------------------+
!/
!/    01-Mar-2011 : Origination.                        ( version 3.14 )
!/    20-Jun-2013 : Computes only up to NKCFL for tests ( version 4.10 )
!/     1-Jun-2017 : Rewrite routine for performance     ( version 5.xx ) 
!/
!  1. Purpose :
!
!     Computes the max CFL number for output purposes
!
!  2. Method :
!
!     
!
!  3. Parameters :
!
!     Parameter list
!     ----------------------------------------------------------------
!       ISEA     Int.   I   Index of sea point
!       NKCFL    Int.   I   Maximum frequency index
!       FACX/Y   Real   I   Factor in propagation velocity.
!                          ( 1 or 0 * DT / DX )
!       DT       Real   I   Time step.
!       MAPFS    I.A.   I   Storage map.
!       CFLXYMAX Real       Maxmimum CFL for spatial advection
!       VGX/Y    Real   I   Speed of grid.
!     ----------------------------------------------------------------
!
!     Local variables.
!     ----------------------------------------------------------------
!       VCFL0X  R.A.  Local courant numbers for absolute group vel.
!                     using local X-grid step.
!       VCFL0Y  R.A.  Id. in Y.
!     ----------------------------------------------------------------
!
!  4. Subroutines used :
!

!  5. Called by :
!
!       W3WAVE   Wave model routine.
!
!  6. Error messages :
!
!       None.
!
!  7. Remarks :
!              make the interface between the WAVEWATCH and the WWM code. 
!
!  8. Structure :
!
!
!  9. Switches :
!
!       !/S     Enable subroutine tracing.
!
!
! 10. Source code :     
!/ ------------------------------------------------------------------- /
!/
!
      USE CONSTANTS
!
      USE W3TIMEMD, ONLY: DSEC21
!
      USE W3GDATMD, ONLY: NX, NY, NSEA, MAPSF, DTCFL, CLATS,          &
                          FLCX, FLCY, NK, NTH, DTH, XFR,              &
                          ECOS, ESIN, SIG,  PFMOVE,IEN, INDEX_CELL,   &
                          NTRI, TRIGP, CCON ,                         &
                          IE_CELL, POS_CELL, COUNTRI, SI, IOBP, XYB

      USE W3ADATMD, ONLY: CG, CX, CY, ATRNX, ATRNY, ITIME, DW
      USE W3IDATMD, ONLY: FLCUR
!/T      USE W3ODATMD, ONLY: NDSE, NDST, FLBPI, NBI, TBPI0, TBPIN,       &
!/T                          ISBPI, BBPI0, BBPIN
!/S      USE W3SERVMD, ONLY: STRACE
      
      IMPLICIT NONE
!/ ------------------------------------------------------------------- /
!/ Parameter list
!/
      INTEGER, INTENT(IN)     :: ISEA, NKCFL, MAPFS(NY*NX)
      REAL, INTENT(IN)        :: FACX, FACY, DT, VGX, VGY
      REAL, INTENT(INOUT)     :: CFLXYMAX
!/
!/ ------------------------------------------------------------------- /
!/ Local parameters
!/
      INTEGER                 :: ITH, IK    
      INTEGER                 :: IP, IP2, ISEA2, I, J, IE, IV, I1, I2, I3
!/S      INTEGER, SAVE           :: IENT = 0
      REAL                    :: CCOS, CSIN, CCURX, CCURY
      REAL                    :: C(NX,2)
      REAL*8                  :: KELEM(3), KTMP(3), LAMBDA(2)
      REAL*8                  :: KKSUM, DTMAXEXP
!/
!/  Velocities 
!/
       REAL*8,  PARAMETER :: ONESIXTH  = 1.0d0/6.0d0
       REAL*8,  PARAMETER :: THR8      = TINY(1.0d0)
       REAL,    PARAMETER :: THR       = TINY(1.0) 

!/ ------------------------------------------------------------------- /                                     
!
! 1.  Preparations --------------------------------------------------- *
! 1.a Set constants
!      
      
!/S      CALL STRACE (IENT, 'W3CFLUG')      
      CFLXYMAX=1E-10
      IP = MAPSF(ISEA,3)
!
! CFL no important on boundary 
!
      IF (IOBP(IP).EQ.1) THEN       
      CCURX  = FACX 
      CCURY  = FACY  
!
! Loop over spectral components
!
      DO IK=1,NKCFL
        DO ITH=1,NTH
          CCOS   = FACX * ECOS(ITH) 
          CSIN   = FACY * ESIN(ITH) 
          C(:,:)=0.

!
! 2.  Calculate advection velocities: group speed ---------------- *
! 
!AR: needs to be rewritten for speed ... single node computation is costly ... 
!MA: you are right but now it is only called if CFX and UNST for CFL profiling

          DO I = INDEX_CELL(IP), INDEX_CELL(IP+1)-1
            IE=IE_CELL(I)       ! TRIGP(IE,IV)=IP  with IV=POS_CELL(I)
            DO J=1,3
              IP2=TRIGP(IE,J)
              ISEA2=MAPFS(IP2)
              C(IP2,1) = CCOS * CG(IK,ISEA2) / CLATS(ISEA2)
              C(IP2,2) = CSIN * CG(IK,ISEA2)
!/MGP              C(IP2,1) = C(IP2,1) - CCURX*VGX/CLATS(ISEA2)
!/MGP              C(IP2,2) = C(IP2,2) - CCURY*VGY
              IF ( FLCUR ) THEN   
                IF (IOBP(IP2) .EQ. 1) THEN                                  
                  C(IP2,1) = C(IP2,1) + CCURX*CX(ISEA2)/CLATS(ISEA2)
                  C(IP2,2) = C(IP2,2) + CCURY*CY(ISEA2)
                  END IF    
                END IF  ! end of ( FLCUR )
              END DO
            END DO
!
!3.     Calculate K-Values and contour based quantities ...
!
          KKSUM = 0.d0
          DO I = INDEX_CELL(IP), INDEX_CELL(IP+1)-1
            IE=IE_CELL(I)       ! TRIGP(IE,IV)=IP  
            IV=POS_CELL(I)
            I1 = TRIGP(IE,1) 
            I2 = TRIGP(IE,2)
            I3 = TRIGP(IE,3)
            LAMBDA(1) = ONESIXTH *(C(I1,1)+C(I2,1)+C(I3,1)) ! Advection speed in X direction
            LAMBDA(2) = ONESIXTH *(C(I1,2)+C(I2,2)+C(I3,2)) ! Advection speed in Y direction
            KELEM(1) = LAMBDA(1) * IEN(IE,1) + LAMBDA(2) * IEN(IE,2) ! K-Values - so called Flux Jacobians 
            KELEM(2) = LAMBDA(1) * IEN(IE,3) + LAMBDA(2) * IEN(IE,4) ! K-Values - so called Flux Jacobians 
            KELEM(3) = LAMBDA(1) * IEN(IE,5) + LAMBDA(2) * IEN(IE,6) ! K-Values - so called Flux Jacobians 

            KTMP        = KELEM ! Copy
            KELEM       = MAX(0.d0,KTMP)

            KKSUM = KKSUM  +  KELEM(IV)
            END DO ! COUNTRI
!
          DTMAXEXP = SI(IP)/MAX(DBLE(10.E-10),KKSUM)
          CFLXYMAX = MAX(DBLE(DT)/DTMAXEXP,DBLE(CFLXYMAX))
          END DO
        END DO
        END IF
!
      RETURN    
 END SUBROUTINE W3CFLUG    
!/ ------------------------------------------------------------------- /

      SUBROUTINE W3XYPFSN2 ( ISP, C, LCALC, RD10, RD20, DT, AC)
     
!/
!/
!/                  +-----------------------------------+
!/                  |  WWIII Version of the WWM FS Code |
!/                  |  by Aron Roland                   |
!/                  |     and Fabrice  Ardhuin          |
!/                  |  for use in WWIII                 |
!/                  |  GPL License                      |
!/                  |  Last update :        03-Nov-2011 |
!/                  +-----------------------------------+
!/
!/    19-Dec-2007 : Origination.                        ( version 3.13 )
!/    25-Aug-2011 : Change of method for IOBPD          ( version 4.04 )
!/    03-Nov-2011 : Addition of shoreline reflection    ( version 4.04 )
!/
!/
!  1. Purpose :
!     Advection of a scalar in a arbitary velocity field on unstructured meshes
!     for the conservative hyperbolic equation N,t + (c*N),xy = 0 in spatial space
!     This is the standard explicit N-Scheme from Roe as formulated in Abgrall
!
!  2. Method :
!
!  3. Parameters :
!
!     Parameter list
!     ----------------------------------------------------------------
!     ----------------------------------------------------------------
!
!  4. Subroutines used :
!
!       STRACE   Subroutine tracing (!/S switch)
!
!  5. Called by :
!
!       W3XYPUG   Routine for advection on unstructured grid
!
!  6. Error messages :
!
!       None.
!
!  7. Remarks :
!
!  8. Structure :
!
!     See source code.
!
!  9. Switches :
!
!     !/S  Enable subroutine tracing.
!
! 10. Source code :
!
!/ ------------------------------------------------------------------- /
!/
       USE W3GDATMD, ONLY : NK, NTH, NTRI, NX, CCON, IE_CELL,POS_CELL, SI, &
                            IEN, TRIGP, CLATS, MAPSF, IOBPD, IOBP, IOBDP, IOBPA, XYB
!/REF1   USE W3GDATMD, ONLY : REFPARS
       USE W3WDATMD, ONLY: TIME      
       USE W3ADATMD, ONLY: CG, ITER    
       USE W3ODATMD, ONLY: NDSE, NDST, FLBPI, NBI, TBPI0, TBPIN, ISBPI, BBPI0, BBPIN
       USE W3TIMEMD, ONLY: DSEC21
!/S      USE W3SERVMD, ONLY: STRACE
       IMPLICIT NONE  
     
       INTEGER, INTENT(IN)    :: ISP                   ! Actual Frequency/Wavenumber, actual Wave Direction
       REAL,    INTENT(IN)    :: DT                    ! Time intervall for which the advection should be computed for the given velocity field
       REAL,    INTENT(IN)    :: C(:,:)                ! Velocity field in it's X- and Y- Components, 
       DOUBLE PRECISION, INTENT(INOUT):: AC(:)         ! Wave Action before and after advection
       REAL,    INTENT(IN)    :: RD10, RD20            ! Time interpolation coefficients for boundary conditions
       LOGICAL, INTENT(IN)    :: LCALC                 ! Switch for the calculation of the max. Global Time step
!/
!/ ------------------------------------------------------------------- /
!/ Parameter list
!/
!/
!/ ------------------------------------------------------------------- /
!/ Local parameters
!/
!/S      INTEGER, SAVE           :: IENT = 0

       REAL*8,  PARAMETER :: ONESIXTH  = 1.0d0/6.0d0
       REAL*8,  PARAMETER :: THR8      = TINY(1.0d0)
       REAL,    PARAMETER :: THR       = TINY(1.0) 
!/
!/ ------------------------------------------------------------------- /
!/
!
! local integer
!
         INTEGER :: IP, IE, IT, I1, I2, I3, ITH, IK    
         INTEGER :: IBI, NI(3)
!
! local real
!
         REAL    :: RD1, RD2
!:
! local double
!
         REAL*8  :: UTILDE, BOUNDARY_FORCING
         REAL*8  :: CFLXY
         REAL*8  :: FL11, FL12, FL21, FL22, FL31, FL32
         REAL*8  :: FL111, FL112, FL211, FL212, FL311, FL312
         REAL*8  :: DTSI(NX), U(NX)
         REAL*8  :: DTMAXGL, DTMAXEXP, REST
         REAL*8  :: LAMBDA(2), KTMP(3)
         REAL*8  :: KELEM(3,NTRI), FLALL(3,NTRI)
         REAL*8  :: KKSUM(NX), ST(NX)
         REAL*8  :: NM(NTRI) 

!/S      CALL STRACE (IENT, 'W3XYPFSN')

! 1. initialisation

         ITH    = 1 + MOD(ISP-1,NTH)
         IK     = 1 + (ISP-1)/NTH 
         DTMAXGL = DBLE(10.E10)
!
!2       Propagation 
!2.a     Calculate K-Values and contour based quantities ...
!
         DO IE = 1, NTRI ! I precacalculate this arrays below as I assume that current velocity  changes continusly ... 
           I1 = TRIGP(IE,1) ! Index of the Element Nodes (TRIGP)
           I2 = TRIGP(IE,2)
           I3 = TRIGP(IE,3)
           LAMBDA(1) = ONESIXTH *(C(I1,1)+C(I2,1)+C(I3,1)) ! Linearized advection speed in X and Y direction
           LAMBDA(2) = ONESIXTH *(C(I1,2)+C(I2,2)+C(I3,2))
           KELEM(1,IE) = LAMBDA(1) * IEN(IE,1) + LAMBDA(2) * IEN(IE,2) ! K-Values - so called Flux Jacobians 
           KELEM(2,IE) = LAMBDA(1) * IEN(IE,3) + LAMBDA(2) * IEN(IE,4)
           KELEM(3,IE) = LAMBDA(1) * IEN(IE,5) + LAMBDA(2) * IEN(IE,6)
!
           KTMP        = KELEM(:,IE) ! Copy
           NM(IE)      = - 1.D0/MIN(-THR8,SUM(MIN(0.d0,KTMP))) ! N-Values 
           KELEM(:,IE) = MAX(0.d0,KTMP)
           FL11  = C(I2,1) * IEN(IE,1) + C(I2,2) * IEN(IE,2) ! Weights for Simpson Integration 
           FL12  = C(I3,1) * IEN(IE,1) + C(I3,2) * IEN(IE,2)
           FL21  = C(I3,1) * IEN(IE,3) + C(I3,2) * IEN(IE,4)
           FL22  = C(I1,1) * IEN(IE,3) + C(I1,2) * IEN(IE,4)
           FL31  = C(I1,1) * IEN(IE,5) + C(I1,2) * IEN(IE,6)
           FL32  = C(I2,1) * IEN(IE,5) + C(I2,2) * IEN(IE,6)
           FL111 = 2.d0*FL11+FL12
           FL112 = 2.d0*FL12+FL11
           FL211 = 2.d0*FL21+FL22
           FL212 = 2.d0*FL22+FL21
           FL311 = 2.d0*FL31+FL32
           FL312 = 2.d0*FL32+FL31
           FLALL(1,IE) = (FL311 + FL212) * ONESIXTH + KELEM(1,IE)
           FLALL(2,IE) = (FL111 + FL312) * ONESIXTH + KELEM(2,IE)
           FLALL(3,IE) = (FL211 + FL112) * ONESIXTH + KELEM(3,IE)
           ! IF (I1.EQ.1.OR.I2.EQ.1.OR.I3.EQ.1) WRITE(6,*) 'TEST N1 :',IK,ITH,IP,IE,KELEM(:,IE),'##',LAMBDA
           END DO ! NTRI

           IF (LCALC) THEN ! If the current field or water level changes estimate the iteration number based on the new flow field and the CFL number of the scheme
             KKSUM = 0.d0
             DO IE = 1, NTRI
               NI = TRIGP(IE,:)
               KKSUM(NI) = KKSUM(NI) + KELEM(:,IE)
             END DO ! IE
             DTMAXEXP = 1E10 ! initialize to large number 
             DO IP = 1, NX
               DTMAXEXP = SI(IP)/MAX(DBLE(10.E-10),KKSUM(IP)*IOBDP(IP))
               DTMAXGL  = MIN( DTMAXGL, DTMAXEXP)
             END DO ! IP
             CFLXY = DBLE(DT)/DTMAXGL
             REST  = ABS(MOD(CFLXY,1.0d0))
             IF (REST .LT. THR8) THEN
               ITER(IK,ITH) = ABS(NINT(CFLXY))
             ELSE IF (REST .GT. THR8 .AND. REST .LT. 0.5d0) THEN
               ITER(IK,ITH) = ABS(NINT(CFLXY)) + 1
             ELSE
               ITER(IK,ITH) = ABS(NINT(CFLXY))
             END IF
           END IF ! LCALC
 
           DO IP = 1, NX               
             DTSI(IP) = DBLE(DT)/DBLE(ITER(IK,ITH))/SI(IP) ! Some precalculations for the time integration.
           END DO

           DO IT = 1, ITER(IK,ITH)
             U = AC
             ST = 0.d0
             DO IE = 1, NTRI
               NI     = TRIGP(IE,:)
               UTILDE = NM(IE) * (DOT_PRODUCT(FLALL(:,IE),U(NI)))
               ST(NI) = ST(NI) + KELEM(:,IE) * (U(NI) - UTILDE) ! the 2nd term are the theta values of each node ...
               END DO ! IE

             DO IP = 1, NX
!
! IOBPD=0  : waves coming from land (or outside grid) 
! Possibly set flux to zero by multiplying ST by IOBPD but also in UTILDE multiply U(NI) by IOBPD ...
!
               U(IP) = MAX(0.d0,U(IP)-DTSI(IP)*ST(IP)*(1-IOBPA(IP)))*DBLE(IOBPD(ITH,IP))
!/REF1             IF (REFPARS(3).LT.0.5.AND.IOBPD(ITH,IP).EQ.0.AND.IOBPA(IP).EQ.0) U(IP)= AC(IP) ! restores reflected boundary values 
               END DO
! update spectrum 
             AC = U
!
! 4 Update boundaries: performs interpolation in time as done in rect grids (e.g. w3pro1md.ftn) 
!
             IF ( FLBPI ) THEN 
!
! 4.1 In this case the boundary is read from the nest.ww3 file 
!
               RD1=RD10 - DT * REAL(ITER(IK,ITH)-IT)/REAL(ITER(IK,ITH))
               RD2=RD20
               IF ( RD2 .GT. 0.001 ) THEN
                 RD2    = MIN(1.,MAX(0.,RD1/RD2))
                 RD1    = 1. - RD2
               ELSE
                 RD1    = 0.
                 RD2    = 1.
                 END IF
!
! Overwrites only the incoming energy ( IOBPD(ITH,IP) = 0) 
!
                DO IBI=1, NBI
                  IP = MAPSF(ISBPI(IBI),1)
                  AC(IP) = ( RD1*BBPI0(ISP,IBI) + RD2*BBPIN(ISP,IBI) )   &
                     / CG(IK,ISBPI(IBI)) * CLATS(ISBPI(IBI))
                END DO

            ENDIF
!
           END DO ! End of loop on time steps        
!        CALL EXTCDE ( 99 )
!/
!/ End of W3XYPFSN ----------------------------------------------------- /
!/
      END SUBROUTINE W3XYPFSN2
      
!/ ------------------------------------------------------------------- /  
      SUBROUTINE W3XYPFSPSI2 ( ISP, C, LCALC, RD10, RD20, DT, AC)
     
!/
!/
!/                  +-----------------------------------+
!/                  |  WWIII Version of the WWM FS Code |
!/                  |  by Aron Roland                   |
!/                  |  for use in WWIII                 |
!/                  |  GPL License                      |
!/                  |  Last update :        19-Dec-2007 |
!/                  +-----------------------------------+
!/
!  1. Purpose :
!     Advection of a scalar in a arbitary velocity field on unstructured meshes
!     for the conservative hyperbolic equation N,t + (c*N),xy = 0 in spatial space
!     This is the standard explicit N-Scheme from Roe as formulated in Abgrall
!
!  2. Method :
!
!  3. Parameters :
!
!     Parameter list
!     ----------------------------------------------------------------
!     ----------------------------------------------------------------
!
!  4. Subroutines used :
!
!       STRACE   Subroutine tracing (!/S switch)
!
!  5. Called by :
!
!       W3XYPUG   Routine for advection on unstructured grid
!
!  6. Error messages :
!
!       None.
!
!  7. Remarks :
!
!  8. Structure :
!
!     See source code.
!
!  9. Switches :
!
!     !/S  Enable subroutine tracing.
!
! 10. Source code :
!
!/ ------------------------------------------------------------------- /
!/
       USE W3GDATMD, ONLY : NK, NTH, NTRI, NX, CCON, IE_CELL,POS_CELL, SI, &
                            IEN, TRIGP, CLATS, MAPSF, IOBPA, IOBPD, IOBP, NNZ, IOBDP
!/REF1   USE W3GDATMD, ONLY :  REFPARS
       USE W3WDATMD, ONLY: TIME     
       USE W3ADATMD, ONLY: CG, ITER    
       USE W3ODATMD, ONLY: NDSE, NDST, FLBPI, NBI, TBPI0, TBPIN, ISBPI, BBPI0, BBPIN
       USE W3TIMEMD, ONLY: DSEC21
!/S      USE W3SERVMD, ONLY: STRACE
       IMPLICIT NONE  
     
       INTEGER, INTENT(IN)    :: ISP                   ! Actual Frequency/Wavenumber, actual Wave Direction
       REAL,    INTENT(IN)    :: DT                    ! Time intervall for which the advection should be computed for the given velocity field
       REAL,    INTENT(IN)    :: C(:,:)              ! Velocity field in it's X- and Y- Components, 
       DOUBLE PRECISION,INTENT(INOUT) :: AC(:)         ! Wave Action before and after advection
       REAL,    INTENT(IN)    :: RD10, RD20            ! Time interpolation coefficients for boundary conditions
       LOGICAL, INTENT(IN)    :: LCALC                 ! Switch for the calculation of the max. Global Time step
!/
!/ ------------------------------------------------------------------- /
!/ Parameter list
!/
!/
!/ ------------------------------------------------------------------- /
!/ Local parameters
!/
!/S      INTEGER, SAVE           :: IENT = 0

       REAL*8,  PARAMETER :: ONESIXTH  = 1.0d0/6.0d0
       REAL*8,  PARAMETER :: THR8      = TINY(1.0d0)
       REAL,    PARAMETER :: THR       = TINY(1.0) 
!/
!/ ------------------------------------------------------------------- /
!/
!
! local integer
!
         INTEGER :: IP, IE, IT, I1, I2, I3, ITH, IK    
         INTEGER :: IBI, NI(3)
!
! local real
!
         REAL    :: RD1, RD2
!:
! local double
!
         REAL*8  :: UTILDE, BOUNDARY_FORCING
         REAL*8  :: FT, CFLXY
         REAL*8  :: FL11, FL12, FL21, FL22, FL31, FL32
         REAL*8  :: FL111, FL112, FL211, FL212, FL311, FL312
         REAL*8  :: DTSI(NX), U(NX)
         REAL*8  :: DTMAXGL, DTMAXEXP, REST
         REAL*8  :: LAMBDA(2), KTMP(3), TMP(3)
         REAL*8  :: THETA_L(3), BET1(3), BETAHAT(3)
         REAL*8  :: KELEM(3,NTRI), FLALL(3,NTRI)
         REAL*8  :: KKSUM(NX), ST(NX)
         REAL*8  :: NM(NTRI) 

!/S      CALL STRACE (IENT, 'W3XYPFSN')

! 1. initialisation

         ITH    = 1 + MOD(ISP-1,NTH)
         IK     = 1 + (ISP-1)/NTH 
         DTMAXGL = DBLE(10.E10)
!
!2       Propagation 
!2.a     Calculate K-Values and contour based quantities ...
!
         DO IE = 1, NTRI ! I precacalculate this arrays below as I assume that current velocity  changes continusly ... 
           I1 = TRIGP(IE,1) ! Index of the Element Nodes (TRIGP)
           I2 = TRIGP(IE,2)
           I3 = TRIGP(IE,3)
           LAMBDA(1) = ONESIXTH *(C(I1,1)+C(I2,1)+C(I3,1)) ! Linearized advection speed in X and Y direction
           LAMBDA(2) = ONESIXTH *(C(I1,2)+C(I2,2)+C(I3,2))
           KELEM(1,IE) = LAMBDA(1) * IEN(IE,1) + LAMBDA(2) * IEN(IE,2) ! K-Values - so called Flux Jacobians 
           KELEM(2,IE) = LAMBDA(1) * IEN(IE,3) + LAMBDA(2) * IEN(IE,4)
           KELEM(3,IE) = LAMBDA(1) * IEN(IE,5) + LAMBDA(2) * IEN(IE,6)
           KTMP        = KELEM(:,IE) ! Copy
           NM(IE)      = - 1.D0/MIN(-THR8,SUM(MIN(0.d0,KTMP))) ! N-Values 
           KELEM(:,IE) = MAX(0.d0,KTMP)
           FL11  = C(I2,1) * IEN(IE,1) + C(I2,2) * IEN(IE,2) ! Weights for Simpson Integration 
           FL12  = C(I3,1) * IEN(IE,1) + C(I3,2) * IEN(IE,2)
           FL21  = C(I3,1) * IEN(IE,3) + C(I3,2) * IEN(IE,4)
           FL22  = C(I1,1) * IEN(IE,3) + C(I1,2) * IEN(IE,4)
           FL31  = C(I1,1) * IEN(IE,5) + C(I1,2) * IEN(IE,6)
           FL32  = C(I2,1) * IEN(IE,5) + C(I2,2) * IEN(IE,6)
           FL111 = 2.d0*FL11+FL12
           FL112 = 2.d0*FL12+FL11
           FL211 = 2.d0*FL21+FL22
           FL212 = 2.d0*FL22+FL21
           FL311 = 2.d0*FL31+FL32
           FL312 = 2.d0*FL32+FL31
           FLALL(1,IE) = (FL311 + FL212)! * ONESIXTH + KELEM(1,IE)
           FLALL(2,IE) = (FL111 + FL312)! * ONESIXTH + KELEM(2,IE)
           FLALL(3,IE) = (FL211 + FL112)! * ONESIXTH + KELEM(3,IE)
         END DO ! NTRI

           IF (LCALC) THEN ! If the current field or water level changes estimate the iteration number based on the new flow field and the CFL number of the scheme
             KKSUM = 0.d0
             DO IE = 1, NTRI
               NI = TRIGP(IE,:)
               KKSUM(NI) = KKSUM(NI) + KELEM(:,IE)
             END DO ! IE
             DTMAXEXP = 1E10 ! initialize to large number 
             DO IP = 1, NX
               DTMAXEXP = SI(IP)/MAX(DBLE(10.E-10),KKSUM(IP)*IOBDP(IP))
               DTMAXGL  = MIN( DTMAXGL, DTMAXEXP)
             END DO ! IP
             CFLXY = DBLE(DT)/DTMAXGL
             REST  = ABS(MOD(CFLXY,1.0d0))
             IF (REST .LT. THR8) THEN
               ITER(IK,ITH) = ABS(NINT(CFLXY))
             ELSE IF (REST .GT. THR8 .AND. REST .LT. 0.5d0) THEN
               ITER(IK,ITH) = ABS(NINT(CFLXY)) + 1
             ELSE
               ITER(IK,ITH) = ABS(NINT(CFLXY))
             END IF
           END IF ! LCALC
 
           DO IP = 1, NX               
             DTSI(IP) = DBLE(DT)/DBLE(ITER(IK,ITH))/SI(IP) ! Some precalculations for the time integration.
           END DO

           DO IT = 1, ITER(IK,ITH)
             U = AC

             ST = 0.d0

             DO IE = 1, NTRI
               NI   = TRIGP(IE,:)
               FT     =-ONESIXTH*DOT_PRODUCT(U(NI),FLALL(:,IE))
               UTILDE = NM(IE) * ( DOT_PRODUCT(KELEM(:,IE),U(NI)) - FT )
               THETA_L(:) = KELEM(:,IE) * (U(NI) - UTILDE)
               IF (ABS(FT) .GT. 0.0d0) THEN
                 BET1(:) = THETA_L(:)/FT
                 IF (ANY( BET1 .LT. 0.0d0) ) THEN
                   BETAHAT(1)    = BET1(1) + 0.5d0 * BET1(2)
                   BETAHAT(2)    = BET1(2) + 0.5d0 * BET1(3)
                   BETAHAT(3)    = BET1(3) + 0.5d0 * BET1(1)
                   BET1(1)       = MAX(0.d0,MIN(BETAHAT(1),1.d0-BETAHAT(2),1.d0))
                   BET1(2)       = MAX(0.d0,MIN(BETAHAT(2),1.d0-BETAHAT(3),1.d0))
                   BET1(3)       = MAX(0.d0,MIN(BETAHAT(3),1.d0-BETAHAT(1),1.d0))
                  THETA_L(:) = FT * BET1
                 END IF
               ELSE
                 THETA_L(:) = 0.d0
               END IF
               ST(NI) = ST(NI) + THETA_L ! the 2nd term are the theta values of each node ...
             END DO

             DO IP = 1, NX
               U(IP) = MAX(0.d0,U(IP)-DTSI(IP)*ST(IP)*(1-IOBPA(IP)))*DBLE(IOBPD(ITH,IP))
!/REF1               IF (REFPARS(3).LT.0.5.AND.IOBPD(ITH,IP).EQ.0.AND.IOBPA(IP).EQ.0) U(IP)= AC(IP) ! restores reflected boundary values 
               END DO

! update spectrum 
             AC = U
!
! 4 Update boundaries: performs interpolation in time as done in rect grids (e.g. w3pro1md.ftn) 
!
             IF ( FLBPI ) THEN
!
! 4.1 In this case the boundary is read from the nest.ww3 file 
!
               RD1=RD10 - DT * REAL(ITER(IK,ITH)-IT)/REAL(ITER(IK,ITH))
               RD2=RD20
               IF ( RD2 .GT. 0.001 ) THEN
                 RD2    = MIN(1.,MAX(0.,RD1/RD2))
                 RD1    = 1. - RD2
               ELSE
                 RD1    = 0.
                 RD2    = 1.
                 END IF
!
! Overwrites only the incoming energy ( IOBPD(ITH,IP) = 0) 
!
                DO IBI=1, NBI
                  IP = MAPSF(ISBPI(IBI),1)
                  AC(IP) = ( RD1*BBPI0(ISP,IBI) + RD2*BBPIN(ISP,IBI) )   &
                     / CG(IK,ISBPI(IBI)) * CLATS(ISBPI(IBI))
                END DO

            ENDIF
!
           END DO ! End of loop on time steps      
!      CALL EXTCDE ( 99 )
!/
!/ End of W3XYPFSN ----------------------------------------------------- /
!/
      END SUBROUTINE W3XYPFSPSI2
      
!/ ------------------------------------------------------------------- /  
      SUBROUTINE W3XYPFSNIMP ( ISP, C, LCALC, RD10, RD20, DT, AC)
     
!/
!/
!/                  +-----------------------------------+
!/                  |  WWIII Version of the WWM FS Code |
!/                  |  by Aron Roland                   |
!/                  |  for use in WWIII                 |
!/                  |  GPL License                      |
!/                  |  Last update :        15-Dec-2013 |
!/                  +-----------------------------------+
!/
!/    15-Dec-2013 : Bug fix for implicit scheme         ( version 4.16 )
!/
!  1. Purpose :
!     Advection of a scalar in a arbitary velocity field on unstructured meshes
!     for the conservative hyperbolic equation N,t + (c*N),xy = 0 in spatial space
!     This is the standard explicit N-Scheme from Roe as formulated in Abgrall
!
!  2. Method :
!
!  3. Parameters :
!
!     Parameter list
!     ----------------------------------------------------------------
!     ----------------------------------------------------------------
!
!  4. Subroutines used :
!
!       STRACE   Subroutine tracing (!/S switch)
!
!  5. Called by :
!
!       W3XYPUG   Routine for advection on unstructured grid
!
!  6. Error messages :
!
!       None.
!
!  7. Remarks :
!
!  8. Structure :
!
!     See source code.
!
!  9. Switches :
!
!     !/S  Enable subroutine tracing.
!
! 10. Source code :
!
!/ ------------------------------------------------------------------- /
!/
       USE W3GDATMD, ONLY : NK, NTH, NTRI, NX, CCON, IE_CELL,POS_CELL, SI, &
                            IEN, TRIGP, CLATS, MAPSF, IOBPD, IOBPA, IOBP, IAA, JAA, POSI, &
                            TRIA, NNZ
!/REF1  USE W3GDATMD, ONLY : REFPARS
       USE W3WDATMD, ONLY: TIME      
       USE W3ADATMD, ONLY: CG, ITER    
       USE W3ODATMD, ONLY: NDSE, NDST, FLBPI, NBI, TBPI0, TBPIN, ISBPI, BBPI0, BBPIN
       USE W3TIMEMD, ONLY: DSEC21
!/S      USE W3SERVMD, ONLY: STRACE
       IMPLICIT NONE  
     
       INTEGER, INTENT(IN)    :: ISP                   ! Actual Frequency/Wavenumber, actual Wave Direction
       REAL,    INTENT(IN)    :: DT                    ! Time intervall for which the advection should be computed for the given velocity field
       REAL,    INTENT(IN)    :: C(:,:)              ! Velocity field in it's X- and Y- Components, 
       DOUBLE PRECISION,INTENT(INOUT) :: AC(:)         ! Wave Action before and after advection
       REAL,    INTENT(IN)    :: RD10, RD20            ! Time interpolation coefficients for boundary conditions
       LOGICAL, INTENT(IN)    :: LCALC                 ! Switch for the calculation of the max. Global Time step
!/
!/ ------------------------------------------------------------------- /
!/ Parameter list
!/
!/
!/ ------------------------------------------------------------------- /
!/ Local parameters
!/
!/S      INTEGER, SAVE           :: IENT = 0

       REAL*8,  PARAMETER :: ONESIXTH  = 1.0d0/6.0d0
       REAL*8,  PARAMETER :: ONETHIRD  = 1.0d0/3.0d0
       REAL*8,  PARAMETER :: THR8      = TINY(1.0d0)
       REAL,    PARAMETER :: THR       = TINY(1.0) 
!/
!/ ------------------------------------------------------------------- /
!/
!
! local integer
!
         INTEGER :: IP, IE, POS, I1, I2, I3, I, J, ITH, IK    
         INTEGER :: IBI
!
! local real
!
         REAL    :: RD1, RD2
!:
! local double
!
         REAL*8  :: BOUNDARY_FORCING
         REAL*8  :: FL11, FL12, FL21, FL22, FL31, FL32
         REAL*8  :: U(NX)
         REAL*8  :: DTMAXGL
         REAL*8  :: LAMBDA(2)
         REAL*8  :: KP(3,NTRI)
         REAL*8  :: K1, DTK, TMP3, KM(3), K(3)
         REAL*8  :: NM(NTRI), CRFS(3), DELTAL(3,NTRI)
         REAL*8  :: B(NX), X(NX)
         REAL*8  :: ASPAR(NNZ)

         INTEGER :: IWKSP( 20*NX )
         INTEGER :: FLJU(NX)
         INTEGER :: FLJAU(NNZ+1)


         INTEGER :: POS_TRICK(3,2)

         INTEGER :: IPAR(16)
         INTEGER :: IERROR ! IWK                             ! ERROR Indicator and Work Array Size,
         INTEGER :: JAU(NNZ+1), JU(NX)

         REAL*8  :: FPAR(16)  ! DROPTOL
         REAL*8  :: WKSP( 8 * NX ) ! REAL WORKSPACES
         REAL*8  :: AU(NNZ+1)
         REAL*8  :: INIU(NX)

         external bcgstab

         POS_TRICK(1,1) = 2
         POS_TRICK(1,2) = 3
         POS_TRICK(2,1) = 3
         POS_TRICK(2,2) = 1
         POS_TRICK(3,1) = 1
         POS_TRICK(3,2) = 2

!/S      CALL STRACE (IENT, 'W3XYPFSN')

! 1. initialisation

         ITH    = 1 + MOD(ISP-1,NTH)
         IK     = 1 + (ISP-1)/NTH 
         DTMAXGL = DBLE(10.E10)

        IF (.FALSE.) THEN
           WRITE(*,*) 'NNZ', NNZ
           WRITE(*,*) 'MINVAL IAA,JAA', MINVAL(IAA), MINVAL(JAA)
           WRITE(*,*) 'MINVAL IAA,JAA', MAXVAL(IAA), MAXVAL(JAA)
           WRITE(*,*) 'MAX/MIN POSI',   MAXVAL(POSI), MINVAL(POSI)
           WRITE(*,*) 'AC, AQ', SUM(AC)
         END IF
!
!2       Propagation 
!2.a     Calculate K-Values and contour based quantities ...
!
         DO IE = 1, NTRI ! I precacalculate this arrays below as I assume that current velocity  changes continusly ... 
           I1 = TRIGP(IE,1) ! Index of the Element Nodes (TRIGP)
           I2 = TRIGP(IE,2)
           I3 = TRIGP(IE,3)
           LAMBDA(1) = ONESIXTH * (C(I1,1)+C(I2,1)+C(I3,1))
           LAMBDA(2) = ONESIXTH * (C(I1,2)+C(I2,2)+C(I3,2))
           K(1)  = LAMBDA(1) * IEN(IE,1) + LAMBDA(2) * IEN(IE,2)
           K(2)  = LAMBDA(1) * IEN(IE,3) + LAMBDA(2) * IEN(IE,4)
           K(3)  = LAMBDA(1) * IEN(IE,5) + LAMBDA(2) * IEN(IE,6)
           KP(1,IE) = MAX(0.d0,K(1))
           KP(2,IE) = MAX(0.d0,K(2))
           KP(3,IE) = MAX(0.d0,K(3))
           KM(1) = MIN(0.d0,K(1))
           KM(2) = MIN(0.d0,K(2))
           KM(3) = MIN(0.d0,K(3))
           FL11 = C(I2,1)*IEN(IE,1)+C(I2,2)*IEN(IE,2)
           FL12 = C(I3,1)*IEN(IE,1)+C(I3,2)*IEN(IE,2)
           FL21 = C(I3,1)*IEN(IE,3)+C(I3,2)*IEN(IE,4)
           FL22 = C(I1,1)*IEN(IE,3)+C(I1,2)*IEN(IE,4)
           FL31 = C(I1,1)*IEN(IE,5)+C(I1,2)*IEN(IE,6)
           FL32 = C(I2,1)*IEN(IE,5)+C(I2,2)*IEN(IE,6)
           CRFS(1) =  - ONESIXTH *  (2.0d0 *FL31 + FL32 + FL21 + 2.0d0 * FL22 )
           CRFS(2) =  - ONESIXTH *  (2.0d0 *FL32 + 2.0d0 * FL11 + FL12 + FL31 )
           CRFS(3) =  - ONESIXTH *  (2.0d0 *FL12 + 2.0d0 * FL21 + FL22 + FL11 )
           DELTAL(:,IE) = CRFS(:)- KP(:,IE)
           NM(IE)       = 1.d0/MIN(DBLE(THR),SUM(KM(:)))
         END DO ! NTRI
 
         U = DBLE(AC)
         J = 0 
         ASPAR = 0.d0
         B     = 0.d0
         DO IP = 1, NX 
           DO I = 1, CCON(IP)
             J = J + 1
             IE    =  IE_CELL(J)
             POS   =  POS_CELL(J)
             K1    =  KP(POS,IE) * IOBPD(ITH,IP)
             IF (K1 > 0.) THEN
               DTK   =  K1 * DT
               TMP3  =  DTK * NM(IE)
               I1    =  POSI(1,J)
               I2    =  POSI(2,J)
               I3    =  POSI(3,J)
               ASPAR(I1) =  ONETHIRD * TRIA(IE) + DTK - TMP3 * DELTAL(POS,IE)              + ASPAR(I1) 
               ASPAR(I2) =                         - TMP3 * DELTAL(POS_TRICK(POS,1),IE) + ASPAR(I2)
               ASPAR(I3) =                         - TMP3 * DELTAL(POS_TRICK(POS,2),IE) + ASPAR(I3) 
               B(IP)     =  B(IP) + ONETHIRD * TRIA(IE) * U(IP)
             ELSE
               I1    =  POSI(1,J)
               ASPAR(I1) =  ONETHIRD * TRIA(IE) + ASPAR(I1)
               B(IP)     =  B(IP) + ONETHIRD * TRIA(IE) * U(IP)
             END IF
           END DO ! End loop over connected elements ...
         END DO
!
!2DO setup a semi-implicit integration scheme for source terms only ...
!
         IPAR(1) = 0       ! always 0 to start an iterative solver
         IPAR(2) = 1       ! right preconditioning
         IPAR(3) = 1       ! use convergence test scheme 1
         IPAR(4) = 8*NX  !
         IPAR(5) = 15
         IPAR(6) = 1000    ! use at most 1000 matvec's
         FPAR(1) = DBLE(1.0E-8)  ! relative tolerance 1.0E-6
         FPAR(2) = DBLE(1.0E-10)  ! absolute tolerance 1.0E-10
         FPAR(11) = 0.d0    ! clearing the FLOPS counter

         AU    = 0.
         FLJAU = 0
         FLJU  = 0
         JAU   = 0
         JU    = 0

         CALL ILU0  (NX, ASPAR, JAA, IAA, AU, FLJAU, FLJU, IWKSP, IERROR)

!         WRITE(*,*) 'PRECONDITIONER', IERROR

!         IF (SUM(AC) .GT. 0.) THEN
         IF (.FALSE.) THEN
           WRITE(*,*) SUM(AC)
           WRITE(*,*) 'CALL SOLVER'
           WRITE(*,*) 'WRITE CG', SUM(CG)
           WRITE(*,*) 'B, X, AC, U', SUM(B), SUM(X), SUM(AC), SUM(U)
           WRITE(*,*) 'IPAR, FPAR', SUM(IPAR), SUM(FPAR)
           WRITE(*,*) 'WKSP, INIU', SUM(WKSP), SUM(INIU)
           WRITE(*,*) 'ASPAR, JAA, IAA',SUM(ASPAR), SUM(IAA), SUM(JAA)
           WRITE(*,*) 'AU, FLJAU, FLJU',SUM(AU), SUM(FLJAU), SUM(FLJU)
         END IF

          INIU = U 
          X    = 0.d0

          CALL RUNRC (NX, B, X, IPAR, FPAR, WKSP, INIU, ASPAR, JAA, IAA, AU, FLJAU, FLJU, BCGSTAB)

         IF (.FALSE.) THEN
           WRITE(*,*) 'SOLUTION'
           WRITE(*,*) 'B, X, AC, U', SUM(B), SUM(X), SUM(AC), SUM(U)
           WRITE(*,*) 'IPAR, FPAR', SUM(IPAR), SUM(FPAR)
           WRITE(*,*) 'WKSP, INIU', SUM(WKSP), SUM(INIU)
           WRITE(*,*) 'ASPAR, JAA, IAA', SUM(ASPAR), SUM(JAA), SUM(IAA)
           WRITE(*,*) 'AU, FLJAU, FLJU', SUM(AU), SUM(FLJAU), SUM(FLJU)
         END IF

          DO IP = 1,NX
            U(IP) = MAX(0.d0,X(IP)*DBLE(IOBPD(ITH,IP)))
!/REF1      IF (REFPARS(3).LT.0.5.AND.IOBPD(ITH,IP).EQ.0.AND.IOBPA(IP).EQ.0) U(IP)= AC(IP) ! restores reflected boundary values 
           END DO
!
! update spectrum 
             AC = REAL(U)
!
! 4 Update boundaries: performs interpolation in time as done in rect grids (e.g. w3pro1md.ftn) 
!
            IF ( FLBPI ) THEN
             RD1=RD10
             RD2=RD20
             IF ( RD2 .GT. 0.001 ) THEN
                 RD2    = MIN(1.,MAX(0.,RD1/RD2))
                 RD1    = 1. - RD2
              ELSE
                 RD1    = 0.
                 RD2    = 1.
              END IF
!
! Time interpolation as done in rect grids
!
               DO IBI=1, NBI
                 IP    = MAPSF(ISBPI(IBI),1)
                 AC(IP) = ( RD1*BBPI0(ISP,IBI) + RD2*BBPIN(ISP,IBI) )   &
                           *IOBPA(IP)*(1-IOBPD(ITH,IP)) / CG(IK,ISBPI(IBI)) * CLATS(ISBPI(IBI))
                 END DO
             END IF

!      CALL EXTCDE ( 99 )
!/
!/ End of W3XYPFSNIMP------------------------------------------------- /
!/
      END SUBROUTINE W3XYPFSNIMP
      
!/ ------------------------------------------------------------------- /  

      SUBROUTINE W3XYPFSFCT2 ( ISP, C, LCALC, RD10, RD20, DT, AC)
!/
!/
!/                  +-----------------------------------+
!/                  |  WWIII Version of the WWM FS Code |
!/                  |  by Aron Roland                   |
!/                  |  for use in WWIII                 |
!/                  |  GPL License                      |
!/                  |  Last update :        19-Dec-2007 |
!/                  +-----------------------------------+
!/
!  1. Purpose :
!     Advection of a scalar in a arbitary velocity field on unstructured meshes
!     for the conservative hyperbolic equation N,t + (c*N),xy = 0 in spatial space
!
!  2. Method :
!
!  3. Parameters :
!
!     Parameter list
!     ----------------------------------------------------------------
!     ----------------------------------------------------------------
!
!  4. Subroutines used :
!
!       STRACE   Subroutine tracing (!/S switch)
!
!  5. Called by :
!
!       W3XYPUG   Routine for advection on unstructured grid
!
!  6. Error messages :
!
!       None.
!
!  7. Remarks :
!
!  8. Structure :
!
!     See source code.
!
!  9. Switches :
!
!     !/S  Enable subroutine tracing.
!
! 10. Source code :
!
!/ ------------------------------------------------------------------- /
!/
       USE W3GDATMD, ONLY : NK, NTH, NTRI, NX, CCON, IE_CELL,POS_CELL, SI, &
                            IEN, TRIGP, CLATS, MAPSF, IOBPD, IOBPA, TRIA, IOBDP
!/REF1  USE W3GDATMD, ONLY : REFPARS
       USE W3WDATMD, ONLY: TIME               
       USE W3ADATMD, ONLY: CG, ITER    
       USE W3ODATMD, ONLY: NDSE, NDST, FLBPI, NBI, TBPI0, TBPIN, ISBPI, BBPI0, BBPIN
       USE W3TIMEMD, ONLY: DSEC21
!/S      USE W3SERVMD, ONLY: STRACE
       IMPLICIT NONE  
     
       INTEGER, INTENT(IN)    :: ISP                   ! Actual Frequency/Wavenumber, actual Wave Direction
       REAL,    INTENT(IN)    :: DT                    ! Time intervall for which the advection should be computed for the given velocity field
       REAL,    INTENT(IN)    :: C(:,:)              ! Velocity field in it's X- and Y- Components, 
       DOUBLE PRECISION,    INTENT(INOUT) :: AC(:)               ! Wave Action before and after advection
       REAL,    INTENT(IN)    :: RD10, RD20            ! Time interpolation coefficients for boundary condition
       LOGICAL, INTENT(IN)    :: LCALC                 ! Switch for the calculation of the max. Global Time step
!/
!/ ------------------------------------------------------------------- /
!/ Parameter list
!/
!/
!/ ------------------------------------------------------------------- /
!/ Local parameters
!/
!/S      INTEGER, SAVE           :: IENT = 0

       REAL*8,  PARAMETER :: ONESIXTH  = 1.0d0/6.0d0
       REAL*8,  PARAMETER :: ONETHIRD  = 1.0d0/3.0d0
       REAL*8,  PARAMETER :: THR8      = TINY(1.0d0)
       REAL,    PARAMETER :: THR       = TINY(1.0) 
!/
!/ ------------------------------------------------------------------- /
!/
!
! local integer
!
         INTEGER :: IP, IE, IT, I1, I2, I3, I, ITH, IK    
         INTEGER :: IBI, NI(3)
!
! local real
!
         REAL    :: RD1, RD2
!:
! local double
!
         REAL*8  :: UTILDE, BOUNDARY_FORCING
         REAL*8  :: FT, CFLXY
         REAL*8  :: FL11, FL12, FL21, FL22, FL31, FL32
         REAL*8  :: FL111, FL112, FL211, FL212, FL311, FL312
         REAL*8  :: DTSI(NX), U(NX), DT4AI
         REAL*8  :: DTMAXGL, DTMAXEXP, REST
         REAL*8  :: LAMBDA(2), KTMP(3), TMP(3)
         REAL*8  :: BET1(3), BETAHAT(3)
         REAL*8  :: THETA_L(3,NTRI), THETA_H(3,NTRI), THETA_ACE(3,NTRI), UTMP(3)
         REAL*8  :: WII(2,NX), UL(NX), USTARI(2,NX)

         REAL*8  :: PM(NX), PP(NX), UIM(NX), UIP(NX)

         REAL*8  :: KELEM(3,NTRI), FLALL(3,NTRI)
         REAL*8  :: KKSUM(NX), ST(NX), BETA
         REAL*8  :: NM(NTRI) 

!/S      CALL STRACE (IENT, 'W3XYPFSFCT2')

! 1. initialisation

         ITH    = 1 + MOD(ISP-1,NTH)
         IK     = 1 + (ISP-1)/NTH 
         DTMAXGL = DBLE(10.E10)
!
!2       Propagation 
!2.a     Calculate K-Values and contour based quantities ...
!
         DO IE = 1, NTRI ! I precacalculate this arrays below as I assume that current velocity  changes continusly ... 
           I1 = TRIGP(IE,1) ! Index of the Element Nodes (TRIGP)
           I2 = TRIGP(IE,2)
           I3 = TRIGP(IE,3)
           LAMBDA(1) = ONESIXTH *(C(I1,1)+C(I2,1)+C(I3,1)) ! Linearized advection speed in X and Y direction
           LAMBDA(2) = ONESIXTH *(C(I1,2)+C(I2,2)+C(I3,2))
           KELEM(1,IE) = LAMBDA(1) * IEN(IE,1) + LAMBDA(2) * IEN(IE,2) ! K-Values - so called Flux Jacobians 
           KELEM(2,IE) = LAMBDA(1) * IEN(IE,3) + LAMBDA(2) * IEN(IE,4)
           KELEM(3,IE) = LAMBDA(1) * IEN(IE,5) + LAMBDA(2) * IEN(IE,6)
           KTMP        = KELEM(:,IE) ! Copy
           NM(IE)      = - 1.D0/MIN(-THR8,SUM(MIN(0.d0,KTMP))) ! N-Values 
           FL11  = C(I2,1) * IEN(IE,1) + C(I2,2) * IEN(IE,2) ! Weights for Simpson Integration 
           FL12  = C(I3,1) * IEN(IE,1) + C(I3,2) * IEN(IE,2)
           FL21  = C(I3,1) * IEN(IE,3) + C(I3,2) * IEN(IE,4)
           FL22  = C(I1,1) * IEN(IE,3) + C(I1,2) * IEN(IE,4)
           FL31  = C(I1,1) * IEN(IE,5) + C(I1,2) * IEN(IE,6)
           FL32  = C(I2,1) * IEN(IE,5) + C(I2,2) * IEN(IE,6)
           FL111 = 2.d0*FL11+FL12
           FL112 = 2.d0*FL12+FL11
           FL211 = 2.d0*FL21+FL22
           FL212 = 2.d0*FL22+FL21
           FL311 = 2.d0*FL31+FL32
           FL312 = 2.d0*FL32+FL31
           FLALL(1,IE) = (FL311 + FL212)! * ONESIXTH + KELEM(1,IE)
           FLALL(2,IE) = (FL111 + FL312)! * ONESIXTH + KELEM(2,IE)
           FLALL(3,IE) = (FL211 + FL112)! * ONESIXTH + KELEM(3,IE)
         END DO ! NTRI

         IF (LCALC) THEN ! If the current field or water level changes estimate the iteration number based on the new flow field and the CFL number of the scheme
           KKSUM = 0.d0
           DO IE = 1, NTRI
             NI = TRIGP(IE,:)
             KKSUM(NI) = KKSUM(NI) + KELEM(:,IE)
           END DO ! IE
           DTMAXEXP = 1E10 ! initialize to large number 
           DO IP = 1, NX
             DTMAXEXP = SI(IP)/MAX(DBLE(10.E-10),KKSUM(IP)*IOBDP(IP))
             DTMAXGL  = MIN( DTMAXGL, DTMAXEXP)
           END DO ! IP
           CFLXY = DBLE(DT)/DTMAXGL
           REST  = ABS(MOD(CFLXY,1.0d0))
           IF (REST .LT. THR8) THEN
             ITER(IK,ITH) = ABS(NINT(CFLXY))
           ELSE IF (REST .GT. THR8 .AND. REST .LT. 0.5d0) THEN
             ITER(IK,ITH) = ABS(NINT(CFLXY)) + 1
           ELSE
             ITER(IK,ITH) = ABS(NINT(CFLXY))
           END IF
         END IF ! LCALC

         DT4AI = DBLE(DT)/DBLE(ITER(IK,ITH)) 
         DTSI(:)  = DT4AI/SI(:) ! Some precalculations for the time integration.

         U = AC
         UL = U
!
!  Now loop on sub-timesteps
!
         DO IT = 1, ITER(IK,ITH)

           ST = 0.d0
         
           DO IE = 1, NTRI 
              NI      = TRIGP(IE,:)
              UTMP    = U(NI)
              FT      =  - ONESIXTH*DOT_PRODUCT(UTMP,FLALL(:,IE))
              TMP     =  MAX(0.d0,KELEM(:,IE))
              UTILDE  =  NM(IE) * ( DOT_PRODUCT(TMP,UTMP) - FT )
              THETA_L(:,IE) =  TMP * ( UTMP - UTILDE )
              IF (ABS(FT) .GT. DBLE(THR)) THEN
                BET1(:) = THETA_L(:,IE)/FT
                IF (ANY( BET1 .LT. 0.0d0) ) THEN
                  BETAHAT(1)    = BET1(1) + 0.5d0 * BET1(2)
                  BETAHAT(2)    = BET1(2) + 0.5d0 * BET1(3)
                  BETAHAT(3)    = BET1(3) + 0.5d0 * BET1(1)
                  BET1(1)       = MAX(0.d0,MIN(BETAHAT(1),1.d0-BETAHAT(2),1.d0))
                  BET1(2)       = MAX(0.d0,MIN(BETAHAT(2),1.d0-BETAHAT(3),1.d0))
                  BET1(3)       = MAX(0.d0,MIN(BETAHAT(3),1.d0-BETAHAT(1),1.d0))
                  THETA_L(:,IE) = FT * BET1
                END IF
              ELSE
                THETA_L(:,IE) = 0.d0
              END IF
!              THETA_H(:,IE) = (ONETHIRD+DT4AI/(2.d0*TRIA(IE)) * KELEM(:,IE))*FT ! LAX-WENDROFF
              THETA_H(:,IE) = (1./3.+2./3.* KELEM(:,IE)/SUM(ABS(KELEM(:,IE))) )*FT ! CENTRAL SCHEME
              ! Antidiffusive residual according to the higher order nonmonotone scheme
              THETA_ACE(:,IE) = ((THETA_H(:,IE) - THETA_L(:,IE))) * DT4AI/SI(NI)
              ST(NI)          = ST(NI) + THETA_L(:,IE)*DT4AI/SI(NI)
            END DO
         
!            UL          = MAX(0.d0,U-ST)*DBLE(IOBPD(ITH,:))!*DBLE(IOBDP(:)) ... add for IOBDP dry/wet flag.
         
            DO IP = 1,NX
              UL(IP) = MAX(0.d0,U(IP)-ST(IP))*DBLE(IOBPD(ITH,IP))
              END DO
         
            USTARI(1,:) = MAX(UL,U)
            USTARI(2,:) = MIN(UL,U)
         
            UIP = -THR8
            UIM =  THR8
            PP  = 0.d0
            PM  = 0.d0
            DO IE = 1, NTRI 
              NI = TRIGP(IE,:)
              PP(NI)  = PP(NI) + MAX(  THR8, -THETA_ACE(:,IE)) 
              PM(NI)  = PM(NI) + MIN( -THR8, -THETA_ACE(:,IE)) 
              UIP(NI) = MAX (UIP(NI), MAXVAL( USTARI(1,NI) ))
              UIM(NI) = MIN (UIM(NI), MINVAL( USTARI(2,NI) ))
            END DO
         
            WII(1,:) = MIN(1.0d0,(UIP-UL)/MAX( THR8,PP))
            WII(2,:) = MIN(1.0d0,(UIM-UL)/MIN(-THR8,PM))
         
            ST = 0.d0
            DO IE = 1, NTRI 
              DO I = 1, 3
                IP = TRIGP(IE,I)
                IF (-THETA_ACE(I,IE) .GE. 0.) THEN
                  TMP(I) = WII(1,IP)
                ELSE
                  TMP(I) = WII(2,IP)
                END IF
              END DO
              BETA = MINVAL(TMP)
              NI = TRIGP(IE,:)
              ST(NI) = ST(NI) + BETA * THETA_ACE(:,IE)
            END DO
         
            DO IP = 1,NX
!
! IOBPD is the switch for removing energy coming from the shoreline 
!
              U(IP) = MAX(0.d0,UL(IP)-ST(IP))*DBLE(IOBPD(ITH,IP)) 
!/REF1               IF (REFPARS(3).LT.0.5.AND.IOBPD(ITH,IP).EQ.0.AND.IOBPA(IP).EQ.0) U(IP)= AC(IP) ! restores reflected boundary values 
               END DO
!
! update spectrum 
             AC = U
!
! 4 Update boundaries: performs interpolation in time as done in rect grids (e.g. w3pro1md.ftn) 
!
             IF ( FLBPI ) THEN
!
! 4.1 In this case the boundary is read from the nest.ww3 file 
!
               RD1=RD10 - DT * REAL(ITER(IK,ITH)-IT)/REAL(ITER(IK,ITH))
               RD2=RD20
               IF ( RD2 .GT. 0.001 ) THEN
                 RD2    = MIN(1.,MAX(0.,RD1/RD2))
                 RD1    = 1. - RD2
               ELSE
                 RD1    = 0.
                 RD2    = 1.
                 END IF
!
! Overwrites only the incoming energy ( IOBPD(ITH,IP) = 0) 
!
                DO IBI=1, NBI
                  IP = MAPSF(ISBPI(IBI),1)
                  AC(IP) = ( RD1*BBPI0(ISP,IBI) + RD2*BBPIN(ISP,IBI) )   &
                     / CG(IK,ISBPI(IBI)) * CLATS(ISBPI(IBI))
                END DO

              ENDIF
!
           END DO ! End of loop on time steps      
!      CALL EXTCDE ( 99 )
!/
!/ End of W3XYPFSN ----------------------------------------------------- /
!/
      END SUBROUTINE W3XYPFSFCT2 
!/ ------------------------------------------------------------------- /
      SUBROUTINE SETDEPTH
!/
!/                  +-----------------------------------+
!/                  | WAVEWATCH III           NOAA/NCEP |
!/                  |                                   |  
!/                  | Aron Roland (BGS IT&E GmbH)       |
!/                  | Mathieu Dutour-Sikiric (IRB)      |
!/                  |                                   |
!/                  |                        FORTRAN 90 |
!/                  | Last update :        01-June-2018 |
!/                  +-----------------------------------+
!/
!/    01-June-2018 : Origination.                        ( version 6.04 )
!/
!  1. Purpose : Init pdlib part
!  2. Method :
!  3. Parameters :
!
!     Parameter list
!     ----------------------------------------------------------------
!     ----------------------------------------------------------------
!
!  4. Subroutines used :
!
!      Name      Type  Module   Description
!     ----------------------------------------------------------------
!      STRACE    Subr. W3SERVMD Subroutine tracing.
!     ----------------------------------------------------------------
!
!  5. Called by :
!
!      Name      Type  Module   Description
!     ----------------------------------------------------------------
!     ----------------------------------------------------------------
!
!  6. Error messages :
!  7. Remarks
!  8. Structure :
!  9. Switches :
!
!     !/S  Enable subroutine tracing.
!
! 10. Source code :
!
!/ ------------------------------------------------------------------- /
!/S      USE W3SERVMD, ONLY: STRACE
!
      USE CONSTANTS, ONLY : LPDLIB
      USE W3GDATMD, ONLY: MAPSF, NSEAL, DMIN, IOBDP, MAPSTA, IOBP, MAPFS, NX
      USE W3ADATMD, ONLY: DW

      IMPLICIT NONE
!/
!/ ------------------------------------------------------------------- /
!/ Parameter list
!/
!/ ------------------------------------------------------------------- /
!/ Local PARAMETERs
!/
!/S      INTEGER, SAVE           :: IENT = 0
!/
!/ ------------------------------------------------------------------- /
!
      INTEGER :: JSEA, ISEA, IX, IP
      REAL*8, PARAMETER :: DTHR = 10E-6
!/S      CALL STRACE (IENT, 'SETDEPTH')
      IOBDP = 1
      DO IP=1,NX
        IF (DW(IP) .LT. DMIN + DTHR) IOBDP(IP) = 0
        !WRITE(*,*) ip, ip_glob, MAPSTA(1,IP_glob), IOBP(IP_glob), DW(ISEA), DMIN
      END DO

     END SUBROUTINE

!/ ------------------------------------------------------------------- /
        
END MODULE W3PROFSMD 


!--------------------------------------------------------------------------
!--------------------------------------------------------------------------
!--------------------------------------------------------------------------
!------------------iterative solver ---------------------------------------
!----------------------------------------------------------------------c
!                          S P A R S K I T                             c
!----------------------------------------------------------------------c
!         Basic Iterative Solvers with Reverse Communication           c
!----------------------------------------------------------------------c
!     This file currently has several basic iterative linear system    c
!     solvers. They are:                                               c
!     CG       -- Conjugate Gradient Method                            c
!     CGNR     -- Conjugate Gradient Method (Normal Residual equation) c
!     BCG      -- Bi-Conjugate Gradient Method                         c
!     DBCG     -- BCG with partial pivoting                            c
!     BCGSTAB  -- BCG stabilized                                       c
!     TFQMR    -- Transpose-Free Quasi-Minimum Residual method         c
!     FOM      -- Full Orthogonalization Method                        c
!     GMRES    -- Generalized Minimum RESidual method (no longer available) c
!     FGMRES   -- Flexible version of Generalized Minimum              c
!                 RESidual method                                      c
!     DQGMRES  -- Direct versions of Quasi Generalize Minimum          c
!                 Residual method                                      c
!----------------------------------------------------------------------c
!     They all have the following calling sequence:
!      subroutine solver(n, rhs, sol, ipar, fpar, w)
!      integer n, ipar(16)
!      real*8 rhs(n), sol(n), fpar(16), w(*)
!     Where
!     (1) 'n' is the size of the linear system,
!     (2) 'rhs' is the right-hand side of the linear system,
!     (3) 'sol' is the solution to the linear system,
!     (4) 'ipar' is an integer parameter array for the reverse
!     communication protocol,
!     (5) 'fpar' is an floating-point parameter array storing
!     information to and from the iterative solvers.
!     (6) 'w' is the work space (size is specified in ipar)
!
!     They are preconditioned iterative solvers with reverse
!     communication. The preconditioners can be applied from either
!     from left or right or both (specified by ipar(2), see below).
!
!     Author: Kesheng John Wu (kewu@mail.cs.umn.edu) 1993
!
!     NOTES:
!
!     (1) Work space required by each of the iterative solver
!     routines is as follows:
!       CG      == 5 * n
!       CGNR    == 5 * n
!       BCG     == 7 * n
!       DBCG    == 11 * n
!       BCGSTAB == 8 * n
!       TFQMR   == 11 * n
!       FOM     == (n+3)*(m+2) + (m+1)*m/2 (m = ipar(5), default m=15)
!       GMRES   == (n+3)*(m+2) + (m+1)*m/2 (m = ipar(5), default m=15) 
!          GMRES is no longer available
!       FGMRES  == 2*n*(m+1) + (m+1)*m/2 + 3*m + 2 (m = ipar(5),
!                  default m=15)
!       DQGMRES == n + lb * (2*n+4) (lb=ipar(5)+1, default lb = 16)
!
!     (2) ALL iterative solvers require a user-supplied DOT-product
!     routine named ddot. The prototype of ddot is
!
!     real*8 function ddot(n,x,y)
!     integer n, ix, iy
!     real*8 x(1+(n-1)*ix), y(1+(n-1)*iy)
!
!     This interface of ddot is exactly the same as that of
!     DDOT (or SDOT if real == real*8) from BLAS-1. It should have
!     same functionality as DDOT on a single processor machine. On a
!     parallel/distributed environment, each processor can perform
!     DDOT on the data it has, then perform a summation on all the
!     partial results.
!
!     (3) To use this set of routines under SPMD/MIMD program paradigm,
!     several things are to be noted: (a) 'n' should be the number of
!     vector elements of 'rhs' that is present on the local processor.
!     (b) if RHS(i) is on processor j, it is expected that SOL(i)
!     will be on the same processor, i.e. the vectors are distributed
!     to each processor in the same way. (c) the preconditioning and
!     stopping criteria specifications have to be the same on all
!     processor involved, ipar and fpar have to be the same on each
!     processor. (d) ddot should be replaced by a distributed
!     dot-product function.
!
!     ..................................................................
!     Reverse Communication Protocols
!
!     When a reverse-communication routine returns, it could be either
!     that the routine has terminated or it simply requires the caller
!     to perform one matrix-vector multiplication. The possible matrices
!     that involve in the matrix-vector multiplications are:
!     A       (the matrix of the linear system),
!     A^T     (A transposed),
!     Ml^{-1} (inverse of the left preconditioner),
!     Ml^{-T} (inverse of the left preconditioner transposed),
!     Mr^{-1} (inverse of the right preconditioner),
!     Mr^{-T} (inverse of the right preconditioner transposed).
!     For all the matrix vector multiplication, v = A u. The input and
!     output vectors are supposed to be part of the work space 'w', and
!     the starting positions of them are stored in ipar(8:9), see below.
!
!     The array 'ipar' is used to store the information about the solver.
!     Here is the list of what each element represents:
!
!     ipar(1) -- status of the call/return.
!     A call to the solver with ipar(1) == 0 will initialize the
!     iterative solver. On return from the iterative solver, ipar(1)
!     carries the status flag which indicates the condition of the
!     return. The status information is divided into two categories,
!     (1) a positive value indicates the solver requires a matrix-vector
!     multiplication,
!     (2) a non-positive value indicates termination of the solver.
!     Here is the current definition:
!       1 == request a matvec with A,
!       2 == request a matvec with A^T,
!       3 == request a left preconditioner solve (Ml^{-1}),
!       4 == request a left preconditioner transposed solve (Ml^{-T}),
!       5 == request a right preconditioner solve (Mr^{-1}),
!       6 == request a right preconditioner transposed solve (Mr^{-T}),
!      10 == request the caller to perform stopping test,
!       0 == normal termination of the solver, satisfied the stopping
!            criteria,
!      -1 == termination because iteration number is greater than the
!            preset limit,
!      -2 == return due to insufficient work space,
!      -3 == return due to anticipated break-down / divide by zero,
!            in the case where Arnoldi procedure is used, additional
!            error code can be found in ipar(12), where ipar(12) is
!            the error code of orthogonalization procedure MGSRO:
!               -1: zero input vector
!               -2: input vector contains abnormal numbers
!               -3: input vector is a linear combination of others
!               -4: trianguler system in GMRES/FOM/etc. has nul rank
!      -4 == the values of fpar(1) and fpar(2) are both <= 0, the valid
!            ranges are 0 <= fpar(1) < 1, 0 <= fpar(2), and they can
!            not be zero at the same time
!      -9 == while trying to detect a break-down, an abnormal number is
!            detected.
!     -10 == return due to some non-numerical reasons, e.g. invalid
!            floating-point numbers etc.
!
!     ipar(2) -- status of the preconditioning:
!       0 == no preconditioning
!       1 == left preconditioning only
!       2 == right preconditioning only
!       3 == both left and right preconditioning
!
!     ipar(3) -- stopping criteria (details of this will be
!     discussed later).
!
!     ipar(4) -- number of elements in the array 'w'. if this is less
!     than the desired size, it will be over-written with the minimum
!     requirement. In which case the status flag ipar(1) = -2.
!
!     ipar(5) -- size of the Krylov subspace (used by GMRES and its
!     variants), e.g. GMRES(ipar(5)), FGMRES(ipar(5)),
!     DQGMRES(ipar(5)).
!
!     ipar(6) -- maximum number of matrix-vector multiplies, if not a
!     positive number the iterative solver will run till convergence
!     test is satisfied.
!
!     ipar(7) -- current number of matrix-vector multiplies. It is
!     incremented after each matrix-vector multiplication. If there
!     is preconditioning, the counter is incremented after the
!     preconditioning associated with each matrix-vector multiplication.
!
!     ipar(8) -- pointer to the input vector to the requested matrix-
!     vector multiplication.
!
!     ipar(9) -- pointer to the output vector of the requested matrix-
!     vector multiplication.
!
!     To perform v = A * u, it is assumed that u is w(ipar(8):ipar(8)+n-1)
!     and v is stored as w(ipar(9):ipar(9)+n-1).
!
!     ipar(10) -- the return address (used to determine where to go to
!     inside the iterative solvers after the caller has performed the
!     requested services).
!
!     ipar(11) -- the result of the external convergence test
!     On final return from the iterative solvers, this value
!     will be reflected by ipar(1) = 0 (details discussed later)
!
!     ipar(12) -- error code of MGSRO, it is
!                  1 if the input vector to MGSRO is linear combination
!                    of others,
!                  0 if MGSRO was successful,
!                 -1 if the input vector to MGSRO is zero,
!                 -2 if the input vector contains invalid number.
!
!     ipar(13) -- number of initializations. During each initilization
!                 residual norm is computed directly from M_l(b - A x).
!
!     ipar(14) to ipar(16) are NOT defined, they are NOT USED by
!     any iterative solver at this time.
!
!     Information about the error and tolerance are stored in the array
!     FPAR. So are some internal variables that need to be saved from
!     one iteration to the next one. Since the internal variables are
!     not the same for each routine, we only define the common ones.
!
!     The first two are input parameters:
!     fpar(1) -- the relative tolerance,
!     fpar(2) -- the absolute tolerance (details discussed later),
!
!     When the iterative solver terminates,
!     fpar(3) -- initial residual/error norm,
!     fpar(4) -- target residual/error norm,
!     fpar(5) -- current residual norm (if available),
!     fpar(6) -- current residual/error norm,
!     fpar(7) -- convergence rate,
!
!     fpar(8:10) are used by some of the iterative solvers to save some
!     internal information.
!
!     fpar(11) -- number of floating-point operations. The iterative
!     solvers will add the number of FLOPS they used to this variable,
!     but they do NOT initialize it, nor add the number of FLOPS due to
!     matrix-vector multiplications (since matvec is outside of the
!     iterative solvers). To insure the correct FLOPS count, the
!     caller should set fpar(11) = 0 before invoking the iterative
!     solvers and account for the number of FLOPS from matrix-vector
!     multiplications and preconditioners.
!
!     fpar(12:16) are not used in current implementation.
!
!     Whether the content of fpar(3), fpar(4) and fpar(6) are residual
!     norms or error norms depends on ipar(3). If the requested
!     convergence test is based on the residual norm, they will be
!     residual norms. If the caller want to test convergence based the
!     error norms (estimated by the norm of the modifications applied
!     to the approximate solution), they will be error norms.
!     Convergence rate is defined by (Fortran 77 statement)
!     fpar(7) = log10(fpar(3) / fpar(6)) / (ipar(7)-ipar(13))
!     If fpar(7) = 0.5, it means that approximately every 2 (= 1/0.5)
!     steps the residual/error norm decrease by a factor of 10.
!
!     ..................................................................
!     Stopping criteria,
!
!     An iterative solver may be terminated due to (1) satisfying
!     convergence test; (2) exceeding iteration limit; (3) insufficient
!     work space; (4) break-down. Checking of the work space is
!     only done in the initialization stage, i.e. when it is called with
!     ipar(1) == 0. A complete convergence test is done after each
!     update of the solutions. Other conditions are monitored
!     continuously.
!
!     With regard to the number of iteration, when ipar(6) is positive,
!     the current iteration number will be checked against it. If
!     current iteration number is greater the ipar(6) than the solver
!     will return with status -1. If ipar(6) is not positive, the
!     iteration will continue until convergence test is satisfied.
!
!     Two things may be used in the convergence tests, one is the
!     residual 2-norm, the other one is 2-norm of the change in the
!     approximate solution. The residual and the change in approximate
!     solution are from the preconditioned system (if preconditioning
!     is applied). The DQGMRES and TFQMR use two estimates for the
!     residual norms. The estimates are not accurate, but they are
!     acceptable in most of the cases. Generally speaking, the error
!     of the TFQMR's estimate is less accurate.
!
!     The convergence test type is indicated by ipar(3). There are four
!     type convergence tests: (1) tests based on the residual norm;
!     (2) tests based on change in approximate solution; (3) caller
!     does not care, the solver choose one from above two on its own;
!     (4) caller will perform the test, the solver should simply continue.
!     Here is the complete definition:
!      -2 == || dx(i) || <= rtol * || rhs || + atol
!      -1 == || dx(i) || <= rtol * || dx(1) || + atol
!       0 == solver will choose test 1 (next)
!       1 == || residual || <= rtol * || initial residual || + atol
!       2 == || residual || <= rtol * || rhs || + atol
!     999 == caller will perform the test
!     where dx(i) denote the change in the solution at the ith update.
!     ||.|| denotes 2-norm. rtol = fpar(1) and atol = fpar(2).
!
!     If the caller is to perform the convergence test, the outcome
!     should be stored in ipar(11).
!     ipar(11) = 0 -- failed the convergence test, iterative solver
!     should continue
!     ipar(11) = 1 -- satisfied convergence test, iterative solver
!     should perform the clean up job and stop.
!
!     Upon return with ipar(1) = 10,
!     ipar(8)  points to the starting position of the change in
!              solution Sx, where the actual solution of the step is
!              x_j = x_0 + M_r^{-1} Sx.
!              Exception: ipar(8) < 0, Sx = 0. It is mostly used by
!              GMRES and variants to indicate (1) Sx was not necessary,
!              (2) intermediate result of Sx is not computed.
!     ipar(9)  points to the starting position of a work vector that
!              can be used by the caller.
!
!     NOTE: the caller should allow the iterative solver to perform
!     clean up job after the external convergence test is satisfied,
!     since some of the iterative solvers do not directly
!     update the 'sol' array. A typical clean-up stage includes
!     performing the final update of the approximate solution and
!     computing the convergence information (e.g. values of fpar(3:7)).
!
!     NOTE: fpar(4) and fpar(6) are not set by the accelerators (the
!     routines implemented here) if ipar(3) = 999.
!
!     ..................................................................
!     Usage:
!
!     To start solving a linear system, the user needs to specify
!     first 6 elements of the ipar, and first 2 elements of fpar.
!     The user may optionally set fpar(11) = 0 if one wants to count
!     the number of floating-point operations. (Note: the iterative
!     solvers will only add the floating-point operations inside
!     themselves, the caller will have to add the FLOPS from the
!     matrix-vector multiplication routines and the preconditioning
!     routines in order to account for all the arithmetic operations.)
!
!     Here is an example:
!     ipar(1) = 0       ! always 0 to start an iterative solver
!     ipar(2) = 2       ! right preconditioning
!     ipar(3) = 1       ! use convergence test scheme 1
!     ipar(4) = 10000   ! the 'w' has 10,000 elements
!     ipar(5) = 10      ! use *GMRES(10) (e.g. FGMRES(10))
!     ipar(6) = 100     ! use at most 100 matvec's
!     fpar(1) = 1.0E-6  ! relative tolerance 1.0E-6
!     fpar(2) = 1.0E-10 ! absolute tolerance 1.0E-10
!     fpar(11) = 0.0    ! clearing the FLOPS counter
!
!     After the above specifications, one can start to call an iterative
!     solver, say BCG. Here is a piece of pseudo-code showing how it can
!     be done,
!
! 10   call bcg(n,rhs,sol,ipar,fpar,w)
!      if (ipar(1).eq.1) then
!         call amux(n,w(ipar(8)),w(ipar(9)),a,ja,ia)
!         goto 10
!      else if (ipar(1).eq.2) then
!         call atmux(n,w(ipar(8)),w(ipar(9)),a,ja,ia)
!         goto 10
!      else if (ipar(1).eq.3) then
!         left preconditioner solver
!         goto 10
!      else if (ipar(1).eq.4) then
!         left preconditioner transposed solve
!         goto 10
!      else if (ipar(1).eq.5) then
!         right preconditioner solve
!         goto 10
!      else if (ipar(1).eq.6) then
!         right preconditioner transposed solve
!         goto 10
!      else if (ipar(1).eq.10) then
!         call my own stopping test routine
!         goto 10
!      else if (ipar(1).gt.0) then
!         ipar(1) is an unspecified code
!      else
!         the iterative solver terminated with code = ipar(1)
!      endif
!
!     This segment of pseudo-code assumes the matrix is in CSR format,
!     AMUX and ATMUX are two routines from the SPARSKIT MATVEC module.
!     They perform matrix-vector multiplications for CSR matrices,
!     where w(ipar(8)) is the first element of the input vectors to the
!     two routines, and w(ipar(9)) is the first element of the output
!     vectors from them. For simplicity, we did not show the name of
!     the routine that performs the preconditioning operations or the
!     convergence tests.
!-----------------------------------------------------------------------
      subroutine bcgstab(n, rhs, sol, ipar, fpar, w)
      implicit none
      integer n, ipar(16)
      real*8 rhs(n), sol(n), fpar(16), w(n,8)
!-----------------------------------------------------------------------
!     BCGSTAB --- Bi Conjugate Gradient stabilized (BCGSTAB)
!     This is an improved BCG routine. (1) no matrix transpose is
!     involved. (2) the convergence is smoother.
!
!
!     Algorithm:
!     Initialization - r = b - A x, r0 = r, p = r, rho = (r0, r),
!     Iterate -
!     (1) v = A p
!     (2) alpha = rho / (r0, v)
!     (3) s = r - alpha v
!     (4) t = A s
!     (5) omega = (t, s) / (t, t)
!     (6) x = x + alpha * p + omega * s
!     (7) r = s - omega * t
!     convergence test goes here
!     (8) beta = rho, rho = (r0, r), beta = rho * alpha / (beta * omega)
!         p = r + beta * (p - omega * v)
!
!     in this routine, before successful return, the fpar's are
!     fpar(3) == initial (preconditionied-)residual norm
!     fpar(4) == target (preconditionied-)residual norm
!     fpar(5) == current (preconditionied-)residual norm
!     fpar(6) == current residual norm or error
!     fpar(7) == current rho (rhok = <r, r0>)
!     fpar(8) == alpha
!     fpar(9) == omega
!
!     Usage of the work space W
!     w(:, 1) = r0, the initial residual vector
!     w(:, 2) = r, current residual vector
!     w(:, 3) = s
!     w(:, 4) = t
!     w(:, 5) = v
!     w(:, 6) = p
!     w(:, 7) = tmp, used in preconditioning, etc.
!     w(:, 8) = delta x, the correction to the answer is accumulated
!               here, so that the right-preconditioning may be applied
!               at the end
!-----------------------------------------------------------------------
!     external routines used
!
      real*8 ddot
      logical stopbis, brkdn
      external ddot, stopbis, brkdn
!
      real*8 one
      parameter(one=1.0D0)
!
!     local variables
!
      integer i
      real*8 alpha,beta,rho,omega
      logical lp, rp
      save lp, rp
!
!     where to go
!
      if (ipar(1).gt.0) then
         !!goto (10, 20, 40, 50, 60, 70, 80, 90, 100, 110) ipar(10)     
        SELECT CASE (ipar(10))
          CASE (1)
            GOTO 10
          CASE (2)
            GOTO 20
          CASE (3)
            GOTO 40
          CASE (4)
            GOTO 50
          CASE (5)
            GOTO 60
          CASE (6)
            GOTO 70
          CASE (7)
            GOTO 80
          CASE (8)
            GOTO 90
          CASE (9)
            GOTO 100
          CASE (10)
            GOTO 110
        END SELECT
      else if (ipar(1).lt.0) then
         goto 900
      endif
!
!     call the initialization routine
!
      call bisinit(ipar,fpar,8*n,1,lp,rp,w)
      if (ipar(1).lt.0) return
!
!     perform a matvec to compute the initial residual
!
      ipar(1) = 1
      ipar(8) = 1
      ipar(9) = 1 + n
      do i = 1, n
         w(i,1) = sol(i)
      enddo
      ipar(10) = 1
      return
 10   ipar(7) = ipar(7) + 1
      ipar(13) = ipar(13) + 1
      do i = 1, n
         w(i,1) = rhs(i) - w(i,2)
      enddo
      fpar(11) = fpar(11) + n
      if (lp) then
         ipar(1) = 3
         ipar(10) = 2
         return
      endif
!
 20   if (lp) then
         do i = 1, n
            w(i,1) = w(i,2)
            w(i,6) = w(i,2)
         enddo
      else
         do i = 1, n
            w(i,2) = w(i,1)
            w(i,6) = w(i,1)
         enddo
      endif
!
      fpar(7) = ddot(n,w,w)
      fpar(11) = fpar(11) + 2 * n
      fpar(5) = sqrt(fpar(7))
      fpar(3) = fpar(5)
      if (abs(ipar(3)).eq.2) then
         fpar(4) = fpar(1) * sqrt(ddot(n,rhs,rhs)) + fpar(2)
         fpar(11) = fpar(11) + 2 * n
      else if (ipar(3).ne.999) then
         fpar(4) = fpar(1) * fpar(3) + fpar(2)
      endif
      if (ipar(3).ge.0) fpar(6) = fpar(5)
      if (ipar(3).ge.0 .and. fpar(5).le.fpar(4) .and. ipar(3).ne.999) then
         goto 900
      endif
!
!     beginning of the iterations
!
!     Step (1), v = A p
 30   if (rp) then
         ipar(1) = 5
         ipar(8) = 5*n+1
         if (lp) then
            ipar(9) = 4*n + 1
         else
            ipar(9) = 6*n + 1
         endif
         ipar(10) = 3
         return
      endif
!
 40   ipar(1) = 1
      if (rp) then
         ipar(8) = ipar(9)
      else
         ipar(8) = 5*n+1
      endif
      if (lp) then
         ipar(9) = 6*n + 1
      else
         ipar(9) = 4*n + 1
      endif
      ipar(10) = 4
      return
 50   if (lp) then
         ipar(1) = 3
         ipar(8) = ipar(9)
         ipar(9) = 4*n + 1
         ipar(10) = 5
         return
      endif
!
 60   ipar(7) = ipar(7) + 1
!
!     step (2)
      alpha = ddot(n,w(1,1),w(1,5))
      fpar(11) = fpar(11) + 2 * n
      if (brkdn(alpha, ipar)) goto 900
      alpha = fpar(7) / alpha
      fpar(8) = alpha
!
!     step (3)
      do i = 1, n
         w(i,3) = w(i,2) - alpha * w(i,5)
      enddo
      fpar(11) = fpar(11) + 2 * n
!
!     Step (4): the second matvec -- t = A s
!
      if (rp) then
         ipar(1) = 5
         ipar(8) = n+n+1
         if (lp) then
            ipar(9) = ipar(8)+n
         else
            ipar(9) = 6*n + 1
         endif
         ipar(10) = 6
         return
      endif
!
 70   ipar(1) = 1
      if (rp) then
         ipar(8) = ipar(9)
      else
         ipar(8) = n+n+1
      endif
      if (lp) then
         ipar(9) = 6*n + 1
      else
         ipar(9) = 3*n + 1
      endif
      ipar(10) = 7
      return
 80   if (lp) then
         ipar(1) = 3
         ipar(8) = ipar(9)
         ipar(9) = 3*n + 1
         ipar(10) = 8
         return
      endif
 90   ipar(7) = ipar(7) + 1
!
!     step (5)
      omega = ddot(n,w(1,4),w(1,4))
      fpar(11) = fpar(11) + n + n
      if (brkdn(omega,ipar)) goto 900
      omega = ddot(n,w(1,4),w(1,3)) / omega
      fpar(11) = fpar(11) + n + n
      if (brkdn(omega,ipar)) goto 900
      fpar(9) = omega
      alpha = fpar(8)
!
!     step (6) and (7)
      do i = 1, n
         w(i,7) = alpha * w(i,6) + omega * w(i,3)
         w(i,8) = w(i,8) + w(i,7)
         w(i,2) = w(i,3) - omega * w(i,4)
      enddo
      fpar(11) = fpar(11) + 6 * n + 1
!
!     convergence test
      if (ipar(3).eq.999) then
         ipar(1) = 10
         ipar(8) = 7*n + 1
         ipar(9) = 6*n + 1
         ipar(10) = 9
         return
      endif
      if (stopbis(n,ipar,2,fpar,w(1,2),w(1,7),one))  goto 900
 100  if (ipar(3).eq.999.and.ipar(11).eq.1) goto 900
!
!     step (8): computing new p and rho
!
      rho = fpar(7)
      fpar(7) = ddot(n,w(1,2),w(1,1))
      omega = fpar(9)
      beta = fpar(7) * fpar(8) / (fpar(9) * rho)
      do i = 1, n
         w(i,6) = w(i,2) + beta * (w(i,6) - omega * w(i,5))
      enddo
      fpar(11) = fpar(11) + 6 * n + 3
      if (brkdn(fpar(7),ipar)) goto 900
!
!     end of an iteration
!
      goto 30
!
!     some clean up job to do
!
 900  if (rp) then
         if (ipar(1).lt.0) ipar(12) = ipar(1)
         ipar(1) = 5
         ipar(8) = 7*n + 1
         ipar(9) = ipar(8) - n
         ipar(10) = 10
         return
      endif

 110  if (rp) then
         call tidycg(n,ipar,fpar,sol,w(1,7))
      else
         call tidycg(n,ipar,fpar,sol,w(1,8))
      endif
!
      return
!-----end-of-bcgstab
      end
!-----------------------------------------------------------------------
      subroutine implu(np,umm,beta,ypiv,u,permut,full)
      real*8 umm,beta,ypiv(*),u(*),x, xpiv
      logical full, perm, permut(*)
      integer np,k,npm1
!-----------------------------------------------------------------------
!     performs implicitly one step of the lu factorization of a
!     banded hessenberg matrix.
!-----------------------------------------------------------------------
      if (np .le. 1) goto 12
      npm1 = np - 1
!
!     -- perform  previous step of the factorization-
!
      do 6 k=1,npm1
         if (.not. permut(k)) goto 5
         x=u(k)
         u(k) = u(k+1)
         u(k+1) = x
 5       u(k+1) = u(k+1) - ypiv(k)*u(k)
 6    continue
!-----------------------------------------------------------------------
!     now determine pivotal information to be used in the next call
!-----------------------------------------------------------------------
 12   umm = u(np)
      perm = (beta .gt. abs(umm))
      if (.not. perm) goto 4
      xpiv = umm / beta
      u(np) = beta
      goto 8
 4    xpiv = beta/umm
 8    permut(np) = perm
      ypiv(np) = xpiv
      if (.not. full) return
!     shift everything up if full...
      do 7 k=1,npm1
         ypiv(k) = ypiv(k+1)
         permut(k) = permut(k+1)
 7    continue
      return
!-----end-of-implu
      end
!-----------------------------------------------------------------------
      subroutine uppdir(n,p,np,lbp,indp,y,u,usav,flops)
      implicit none

      integer   :: k,np,n,npm1,j,ju,indp,lbp
      real*8    :: p(n,lbp), y(*), u(*), usav(*), x, flops
      
!-----------------------------------------------------------------------
!     updates the conjugate directions p given the upper part of the
!     banded upper triangular matrix u.  u contains the non zero
!     elements of the column of the triangular matrix..
!-----------------------------------------------------------------------
      real*8 zero
      parameter(zero=0.0D0)
!
      npm1=np-1
      if (np .le. 1) goto 12
      j=indp
      ju = npm1
 10   if (j .le. 0) j=lbp
      x = u(ju) /usav(j)
      if (x .eq. zero) goto 115
      do 11 k=1,n
         y(k) = y(k) - x*p(k,j)
 11   continue
      flops = flops + 2*n
 115  j = j-1
      ju = ju -1
      if (ju .ge. 1) goto 10
 12   indp = indp + 1
      if (indp .gt. lbp) indp = 1
      usav(indp) = u(np)
      do 13 k=1,n
         p(k,indp) = y(k)
 13   continue
      return
!-----------------------------------------------------------------------
!-------end-of-uppdir---------------------------------------------------
      end

      subroutine givens(x,y,c,s)
       implicit none

      real*8   :: x,y,c,s
!-----------------------------------------------------------------------
!     Given x and y, this subroutine generates a Givens' rotation c, s.
!     And apply the rotation on (x,y) ==> (sqrt(x**2 + y**2), 0).
!     (See P 202 of "matrix computation" by Golub and van Loan.)
!-----------------------------------------------------------------------
      real*8   :: t,one,zero
      parameter (zero=0.0D0,one=1.0D0)
!
      if (x.eq.zero .and. y.eq.zero) then
         c = one
         s = zero
      else if (abs(y).gt.abs(x)) then
         t = x / y
         x = sqrt(one+t*t)
         s = sign(one / x, y)
         c = t*s
      else if (abs(y).le.abs(x)) then
         t = y / x
         y = sqrt(one+t*t)
         c = sign(one / y, x)
         s = t*c
      else
!
!     X or Y must be an invalid floating-point number, set both to zero
!
         x = zero
         y = zero
         c = one
         s = zero
      endif
      x = abs(x*y)
!
!     end of givens
!
      return
      end
!-----end-of-givens
!-----------------------------------------------------------------------
      logical function stopbis(n,ipar,mvpi,fpar,r,delx,sx)
      implicit none
      integer n,mvpi,ipar(16)
      real*8 fpar(16), r(n), delx(n), sx, ddot
      external ddot
!-----------------------------------------------------------------------
!     function for determining the stopping criteria. return value of
!     true if the stopbis criteria is satisfied.
!-----------------------------------------------------------------------
      if (ipar(11) .eq. 1) then
         stopbis = .true.
      else
         stopbis = .false.
      endif
      if (ipar(6).gt.0 .and. ipar(7).ge.ipar(6)) then
         ipar(1) = -1
         stopbis = .true.
      endif
      if (stopbis) return
!
!     computes errors
!
      fpar(5) = sqrt(ddot(n,r,r))
      fpar(11) = fpar(11) + 2 * n
      if (ipar(3).lt.0) then
!
!     compute the change in the solution vector
!
         fpar(6) = sx * sqrt(ddot(n,delx,delx))
         fpar(11) = fpar(11) + 2 * n
         if (ipar(7).lt.mvpi+mvpi+1) then
!
!     if this is the end of the first iteration, set fpar(3:4)
!
            fpar(3) = fpar(6)
            if (ipar(3).eq.-1) then
               fpar(4) = fpar(1) * fpar(3) + fpar(2)
            endif
         endif
      else
         fpar(6) = fpar(5)
      endif
!
!     .. the test is struct this way so that when the value in fpar(6)
!       is not a valid number, STOPBIS is set to .true.
!
      if (fpar(6).gt.fpar(4)) then
         stopbis = .false.
         ipar(11) = 0
      else
         stopbis = .true.
         ipar(11) = 1
      endif
!
      return
      end
!-----end-of-stopbis
!-----------------------------------------------------------------------
      subroutine tidycg(n,ipar,fpar,sol,delx)
      implicit none
      integer i,n,ipar(16)
      real*8 fpar(16),sol(n),delx(n)
!-----------------------------------------------------------------------
!     Some common operations required before terminating the CG routines
!-----------------------------------------------------------------------
      real*8 zero
      parameter(zero=0.0D0)
!
      if (ipar(12).ne.0) then
         ipar(1) = ipar(12) 
      else if (ipar(1).gt.0) then
         if ((ipar(3).eq.999 .and. ipar(11).eq.1) .or. &
     &        fpar(6).le.fpar(4)) then
            ipar(1) = 0
         else if (ipar(7).ge.ipar(6) .and. ipar(6).gt.0) then
            ipar(1) = -1
         else
            ipar(1) = -10
         endif
      endif
      if (fpar(3).gt.zero .and. fpar(6).gt.zero .and. ipar(7).gt.ipar(13)) then 
        fpar(7) = log10(fpar(3) / fpar(6)) / dble(ipar(7)-ipar(13))
      else
         fpar(7) = zero
      endif
      do i = 1, n
         sol(i) = sol(i) + delx(i)
      enddo
      return
      end
!-----end-of-tidycg
!-----------------------------------------------------------------------
      logical function brkdn(alpha, ipar)
      implicit none
      integer ipar(16)
      real*8 alpha, beta, zero, one
      parameter (zero=0.0D0, one=1.0D0)
!-----------------------------------------------------------------------
!     test whether alpha is zero or an abnormal number, if yes,
!     this routine will return .true.
!
!     If alpha == 0, ipar(1) = -3,
!     if alpha is an abnormal number, ipar(1) = -9.
!-----------------------------------------------------------------------
      brkdn = .false.
      if (alpha.gt.zero) then
         beta = one / alpha
         if (.not. beta.gt.zero) then
            brkdn = .true.
            ipar(1) = -9
         endif
      else if (alpha.lt.zero) then
         beta = one / alpha
         if (.not. beta.lt.zero) then
            brkdn = .true.
            ipar(1) = -9
         endif
      else if (alpha.eq.zero) then
         brkdn = .true.
         ipar(1) = -3
      else
         brkdn = .true.
         ipar(1) = -9
      endif
      return
      end
!-----end-of-brkdn
!-----------------------------------------------------------------------
      subroutine bisinit(ipar,fpar,wksize,dsc,lp,rp,wk)
      implicit none
      integer i,ipar(16),wksize,dsc
      logical lp,rp
      real*8  fpar(16),wk(*)
!-----------------------------------------------------------------------
!     some common initializations for the iterative solvers
!-----------------------------------------------------------------------
      real*8 zero, one
      parameter(zero=0.0D0, one=1.0D0)
!
!     ipar(1) = -2 inidcate that there are not enough space in the work
!     array
!
      if (ipar(4).lt.wksize) then
         ipar(1) = -2
         ipar(4) = wksize
         return
      endif
!
      if (ipar(2).gt.2) then
         lp = .true.
         rp = .true.
      else if (ipar(2).eq.2) then
         lp = .false.
         rp = .true.
      else if (ipar(2).eq.1) then
         lp = .true.
         rp = .false.
      else
         lp = .false.
         rp = .false.
      endif
      if (ipar(3).eq.0) ipar(3) = dsc
!     .. clear the ipar elements used
      ipar(7) = 0
      ipar(8) = 0
      ipar(9) = 0
      ipar(10) = 0
      ipar(11) = 0
      ipar(12) = 0
      ipar(13) = 0
!
!     fpar(1) must be between (0, 1), fpar(2) must be positive,
!     fpar(1) and fpar(2) can NOT both be zero
!     Normally return ipar(1) = -4 to indicate any of above error
!
      if (fpar(1).lt.zero .or. fpar(1).ge.one .or. fpar(2).lt.zero .or. &
     &     (fpar(1).eq.zero .and. fpar(2).eq.zero)) then
         if (ipar(1).eq.0) then
            ipar(1) = -4
            return
         else
            fpar(1) = 1.0D-6
            fpar(2) = 1.0D-16
         endif
      endif
!     .. clear the fpar elements
      do i = 3, 10
         fpar(i) = zero
      enddo
      if (fpar(11).lt.zero) fpar(11) = zero
!     .. clear the used portion of the work array to zero
      do i = 1, wksize
         wk(i) = zero
      enddo
!
      return
!-----end-of-bisinit
      end
!-----------------------------------------------------------------------
      subroutine mgsro(full,lda,n,m,ind,ops,vec,hh,ierr)
      implicit none
      logical full
      integer lda,m,n,ind,ierr
      real*8  ops,hh(m),vec(lda,m)
!-----------------------------------------------------------------------
!     MGSRO  -- Modified Gram-Schmidt procedure with Selective Re-
!               Orthogonalization
!     The ind'th vector of VEC is orthogonalized against the rest of
!     the vectors.
!
!     The test for performing re-orthogonalization is performed for
!     each indivadual vectors. If the cosine between the two vectors
!     is greater than 0.99 (REORTH = 0.99**2), re-orthogonalization is
!     performed. The norm of the 'new' vector is kept in variable NRM0,
!     and updated after operating with each vector.
!
!     full   -- .ture. if it is necessary to orthogonalize the ind'th
!               against all the vectors vec(:,1:ind-1), vec(:,ind+2:m)
!               .false. only orthogonalize againt vec(:,1:ind-1)
!     lda    -- the leading dimension of VEC
!     n      -- length of the vector in VEC
!     m      -- number of vectors can be stored in VEC
!     ind    -- index to the vector to be changed
!     ops    -- operation counts
!     vec    -- vector of LDA X M storing the vectors
!     hh     -- coefficient of the orthogonalization
!     ierr   -- error code
!               0 : successful return
!               -1: zero input vector
!               -2: input vector contains abnormal numbers
!               -3: input vector is a linear combination of others
!
!     External routines used: real*8 ddot
!-----------------------------------------------------------------------
      integer i,k
      real*8  nrm0, nrm1, fct, thr, ddot, zero, one, reorth
      parameter (zero=0.0D0, one=1.0D0, reorth=0.98D0)
      external ddot
!
!     compute the norm of the input vector
!
      nrm0 = ddot(n,vec(1,ind),vec(1,ind))
      ops = ops + n + n
      thr = nrm0 * reorth
      if (nrm0.le.zero) then
         ierr = - 1
         return
      else if (nrm0.gt.zero .and. one/nrm0.gt.zero) then
         ierr = 0
      else
         ierr = -2
         return
      endif
!
!     Modified Gram-Schmidt loop
!
      if (full) then
         do 40 i = ind+1, m
            fct = ddot(n,vec(1,ind),vec(1,i))
            hh(i) = fct
            do 20 k = 1, n
               vec(k,ind) = vec(k,ind) - fct * vec(k,i)
 20         continue
            ops = ops + 4 * n + 2
            if (fct*fct.gt.thr) then
               fct = ddot(n,vec(1,ind),vec(1,i))
               hh(i) = hh(i) + fct
               do 30 k = 1, n
                  vec(k,ind) = vec(k,ind) - fct * vec(k,i)
 30            continue
               ops = ops + 4*n + 1
            endif
            nrm0 = nrm0 - hh(i) * hh(i)
            if (nrm0.lt.zero) nrm0 = zero
            thr = nrm0 * reorth
 40      continue
      endif
!
      do 70 i = 1, ind-1
         fct = ddot(n,vec(1,ind),vec(1,i))
         hh(i) = fct
         do 50 k = 1, n
            vec(k,ind) = vec(k,ind) - fct * vec(k,i)
 50      continue
         ops = ops + 4 * n + 2
         if (fct*fct.gt.thr) then
            fct = ddot(n,vec(1,ind),vec(1,i))
            hh(i) = hh(i) + fct
            do 60 k = 1, n
               vec(k,ind) = vec(k,ind) - fct * vec(k,i)
 60         continue
            ops = ops + 4*n + 1
         endif
         nrm0 = nrm0 - hh(i) * hh(i)
         if (nrm0.lt.zero) nrm0 = zero
         thr = nrm0 * reorth
 70   continue
!
!     test the resulting vector
!
      nrm1 = sqrt(ddot(n,vec(1,ind),vec(1,ind)))
      ops = ops + n + n
      hh(ind) = nrm1    ! statement label 75 
      if (nrm1.le.zero) then
         ierr = -3
         return
      endif
!
!     scale the resulting vector
!
      fct = one / nrm1
      do 80 k = 1, n
         vec(k,ind) = vec(k,ind) * fct
 80   continue
      ops = ops + n + 1
!
!     normal return
!
      ierr = 0
      return
!     end surbotine mgsro
      end
!----------------------------------------------------------------------c
!                          S P A R S K I T                             c
!----------------------------------------------------------------------c
!          BASIC MATRIX-VECTOR OPERATIONS - MATVEC MODULE              c
!         Matrix-vector Mulitiplications and Triang. Solves            c
!----------------------------------------------------------------------c
! contents: (as of Nov 18, 1991)                                       c
!----------                                                            c
! 1) Matrix-vector products:                                           c
!---------------------------                                           c
! amux  : A times a vector. Compressed Sparse Row (CSR) format.        c
! amuxms: A times a vector. Modified Compress Sparse Row format.       c
! atmux : Transp(A) times a vector. CSR format.                        c
! atmuxr: Transp(A) times a vector. CSR format. A rectangular.         c
! amuxe : A times a vector. Ellpack/Itpack (ELL) format.               c
! amuxd : A times a vector. Diagonal (DIA) format.                     c
! amuxj : A times a vector. Jagged Diagonal (JAD) format.              c
! vbrmv : Sparse matrix-full vector product, in VBR format             c
!                                                                      c
! 2) Triangular system solutions:                                      c
!-------------------------------                                       c
! lsol  : Unit Lower Triang. solve. Compressed Sparse Row (CSR) format.c
! ldsol : Lower Triang. solve.  Modified Sparse Row (MSR) format.      c
! lsolc : Unit Lower Triang. solve. Comp. Sparse Column (CSC) format.  c
! ldsolc: Lower Triang. solve. Modified Sparse Column (MSC) format.    c
! ldsoll: Lower Triang. solve with level scheduling. MSR format.       c
! usol  : Unit Upper Triang. solve. Compressed Sparse Row (CSR) format.c
! udsol : Upper Triang. solve.  Modified Sparse Row (MSR) format.      c
! usolc : Unit Upper Triang. solve. Comp. Sparse Column (CSC) format.  c
! udsolc: Upper Triang. solve.  Modified Sparse Column (MSC) format.   c
!----------------------------------------------------------------------c
! 1)     M A T R I X    B Y    V E C T O R     P R O D U C T S         c
!----------------------------------------------------------------------c
      subroutine amux (n, x, y, a,ja,ia) 
      real*8  x(*), y(*), a(*) 
      integer n, ja(*), ia(*)
!-----------------------------------------------------------------------
!         A times a vector
!----------------------------------------------------------------------- 
! multiplies a matrix by a vector using the dot product form
! Matrix A is stored in compressed sparse row storage.
!
! on entry:
!----------
! n     = row dimension of A
! x     = real array of length equal to the column dimension of
!         the A matrix.
! a, ja,
!    ia = input matrix in compressed sparse row format.
!
! on return:
!-----------
! y     = real array of length n, containing the product y=Ax
!
!-----------------------------------------------------------------------
! local variables
!
      real*8 t
      integer i, k
!-----------------------------------------------------------------------
      do 100 i = 1,n
!
!     compute the inner product of row i with vector x
! 
         t = 0.0d0
         do 99 k=ia(i), ia(i+1)-1
            t = t + a(k)*x(ja(k))
 99      continue
!
!     store result in y(i) 
!
         y(i) = t
 100  continue
!
      return
!---------end-of-amux---------------------------------------------------
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
      subroutine amuxms (n, x, y, a,ja)
      real*8  x(*), y(*), a(*)
      integer n, ja(*)
!-----------------------------------------------------------------------
!         A times a vector in MSR format
!-----------------------------------------------------------------------
! multiplies a matrix by a vector using the dot product form
! Matrix A is stored in Modified Sparse Row storage.
!
! on entry:
!----------
! n     = row dimension of A
! x     = real array of length equal to the column dimension of
!         the A matrix.
! a, ja,= input matrix in modified compressed sparse row format.
!
! on return:
!-----------
! y     = real array of length n, containing the product y=Ax
!
!-----------------------------------------------------------------------
! local variables
!
      integer i, k
!-----------------------------------------------------------------------
        do 10 i=1, n
        y(i) = a(i)*x(i)
 10     continue
      do 100 i = 1,n
!
!     compute the inner product of row i with vector x
!
         do 99 k=ja(i), ja(i+1)-1
            y(i) = y(i) + a(k) *x(ja(k))
 99      continue
 100  continue
!
      return
!---------end-of-amuxm--------------------------------------------------
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
      subroutine atmux (n, x, y, a, ja, ia)
      real*8 x(*), y(*), a(*) 
      integer n, ia(*), ja(*)
!-----------------------------------------------------------------------
!         transp( A ) times a vector
!----------------------------------------------------------------------- 
! multiplies the transpose of a matrix by a vector when the original
! matrix is stored in compressed sparse row storage. Can also be
! viewed as the product of a matrix by a vector when the original
! matrix is stored in the compressed sparse column format.
!-----------------------------------------------------------------------
!
! on entry:
!----------
! n     = row dimension of A
! x     = real array of length equal to the column dimension of
!         the A matrix.
! a, ja,
!    ia = input matrix in compressed sparse row format.
!
! on return:
!-----------
! y     = real array of length n, containing the product y=transp(A)*x
!
!-----------------------------------------------------------------------
!     local variables 
!
      integer i, k 
!-----------------------------------------------------------------------
!
!     zero out output vector
! 
      do 1 i=1,n
         y(i) = 0.0
 1    continue
!
! loop over the rows
!
      do 100 i = 1,n
         do 99 k=ia(i), ia(i+1)-1 
            y(ja(k)) = y(ja(k)) + x(i)*a(k)
 99      continue
 100  continue
!
      return
!-------------end-of-atmux---------------------------------------------- 
!-----------------------------------------------------------------------
      end
!----------------------------------------------------------------------- 
      subroutine atmuxr (m, n, x, y, a, ja, ia)
      real*8 x(*), y(*), a(*) 
      integer m, n, ia(*), ja(*)
!-----------------------------------------------------------------------
!         transp( A ) times a vector, A can be rectangular
!----------------------------------------------------------------------- 
! See also atmux.  The essential difference is how the solution vector
! is initially zeroed.  If using this to multiply rectangular CSC 
! matrices by a vector, m number of rows, n is number of columns.
!-----------------------------------------------------------------------
!
! on entry:
!----------
! m     = column dimension of A
! n     = row dimension of A
! x     = real array of length equal to the column dimension of
!         the A matrix.
! a, ja,
!    ia = input matrix in compressed sparse row format.
!
! on return:
!-----------
! y     = real array of length n, containing the product y=transp(A)*x
!
!-----------------------------------------------------------------------
!     local variables 
!
      integer i, k 
!-----------------------------------------------------------------------
!
!     zero out output vector
! 
      do 1 i=1,m
         y(i) = 0.0
 1    continue
!
! loop over the rows
!
      do 100 i = 1,n
         do 99 k=ia(i), ia(i+1)-1 
            y(ja(k)) = y(ja(k)) + x(i)*a(k)
 99      continue
 100  continue
!
      return
!-------------end-of-atmuxr--------------------------------------------- 
!-----------------------------------------------------------------------
      end
!----------------------------------------------------------------------- 
      subroutine amuxe (n,x,y,na,ncol,a,ja)
        implicit none

      integer     :: n, na, ncol, ja(na,*)
      real*8      :: x(n), y(n), a(na,*)
      
!-----------------------------------------------------------------------
!        A times a vector in Ellpack Itpack format (ELL)               
!----------------------------------------------------------------------- 
! multiplies a matrix by a vector when the original matrix is stored 
! in the ellpack-itpack sparse format.
!-----------------------------------------------------------------------
!
! on entry:
!----------
! n     = row dimension of A
! x     = real array of length equal to the column dimension of
!         the A matrix.
! na    = integer. The first dimension of arrays a and ja
!         as declared by the calling program.
! ncol  = integer. The number of active columns in array a.
!         (i.e., the number of generalized diagonals in matrix.)
! a, ja = the real and integer arrays of the itpack format
!         (a(i,k),k=1,ncol contains the elements of row i in matrix
!          ja(i,k),k=1,ncol contains their column numbers) 
!
! on return:
!-----------
! y     = real array of length n, containing the product y=y=A*x
!
!-----------------------------------------------------------------------
! local variables
!
      integer i, j 
!-----------------------------------------------------------------------
      do 1 i=1, n
         y(i) = 0.0 
 1    continue
      do 10 j=1,ncol
         do 25 i = 1,n
            y(i) = y(i)+a(i,j)*x(ja(i,j))
 25      continue
 10   continue
!
      return
!--------end-of-amuxe--------------------------------------------------- 
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
      subroutine amuxd (n,x,y,diag,ndiag,idiag,ioff) 
      integer n, ndiag, idiag, ioff(idiag) 
      real*8 x(n), y(n), diag(ndiag,idiag)
!-----------------------------------------------------------------------
!        A times a vector in Diagonal storage format (DIA) 
!----------------------------------------------------------------------- 
! multiplies a matrix by a vector when the original matrix is stored 
! in the diagonal storage format.
!-----------------------------------------------------------------------
!
! on entry:
!----------
! n     = row dimension of A
! x     = real array of length equal to the column dimension of
!         the A matrix.
! ndiag  = integer. The first dimension of array adiag as declared in
!         the calling program.
! idiag  = integer. The number of diagonals in the matrix.
! diag   = real array containing the diagonals stored of A.
! idiag  = number of diagonals in matrix.
! diag   = real array of size (ndiag x idiag) containing the diagonals
!          
! ioff   = integer array of length idiag, containing the offsets of the
!          diagonals of the matrix:
!          diag(i,k) contains the element a(i,i+ioff(k)) of the matrix.
!
! on return:
!-----------
! y     = real array of length n, containing the product y=A*x
!
!-----------------------------------------------------------------------
!       local variables 
!
      integer j, k, io, i1, i2 
!-----------------------------------------------------------------------
      do 1 j=1, n
         y(j) = 0.0d0
 1    continue
      do 10 j=1, idiag
         io = ioff(j)
         i1 = max0(1,1-io)
         i2 = min0(n,n-io)
         do 9 k=i1, i2  
            y(k) = y(k)+diag(k,j)*x(k+io)
 9       continue
 10   continue
! 
      return
!----------end-of-amuxd-------------------------------------------------
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
      subroutine amuxj (n, x, y, jdiag, a, ja, ia)
      integer n, jdiag, ja(*), ia(*)
      real*8 x(n), y(n), a(*)
!-----------------------------------------------------------------------
!        A times a vector in Jagged-Diagonal storage format (JAD) 
!----------------------------------------------------------------------- 
! multiplies a matrix by a vector when the original matrix is stored 
! in the jagged diagonal storage format.
!-----------------------------------------------------------------------
!
! on entry:
!----------
! n      = row dimension of A
! x      = real array of length equal to the column dimension of
!         the A matrix.
! jdiag  = integer. The number of jadded-diagonals in the data-structure.
! a      = real array containing the jadded diagonals of A stored
!          in succession (in decreasing lengths) 
! j      = integer array containing the colum indices of the 
!          corresponding elements in a.
! ia     = integer array containing the lengths of the  jagged diagonals
!
! on return:
!-----------
! y      = real array of length n, containing the product y=A*x
!
! Note:
!------- 
! Permutation related to the JAD format is not performed.
! this can be done by:
!     call permvec (n,y,y,iperm) 
! after the call to amuxj, where iperm is the permutation produced
! by csrjad.
!-----------------------------------------------------------------------
! local variables 
!
      integer i, ii, k1, ilen, j 
!-----------------------------------------------------------------------
      do 1 i=1, n
         y(i) = 0.0d0
 1    continue
      do 70 ii=1, jdiag
         k1 = ia(ii)-1
         ilen = ia(ii+1)-k1-1
         do 60 j=1,ilen
            y(j)= y(j)+a(k1+j)*x(ja(k1+j)) 
 60      continue
 70   continue
!
      return
!----------end-of-amuxj------------------------------------------------- 
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
      subroutine vbrmv(nr, nc, ia, ja, ka, a, kvstr, kvstc, x, b)
!-----------------------------------------------------------------------
      integer nr, nc, ia(nr+1), ja(*), ka(*), kvstr(nr+1), kvstc(*)
      real*8  a(*), x(*), b(*)
!-----------------------------------------------------------------------
!     Sparse matrix-full vector product, in VBR format.
!-----------------------------------------------------------------------
!     On entry:
!--------------
!     nr, nc  = number of block rows and columns in matrix A
!     ia,ja,ka,a,kvstr,kvstc = matrix A in variable block row format
!     x       = multiplier vector in full format
!
!     On return:
!---------------
!     b = product of matrix A times vector x in full format
!
!     Algorithm:
!---------------
!     Perform multiplication by traversing a in order.
!
!-----------------------------------------------------------------------
!-----local variables
      integer n, i, j, ii, jj, k, istart, istop
      real*8  xjj
!---------------------------------
      n = kvstc(nc+1)-1
      do i = 1, n
         b(i) = 0.d0
      enddo
!---------------------------------
      k = 1
      do i = 1, nr
         istart = kvstr(i)
         istop  = kvstr(i+1)-1
         do j = ia(i), ia(i+1)-1
            do jj = kvstc(ja(j)), kvstc(ja(j)+1)-1
               xjj = x(jj)
               do ii = istart, istop
                  b(ii) = b(ii) + xjj*a(k)
                  k = k + 1
               enddo
            enddo
         enddo
      enddo
!---------------------------------
      return
      end
!-----------------------------------------------------------------------
!----------------------end-of-vbrmv-------------------------------------
!-----------------------------------------------------------------------
!----------------------------------------------------------------------c
! 2)     T R I A N G U L A R    S Y S T E M    S O L U T I O N S       c
!----------------------------------------------------------------------c
      subroutine lsol (n,x,y,al,jal,ial)
      integer n, jal(*),ial(n+1) 
      real*8  x(n), y(n), al(*) 
!-----------------------------------------------------------------------
!   solves    L x = y ; L = lower unit triang. /  CSR format
!----------------------------------------------------------------------- 
! solves a unit lower triangular system by standard (sequential )
! forward elimination - matrix stored in CSR format. 
!-----------------------------------------------------------------------
!
! On entry:
!---------- 
! n      = integer. dimension of problem.
! y      = real array containg the right side.
!
! al,
! jal,
! ial,    = Lower triangular matrix stored in compressed sparse row
!          format. 
!
! On return:
!----------- 
!          x  = The solution of  L x  = y.
!--------------------------------------------------------------------
! local variables 
!
      integer k, j 
      real*8  t
!-----------------------------------------------------------------------
      x(1) = y(1) 
      do 150 k = 2, n
         t = y(k) 
         do 100 j = ial(k), ial(k+1)-1
            t = t-al(j)*x(jal(j))
 100     continue
         x(k) = t 
 150  continue
!
      return
!----------end-of-lsol-------------------------------------------------- 
!-----------------------------------------------------------------------
      end
!----------------------------------------------------------------------- 
      subroutine ldsol (n,x,y,al,jal) 
      integer n, jal(*) 
      real*8 x(n), y(n), al(*) 
!----------------------------------------------------------------------- 
!     Solves L x = y    L = triangular. MSR format 
!-----------------------------------------------------------------------
! solves a (non-unit) lower triangular system by standard (sequential) 
! forward elimination - matrix stored in MSR format 
! with diagonal elements already inverted (otherwise do inversion,
! al(1:n) = 1.0/al(1:n),  before calling ldsol).
!-----------------------------------------------------------------------
!
! On entry:
!---------- 
! n      = integer. dimension of problem.
! y      = real array containg the right hand side.
!
! al,
! jal,   = Lower triangular matrix stored in Modified Sparse Row 
!          format. 
!
! On return:
!----------- 
!        x = The solution of  L x = y .
!--------------------------------------------------------------------
! local variables 
!
      integer k, j 
      real*8 t 
!-----------------------------------------------------------------------
      x(1) = y(1)*al(1) 
      do 150 k = 2, n
         t = y(k) 
         do 100 j = jal(k), jal(k+1)-1
            t = t - al(j)*x(jal(j))
 100     continue
         x(k) = al(k)*t 
 150  continue
      return
!----------end-of-ldsol-------------------------------------------------
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
      subroutine lsolc (n,x,y,al,jal,ial)
      integer n, jal(*),ial(*) 
      real*8  x(n), y(n), al(*) 
!-----------------------------------------------------------------------
!       SOLVES     L x = y ;    where L = unit lower trang. CSC format
!-----------------------------------------------------------------------
! solves a unit lower triangular system by standard (sequential )
! forward elimination - matrix stored in CSC format. 
!-----------------------------------------------------------------------
!
! On entry:
!---------- 
! n      = integer. dimension of problem.
! y      = real*8 array containg the right side.
!
! al,
! jal,
! ial,    = Lower triangular matrix stored in compressed sparse column 
!          format. 
!
! On return:
!----------- 
!         x  = The solution of  L x  = y.
!-----------------------------------------------------------------------
! local variables 
!
      integer k, j
      real*8 t
!-----------------------------------------------------------------------
      do 140 k=1,n
         x(k) = y(k) 
 140  continue
      do 150 k = 1, n-1
         t = x(k) 
         do 100 j = ial(k), ial(k+1)-1
            x(jal(j)) = x(jal(j)) - t*al(j) 
 100     continue
 150  continue
!
      return
!----------end-of-lsolc------------------------------------------------- 
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
      subroutine ldsolc (n,x,y,al,jal) 
      integer n, jal(*)
      real*8 x(n), y(n), al(*)
!-----------------------------------------------------------------------
!    Solves     L x = y ;    L = nonunit Low. Triang. MSC format 
!----------------------------------------------------------------------- 
! solves a (non-unit) lower triangular system by standard (sequential) 
! forward elimination - matrix stored in Modified Sparse Column format 
! with diagonal elements already inverted (otherwise do inversion,
! al(1:n) = 1.0/al(1:n),  before calling ldsol).
!-----------------------------------------------------------------------
!
! On entry:
!---------- 
! n      = integer. dimension of problem.
! y      = real array containg the right hand side.
!
! al,
! jal,
! ial,    = Lower triangular matrix stored in Modified Sparse Column
!           format.
!
! On return:
!----------- 
!         x = The solution of  L x = y .
!--------------------------------------------------------------------
! local variables
!
      integer k, j
      real*8 t 
!-----------------------------------------------------------------------
      do 140 k=1,n
         x(k) = y(k) 
 140  continue
      do 150 k = 1, n 
         x(k) = x(k)*al(k) 
         t = x(k) 
         do 100 j = jal(k), jal(k+1)-1
            x(jal(j)) = x(jal(j)) - t*al(j) 
 100     continue
 150  continue
!
      return
!----------end-of-lsolc------------------------------------------------ 
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------  
      subroutine ldsoll (n,x,y,al,jal,nlev,lev,ilev) 
      integer n, nlev, jal(*), ilev(nlev+1), lev(n)
      real*8 x(n), y(n), al(*)
!-----------------------------------------------------------------------
!    Solves L x = y    L = triangular. Uses LEVEL SCHEDULING/MSR format 
!-----------------------------------------------------------------------
!
! On entry:
!---------- 
! n      = integer. dimension of problem.
! y      = real array containg the right hand side.
!
! al,
! jal,   = Lower triangular matrix stored in Modified Sparse Row 
!          format. 
! nlev   = number of levels in matrix
! lev    = integer array of length n, containing the permutation
!          that defines the levels in the level scheduling ordering.
! ilev   = pointer to beginning of levels in lev.
!          the numbers lev(i) to lev(i+1)-1 contain the row numbers
!          that belong to level number i, in the level shcheduling
!          ordering.
!
! On return:
!----------- 
!        x = The solution of  L x = y .
!--------------------------------------------------------------------
      integer ii, jrow, i, k
      real*8 t 
!     
!     outer loop goes through the levels. (SEQUENTIAL loop)
!     
      do 150 ii=1, nlev
!     
!     next loop executes within the same level. PARALLEL loop
!     
         do 100 i=ilev(ii), ilev(ii+1)-1 
            jrow = lev(i)
!
! compute inner product of row jrow with x
! 
            t = y(jrow) 
            do 130 k=jal(jrow), jal(jrow+1)-1 
               t = t - al(k)*x(jal(k))
 130        continue
            x(jrow) = t*al(jrow) 
 100     continue
 150  continue
      return
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
      subroutine usol (n,x,y,au,jau,iau)
      integer n, jau(*),iau(n+1) 
      real*8  x(n), y(n), au(*) 
!----------------------------------------------------------------------- 
!             Solves   U x = y    U = unit upper triangular. 
!-----------------------------------------------------------------------
! solves a unit upper triangular system by standard (sequential )
! backward elimination - matrix stored in CSR format. 
!-----------------------------------------------------------------------
!
! On entry:
!---------- 
! n      = integer. dimension of problem.
! y      = real array containg the right side.
!
! au,
! jau,
! iau,    = Lower triangular matrix stored in compressed sparse row
!          format. 
!
! On return:
!----------- 
!         x = The solution of  U x = y . 
!-------------------------------------------------------------------- 
! local variables 
!
      integer k, j 
      real*8  t
!-----------------------------------------------------------------------
      x(n) = y(n) 
      do 150 k = n-1,1,-1 
         t = y(k) 
         do 100 j = iau(k), iau(k+1)-1
            t = t - au(j)*x(jau(j))
 100     continue
         x(k) = t 
 150  continue
!
      return
!----------end-of-usol-------------------------------------------------- 
!-----------------------------------------------------------------------
      end
!----------------------------------------------------------------------- 
      subroutine udsol (n,x,y,au,jau) 
      integer n, jau(*) 
      real*8  x(n), y(n),au(*) 
!----------------------------------------------------------------------- 
!             Solves   U x = y  ;   U = upper triangular in MSR format
!-----------------------------------------------------------------------
! solves a non-unit upper triangular matrix by standard (sequential )
! backward elimination - matrix stored in MSR format. 
! with diagonal elements already inverted (otherwise do inversion,
! au(1:n) = 1.0/au(1:n),  before calling).
!-----------------------------------------------------------------------
!
! On entry:
!---------- 
! n      = integer. dimension of problem.
! y      = real array containg the right side.
!
! au,
! jau,    = Lower triangular matrix stored in modified sparse row
!          format. 
!
! On return:
!----------- 
!         x = The solution of  U x = y .
!--------------------------------------------------------------------
! local variables 
!
      integer k, j
      real*8 t
!-----------------------------------------------------------------------
      x(n) = y(n)*au(n)
      do 150 k = n-1,1,-1
         t = y(k) 
         do 100 j = jau(k), jau(k+1)-1
            t = t - au(j)*x(jau(j))
 100     continue
         x(k) = au(k)*t 
 150  continue
!
      return
!----------end-of-udsol-------------------------------------------------
!-----------------------------------------------------------------------
      end
!----------------------------------------------------------------------- 
      subroutine usolc (n,x,y,au,jau,iau)
      real*8  x(*), y(*), au(*) 
      integer n, jau(*),iau(*)
!-----------------------------------------------------------------------
!       SOUVES     U x = y ;    where U = unit upper trang. CSC format
!-----------------------------------------------------------------------
! solves a unit upper triangular system by standard (sequential )
! forward elimination - matrix stored in CSC format. 
!-----------------------------------------------------------------------
!
! On entry:
!---------- 
! n      = integer. dimension of problem.
! y      = real*8 array containg the right side.
!
! au,
! jau,
! iau,    = Uower triangular matrix stored in compressed sparse column 
!          format. 
!
! On return:
!----------- 
!         x  = The solution of  U x  = y.
!-----------------------------------------------------------------------
! local variables 
!     
      integer k, j
      real*8 t
!-----------------------------------------------------------------------
      do 140 k=1,n
         x(k) = y(k) 
 140  continue
      do 150 k = n,1,-1
         t = x(k) 
         do 100 j = iau(k), iau(k+1)-1
            x(jau(j)) = x(jau(j)) - t*au(j) 
 100     continue
 150  continue
!
      return
!----------end-of-usolc------------------------------------------------- 
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
      subroutine udsolc (n,x,y,au,jau)   
      integer n, jau(*) 
      real*8 x(n), y(n), au(*)  
!-----------------------------------------------------------------------
!    Solves     U x = y ;    U = nonunit Up. Triang. MSC format 
!----------------------------------------------------------------------- 
! solves a (non-unit) upper triangular system by standard (sequential) 
! forward elimination - matrix stored in Modified Sparse Column format 
! with diagonal elements already inverted (otherwise do inversion,
! auuuul(1:n) = 1.0/au(1:n),  before calling ldsol).
!-----------------------------------------------------------------------
!
! On entry:
!---------- 
! n      = integer. dimension of problem.
! y      = real*8 array containg the right hand side.
!
! au,
! jau,   = Upper triangular matrix stored in Modified Sparse Column
!          format.
!
! On return:
!----------- 
!         x = The solution of  U x = y .
!--------------------------------------------------------------------
! local variables 
! 
      integer k, j
      real*8 t
!----------------------------------------------------------------------- 
      do 140 k=1,n
         x(k) = y(k) 
 140  continue
      do 150 k = n,1,-1
         x(k) = x(k)*au(k) 
         t = x(k) 
         do 100 j = jau(k), jau(k+1)-1
            x(jau(j)) = x(jau(j)) - t*au(j) 
 100     continue
 150  continue
!
      return
!----------end-of-udsolc------------------------------------------------ 
!-----------------------------------------------------------------------
      end
!-----------------------------------------------------------------------
        subroutine lusol(n, y, x, alu, jlu, ju)
          implicit none

        integer    :: n, jlu(*), ju(*)
        real*8     :: x(n), y(n), alu(*)
        
!-----------------------------------------------------------------------
        integer    :: i,k
!
! forward solve
!
        do 40 i = 1, n
           x(i) = y(i)
           do 41 k=jlu(i),ju(i)-1
              x(i) = x(i) - alu(k)* x(jlu(k))
 41        continue
 40     continue
        do 90 i = n, 1, -1
           do 91 k=ju(i),jlu(i+1)-1
              x(i) = x(i) - alu(k)*x(jlu(k))
 91        continue
           x(i) = alu(i)*x(i)
 90     continue
!
        return
!----------------end of lusol ------------------------------------------
        end
!-----------------------------------------------------------------------
        subroutine lutsol(n, y, x, alu, jlu, ju) 
          implicit none

        integer     :: n, jlu(*), ju(*)
        real*8      :: x(n), y(n), alu(*)
        
!-----------------------------------------------------------------------
! local variables
!
        integer      :: i,k
!
        do 10 i = 1, n
           x(i) = y(i)
 10     continue
!
! forward solve (with U^T)
!
        do 20 i = 1, n
           x(i) = x(i) * alu(i)
           do 30 k=ju(i),jlu(i+1)-1
              x(jlu(k)) = x(jlu(k)) - alu(k)* x(i)
 30        continue
 20     continue
!     
!     backward solve (with L^T)
!     
        do 40 i = n, 1, -1 
           do 50 k=jlu(i),ju(i)-1
              x(jlu(k)) = x(jlu(k)) - alu(k)*x(i)
 50        continue
 40     continue
!
        return
!----------------end of lutsol -----------------------------------------
!-----------------------------------------------------------------------
         end
!----------------------------------------------------------------------- 
        subroutine qsplit(a,ind,n,ncut)
          implicit none

        integer    :: n, ind(n), ncut
        real*8     :: a(n)
        
!-----------------------------------------------------------------------
!     does a quick-sort split of a real array.
!     on input a(1:n). is a real array
!     on output a(1:n) is permuted such that its elements satisfy:
!
!     abs(a(i)) .ge. abs(a(ncut)) for i .lt. ncut and
!     abs(a(i)) .le. abs(a(ncut)) for i .gt. ncut
!
!     ind(1:n) is an integer array which permuted in the same way as a(*).
!-----------------------------------------------------------------------
        real*8     :: tmp, abskey
        integer    :: itmp, first, last, j, mid
!-----
        first = 1
        last = n
        if (ncut .lt. first .or. ncut .gt. last) return
!
!     outer loop -- while mid .ne. ncut do
!
 1      mid = first
        abskey = abs(a(mid))
        do 2 j=first+1, last
           if (abs(a(j)) .gt. abskey) then
              mid = mid+1
!     interchange
              tmp = a(mid)
              itmp = ind(mid)
              a(mid) = a(j)
              ind(mid) = ind(j)
              a(j)  = tmp
              ind(j) = itmp
           endif
 2      continue
!
!     interchange
!
        tmp = a(mid)
        a(mid) = a(first)
        a(first)  = tmp
!
        itmp = ind(mid)
        ind(mid) = ind(first)
        ind(first) = itmp
!
!     test for while loop
!
        if (mid .eq. ncut) return
        if (mid .gt. ncut) then
           last = mid-1
        else
           first = mid+1
        endif
        goto 1
!----------------end-of-qsplit------------------------------------------
!-----------------------------------------------------------------------
        end
      subroutine runrc(n,rhs,sol,ipar,fpar,wk,guess,a,ja,ia,au,jau,ju,solver)
      implicit none
      integer n,ipar(16),ia(n+1),ja(*),ju(*),jau(*)
      real*8 fpar(16),rhs(n),sol(n),guess(n),wk(*),a(*),au(*)
      external solver
!-----------------------------------------------------------------------
!     the actual tester. It starts the iterative linear system solvers
!     with a initial guess suppied by the user.
!
!     The structure {au, jau, ju} is assumed to have the output from
!     the ILU* routines in ilut.f.
!
!-----------------------------------------------------------------------
!     local variables
!
      integer       :: i, its
!      real          :: dtime, dt(2), time
!     external dtime
      save its
!
!     ipar(2) can be 0, 1, 2, please don't use 3
!
      if (ipar(2).gt.2) then
         WRITE(*,*) 'I can not do both left and right preconditioning.'
         return
      endif

      its = 0
!
      do i = 1, n
         sol(i) = guess(i)
      enddo
!
      ipar(1) = 0
!     time = dtime(dt)

 10   call solver(n,rhs,sol,ipar,fpar,wk)

      if (ipar(7).ne.its) then
         its = ipar(7)
      endif
      if (ipar(1).eq.1) then
         call amux(n, wk(ipar(8)), wk(ipar(9)), a, ja, ia)
         goto 10
      else if (ipar(1).eq.2) then
         call atmux(n, wk(ipar(8)), wk(ipar(9)), a, ja, ia)
         goto 10
      else if (ipar(1).eq.3 .or. ipar(1).eq.5) then
         call lusol(n,wk(ipar(8)),wk(ipar(9)),au,jau,ju)
         goto 10
      else if (ipar(1).eq.4 .or. ipar(1).eq.6) then
         call lutsol(n,wk(ipar(8)),wk(ipar(9)),au,jau,ju)
         goto 10
      else if (ipar(1).le.0) then
         if (ipar(1).eq.0) then
!            WRITE(*,*) 'Iterative sovler has satisfied convergence test.'
         else if (ipar(1).eq.-1) then
             WRITE(*,*) 'Iterative solver has iterated too many times.'
         else if (ipar(1).eq.-2) then
            WRITE(*,*) 'Iterative solver was not given enough work space.'
            WRITE(*,*) 'The work space should at least have ', ipar(4), &
     &           ' elements.'
         else if (ipar(1).eq.-3) then
            WRITE(*,*) 'Iterative sovler is facing a break-down.'
         else
            WRITE(*,*) 'Iterative solver terminated. code =', ipar(1)
         endif
      endif
      end
!-----end-of-runrc
!----------------------------------------------------------------------c
!                          S P A R S K I T                             c
!----------------------------------------------------------------------c
!                   ITERATIVE SOLVERS MODULE                           c
!----------------------------------------------------------------------c
! This Version Dated: August 13, 1996. Warning: meaning of some        c
! ============ arguments have changed w.r.t. earlier versions. Some    c
!              Calling sequences may also have changed                 c
!
      subroutine ilut(n,a,ja,ia,lfil,droptol,alu,jlu,ju,iwk,w,jw,ierr)
!-----------------------------------------------------------------------
      implicit none 
      integer n 
      real*8 a(*),alu(*),w(n+1),droptol
      integer ja(*),ia(n+1),jlu(*),ju(n),jw(2*n),lfil,iwk,ierr
!----------------------------------------------------------------------*
!                      *** ILUT preconditioner ***                     *
!      incomplete LU factorization with dual truncation mechanism      *
!----------------------------------------------------------------------*
!     Author: Yousef Saad *May, 5, 1990, Latest revision, August 1996  *
!----------------------------------------------------------------------*
! PARAMETERS                                                           
!-----------                                                           
!
! on entry:
!========== 
! n       = integer. The row dimension of the matrix A. The matrix 
!
! a,ja,ia = matrix stored in Compressed Sparse Row format.              
!
! lfil    = integer. The fill-in parameter. Each row of L and each row
!           of U will have a maximum of lfil elements (excluding the 
!           diagonal element). lfil must be .ge. 0.
!           ** WARNING: THE MEANING OF LFIL HAS CHANGED WITH RESPECT TO
!           EARLIER VERSIONS. 
!
! droptol = real*8. Sets the threshold for dropping small terms in the
!           factorization. See below for details on dropping strategy.
!
!  
! iwk     = integer. The lengths of arrays alu and jlu. If the arrays
!           are not big enough to store the ILU factorizations, ilut
!           will stop with an error message. 
!
! On return:
!===========
!
! alu,jlu = matrix stored in Modified Sparse Row (MSR) format containing
!           the L and U factors together. The diagonal (stored in
!           alu(1:n) ) is inverted. Each i-th row of the alu,jlu matrix
!           contains the i-th row of L (excluding the diagonal entry=1)
!           followed by the i-th row of U.
!
! ju      = integer array of length n containing the pointers to
!           the beginning of each row of U in the matrix alu,jlu.
!
! ierr    = integer. Error message with the following meaning.
!           ierr  = 0    --> successful return.
!           ierr .gt. 0  --> zero pivot encountered at step number ierr.
!           ierr  = -1   --> Error. input matrix may be wrong.
!                            (The elimination process has generated a
!                            row in L or U whose length is .gt.  n.)
!           ierr  = -2   --> The matrix L overflows the array al.
!           ierr  = -3   --> The matrix U overflows the array alu.
!           ierr  = -4   --> Illegal value for lfil.
!           ierr  = -5   --> zero row encountered.
!
! work arrays:
!=============
! jw      = integer work array of length 2*n.
! w       = real work array of length n+1.
!  
!----------------------------------------------------------------------
! w, ju (1:n) store the working array [1:ii-1 = L-part, ii:n = u] 
! jw(n+1:2n)  stores nonzero indicators
! 
! Notes:
! ------
! The diagonal elements of the input matrix must be  nonzero (at least
! 'structurally'). 
!
!----------------------------------------------------------------------* 
!---- Dual drop strategy works as follows.                             *
!                                                                      *
!     1) Theresholding in L and U as set by droptol. Any element whose *
!        magnitude is less than some tolerance (relative to the abs    *
!        value of diagonal element in u) is dropped.                   *
!                                                                      *
!     2) Keeping only the largest lfil elements in the i-th row of L   * 
!        and the largest lfil elements in the i-th row of U (excluding *
!        diagonal elements).                                           *
!                                                                      *
! Flexibility: one  can use  droptol=0  to get  a strategy  based on   *
! keeping  the largest  elements in  each row  of L  and U.   Taking   *
! droptol .ne.  0 but lfil=n will give  the usual threshold strategy   *
! (however, fill-in is then mpredictible).                             *
!----------------------------------------------------------------------*
!     locals
      integer ju0,k,j1,j2,j,ii,i,lenl,lenu,jj,jrow,jpos,lenn 
      real*8 tnorm, t, abs, s, fact 
      if (lfil .lt. 0) goto 998
!-----------------------------------------------------------------------
!     initialize ju0 (points to next element to be added to alu,jlu)
!     and pointer array.
!-----------------------------------------------------------------------
      ju0 = n+2
      jlu(1) = ju0
!
!     initialize nonzero indicator array. 
!
      do 1 j=1,n
         jw(n+j)  = 0
 1    continue
!-----------------------------------------------------------------------
!     beginning of main loop.
!-----------------------------------------------------------------------
      do 500 ii = 1, n

         j1 = ia(ii)
         j2 = ia(ii+1) - 1

         tnorm = 0.0d0
         do 501 k=j1,j2
            tnorm = tnorm+abs(a(k))
 501     continue
         if (abs(tnorm) .lt. tiny(1.)) goto 999

         tnorm = tnorm/real(j2-j1+1)
!     
!     unpack L-part and U-part of row of A in arrays w 
!     
         lenu = 1
         lenl = 0
         jw(ii) = ii
         w(ii) = 0.0
         jw(n+ii) = ii
!
         do 170  j = j1, j2
            k = ja(j)
            t = a(j)
            if (k .lt. ii) then
               lenl = lenl+1
               jw(lenl) = k
               w(lenl) = t
               jw(n+k) = lenl
            else if (k .eq. ii) then
               w(ii) = t
            else
               lenu = lenu+1
               jpos = ii+lenu-1 
               jw(jpos) = k
               w(jpos) = t
               jw(n+k) = jpos
            endif
 170     continue
         jj = 0
         lenn = 0 
!     
!     eliminate previous rows
!     
 150     jj = jj+1
         if (jj .gt. lenl) goto 160
!-----------------------------------------------------------------------
!     in order to do the elimination in the correct order we must select
!     the smallest column index among jw(k), k=jj+1, ..., lenl.
!-----------------------------------------------------------------------
         jrow = jw(jj)
         k = jj
!     
!     determine smallest column index
!     
         do 151 j=jj+1,lenl
            if (jw(j) .lt. jrow) then
               jrow = jw(j)
               k = j
            endif
 151     continue
!
         if (k .ne. jj) then
!     exchange in jw
            j = jw(jj)
            jw(jj) = jw(k)
            jw(k) = j
!     exchange in jr
            jw(n+jrow) = jj
            jw(n+j) = k
!     exchange in w
            s = w(jj)
            w(jj) = w(k)
            w(k) = s
         endif
!
!     zero out element in row by setting jw(n+jrow) to zero.
!     
         jw(n+jrow) = 0
!
!     get the multiplier for row to be eliminated (jrow).
!     
         fact = w(jj)*alu(jrow)
         if (abs(fact) .le. droptol) goto 150
!     
!     combine current row and row jrow
!
         do 203 k = ju(jrow), jlu(jrow+1)-1
            s = fact*alu(k)
            j = jlu(k)
            jpos = jw(n+j)
            if (j .ge. ii) then
!     
!     dealing with upper part.
!     
               if (jpos .eq. 0) then
!
!     this is a fill-in element
!     
                  lenu = lenu+1
                  if (lenu .gt. n) goto 995
                  i = ii+lenu-1
                  jw(i) = j
                  jw(n+j) = i
                  w(i) = - s
               else
!
!     this is not a fill-in element 
!
                  w(jpos) = w(jpos) - s

               endif
            else
!     
!     dealing  with lower part.
!     
               if (jpos .eq. 0) then
!
!     this is a fill-in element
!     
                  lenl = lenl+1
                  if (lenl .gt. n) goto 995
                  jw(lenl) = j
                  jw(n+j) = lenl
                  w(lenl) = - s
               else
!     
!     this is not a fill-in element 
!     
                  w(jpos) = w(jpos) - s
               endif
            endif
 203     continue
!     
!     store this pivot element -- (from left to right -- no danger of
!     overlap with the working elements in L (pivots). 
!     
         lenn = lenn+1 
         w(lenn) = fact
         jw(lenn)  = jrow
         goto 150
 160     continue
!     
!     reset double-pointer to zero (U-part)
!     
         do 308 k=1, lenu
            jw(n+jw(ii+k-1)) = 0
 308     continue
!     
!     update L-matrix
!     
         lenl = lenn 
         lenn = min0(lenl,lfil)
!     
!     sort by quick-split
!
         call qsplit (w,jw,lenl,lenn)
!
!     store L-part
! 
         do 204 k=1, lenn 
            if (ju0 .gt. iwk) goto 996
            alu(ju0) =  w(k)
            jlu(ju0) =  jw(k)
            ju0 = ju0+1
 204     continue
!     
!     save pointer to beginning of row ii of U
!     
         ju(ii) = ju0
!
!     update U-matrix -- first apply dropping strategy 
!
         lenn = 0
         do k=1, lenu-1
            if (abs(w(ii+k)) .gt. droptol*tnorm) then 
               lenn = lenn+1
               w(ii+lenn) = w(ii+k) 
               jw(ii+lenn) = jw(ii+k) 
            endif
         enddo
         lenu = lenn+1
         lenn = min0(lenu,lfil)
!
         call qsplit (w(ii+1), jw(ii+1), lenu-1,lenn)
!
!     copy
! 
         t = abs(w(ii))
         if (lenn + ju0 .gt. iwk) goto 997
         do 302 k=ii+1,ii+lenn-1 
            jlu(ju0) = jw(k)
            alu(ju0) = w(k)
            t = t + abs(w(k) )
            ju0 = ju0+1
 302     continue
!     
!     store inverse of diagonal element of u
!     
!2do check if it works ... after correction ...
         if (abs(w(ii)) .lt.  tiny(1.d0)) w(ii) = (0.0001d0 + droptol)*tnorm
!     
         alu(ii) = 1.0d0/ w(ii) 
!     
!     update pointer to beginning of next row of U.
!     
         jlu(ii+1) = ju0
!-----------------------------------------------------------------------
!     end main loop
!-----------------------------------------------------------------------
 500  continue
      ierr = 0
      return
!
!     incomprehensible error. Matrix must be wrong.
!     
 995  ierr = -1
      return
!     
!     insufficient storage in L.
!     
 996  ierr = -2
      return
!     
!     insufficient storage in U.
!     
 997  ierr = -3
      return
!     
!     illegal lfil entered.
!     
 998  ierr = -4
      return
!     
!     zero row encountered
!     
 999  ierr = -5
      return
!----------------end-of-ilut--------------------------------------------
!-----------------------------------------------------------------------
      end
!----------------------------------------------------------------------
!       subroutine ilu0(n, a, ja, ia, alu, jlu, ju, iw, ipoint1, ipoint2, ierr)
        subroutine ilu0(n, a, ja, ia, alu, jlu, ju, iw, ierr)

        implicit real*8 (a-h,o-z)
        real*8 a(*), alu(*), tl
        integer n, ju0, ii, jj, i, j, jcol, js, jf, jm, jrow, jw, ierr
        integer ja(*), ia(*), ju(*), jlu(*), iw(n)
!
!-----------------------------------------------------------------------
        ju0 = n+2
        jlu(1) = ju0 !!!
        iw = 0
        do ii = 1, n
           js = ju0
           do j=ia(ii),ia(ii+1)-1
              jcol = ja(j)
              if (jcol .eq. ii) then
                 alu(ii) = a(j)
                 iw(jcol) = ii
                 ju(ii)  = ju0 !!!
              else
                 alu(ju0) = a(j)
                 jlu(ju0) = ja(j)
                 iw(jcol) = ju0
                 ju0 = ju0+1
              endif
           end do
           jlu(ii+1) = ju0 !!!
           jf = ju0-1
           jm = ju(ii)-1
!     exit if diagonal element is reached.
           do j=js, jm
              jrow = jlu(j)
              tl = alu(j)*alu(jrow)
              alu(j) = tl
!     perform  linear combination
              do jj = ju(jrow), jlu(jrow+1)-1
                 jw = iw(jlu(jj))
                 if (jw .ne. 0) then
                   alu(jw) = alu(jw) - tl*alu(jj)
!                   write(*,*) ii, jw, jj
                 end if
              end do
           end do
!     invert  and store diagonal element.
           if (abs(alu(ii)) .lt. tiny(1.)) goto 600
           alu(ii) = 1.0d0/alu(ii)
!     reset pointer iw to zero
           iw(ii) = 0
           do i = js, jf
             iw(jlu(i)) = 0
           end do
        end do

        ierr = 0
        return

!     zero pivot :
 600    ierr = ii
      return
      end
!-----------------------------------------------------------------------
!       subroutine pgmres(n, im, rhs, sol, eps, maxits, ierr)
!        subroutine pgmres(n, im, rhs, sol, eps, maxits, aspar, ierr)
        subroutine pgmres(n, im, rhs, sol, eps, maxits, aspar, nnz, ia, ja, alu, jlu, ju, vv, ierr)
!-----------------------------------------------------------------------
!       use datapool, only : nnz, ia, ja, alu, jlu, ju, vv, aspar!, rhs, sol
       implicit none

       integer :: n, im, maxits, ierr, nnz

       integer :: ja(nnz), ia(n+1)
       integer :: jlu(nnz+1), ju(n)
       real*8  :: vv(n,im+1), alu(nnz+1)
       real*8  :: aspar(nnz)

        real*8  :: rhs(*), sol(*)

       real*8  :: eps
       real*8  :: eps1, epsmac, gam, t, ddot, dnrm2, ro, tl

       integer :: i,i1,j,jj,k,k1,iii,ii,ju0
       integer :: its,jrow,jcol,jf,jm,js,jw

       real*8  :: hh(im+1,im), c(im), s(im), rs(im+1)
       real*8  :: iw(n)

       logical :: lblas = .false.      ! use sparskit matvec and external blas libs (true), don't use them (false)
       logical :: lilu  = .true.      ! use simple ilu preconditioner

       data epsmac/1.d-16/

! ilu0 preconditioner

       if (lilu) then
         ju0 = n+2
         jlu(1) = ju0 !!!
         iw = 0
         do ii = 1, n
            js = ju0
            do j=ia(ii),ia(ii+1)-1
               jcol = ja(j)
               if (jcol .eq. ii) then
                  alu(ii) = aspar(j)
                  iw(jcol) = ii
                  ju(ii)  = ju0 !!!
               else
                  alu(ju0) = aspar(j)
                  jlu(ju0) = ja(j)
                  iw(jcol) = ju0
                  ju0 = ju0+1
               endif
            end do
            jlu(ii+1) = ju0 !!!
            jf = ju0-1
            jm = ju(ii)-1
! exit if diagonal element is reached.
            do j=js, jm
               jrow = jlu(j)
               tl = alu(j)*alu(jrow)
               alu(j) = tl
! perform  linear combination
               do jj = ju(jrow), jlu(jrow+1)-1
                  jw = int(iw(jlu(jj)))
                  if (jw .ne. 0) then
                      alu(jw) = alu(jw) - tl*alu(jj)
!                    write(*,*) ii, jw, jj
                  end if
               end do
            end do
!     invert  and store diagonal element.
            if (abs(alu(ii)) .lt. epsmac) then
              write (*,*) 'zero pivot'
              stop
            end if
            alu(ii) = 1.0d0/alu(ii)
!     reset pointer iw to zero
            iw(ii) = 0
            do i = js, jf
              iw(jlu(i)) = 0
            end do
         end do
! end preconditioner
       end if
!-------------------------------------------------------------
       its = 0
! outer loop starts here..
       if (lblas) then
         call amux (n, sol, vv, aspar, ja, ia)
       else
         do iii = 1, n
           t = 0.0d0
           do k = ia(iii), ia(iii+1)-1
              t = t + aspar(k) * sol(ja(k))
           end do
           vv(iii,1) = t
         end do
       end if
       do j=1,n
          vv(j,1) = rhs(j) - vv(j,1)
       end do
 20    if (lblas) then
         ro = dnrm2(n, vv)
       else
         ro = sqrt(sum(vv(:,1)*vv(:,1)))
       end if
       if (abs(ro) .lt. epsmac) goto 999
       t = 1.0d0 / ro
       do j=1, n
          vv(j,1) = vv(j,1)*t
       end do
       if (its .eq. 0) eps1=eps*ro
!      initialize 1-st term  of rhs of hessenberg system..
       rs(1) = ro
       i = 0
 4     i=i+1
       its = its + 1
       i1 = i + 1
       if (lblas) then
         call lusol (n, vv(1,i), rhs, alu, jlu, ju)
         call amux (n, rhs, vv(1,i1), aspar, ja, ia)
       else
         do iii = 1, n !- lusol
           rhs(iii) = vv(iii,i)
           do k=jlu(iii),ju(iii)-1
             rhs(iii) = rhs(iii) - alu(k)* rhs(jlu(k))
           end do
         end do
         do iii = n, 1, -1
           do k=ju(iii),jlu(iii+1)-1
              rhs(iii) = rhs(iii) - alu(k)*rhs(jlu(k))
           end do
           rhs(iii) = alu(iii)*rhs(iii)
         end do
         do iii = 1, n !- amux
           t = 0.0d0
           do k = ia(iii), ia(iii+1)-1
             t = t + aspar(k) * rhs(ja(k))
           end do
           vv(iii,i1) = t
         end do
       end if
!      modified gram - schmidt...
       if (lblas) then
         do j=1, i
           t = ddot(n, vv(1,j),vv(1,i1))
           hh(j,i) = t
           call daxpy(n, -t, vv(1,j), 1, vv(1,i1), 1)
           t = dnrm2(n, vv(1,i1))
         end do
       else
         do j=1, i
           t = 0.d0
           do iii = 1,n
             t = t + vv(iii,j)*vv(iii,i1)
           end do
           hh(j,i) = t
           vv(:,i1) = vv(:,i1) - t * vv(:,j)
           t = sqrt(sum(vv(:,i1)*vv(:,i1)))
         end do
       end if
       hh(i1,i) = t
       if ( abs(t) .lt. epsmac) goto 58
       t = 1.0d0/t
       do k=1,n
          vv(k,i1) = vv(k,i1)*t
       end do
!     done with modified gram schimd and arnoldi step.. now  update factorization of hh
58     if (i .eq. 1) goto 121
       do k=2,i
          k1 = k-1
          t = hh(k1,i)
          hh(k1,i) = c(k1)*t + s(k1)*hh(k,i)
          hh(k,i) = -s(k1)*t + c(k1)*hh(k,i)
       end do
121    gam = sqrt(hh(i,i)**2 + hh(i1,i)**2)
       if (abs(gam) .lt. epsmac) gam = epsmac
!      get  next plane rotation
       c(i) = hh(i,i)/gam
       s(i) = hh(i1,i)/gam
       rs(i1) = -s(i)*rs(i)
       rs(i) =  c(i)*rs(i)
!      detrermine residual norm and test for convergence-
       hh(i,i) = c(i)*hh(i,i) + s(i)*hh(i1,i)
       ro = abs(rs(i1))
       if (i .lt. im .and. (ro .gt. eps1))  goto 4
!      now compute solution. first solve upper triangular system.
       rs(i) = rs(i)/hh(i,i)
       do ii=2,i
          k=i-ii+1
          k1 = k+1
          t=rs(k)
          do j=k1,i
             t = t-hh(k,j)*rs(j)
          end do
          rs(k) = t/hh(k,k)
       end do
!      form linear combination of v(*,i)'s to get solution
       t = rs(1)
       do k=1, n
          rhs(k) = vv(k,1)*t
       end do
       do j = 2, i
          t = rs(j)
          do k=1, n
             rhs(k) = rhs(k)+t*vv(k,j)
          end do
       end do
!      call preconditioner.
       if (lblas) then
         call lusol (n, rhs, rhs, alu, jlu, ju)
       else
         do iii = 1, n
            do k=jlu(iii),ju(iii)-1
              rhs(iii) = rhs(iii) - alu(k)* rhs(jlu(k))
            end do
         end do
         do iii = n, 1, -1
           do k=ju(iii),jlu(iii+1)-1
              rhs(iii) = rhs(iii) - alu(k)*rhs(jlu(k))
           end do
           rhs(iii) = alu(iii)*rhs(iii)
         end do
       end if
       do k=1, n
          sol(k) = sol(k) + rhs(k)
       end do
!      restart outer loop  when necessary
       if (ro .le. eps1) goto 990
       if (its .ge. maxits) goto 991
!      else compute residual vector and continue..
       do j=1,i
          jj = i1-j+1
          rs(jj-1) = -s(jj-1)*rs(jj)
          rs(jj) = c(jj-1)*rs(jj)
       end do
       do j=1,i1
          t = rs(j)
          if (j .eq. 1)  t = t-1.0d0
          if (lblas) then
            call daxpy (n, t, vv(1,j), 1,  vv, 1)
          else
            vv(:,j) = vv(:,j) + t * vv(:,1)
          end if
       end do
! 199   format('   its =', i4, ' res. norm =', d20.6)
!      restart outer loop.
       goto 20
 990   ierr = 0
       return
 991   ierr = 1
       return
 999   continue
       ierr = -1
       return
!--------------------------------------------------------------------- 
       end
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
!     subroutine from blas1.f90
!-----------------------------------------------------------------------
      DOUBLE PRECISION FUNCTION DNRM2(N,X)
!     .. Scalar Arguments ..
      INTEGER N
!     ..
!     .. Array Arguments ..
      DOUBLE PRECISION X(*)
!     ..
!
!  Purpose
!  =======
!
!  DNRM2 returns the euclidean norm of a vector via the function
!  name, so that
!
!     DNRM2 := sqrt( x'*x )
!
!  Further Details
!  ===============
!
!  -- This version written on 25-October-1982.
!     Modified on 14-October-1993 to inline the call to DLASSQ.
!     Sven Hammarling, Nag Ltd.
!
!  =====================================================================
!
!     .. Parameters ..
      DOUBLE PRECISION ONE,ZERO
      PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
!     ..
!     .. Local Scalars ..
      DOUBLE PRECISION ABSXI,NORM,SCALE,SSQ
      INTEGER IX
!     ..
!     .. Intrinsic Functions ..
      INTRINSIC ABS,SQRT
!     ..
      IF (N.LT.1 ) THEN
          NORM = ZERO
      ELSE IF (N.EQ.1) THEN
          NORM = ABS(X(1))
      ELSE
          SCALE = ZERO
          SSQ = ONE
!        The following loop is equivalent to this call to the LAPACK
!        auxiliary routine:
!        CALL DLASSQ( N, X, SCALE, SSQ )
!
          DO 10 IX = 1,1 + (N-1)
              IF (X(IX).NE.ZERO) THEN
                  ABSXI = ABS(X(IX))
                  IF (SCALE.LT.ABSXI) THEN
                      SSQ = ONE + SSQ* (SCALE/ABSXI)**2
                      SCALE = ABSXI
                  ELSE
                      SSQ = SSQ + (ABSXI/SCALE)**2
                  END IF
              END IF
   10     CONTINUE
          NORM = SCALE*SQRT(SSQ)
      END IF
!
      DNRM2 = NORM
      RETURN
!
!     End of DNRM2.
!
      END

!-----------------------------------------------------------------------
 SUBROUTINE DLASSQ( N, X, SCALE, SUMSQ )
!
! -- LAPACK auxiliary routine (version 3.1) --
! Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
! November 2006
 INTEGER N
DOUBLE PRECISION SCALE, SUMSQ
DOUBLE PRECISION X( * )
!
! DLASSQ returns the values scl and smsq such that
!
! ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
!
! where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is
! assumed to be non-negative and scl returns the value
!
! scl = max( scale, abs( x( i ) ) ).
!
! SCALE (input/output) DOUBLE PRECISION
! On entry, the value scale in the equation above.
! On exit, SCALE is overwritten with scl , the scaling factor
! for the sum of squares.
        DOUBLE PRECISION ZERO
        PARAMETER ( ZERO = 0.0D+0 )
        INTEGER IX
        DOUBLE PRECISION ABSXI
        INTRINSIC ABS
!
        IF( N.GT.0 ) THEN
        DO IX = 1, 1 + ( N-1 )
          IF( X( IX ).NE.ZERO ) THEN
            ABSXI = ABS( X( IX ) )
            IF( SCALE.LT.ABSXI ) THEN
              SUMSQ = 1 + SUMSQ*( SCALE / ABSXI )**2
              SCALE = ABSXI
            ELSE
         SUMSQ = SUMSQ + ( ABSXI / SCALE )**2
       END IF
          END IF
        END DO
        END IF
        RETURN
        END

!-------------------------------------------------------------------------
      double precision function ddot(n,dx,dy)
!
!     forms the dot product of two vectors.
!     uses unrolled loops for increments equal to one.
!     jack dongarra, linpack, 3/11/78.
!
      double precision dx(*),dy(*),dtemp
      integer i,m,mp1,n
!
      ddot = 0.0d0
      dtemp = 0.0d0
      if(n.le.0)return

   20 m = mod(n,5)
      if( m .eq. 0 ) go to 40
      do 30 i = 1,m
        dtemp = dtemp + dx(i)*dy(i)
   30 continue
      if( n .lt. 5 ) go to 60
   40 mp1 = m + 1
      do 50 i = mp1,n,5
        dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) + &
     &   dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) + dx(i + 4)*dy(i + 4)
   50 continue
   60 ddot = dtemp
      return
      end
!----------------------------------------------------------------------
      subroutine daxpy(n,da,dx,incx,dy,incy)
!
!     constant times a vector plus a vector.
!     uses unrolled loops for increments equal to one.
!     jack dongarra, linpack, 3/11/78.
!
      double precision dx(1),dy(1),da
      integer i,incx,incy,ix,iy,m,mp1,n
!
      if(n.le.0)return
      if (abs(da) .lt. tiny(1.d0)) return
      if(incx.eq.1.and.incy.eq.1)go to 20
!
!        code for unequal increments or equal increments
!          not equal to 1
!
      ix = 1
      iy = 1
      if(incx.lt.0)ix = (-n+1)*incx + 1
      if(incy.lt.0)iy = (-n+1)*incy + 1
      do 10 i = 1,n
        dy(iy) = dy(iy) + da*dx(ix)
        ix = ix + incx
        iy = iy + incy
   10 continue
      return
!
!        code for both increments equal to 1
!
!
!        clean-up loop
!
   20 m = mod(n,4)
      if( m .eq. 0 ) go to 40
      do 30 i = 1,m
        dy(i) = dy(i) + da*dx(i)
   30 continue
      if( n .lt. 4 ) return
   40 mp1 = m + 1
      do 50 i = mp1,n,4
        dy(i) = dy(i) + da*dx(i)
        dy(i + 1) = dy(i + 1) + da*dx(i + 1)
        dy(i + 2) = dy(i + 2) + da*dx(i + 2)
        dy(i + 3) = dy(i + 3) + da*dx(i + 3)
   50 continue
      return
      end