/* BARRIER       set a barrier; for SPMD versions      */
#if defined(MPI)
# define BARRIER call mpi_barrier(mpi_comm_hycom,mpierr)
#elif defined(SHMEM)
# define BARRIER call shmem_barrier_all()
#endif

/* BARRIER_MP    halo synchronization; SHMEM only      */
/* BARRIER_NP    halo synchronization; SHMEM only      */
#if defined(RINGB)
#define BARRIER_MP call xctbar(idproc(mproc-1,nproc),idproc(mproc+1,nproc))
#define BARRIER_NP call xctbar(idproc(mproc,nproc-1),idproc(mproc,nproc+1))
#else
#define BARRIER_MP BARRIER
#define BARRIER_NP BARRIER
#endif

#if defined(MPI)
/* #define MPISR */
/* MTYPE4        mpi type for real*4                   */
/* MTYPER        mpi type for real                     */
/* MTYPED        mpi type for real*8                   */
/* MTYPEI        mpi type for integer                  */
/* MPI_SEND      either mpi_send  or mpi_ssend         */
/* MPI_ISEND     either mpi_isend or mpi_issend        */
#if defined(NOMPIR8) /* LAM does not support mpi_real[48] */
#if defined(REAL4)
# define MTYPE4 mpi_real
# define MTYPER mpi_real
# define MTYPED mpi_double_precision
# define MTYPEI mpi_integer
#else /* REAL8 */
# define MTYPE4 mpi_real
# define MTYPER mpi_double_precision
# define MTYPED mpi_double_precision
# define MTYPEI mpi_integer
#endif
#else /* most MPI's allow mpi_real[48] */
#if defined(REAL4)
# define MTYPE4 mpi_real4
# define MTYPER mpi_real4
# define MTYPED mpi_real8
# define MTYPEI mpi_integer
#else /* REAL8 */
# define MTYPE4 mpi_real4
# define MTYPER mpi_real8
# define MTYPED mpi_real8
# define MTYPEI mpi_integer
#endif
#endif
#if defined(SSEND)
# define MPI_SEND mpi_ssend
# define MPI_ISEND mpi_issend
#else
# define MPI_SEND mpi_send
# define MPI_ISEND mpi_isend
#endif /* SSEND:else */
#endif /* MPI */

#if defined(SHMEM)
/* SHMEM_GET4    get real*4  variables                 */
/* SHMEM_GETR    get real    variables                 */
/* SHMEM_GETD    get real*8  variables                 */
/* SHMEM_GETI    get integer variables                 */
/* SHMEM_MYPE    return number of this PE (0...npes-1) */
/* SHMEM_NPES    return number of PEs                  */
#if defined(REAL4)
# define SHMEM_GET4 shmem_get32
# define SHMEM_GETR shmem_get32
# define SHMEM_GETD shmem_get64
# define SHMEM_GETI shmem_integer_get
#else /* REAL8 */
# define SHMEM_GET4 shmem_get32
# define SHMEM_GETR shmem_get64
# define SHMEM_GETD shmem_get64
# define SHMEM_GETI shmem_integer_get
#endif
# define SHMEM_MYPE shmem_my_pe
# define SHMEM_NPES shmem_n_pes
#endif /* SHMEM */

!
!-----------------------------------------------------------------------
!
!     auxillary routines that involve off-processor communication.
!     message passing version, contained in module mod_xc.
!
!     author:  Alan J. Wallcraft,  NRL.
!
!-----------------------------------------------------------------------
!
      subroutine xcaget(aa, a, mnflg)
      implicit none
!
      real,    intent(out)   :: aa(itdm,jtdm)
      real,    intent(in)    :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) convert an entire 2-D array from tiled to non-tiled layout.
!
!  3) mnflg selects which nodes must return the array
!        = 0; all nodes
!        = n; node number n (mnproc=n)
!
!  4) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aa              real           output    non-tiled target array
!    a               real           input     tiled source array
!    mnflg           integer        input     node return flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
      integer i,j,mp,np,mnp
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0( 1)
        nxc = 1
      endif
#endif
#if defined(RELO)
!
      if     (.not.allocated(al)) then
        allocate(       al(itdm,jdm) )
        call mem_stat_add( itdm*jdm )
        allocate(       at(idm*jdm) )
        call mem_stat_add( idm*jdm )
      endif
#endif
!
!     gather each row of tiles onto the first tile in the row.
!
#if defined(MPI)
      if     (mproc.eq.mpe_1(nproc)) then
        do j= 1,jj
          do i= 1,i0
            al(i,j) = vland
          enddo
          do i= 1,ii
            al(i0+i,j) = a(i,j)
          enddo
          do i= i0+ii+1,itdm
            al(i,j) = vland
          enddo
        enddo
        do mp= mpe_1(nproc)+1,mpe_e(nproc)
          call MPI_RECV(at,ii_pe(mp,nproc)*jj,MTYPER, &
                        idproc(mp,nproc), 9941, &
                        mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
          do j= 1,jj
            do i= 1,ii_pe(mp,nproc)
              al(i0_pe(mp,nproc)+i,j) = at(i+(j-1)*ii_pe(mp,nproc))
            enddo
          enddo
        enddo
      else  !mproc>1
        do j= 1,jj
          do i= 1,ii
            at(i+(j-1)*ii) = a(i,j)
          enddo
        enddo
        call MPI_SEND(at,ii*jj,MTYPER, &
                      idproc(mpe_1(nproc),nproc), 9941, &
                      mpi_comm_hycom, mpierr)
      endif
#elif defined(SHMEM)
      do j= 1,jj
        do i= 1,ii
          at(i+(j-1)*ii) = a(i,j)
        enddo
      enddo
      BARRIER
      if     (mproc.eq.mpe_1(nproc)) then
        do j= 1,jj
          do i= 1,i0
            al(i,j) = vland
          enddo
          do i= 1,ii
            al(i0+i,j) = a(i,j)
          enddo
          do i= i0+ii+1,itdm
            al(i,j) = vland
          enddo
        enddo
        do mp= mpe_1(nproc)+1,mpe_e(nproc)
          call SHMEM_GETR(at, &
                          at,ii_pe(mp,nproc)*jj, idproc(mp,nproc))
          do j= 1,jj
            do i= 1,ii_pe(mp,nproc)
              al(i0_pe(mp,nproc)+i,j) = at(i+(j-1)*ii_pe(mp,nproc))
            enddo
          enddo
        enddo
      endif
      BARRIER
#endif /* MPI:SHMEM */
!
!     gather each row of tiles onto the output array.
!
#if defined(MPI)
      mnp = max(mnflg,1)
!
      if     (mnproc.eq.mnp) then
        if (mproc.eq.mpe_1(nproc)) then
          do j= 1,jj
            do i= 1,itdm
              aa(i,j+j0) = al(i,j)
            enddo
          enddo
        endif
        do np= 1,jpr
          mp = mpe_1(np)
          if     (idproc(mp,np).ne.idproc(mproc,nproc)) then
            call MPI_RECV(al,itdm*jj_pe(mp,np),MTYPER, &
                          idproc(mp,np), 9942, &
                          mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
            do j= 1,jj_pe(mp,np)
              do i= 1,itdm
                aa(i,j+j0_pe(mp,np)) = al(i,j)
              enddo
            enddo
          endif
        enddo
      elseif (mproc.eq.mpe_1(nproc)) then
        call MPI_SEND(al,itdm*jj,MTYPER, &
                      idproc1(mnp), 9942, &
                      mpi_comm_hycom, mpierr)
      endif
!
      if     (mnflg.eq.0) then
        call mpi_bcast(aa,itdm*jtdm,MTYPER, &
                       idproc1(1),mpi_comm_hycom,mpierr)
      endif
#elif defined(SHMEM)
      if     (mnflg.eq.0 .or. mnproc.eq.mnflg) then
        if (mproc.eq.mpe_1(nproc)) then
          do j= 1,jj
            do i= 1,itdm
              aa(i,j+j0) = al(i,j)
            enddo
          enddo
        endif
        do np= 1,jpr
          mp = mpe_1(np)
          if     (idproc(mp,np).ne.idproc(mproc,nproc)) then
            call SHMEM_GETR(al, &
                            al,itdm*jj_pe(mp,np), idproc(mp,np))
            do j= 1,jj_pe(mp,np)
              do i= 1,itdm
                aa(i,j+j0_pe(mp,np)) = al(i,j)
              enddo
            enddo
          endif
        enddo
      endif
      ! no barrier needed here because of double buffering (at and then al)
#endif /* MPI:SHMEM */
#if defined(TIMER)
!
      if     (nxc.eq. 1) then
        call xctmr1( 1)
        nxc = 0
      endif
#endif
      return
      end subroutine xcaget

      subroutine xcaget4(aa, a, mnflg)
      implicit none
!
      real*4,  intent(out)   :: aa(itdm,*)
      real*4,  intent(in)    :: a(ii,jj)
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) convert an entire 2-D array from tiled to non-tiled layout.
!
!  2) Special version for zaiord and zaiowr only.
!     arrays are real*4 and tiled array has no halo.
!
!  3) mnflg selects which nodes must return the array
!        =-1; first node in each row returns that row
!        = n; node number n (mnproc=n)
!
!  4) The size of aa depends on mnflg
!      =-1; aa needed on 1st node in each row, dimension (itdm,jdm)
!      = n; aa needed on nth node,             dimension (itdm,jtdm)
!
!  5) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aa              real           output    non-tiled target array
!    a               real           input     tiled source array
!    mnflg           integer        input     node return flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      integer i,j,mp,np,mnp
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0( 1)
        nxc = 1
      endif
#endif
#if defined(RELO)
!
      if     (.not.allocated(at4)) then
        if     (mproc.eq.mpe_1(nproc)) then
          allocate(       al4(itdm,jdm) )
          allocate(      alt4(idm*jdm,ipr) )
          call mem_stat_add( (itdm*jdm)/2     ) !real*4, so /2
          call mem_stat_add( (idm *jdm*jpr)/2 ) !real*4, so /2
        endif !first tile in the row
        allocate(       at4(idm*jdm) )
        call mem_stat_add( (idm*jdm) /2 ) !real*4, so /2
      endif
#endif
!
!     gather each row of tiles onto the first tile in the row.
!
#if defined(MPI)
      if     (mproc.eq.mpe_1(nproc)) then
        do j= 1,jj
          do i= 1,i0
            al4(i,j) = vland4
          enddo
          do i= 1,ii
            al4(i0+i,j) = a(i,j)
          enddo
          do i= i0+ii+1,itdm
            al4(i,j) = vland4
          enddo
        enddo
        do mp= mpe_1(nproc)+1,mpe_e(nproc)
          call MPI_RECV(at4,ii_pe(mp,nproc)*jj,MTYPE4, &
                        idproc(mp,nproc), 9941, &
                        mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
          do j= 1,jj
            do i= 1,ii_pe(mp,nproc)
              al4(i0_pe(mp,nproc)+i,j) = at4(i+(j-1)*ii_pe(mp,nproc))
            enddo
          enddo
        enddo
      else  !mproc>1
        call MPI_SEND(a,ii*jj,MTYPE4, &
                      idproc(mpe_1(nproc),nproc), 9941, &
                      mpi_comm_hycom, mpierr)
      endif
#elif defined(SHMEM)
      do j= 1,jj
        do i= 1,ii
          at4(i+(j-1)*ii) = a(i,j)
        enddo
      enddo
      BARRIER
      if     (mproc.eq.mpe_1(nproc)) then
        do j= 1,jj
          do i= 1,i0
            al4(i,j) = vland4
          enddo
          do i= 1,ii
            al4(i0+i,j) = a(i,j)
          enddo
          do i= i0+ii+1,itdm
            al4(i,j) = vland4
          enddo
        enddo
        do mp= mpe_1(nproc)+1,mpe_e(nproc)
          call SHMEM_GET4(at4, &
                          at4,ii_pe(mp,nproc)*jj, idproc(mp,nproc))
          do j= 1,jj
            do i= 1,ii_pe(mp,nproc)
              al4(i0_pe(mp,nproc)+i,j) = at4(i+(j-1)*ii_pe(mp,nproc))
            enddo
          enddo
        enddo
      endif
      BARRIER
#endif /* MPI:SHMEM */
      if     (mnflg.eq.-1) then
!
!       we are essentially done.
!
        if (mproc.eq.mpe_1(nproc)) then
          do j= 1,jj
            do i= 1,itdm
              aa(i,j) = al4(i,j)
            enddo
          enddo  
        endif  
      else !mnflg.ne.-1
!
!     gather each row of tiles onto the output array.
!
#if defined(MPI)
      mnp = max(mnflg,1)
!
      if     (mnproc.eq.mnp) then
        if (mproc.eq.mpe_1(nproc)) then
          do j= 1,jj
            do i= 1,itdm
              aa(i,j+j0) = al4(i,j)
            enddo
          enddo
        endif
        do np= 1,jpr
          mp = mpe_1(np)
          if     (idproc(mp,np).ne.idproc(mproc,nproc)) then
            call MPI_RECV(al4,itdm*jj_pe(mp,np),MTYPE4, &
                          idproc(mp,np), 9942, &
                          mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
            do j= 1,jj_pe(mp,np)
              do i= 1,itdm
                aa(i,j+j0_pe(mp,np)) = al4(i,j)
              enddo
            enddo
          endif
        enddo
      elseif (mproc.eq.mpe_1(nproc)) then
        call MPI_SEND(al4,itdm*jj,MTYPE4, &
                      idproc1(mnp), 9942, &
                      mpi_comm_hycom, mpierr)
      endif
#elif defined(SHMEM)
      if     (mnproc.eq.mnflg) then
        if (mproc.eq.mpe_1(nproc)) then
          do j= 1,jj
            do i= 1,itdm
              aa(i,j+j0) = al4(i,j)
            enddo
          enddo
        endif
        do np= 1,jpr
          mp = mpe_1(np)
          if     (idproc(mp,np).ne.idproc(mproc,nproc)) then
            call SHMEM_GET4(al4, &
                            al4,itdm*jj_pe(mp,np), idproc(mp,np))
            do j= 1,jj_pe(mp,np)
              do i= 1,itdm
                aa(i,j+j0_pe(mp,np)) = al4(i,j)
              enddo
            enddo
          endif
        enddo
      endif
      ! no barrier needed here because of double buffering (at and then al)
#endif /* MPI:SHMEM */
      endif !mnflg.eq.-1:else                                              
#if defined(TIMER)
!
      if     (nxc.eq. 1) then
        call xctmr1( 1)
        nxc = 0
      endif
#endif
      return
      end subroutine xcaget4

      subroutine xcaput(aa, a, mnflg)
      implicit none
!
      real,    intent(inout) :: aa(itdm,jtdm)
      real,    intent(out)   :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) convert an entire 2-D array from non-tiled to tiled layout.
!
!  3) mnflg selects which nodes must contain the non-tiled array
!        = 0; all nodes
!        = n; node number n (mnproc=n)
!     if mnflg.ne.0 the array aa may be broadcast to all nodes,
!      so aa must exist and be overwritable on all nodes.
!
!  4) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aa              real           input     non-tiled source array
!    a               real           output    tiled target array
!    mnflg           integer        input     node source flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
!
      integer mpireqa(jpr),mpireqb(ipr)
#endif
!
      integer i,j,mp,np,mnp
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        BARRIER
        call xctmr0( 4)
        nxc = 4
      endif
#endif
#if defined(RELO)
!
      if     (.not.allocated(al)) then
        allocate(       al(itdm,jdm) )
        call mem_stat_add( itdm*jdm )
        allocate(       at(idm*jdm) )
        call mem_stat_add( idm*jdm )
      endif
#endif
!
!     if     (mnproc.eq.1) then
!       write(lp,'(a,g20.10)') 'xcaput - vland =',vland
!     endif
!
!     use xclput for now,
!     this is slow for mnflg.ne.0, but easy to implement.
!
      if     (mnflg.ne.0) then
!       "broadcast" row sections of aa to all processors in the row.
        if     (mnproc.ne.mnflg) then
          aa(:,:) = vland
        endif
#if defined(MPI)
        if     (mnproc.eq.mnflg) then
          j = 0
          do np= 1,jpr
            mp = mpe_1(np)
            if     (np.eq.nproc .and. mp.eq.mproc) then
              al(:,1:jj) = aa(:,j0+1:j0+jj)
            else
              j = j + 1
              call MPI_ISEND(aa(1,j0_pe(mp,np)+1), &
                            itdm*jj_pe(mp,np),MTYPER, &
                            idproc(mp,np), 9951, &
                            mpi_comm_hycom, mpireqa(j), mpierr)
            endif
          enddo
          call mpi_waitall( j, mpireqa, MPI_STATUSES_IGNORE, mpierr)
        elseif (mproc.eq.mpe_1(nproc)) then
          call MPI_RECV(al,itdm*jj,MTYPER, &
                        idproc1(mnflg), 9951, &
                        mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
        endif
!
        if     (mproc.eq.mpe_1(nproc)) then
          i = 0
          do mp= mpe_1(nproc)+1,mpe_e(nproc)
            i = i + 1
            call MPI_ISEND(al,itdm*jj,MTYPER, &
                          idproc(mp,nproc), 9952, &
                          mpi_comm_hycom, mpireqb(i), mpierr)
          enddo
          call mpi_waitall( i, mpireqb, MPI_STATUSES_IGNORE, mpierr)
        else
          call MPI_RECV(al,itdm*jj,MTYPER, &
                        idproc(mpe_1(nproc),nproc), 9952, &
                        mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
        endif
!
        aa(:,j0+1:j0+jj) = al(:,1:jj)
#elif defined(SHMEM)
!       assume aa is in common.
        BARRIER
        if     (mnproc.ne.mnflg) then
          do j= 1,jj
            call SHMEM_GETR(aa(1,j0+j), &
                            aa(1,j0+j),itdm,idproc1(mnflg))
          enddo
        endif
        BARRIER
#endif
      endif
      do j= 1,jtdm
        call xclput(aa(1,j),itdm, a, 1,j,1,0)
      enddo
#if defined(TIMER)
!
      if     (nxc.eq. 4) then
        call xctmr1( 4)
        nxc = 0
      endif
#endif
      return
      end subroutine xcaput

      subroutine xcaput4(aa, a, mnflg)
      implicit none
!
      real*4,  intent(in)    :: aa(itdm,*)
      real*4,  intent(out)   :: a(ii,jj)
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) convert an entire 2-D array from non-tiled to tiled layout.
!
!  2) Special version for zaiord and zaiowr only.
!     arrays are real*4 and tiled array has no halo.
!
!  3) mnflg selects which nodes must contain the non-tiled array
!        =-1; first node in each row contains that row
!        = n; node number n (mnproc=n)
!     the array aa is unchanged on exit, and need not exist
!      on the nodes that do not reference it.
!
!  4) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aa              real           input     non-tiled source array
!    a               real           output    tiled target array
!    mnflg           integer        input     node source flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
!
      integer mpireqa(jpr),mpireqb(ipr)
#endif
!
      integer i,msg,j,mp,np,mnp
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        BARRIER
        call xctmr0( 4)
        nxc = 4
      endif
#endif
#if defined(RELO)
!
      if     (.not.allocated(at4)) then
        if     (mproc.eq.mpe_1(nproc)) then
          allocate(       al4(itdm,jdm) )
          allocate(      alt4(idm *jdm,ipr) )
          call mem_stat_add( (itdm*jdm)/2     ) !real*4, so /2
          call mem_stat_add( (idm *jdm*jpr)/2 ) !real*4, so /2
        endif !first tile in the row
        allocate(       at4(idm*jdm) )
        call mem_stat_add( (idm*jdm) /2 ) !real*4, so /2
      endif
#endif
!
!     if     (mnproc.eq.1) then
!       write(lp,'(a,g20.10)') 'xcaput - vland =',vland
!     endif
!
      if     (mnflg.gt.0) then
#if defined(MPI)
!       "broadcast" row sections of aa to first processor in each row.
        if     (mnproc.eq.mnflg) then
          j = 0
          do np= 1,jpr
            mp = mpe_1(np)
            if     (np.eq.nproc .and. mp.eq.mproc) then
              al4(:,1:jj) = aa(:,j0+1:j0+jj)
            else
              j = j + 1
              call MPI_ISEND(aa(1,j0_pe(mp,np)+1), &
                            itdm*jj_pe(mp,np),MTYPE4, &
                            idproc(mp,np), 9951, &
                            mpi_comm_hycom, mpireqa(j), mpierr)
            endif
          enddo
          call mpi_waitall( j, mpireqa, MPI_STATUSES_IGNORE, mpierr)
        elseif (mproc.eq.mpe_1(nproc)) then
          call MPI_RECV(al4,itdm*jj,MTYPE4, &
                        idproc1(mnflg), 9951, &
                        mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
        endif
!
!       "broadcast" partial row sections to the rest of the row's processors
        if     (mproc.eq.mpe_1(nproc)) then
          msg = 0
          do mp= mpe_1(nproc)+1,mpe_e(nproc)
            do j= 1,jj
              do i= 1,ii_pe(mp,nproc)
                alt4(i+(j-1)*ii_pe(mp,nproc),mp) =  &
                  al4(i0_pe(mp,nproc)+i,j)
              enddo
            enddo
            msg = msg + 1
            call MPI_ISEND(alt4(1,mp),ii_pe(mp,nproc)*jj,MTYPE4, &
                          idproc(mp,nproc), 9952, &
                          mpi_comm_hycom, mpireqb(msg), mpierr)
          enddo !mp
!
          do j= 1,jj
            do i= 1,ii
              a(i,j) = al4(i0+i,j)
            enddo
          enddo
!
          call mpi_waitall( msg, mpireqb, MPI_STATUSES_IGNORE, mpierr)
        else
          call MPI_RECV(at4,ii*jj,MTYPE4, &
                        idproc(mpe_1(nproc),nproc), 9952, &
                        mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
          do j= 1,jj
            do i= 1,ii
              a(i,j) = at4(i+(j-1)*ii)
            enddo
          enddo
        endif
#elif defined(SHMEM)
!       assume aa is in common.
        BARRIER
!       "broadcast" row sections of aa to first processor in each row.
        if     (mproc.eq.mpe_1(nproc)) then
          call SHMEM_GET4(al4(1,   1), &
                           aa(1,j0+1),itdm*jj,idproc1(mnflg))
          do mp= mpe_1(nproc)+1,mpe_e(nproc)
            do j= 1,jj
              do i= 1,ii
                alt4(i+(j-1)*ii,mp) = al4(i0+i,j)
              enddo
            enddo
          enddo !mp
        endif
        BARRIER
!       "broadcast" partial row sections to the rest of the row's processors
        if     (mproc.eq.mpe_1(nproc)) then
          do j= 1,jj
            do i= 1,ii
              a(i,j) = al4(i0+i,j)
            enddo
          enddo
        else
          call SHMEM_GET4( at4(1,    1), &
                          alt4(1,mproc),ii*jj,idproc1(mpe_1(nproc)))
          do j= 1,jj
            do i= 1,ii
              a(i,j) = at4(i+(j-1)*ii)
            enddo
          enddo
        endif
        BARRIER
#endif
      elseif (mnflg.eq.-1) then
#if defined(MPI)
!       "broadcast" partial row sections of aa to all processors in the row.
        if     (mproc.eq.mpe_1(nproc)) then
          msg = 0
          do mp= mpe_1(nproc)+1,mpe_e(nproc)
            do j= 1,jj
              do i= 1,ii_pe(mp,nproc)
                alt4(i+(j-1)*ii_pe(mp,nproc),mp) =  &
                  aa(i0_pe(mp,nproc)+i,j)
              enddo
            enddo
            msg = msg + 1
            call MPI_ISEND(alt4(1,mp),ii_pe(mp,nproc)*jj,MTYPE4, &
                          idproc(mp,nproc), 9953, &
                          mpi_comm_hycom, mpireqb(msg), mpierr)
          enddo !mp
!
          do j= 1,jj
            do i= 1,ii
              a(i,j) = aa(i0+i,j)
            enddo
          enddo
!
          call mpi_waitall( msg, mpireqb, MPI_STATUSES_IGNORE, mpierr)
        else
          call MPI_RECV(at4,ii*jj,MTYPE4, &
                        idproc(mpe_1(nproc),nproc), 9953, &
                        mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
          do j= 1,jj
            do i= 1,ii
              a(i,j) = at4(i+(j-1)*ii)
            enddo
          enddo
        endif
#elif defined(SHMEM)
!       "broadcast" partial row sections to the rest of the row's processors
        if     (mproc.eq.mpe_1(nproc)) then
          do mp= mpe_1(nproc)+1,mpe_e(nproc)
            do j= 1,jj
              do i= 1,ii_pe(mp,nproc)
                alt4(i+(j-1)*ii_pe(mp,nproc),mp) =  &
                  aa(i0_pe(mp,nproc)+i,j)
              enddo
            enddo
          enddo !mp
        endif
        BARRIER
        if     (mproc.eq.mpe_1(nproc)) then
          do j= 1,jj
            do i= 1,ii
              a(i,j) = aa(i0+i,j)
            enddo
          enddo
        else
          call SHMEM_GET4( at4(1,    1), &
                          alt4(1,mproc),ii*jj,idproc1(mpe_1(nproc)))
          do j= 1,jj
            do i= 1,ii
              a(i,j) = at4(i+(j-1)*ii)
            enddo
          enddo
        endif
        BARRIER
#endif
      endif !mnflg.gt.0:mnflg.eq.-1
#if defined(TIMER)
!
      if     (nxc.eq. 4) then
        call xctmr1( 4)
        nxc = 0
      endif
#endif
      return
      end subroutine xcaput4

      subroutine xcastr(a, mnflg)
      implicit none
!
      real,    intent(inout) :: a(:)
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) broadcast array a to all tiles.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target array
!    mnflg           integer        input     node originator flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      integer i,is0,isl,mn,n,nn
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0( 9)
        nxc =  9
      endif
#endif
!
!     stripmine a.
!
      n = size(a)
!
      do is0= 0,n-1,nmax
        isl = min(is0+nmax,n)
        nn = isl - is0
        if     (mnproc.eq.mnflg) then
          do i= 1,nn
            b(i) = a(is0+i)
          enddo
        endif
#if defined(MPI)
        call mpi_bcast(b,nn,MTYPER, &
                       idproc1(mnflg), &
                       mpi_comm_hycom,mpierr)
#elif defined(SHMEM)
        BARRIER
        if     (mnproc.ne.mnflg) then
!         get from source processor
          call SHMEM_GETR(b,b,nn, idproc1(mnflg))
        endif
        BARRIER
#endif
        if     (mnproc.ne.mnflg) then
          do i= 1,nn
            a(is0+i) = b(i)
          enddo
        endif
      enddo  ! stripmine loop
#if defined(TIMER)
!
      if     (nxc.eq. 9) then
        call xctmr1( 9)
        nxc = 0
      endif
#endif
      return
      end subroutine xcastr

      subroutine xceget(aelem, a, ia,ja, mnflg)
      implicit none
!
      real,    intent(out)   ::         aelem
      real,    intent(in)    :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
      integer, intent(in)    ::         ia,ja
      integer, intent(in), optional ::  mnflg
!
!**********
!*
!  1) find the value of a(ia,ja) on the non-tiled 2-D grid.
!
!  2) mnflg selects which nodes must return the line
!        = 0; all nodes (default)
!        = n; node number n (mnproc=n)
!     note that aelem might, or might not, be returned on other nodes
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aelem           real           output    required element
!    a               real           input     tiled source array
!    ia              integer        input     1st non-tiled index into a
!    ja              integer        input     2nd non-tiled index into a
!    mnflg           integer        input     node return flag
!
!  4) the global variable vland is returned when outside active tiles.
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
!     double buffer to reduce the number of required barriers.
!
      real,    save, dimension(0:1) :: elem
      integer, save :: kdb = 0
!
      integer i,j,lmnflg,mp,np
#if defined(TIMER)
!
!     if     (nxc.eq.0) then
!       call xctmr0( 2)
!       nxc = 2
!     endif
#endif
!     
      if     (present(mnflg)) then
        lmnflg = mnflg
      else
        lmnflg = 0  ! default
      endif
!
      kdb = mod(kdb+1,2)  ! the least recently used of the two buffers
!
!     find the host tile.
!
      np  = npe_j(max(0,min(jtdm+1,ja))   )
      mp  = mpe_i(max(0,min(itdm+1,ia)),np)
!
      if      (mp.le.0) then
!
!       no tile.
!
        elem(kdb) = vland
      elseif  (mp.eq.mproc .and. np.eq.nproc) then
!
!       this tile.
!
        i = ia - i0
        j = ja - j0
!
        elem(kdb) = a(i,j)
#if defined(MPI)
        if     (lmnflg.eq.0) then
          call mpi_bcast(elem(kdb),1,MTYPER, &
                         idproc(mp,np),mpi_comm_hycom,mpierr)
        elseif (lmnflg.ne.mnproc) then
          call MPI_SEND(elem(kdb),1,MTYPER, &
                        idproc1(lmnflg), 9932, &
                        mpi_comm_hycom, mpierr)
        endif !lmnflg
#elif defined(SHMEM)
        BARRIER
#endif
      else
!
!       another tile.
!
#if defined(MPI)
        if     (lmnflg.eq.0) then
          call mpi_bcast(elem(kdb),1,MTYPER, &
                         idproc(mp,np),mpi_comm_hycom,mpierr)
        elseif (lmnflg.eq.mnproc) then
          call MPI_RECV( elem(kdb),1,MTYPER, &
                         idproc(mp,np), 9932, &
                         mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
        endif !lmnflg
#elif defined(SHMEM)
        BARRIER
        call SHMEM_GETR(elem(kdb), &
                        elem(kdb),1,idproc(mp,np))
        ! no barrier needed here because of double buffering
#endif
      endif
      aelem = elem(kdb)
#if defined(TIMER)
!
!     if     (nxc.eq. 2) then
!       call xctmr1( 2)
!       nxc = 0
!     endif
#endif
      return
      end subroutine xceget

      subroutine xceput(aelem, a, ia,ja, mnflg)
      implicit none
!
      integer, intent(in), optional ::  mnflg
      integer, intent(in)    ::         ia,ja
      real,    intent(inout) ::         aelem
      real,    intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
!
!**********
!*
!  1) fill a single element in the non-tiled 2-D grid.
!
!  2) mnflg selects which nodes hold the element on entry
!        = 0; all nodes (default)
!        = n; node number n (mnproc=n)
!     all nodes hold the element on exit
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aelem           real           in/out    element value
!    a               real           in/out    tiled target array
!    ia              integer        input     1st non-tiled index into a
!    ja              integer        input     2nd non-tiled index into a
!    mnflg           integer        input     node source flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      real, save :: a1(1)
#if defined(TIMER)
!
!     if     (nxc.eq.0) then
!       call xctmr0( 5)
!       nxc = 5
!     endif
#endif
!     
      if     (present(mnflg)) then
        if     (mnflg.ne.0) then
          if     (mnproc.eq.mnflg) then
            a1(1) = aelem
            call xcastr(a1, mnflg)  !broadcast the input element
          else
            call xcastr(a1, mnflg)  !broadcast the input element
            aelem = a1(1)
          endif !mnproc
        endif !mnflg>0
      endif !present
!
      if     (i0.lt.ia .and. ia.le.i0+ii .and. &
              j0.lt.ja .and. ja.le.j0+jj      ) then
!
!       this tile.
!
        a(ia-i0,ja-j0) = aelem
      endif
#if defined(TIMER)
!
!     if     (nxc.eq. 5) then
!       call xctmr1( 5)
!       nxc = 0
!     endif
#endif
      return
      end subroutine xceput

      subroutine xcgetc(iline)
      implicit none
!
      integer, intent(inout) :: iline(81)
!
!**********
!*
!  1) machine specific routine for broadcasting iline.
!
!  2) only use in zagetc (hence the name).
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
!     broadcast to all processors
!
#if defined(MPI)
      call mpi_bcast(iline,81,MTYPEI, &
                     idproc1(1),mpi_comm_hycom,mpierr)
#elif defined(SHMEM)
      BARRIER
      if     (mnproc.ne.1) then
        call SHMEM_GETI(iline, &
                        iline,81, idproc1(1))
      endif
      ! no barrier needed here because zagetc is using two buffers
#endif
      return
      end subroutine xcgetc

      subroutine xcgput4(aa, ga, mnflg)
      implicit none
!
      real*4,  intent(in)    :: aa(itdm,*)
      real*4,  intent(out)   :: ga(itdm,jtdm)
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) broadcast an entire 2-D non-tiled array
!
!  2) For zaiordg only.
!     arrays are real*4 with no halo.
!
!  3) mnflg selects which nodes must contain the input non-tiled array
!        =-1; first node in each row contains that row
!        = n; node number n (mnproc=n)
!     the array aa is unchanged on exit, and need not exist
!      on the nodes that do not reference it.
!
!  4) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aa              real*4         input     non-tiled source array
!    ga              real*4         output    non-tiled target array
!    mnflg           integer        input     node source flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      integer i,j,np
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        BARRIER
        call xctmr0( 4)
        nxc = 4
      endif
#endif
!
      if     (mnflg.gt.0) then
#if defined(MPI)
        if     (mnproc.eq.mnflg) then
          do j= 1,jtdm
            do i= 1,itdm
              ga(i,j) = aa(i,j)
            enddo
          enddo
        endif
        call mpi_bcast(ga,itdm*jtdm,MTYPE4, &
                       idproc1(mnflg), &
                       mpi_comm_hycom,mpierr)
#elif defined(SHMEM)
        call xcstop('xcgput4: shmem not implemented')
               stop '(xcgput4)'
#endif
      elseif (mnflg.eq.-1) then
#if defined(MPI)
        if     (mproc.eq.mpe_1(nproc)) then
          do j= 1,jj
            do i= 1,itdm
              ga(i,j0+j) = aa(i,j)
            enddo
          enddo
        endif
        do np= 1,jpr
          call mpi_bcast(ga(1,j0_pe(mpe_1(np),np)+1), &
                         itdm*jj_pe(mpe_1(np),np),MTYPE4, &
                             idproc(mpe_1(np),np), &
                         mpi_comm_hycom,mpierr)
        enddo
#elif defined(SHMEM)
        call xcstop('xcgput4: shmem not implemented')
               stop '(xcgput4)'
#endif
      endif !mnflg.gt.0:mnflg.eq.-1
!
#if defined(TIMER)
      if     (nxc.eq. 4) then
        call xctmr1( 4)
        nxc = 0
      endif
#endif
      return
      end subroutine xcgput4

      subroutine xchalt(cerror)
      implicit none
!
      character*(*), intent(in) :: cerror
!
!**********
!*
!  1) stop all processes.
!
!  2) only one processes need call this routine, i.e. it is for
!      emergency stops.  use 'xcstop' for ordinary stops called
!      by all processes.
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    cerror          char*(*)       input     error message
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
!     message passing version.
!
      if     (cerror.ne.' ') then
        write(lp,*) '**************************************************'
        write(lp,*) cerror
      endif
      write(lp,*) '**************************************************'
      write(lp,*) 'XCHALT CALLED ON PROC = ',mnproc,mproc,nproc
      write(lp,*) '**************************************************'
      call flush(lp)
!
#if defined(MPI)
      call mpi_abort(mpi_comm_hycom,9)
#else
      call abort()
#endif
      stop '(xchalt)'
      end subroutine xchalt

      subroutine xciget(alist,nl, a, ia,ja, mnflg)
      implicit none
!
      integer, intent(in)  ::          nl
      real,    intent(out) ::          alist(nl)
      real,    intent(in)  :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
      integer, intent(in)  ::          ia(nl),ja(nl)
      integer, intent(in), optional :: mnflg
!
!**********
!*
!  1) find the value of a(ia(:),ja(:)) on the non-tiled 2-D grid.
!
!  2) mnflg selects which nodes must return the line
!        = 0; all nodes (default)
!        = n; node number n (mnproc=n)
!     note that alist might, or might not, be returned on other nodes
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    alist           real           output    required elements
!    nl              integer        input     dimension of alist
!    a               real           input     tiled source array
!    ia              integer        input     1st non-tiled indexes into a
!    ja              integer        input     2nd non-tiled indexes into a
!    mnflg           integer        input     node return flag
!
!  4) the global variable vland is returned when outside active tiles.
!*
!**********
!
      integer i,lmnflg,nl_sm
!     
      if     (present(mnflg)) then
        lmnflg = mnflg
      else
        lmnflg = 0  ! default
      endif
!
!     stripmine alist in itdm+jtdm chunks
!
      do i= 1,nl,itdm+jtdm
        nl_sm = min( itdm+jtdm, nl+1-i )
        call xciget_sm(alist(i),nl_sm, a, ia(i),ja(i), lmnflg)
      enddo !i
      return
      end subroutine xciget

      subroutine xciget_sm(alist,nl, a, ia,ja, mnflg)
      implicit none
!
      integer, intent(in)  :: nl
      real,    intent(out) :: alist(nl)
      real,    intent(in)  :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
      integer, intent(in)  :: ia(nl),ja(nl)
      integer, intent(in)  :: mnflg
!
!**********
!*
!  1) find the value of a(ia(:),ja(:)) on the non-tiled 2-D grid.
!     version for nl<=itdm+jtdm, always invoke via xciget
!
!  2) mnflg selects which nodes must return the line
!        = 0; all nodes (default)
!        = n; node number n (mnproc=n)
!     note that alist might, or might not, be returned on other nodes
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    alist           real           output    required elements
!    nl              integer        input     dimension of alist
!    a               real           input     tiled source array
!    ia              integer        input     1st non-tiled indexes into a
!    ja              integer        input     2nd non-tiled indexes into a
!    mnflg           integer        input     node return flag
!
!  4) the global variable vland is returned when outside active tiles.
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!     
!     double buffer to remove the need for barriers.
!     pad alX to guard against false sharing.
!
#if defined(RELO)
      real,    save, allocatable, dimension(:,:) :: &
        als,alr
#else
      real,    save, dimension(-47:itdm+jtdm+48,0:1) :: &
        als,alr
#endif
      integer, save, dimension(0:ijqr-1,0:1) :: nlp,nxp
!     
      integer, save :: kdb = 0
      integer       :: i,np,mp,mnp,nxpa
#if defined(RELO)
!
      if     (.not.allocated(als)) then
        allocate(     als(-47:itdm+jtdm+48,0:1), &
                      alr(-47:itdm+jtdm+48,0:1) )
        call mem_stat_add( 2*(itdm+jtdm+96)*2 )
      endif
#endif
!     
      kdb = mod(kdb+1,2)  ! the least recently used of the two buffers
!
! --- calculate the gatherv sizes and offsets on all tiles
!
      nlp(0:ijqr-1,kdb) = 0
      do i= 1,nl
        np  = npe_j(max(0,min(jtdm+1,ja(i)))   )
        mp  = mpe_i(max(0,min(itdm+1,ia(i))),np)
        if     (mp.gt.0) then
          mnp = idproc(mp,np)
          nlp(mnp,kdb) = nlp(mnp,kdb) + 1
          if     (mp.eq.mproc .and. np.eq.nproc) then
! ---       update the local send buffer
            als(nlp(mnp,kdb),kdb) = a(ia(i)-i0,ja(i)-j0)
          endif
        endif !mp
      enddo !i
!
      nxp(0,kdb) = 0
      do i= 1,ijqr-1
        nxp(i,kdb) = nxp(i-1,kdb) + nlp(i-1,kdb)
      enddo
      nxpa = nxp(ijqr-1,kdb) + nlp(ijqr-1,kdb)
!
#if defined(MPI)
      if     (mnflg.eq.0) then
! ---   gatherv onto the 1st mpi task, then broadcast the result
        mnp = idproc(mproc,nproc)
        call mpi_gatherv(als(1,kdb),nlp(mnp,kdb),MTYPER, &
                         alr(1,kdb),nlp(0,kdb),nxp(0,kdb),MTYPER, &
                         idproc1(1), &
                         mpi_comm_hycom, mpierr)
        call mpi_bcast(  alr(1,kdb),nxpa,MTYPER, &
                         idproc1(1), &
                         mpi_comm_hycom, mpierr)
      else
        mnp = idproc(mproc,nproc)
        call mpi_gatherv(als(1,kdb),nlp(mnp,kdb),MTYPER, &
                         alr(1,kdb),nlp(0,kdb),nxp(0,kdb),MTYPER, &
                         idproc1(mnflg), &
                         mpi_comm_hycom, mpierr)
      endif !mnflg
#elif defined(SHMEM)
      call xcstop('xciget: shmem not implemented')
      stop '(xciget)'
#endif
!
      if     (mnflg.eq.0 .or. mnflg.eq.mnproc) then
! ---   repeat the gatherv size logic to unwind the recv buffer
        nlp(0:ijqr-1,kdb) = 0
        do i= 1,nl
          np  = npe_j(max(0,min(jtdm+1,ja(i)))   )
          mp  = mpe_i(max(0,min(itdm+1,ia(i))),np)
          if     (mp.le.0) then
            alist(i) = vland
          else
            mnp = idproc(mp,np)
            nlp(mnp,kdb) = nlp(mnp,kdb) + 1
            alist(i) = alr(nlp(mnp,kdb)+nxp(mnp,kdb),kdb)
          endif !mp
        enddo !i
      endif !mnflg
      return
      end subroutine xciget_sm

      subroutine xciput(alist,nl, a, ia,ja, mnflg)
      implicit none
!
      integer, intent(in)    ::        nl
      integer, intent(in), optional :: mnflg
      integer, intent(in)    ::        ia(nl),ja(nl)
      real,    intent(inout) ::        alist(nl)
      real,    intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
!
!**********
!*
!  1) fill a list of elements in the non-tiled 2-D grid.
!     also updates the halo.
!
!  2) mnflg selects which nodes hold the elements on entry
!        = 0; all nodes (default)
!        = n; node number n (mnproc=n)
!     all nodes hold the elements on exit
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    alist           real           in/out    element values
!    nl              integer        input     dimension of alist
!    a               real           in/out    tiled target array
!    ia              integer        input     1st non-tiled indexes into a
!    ja              integer        input     2nd non-tiled indexes into a
!    mnflg           integer        input     node source flag
!*
!**********
!
      integer i,iai,jai
#if defined(TIMER)
!
!     if     (nxc.eq.0) then
!       call xctmr0( 5)
!       nxc = 5
!     endif
#endif
!     
      if     (present(mnflg)) then
        if     (mnflg.ne.0) then
          call xcastr(alist, mnflg)  !broadcast the input list
        endif
      endif
!
      do i= 1,nl
        iai = ia(i)-i0
        jai = ja(i)-j0
        if     (iai.ge.1-nbdy .and. iai.le.ii+nbdy .and. &
                jai.ge.1-nbdy .and. jai.le.jj+nbdy      ) then
!
!         on this tile with halo.
!
          a(iai,jai) = alist(i)
        endif !on tile
!
        if     (nreg.ne.0 .and. ia(i).ge.itdm-nbdy+1) then  ! periodic
          iai = ia(i)-itdm-i0
          if     (iai.ge.1-nbdy .and. iai.le.ii+nbdy .and. &
                  jai.ge.1-nbdy .and. jai.le.jj+nbdy      ) then
            a(iai,jai) = alist(i)
          endif !on tile
        endif !periodic, large ia(i)
        if     (nreg.ne.0 .and. ia(i).le.nbdy) then  ! periodic
          iai = ia(i)+itdm-i0
          if     (iai.ge.1-nbdy .and. iai.le.ii+nbdy .and. &
                  jai.ge.1-nbdy .and. jai.le.jj+nbdy      ) then
            a(iai,jai) = alist(i)
          endif !on tile
        endif !periodic, small ia(i)
      enddo !i
#if defined(TIMER)
!
!     if     (nxc.eq. 5) then
!       call xctmr1( 5)
!       nxc = 0
!     endif
#endif
      return
      end subroutine xciput

      subroutine xclget(aline,nl, a, i1,j1,iinc,jinc, mnflg)
      implicit none
!
      integer, intent(in)    ::  nl,i1,j1,iinc,jinc,mnflg
      real,    intent(out)   ::  aline(nl)
      real,    intent(in)    ::  a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
!
!**********
!*
!  1) extract a line of elements from the non-tiled 2-D grid.
!
!  2) aline(i) == aa(i1+iinc*(i-1),j1+jinc*(i-1)), for i=1...nl.
!     where aa is the non-tiled representation of a, and
!     iinc and jinc can each be -1, 0, or +1.
!
!  3) mnflg selects which nodes must return the line
!        = 0; all nodes
!        = n; node number n (mnproc=n)
!
!  4) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aline           real           output    required line of elements
!    nl              integer        input     dimension of aline
!    a               real           input     tiled source array
!    i1              integer        input     1st non-tiled index into a
!    j1              integer        input     2nd non-tiled index into a
!    iinc            integer        input     1st index increment
!    jinc            integer        input     2nd index increment
!    mnflg           integer        input     node return flag
!
!  5) the global variable vland is returned when outside active tiles.
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
!     pad al to guard against false sharing.
!     double buffer to reduce the number of required barriers.
!
#if defined(RELO)
      real, save, allocatable, dimension (:,:) :: al
#else
      real, save, dimension(-47:itdm+jtdm+48,0:1) :: al
#endif
!
      integer, save        :: kdb = 0
      integer, allocatable :: ia(:),ja(:)
!
      integer i1n,iif,iil,j1n,jjf,jjl,l,lx0,nxl,mp,np
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0( 3)
        nxc = 3
      endif
#endif
#if defined(RELO)
!
      if     (.not.allocated(al)) then
        allocate(    al(-47:itdm+jtdm+48,0:1) )
        call mem_stat_add( (itdm+jtdm+96)*2 )
      endif
#endif
!
      kdb = mod(kdb+1,2)  ! the least recently used of the two buffers
!
      if     ((jinc.eq.0 .and. iinc.eq.1) .or. nl.eq.1) then
!
! ---   horizontal forward line.
!
        al(1:nl,kdb) = vland
!
        np  = npe_j(j1)
        do mp= mpe_1(np),mpe_e(np)
          iif = max(i1,     i0_pe(mp,np)+1)
          iil = min(i1+nl-1,i0_pe(mp,np)+ii_pe(mp,np))
          lx0 = iif - i1
          nxl = iil - iif + 1
          if     (nxl.le.0) then
            cycle  ! no elements from tile (mp,np)
          endif
!
          if      (mp.eq.mproc .and. np.eq.nproc) then
!
!           this tile.
!
            do l= lx0+1,lx0+nxl
              al(l,kdb) = a(i1+l-1-i0,j1-j0)
            enddo
#if defined(MPI)
            if     (mnflg.eq.0) then
              call mpi_bcast(al(lx0+1,kdb),nxl,MTYPER, &
                             idproc(mp,np),mpi_comm_hycom,mpierr)
            elseif (mnflg.ne.mnproc) then
              call MPI_SEND(al(lx0+1,kdb),nxl,MTYPER, &
                            idproc1(mnflg), 9931, &
                            mpi_comm_hycom, mpierr)
            endif
          else
!
!           another tile.
!
            if     (mnflg.eq.0) then
              call mpi_bcast(al(lx0+1,kdb),nxl,MTYPER, &
                             idproc(mp,np),mpi_comm_hycom,mpierr)
            elseif (mnflg.eq.mnproc) then
              call MPI_RECV(al(lx0+1,kdb),nxl,MTYPER, &
                            idproc(mp,np), 9931, &
                            mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
            endif
#endif /* MPI */
          endif
!
          if     (lx0+nxl.eq.nl) then
            exit
          endif
        enddo  ! np=1,jpr
#if defined(SHMEM)
!
!       spliting process into two phases saves on barriers.
!
        BARRIER
        do mp= mpe_1(np),mpe_e(np)
          iif = max(i1,     i0_pe(mp,np)+1)
          iil = min(i1+nl-1,i0_pe(mp,np)+ii_pe(mp,np))
          lx0 = iif - i1
          nxl = iil - iif + 1
          if     (nxl.le.0) then
            cycle  ! no elements from tile (mp,np)
          endif
!
          if      (mp.eq.mproc .and. np.eq.nproc) then
!
!           nothing to do here (see 1st phase, above).
!
          else
!
!           another tile.
!
            if     (mnflg.eq.0 .or. mnflg.eq.mnproc) then
              call SHMEM_GETR(al(lx0+1,kdb), &
                              al(lx0+1,kdb),nxl,idproc(mp,np))
            endif
          endif
!
          if     (lx0+nxl.eq.nl) then
            exit
          endif
        enddo  ! np=1,jpr
        ! no barrier needed here because of double buffering
#endif /* SHMEM */
!
        if     (mnflg.eq.0 .or. mnflg.eq.mnproc) then
          aline(1:nl) = al(1:nl,kdb)
        endif
      elseif (iinc.eq.0 .and. jinc.eq.1) then
!
! ---   vertical forward line.
!
        al(1:nl,kdb) = vland
!
        do np= 1,jpr
          mp = mpe_i(i1,np)
          if     (mp.le.0) then
            cycle  ! an all-land column
          endif
          jjf = max(j1,     j0_pe(mp,np)+1)
          jjl = min(j1+nl-1,j0_pe(mp,np)+jj_pe(mp,np))
          lx0 = jjf - j1
          nxl = jjl - jjf + 1
          if     (nxl.le.0) then
            cycle  ! no elements from tile (mp,np)
          endif
!
          if      (mp.eq.mproc .and. np.eq.nproc) then
!
!           this tile.
!
            do l= lx0+1,lx0+nxl
              al(l,kdb) = a(i1-i0,j1+l-1-j0)
            enddo
#if defined(MPI)
            if     (mnflg.eq.0) then
              call mpi_bcast(al(lx0+1,kdb),nxl,MTYPER, &
                             idproc(mp,np),mpi_comm_hycom,mpierr)
            elseif (mnflg.ne.mnproc) then
              call MPI_SEND(al(lx0+1,kdb),nxl,MTYPER, &
                            idproc1(mnflg), 9931, &
                            mpi_comm_hycom, mpierr)
            endif
          else
!
!           another tile.
!
            if     (mnflg.eq.0) then
              call mpi_bcast(al(lx0+1,kdb),nxl,MTYPER, &
                             idproc(mp,np),mpi_comm_hycom,mpierr)
            elseif (mnflg.eq.mnproc) then
              call MPI_RECV(al(lx0+1,kdb),nxl,MTYPER, &
                            idproc(mp,np), 9931, &
                            mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
            endif
#endif /* MPI */
          endif
!
          if     (lx0+nxl.eq.nl) then
            exit
          endif
        enddo  ! np=1,jpr
#if defined(SHMEM)
!
!       spliting process into two phases saves on barriers.
!
        BARRIER
        do np= 1,jpr
          mp = mpe_i(i1,np)
          if     (mp.le.0) then
            cycle  ! an all-land column
          endif
          jjf = max(j1,     j0_pe(mp,np)+1)
          jjl = min(j1+nl-1,j0_pe(mp,np)+jj_pe(mp,np))
          lx0 = jjf - j1
          nxl = jjl - jjf + 1
          if     (nxl.le.0) then
            cycle  ! no elements from tile (mp,np)
          endif
!
          if      (mp.eq.mproc .and. np.eq.nproc) then
!
!           nothing to do here (see 1st phase, above).
!
          else
!
!           another tile.
!
            if     (mnflg.eq.0 .or. mnflg.eq.mnproc) then
              call SHMEM_GETR(al(lx0+1,kdb), &
                              al(lx0+1,kdb),nxl,idproc(mp,np))
            endif
          endif
!
          if     (lx0+nxl.eq.nl) then
            exit
          endif
        enddo  ! np=1,jpr
        ! no barrier needed here because of double buffering
#endif /* SHMEM */
!
        if     (mnflg.eq.0 .or. mnflg.eq.mnproc) then
          aline(1:nl) = al(1:nl,kdb)
        endif
      elseif (nl.eq.1) then
!
!       diagonal and reversing lines, single element
!
        call xceget(aline(1),a,i1,j1, mnflg)
      else
!
!       diagonal and reversing lines - call xciget.
!
        allocate(ia(nl),ja(nl))
        do l= 1,nl
          ia(l) = i1+iinc*(l-1)
          ja(l) = j1+jinc*(l-1)
        enddo !l
        call xciget(aline,nl, a, ia,ja, mnflg)
        deallocate(ia,ja)
      endif
#if defined(TIMER)
!
      if     (nxc.eq. 3) then
        call xctmr1( 3)
        nxc = 0
      endif
#endif
      return
      end subroutine xclget

      subroutine xclput(aline,nl, a, i1,j1,iinc,jinc, mnflg)
      implicit none
!
      integer, intent(in), optional ::  mnflg
      integer, intent(in)    ::         nl,i1,j1,iinc,jinc
      real,    intent(inout) ::         aline(nl)
      real,    intent(inout) ::  a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
!
!**********
!*
!  1) fill a line of elements in the non-tiled 2-D grid.
!
!  2) aline(i) == aa(i1+i1*(i-1),j1+j1*(i-1)), for i=1...nl.
!     where aa is the non-tiled representation of a, and
!     one of iinc and jinc must be 0, and the other must be 1.
!     also updates the halo.
!
!  3) mnflg selects which nodes hold the line on entry
!        = 0; all nodes (default)
!        = n; node number n (mnproc=n)
!     all nodes hold the line on exit
!
!  4) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aline           real           in/out    line of element values
!    nl              integer        input     dimension of aline
!    a               real           in/out    tiled target array
!    i1              integer        input     1st non-tiled index into a
!    j1              integer        input     2nd non-tiled index into a
!    iinc            integer        input     1st index increment
!    jinc            integer        input     2nd index increment
!    mnflg           integer        input     node source flag
!*
!**********
!
      integer i,j
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0( 5)
        nxc = 5
      endif
#endif
!
      if     (present(mnflg)) then
        if     (mnflg.ne.0) then
          call xcastr(aline, mnflg)  !broadcast the input line
        endif
      endif
!
      if     (jinc.eq.0) then
        j = j1-j0
        if     (j.ge.1-nbdy .and. j.le.jj+nbdy) then
          do i= max(1-nbdy,i1-i0),min(i1+nl-1-i0,ii+nbdy)
            a(i,j) = aline(i+i0-i1+1)
          enddo
          if     (nreg.ne.0 .and. i1+nl-1.ge.itdm-nbdy+1) then  ! periodic
            do i= max(1-nbdy,i1-itdm-i0),min(i1+nl-1-itdm-i0,0)
              a(i,j) = aline(i+itdm+i0-i1+1)
            enddo
          endif
          if     (nreg.ne.0 .and. i1.le.nbdy) then  ! periodic
            do i= max(ii+1,i1+itdm-i0),min(i1+nl-1+itdm-i0,ii+nbdy)
              a(i,j) = aline(i-itdm+i0-i1+1)
            enddo
          endif
        endif !j
      elseif (iinc.eq.0) then
        i = i1-i0
        if     (i.ge.1-nbdy .and. i.le.ii+nbdy) then
          do j= max(1-nbdy,j1-j0),min(j1-j0+nl-1,jj+nbdy)
            a(i,j) = aline(j+j0-j1+1)
          enddo
        endif !on tile
        if     (nreg.ne.0 .and. i1.ge.itdm-nbdy+1) then  ! periodic
          i = i1-itdm-i0 !e.g. i1==itdm becomes 0-i0
          if     (i.ge.1-nbdy .and. i.le.0) then
            do j= max(1-nbdy,j1-j0),min(j1-j0+nl-1,jj+nbdy)
              a(i,j) = aline(j+j0-j1+1)
            enddo
          endif !in halo
        endif !near itdm
        if     (nreg.ne.0 .and. i1.le.nbdy) then  ! periodic
          i = i1+itdm-i0 !e.g. i1==1 becomes itdm+1-i0
          if     (i.ge.ii+1 .and. i.le.ii+nbdy) then
            do j= max(1-nbdy,j1-j0),min(j1-j0+nl-1,jj+nbdy)
              a(i,j) = aline(j+j0-j1+1)
            enddo
          endif !in halo
        endif !near 1
      endif
#if defined(TIMER)
!
      if     (nxc.eq. 5) then
        call xctmr1( 5)
        nxc = 0
      endif
#endif
      return
      end subroutine xclput

      subroutine xcmaxr_0(a, mnflg)
      implicit none
!
      real,    intent(inout) :: a
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) replace scalar a with its element-wise maximum over all tiles.
!
!  2) mnflg selects which nodes must return the minimum
!        = 0; all nodes
!        = n; node number n (mnproc=n)
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target variable
!    mnflg           integer        input     node return flag
!*
!**********
!
      real a1(1)
!
      a1(1) = a
      call xcmaxr_1(a1, mnflg)
      a = a1(1)
      return
      end subroutine xcmaxr_0

      subroutine xcmaxr_1(a, mnflg)
      implicit none
!
      real,    intent(inout) :: a(:)
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) replace array a with its element-wise maximum over all tiles.
!
!  2) mnflg selects which nodes must return the minimum
!        = 0; all nodes
!        = n; node number n (mnproc=n)
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target array
!    mnflg           integer        input     node return flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      integer i,is0,isl,mn,mnp,n,nn
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0(10)
        nxc = 10
      endif
#endif
!
!     stripmine a.
!
      n = size(a)
!
      do is0= 0,n-1,nmax
        isl = min(is0+nmax,n)
        nn = isl - is0
        do i= 1,nn
          b(i) = a(is0+i)
        enddo
!
#if defined(MPI)
        if     (mnflg.eq.0) then
          call mpi_allreduce(b,c,nn,MTYPER,mpi_max, &
                             mpi_comm_hycom,mpierr)
        else
          call mpi_reduce(   b,c,nn,MTYPER,mpi_max, &
                             idproc1(mnflg), &
                             mpi_comm_hycom,mpierr)
        endif
#elif defined(SHMEM)
        BARRIER
        mnp = max(1,mnflg)
        if     (mnproc.eq.mnp) then
!         form global maximum on one processor
          do i= 1,nn
            c(i) = b(i)
          enddo
          do mn= 1,ijpr
            if     (mn.ne.mnp) then
              call SHMEM_GETR(b,b,nn, idproc1(mn))
              do i= 1,nn
                c(i) = max(c(i),b(i))
              enddo
            endif !.ne.mnp
          enddo
          BARRIER
        endif
        if     (mnflg.eq.0) then
!         get global maximum from 1st processor
          BARRIER
          call SHMEM_GETR(c,c,nn, idproc1(mnp))
        endif
        ! no barrier needed here because using two buffers (b and c)
#endif
        if     (mnflg.eq.0 .or. mnflg.eq.mnproc) then
          do i= 1,nn
            a(is0+i) = c(i)
          enddo
        endif
      enddo  ! stripmine loop
#if defined(TIMER)
!
      if     (nxc.eq.10) then
        call xctmr1(10)
        nxc = 0
      endif
#endif
      return
      end subroutine xcmaxr_1

      subroutine xcmaxr_0o(a)
      implicit none
!
      real, intent(inout) :: a
!
!**********
!*
!  1) replace scalar a with its element-wise maximum over all tiles.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target variable
!*
!**********
!
      integer mnflg
      real    a1(1)
!
      mnflg = 0  !all nodes
      a1(1) = a
      call xcmaxr_1(a1, mnflg)
      a = a1(1)
      return
      end subroutine xcmaxr_0o

      subroutine xcmaxr_1o(a)
      implicit none
!
      real, intent(inout) :: a(:)
!
!**********
!*
!  1) replace array a with its element-wise maximum over all tiles.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target array
!*
!**********
!
      integer mnflg
!
      mnflg = 0  !all nodes
      call xcmaxr_1(a, mnflg)
      return
      end subroutine xcmaxr_1o

      subroutine xcminr_0(a, mnflg)
      implicit none
!
      real,    intent(inout) :: a
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) replace scalar a with its element-wise minimum over all tiles.
!
!  2) mnflg selects which nodes must return the minimum
!        = 0; all nodes
!        = n; node number n (mnproc=n)
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target variable
!    mnflg           integer        input     node return flag
!*
!**********
!
      real a1(1)
!
      a1(1) = a
      call xcminr_1(a1, mnflg)
      a = a1(1)
      return
      end subroutine xcminr_0

      subroutine xcminr_1(a, mnflg)
      implicit none
!
      real,    intent(inout) :: a(:)
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) replace array a with its element-wise minimum over all tiles.
!
!  2) mnflg selects which nodes must return the minimum
!        = 0; all nodes
!        = n; node number n (mnproc=n)
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target array
!    mnflg           integer        input     node return flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      integer i,is0,isl,mn,mnp,n,nn
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0(10)
        nxc = 10
      endif
#endif
!
!     stripmine a.
!
      n = size(a)
!
      do is0= 0,n-1,nmax
        isl = min(is0+nmax,n)
        nn = isl - is0
        do i= 1,nn
          b(i) = a(is0+i)
        enddo
!
#if defined(MPI)
        if     (mnflg.eq.0) then
          call mpi_allreduce(b,c,nn,MTYPER,mpi_min, &
                             mpi_comm_hycom,mpierr)
        else
          call mpi_reduce(   b,c,nn,MTYPER,mpi_min, &
                             idproc1(mnflg), &
                             mpi_comm_hycom,mpierr)
        endif
#elif defined(SHMEM)
        BARRIER
        mnp = max(1,mnflg)
        if     (mnproc.eq.mnp) then
!         form global minimum on one processor
          do i= 1,nn
            c(i) = b(i)
          enddo
          do mn= 1,ijpr
            if     (mn.ne.mnp) then
              call SHMEM_GETR(b,b,nn, idproc1(mn))
              do i= 1,nn
                c(i) = min(c(i),b(i))
              enddo
            endif !.ne.mnp
          enddo
          BARRIER
        endif
        if     (mnflg.eq.0) then
!         get global minimum from 1st processor
          BARRIER
          call SHMEM_GETR(c,c,nn, idproc1(mnp))
        endif
        ! no barrier needed here because using two buffers (b and c)
#endif
        if     (mnflg.eq.0 .or. mnflg.eq.mnproc) then
          do i= 1,nn
            a(is0+i) = c(i)
          enddo
        endif
      enddo
#if defined(TIMER)
!
      if     (nxc.eq.10) then
        call xctmr1(10)
        nxc = 0
      endif
#endif
      return
      end subroutine xcminr_1

      subroutine xcminr_0o(a)
      implicit none
!
      real, intent(inout) :: a
!
!**********
!*
!  1) replace scalar a with its element-wise minimum over all tiles.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target variable
!*
!**********
!
      integer mnflg
      real    a1(1)
!
      mnflg = 0  !all nodes
      a1(1) = a
      call xcminr_1(a1, mnflg)
      a = a1(1)
      return
      end subroutine xcminr_0o

      subroutine xcminr_1o(a)
      implicit none
!
      real, intent(inout) :: a(:)
!
!**********
!*
!  1) replace array a with its element-wise minimum over all tiles.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target array
!*
!**********
!
      integer mnflg
!
      mnflg = 0  !all nodes
      call xcminr_1(a, mnflg)
      return
      end subroutine xcminr_1o

#if defined(OCEANS2)
      subroutine xcpipe(aa_master,what_master, aa_slave,what_slave)
      implicit none
!
      real,         intent(out) :: aa_master(itdm,jtdm)
      real,         intent(in)  ::  aa_slave(itdm,jtdm)
      character*12, intent(out) :: what_master
      character*12, intent(in)  :: what_slave
!
!**********
!*
!  1) copy aa_slave on task idp1_2 to aa_master on task idp1_1
!     only called from pipe_compare
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    aa_master       real           output    non-tiled target array
!    aa_slave        real            input    non-tiled source array
!    what_master     character*12   output    non-tiled target array name
!    what_slave      character*12    input    non-tiled source array name
!*
!**********
!
      include 'mpif.h'
      integer mpierr
!
      integer iwhat(12)
      save    iwhat
!
      integer i
!
!     only the 1st task on master and slave are involved in the copy.
!     note the use of mpi_comm_world
!
      if     (mnproc.eq.1) then
        if     (nocean.eq.1) then
! ---     master
          call MPI_RECV(iwhat,12,MTYPEI, &
                        idp1_2, 9943, &
                        mpi_comm_world, MPI_STATUS_IGNORE, mpierr)
          do i= 1,12
            what_master(i:i) = char(iwhat(i))
          enddo !i
          call MPI_RECV(aa_master,itdm*jtdm,MTYPER, &
                        idp1_2, 9944, &
                        mpi_comm_world, MPI_STATUS_IGNORE, mpierr)
        else  !nocean==2
! ---     slave
          do i= 1,12
            iwhat(i) = ichar(what_slave(i:i))
          enddo !i
          call MPI_SEND(iwhat,12,MTYPEI, &
                      idp1_1, 9943, &
                      mpi_comm_world, mpierr)
          call MPI_SEND(aa_slave,itdm*jtdm,MTYPER, &
                      idp1_1, 9944, &
                      mpi_comm_world, mpierr)
        endif !master:slave
      endif !mnproc==1
      return
      end subroutine xcpipe
#endif /* OCEANS2 */

      subroutine xcspmd(mpi_comm_vm)
      implicit none
      integer, optional ::  mpi_comm_vm
!
!**********
!*
!  1) initialize data structures that identify the tiles.
!
!  2) data structures (public):
!      ipr     - 1st 2-D node dimension (<=iqr)
!      jpr     - 2nd 2-D node dimension (<=jqr)
!      ijpr    -     1-D node dimension (ipr*jpr)
!      mproc   - 1st 2-D node index
!      nproc   - 2nd 2-D node index
!      mnproc  -     1-D node index
!      mp_1st  - 1st node in this row of 2-D nodes,        mpe_1(nproc)
!      i0      - 1st dimension offset for this tile, i0_pe(mproc,nproc)
!      ii      - 1st dimension extent for this tile, ii_pe(mproc,nproc)
!      j0      - 2nd dimension offset for this tile, j0_pe(mproc,nproc)
!      jj      - 2nd dimension extent for this tile, jj_pe(mproc,nproc)
!      nreg    -     region type
!      vland   -     fill value for land (standard value 0.0)
!
!  3) data structures (private):
!      idproc  -     2-D node addresses, with periodic wrap
!      idproc1 -     1-D node addresses, with periodic wrap
!      idhalo  -     left and right halo target nodes
!      i0_pe   - 1st dimension tile offsets
!      ii_pe   - 1st dimension tile extents (<=idm)
!      j0_pe   - 2nd dimension tile offsets
!      jj_pe   - 2nd dimension tile extents (<=jdm)
!      mpe_1   - 1st node in each row of 2-D nodes
!      mpe_e   - end node in each row of 2-D nodes
!      mpe_i   - mapping from 1st global dimension to 2-D nodes
!      npe_j   - mapping from 2nd global dimension to 2-D nodes
!      i1sum   - local index of 1st partial sum on each tile
!      iisum   - number of partial sums on each tile
!      m0_top  - tile offset:       top neighbors (0:jpr-1)
!      mm_top  - tile extent:       top neighbors (<=jpr)
!      i0_st   - halo offsets: send top neighbors
!      ii_st   - halo lengths: send top neighbors
!      i0_gt   - halo offsets:  get top neighbors
!      ii_gt   - halo lengths:  get top neighbors
!      m0_bot  - tile offset:       bot neighbors (0:jpr-1)
!      mm_bot  - tile extent:       bot neighbors (<=jpr)
!      i0_sb   - halo offsets: send bot neighbors
!      ii_sb   - halo lengths: send bot neighbors
!      i0_gb   - halo offsets:  get bot neighbors
!      ii_gb   - halo lengths:  get bot neighbors
!
!  4) all data structures are based on the processor number and
!     the patch distribution file, 'patch.input'.
!*
!**********
!
      integer i,idm_in,itdm_in,ios,ixsum, &
              j,jdm_in,jtdm_in,l,m,mm,mn,mypei,n,npesi
!
      character*80 cfile
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
!
!     standard mpi (message passing) version.
!
      if(present(mpi_comm_vm)) then
        call mpi_comm_dup(mpi_comm_vm,    mpi_comm_hycom, mpierr)
        lp = 6
      else
#if ! defined(OCEANS2)
        call mpi_init(mpierr)
        call mpi_comm_dup(mpi_comm_world, mpi_comm_hycom, mpierr)
        lp = 6
#else
! ---   two copies of HYCOM on distinct MPI tasks, master/slave via mod_pipe.
        call mpi_init(mpierr)
        call mpi_comm_rank(mpi_comm_world, mypei, mpierr)
        open(unit=uoff+99,file='./patch.input', iostat=ios)
        if     (ios.ne.0) then
          if     (mypei.eq.0) then
            write(6,'(a)') "xcspmd: can't open ./patch.input"
            call flush(6)
          endif !1st task
          call mpi_finalize(mpierr)
          stop '(xcspmd)'
        endif !ios!=0
        read(uoff+99,'(/i6/)',iostat=ios) ijpr
        if     (ios.ne.0) then
          if     (mypei.eq.0) then
            write(6,'(a)') "xcspmd: can't read ./patch.input"
            call flush(6)
          endif !1st task
          call mpi_finalize(mpierr)
          stop '(xcspmd)'
        endif !ios!=0
        close(uoff+99)
        idp1_1 = 0     !1st mpi task of 1st ocean
        idp1_2 = ijpr  !1st mpi task of 2nd ocean
        if     (mypei.lt.idp1_2) then
          nocean = 1  !master ocean
          call mpi_comm_split(mpi_comm_world, nocean,mypei, &
                              mpi_comm_hycom, mpierr)
          lp = 6
        else
          nocean = 2  !slave ocean
          call mpi_comm_split(mpi_comm_world, nocean,mypei, &
                              mpi_comm_hycom, mpierr)
          lp = uoff+6
          write(cfile,"(a,i6.6,a)") "./OCEAN2/stdout",mypei+1,".log"
          open(unit=lp,file=trim(cfile),status='new')
        endif
#endif  /* !OCEANS2:else */
      endif  !mpi_comm_vm:else
!
      call mpi_comm_rank(mpi_comm_hycom, mypei, mpierr)
      call mpi_comm_size(mpi_comm_hycom, npesi, mpierr)
!
      mnproc = mypei + 1  ! mnproc counts from 1
#if defined(DEBUG_ALL)
      write(lp,'(a,i5)') 'mnproc =',mnproc
      call xcsync(flush_lp)
#endif
#elif defined(SHMEM)
      integer SHMEM_MYPE,SHMEM_NPES
!
!     shmem version.
!
      call start_pes(0)
!
      lp = 6
!
      mypei = SHMEM_MYPE()
      npesi = SHMEM_NPES()
!
      mnproc = mypei + 1  ! mnproc counts from 1
#else
      lp = 6
!
      write(lp,*)
      write(lp,*) '***** ERROR - UNDEFINED SPMD MACHINE TYPE *****'
      write(lp,*)
      call flush(lp)
      stop '(xcspmd)'
#endif  /* MPI:SHMEM:else */
!
      vland  = 0.0
      vland4 = 0.0
!
#if defined(T3E)
      open(unit=lp,delim='none')  ! forces open on stdout
#endif
!
!     initialize timers early to allow xcstop to work.
!
      call xctmri
#if defined(RELO)
!
! --  placeholders to allow xcstop to work.
!
      idm = 1
      jdm = 1
      kdm = 1
#endif
!
!     read in the tile locations and sizes.
!     patch distibution file on unit 21 (fort.21).
!
!     here is an example of a patch file, with a leading "!" added:
!
!  npes   npe   mpe   idm   jdm  ibig  jbig  nreg
!    16     4     4    57    52    19    15     0
!
!ispt(  1) =     22    35    42    49
!iipe(  1) =     13     7     7     7
!ispt(  2) =      1    15    25    34
!iipe(  2) =     14    10     9     9
!ispt(  3) =      2    21    29    38
!iipe(  3) =     19     8     9    10
!ispt(  4) =     18    28    35    42
!iipe(  4) =     10     7     7     9
! 
!jspt(  1) =      1    15    26    38
!jjpe(  1) =     14    11    12    15
!
!     ispt(j) is the 1st i-point on each tile in the j-th row
!     iipe(j) is the i-extent    of each tile in the j-th row
!     jspt(1) is the 1st j-point on each tile in all columns
!     jjpe(1) is the j-extent    of each tile in all columns
!
!     note that each tile row can have a different tile layout, 
!     but each tile column is identical.  So a tile can have at
!     most one nieghbor in the E and W directions, but several
!     nieghbors in the N and S directions.
!
!     iipe can be zero, indicating an empty tile (not included in the
!     active tile count, npes) and therefore at least a halo widths
!     land separation between active tiles to its east and west.  In
!     periodic cases, the last non-empty tile can periodic wrap to the
!     first tile in the row (i.e. trailing "empty" tiles can be null
!     tiles, rather than all-land tiles).
!
#if defined(OCEANS2)
      if     (nocean.eq.2) then
        open(unit=uoff+99,file="./OCEAN2/patch.input", &
             iostat=ios)
      else
        open(unit=uoff+99,file='./patch.input', &
             iostat=ios)
      endif
#else
      open(unit=uoff+99,file='./patch.input', &
           iostat=ios)
#endif
      if     (ios.ne.0) then
        call xcstop('xcspmd: error opening patch.input')
        stop '(xcspmd)'
      endif
      read(uoff+99,'(/8i6/)',iostat=ios) ijpr,ipr,jpr, &
                        itdm_in,jtdm_in,idm_in,jdm_in,nreg
      if     (ios.ne.0) then
        call xcstop('xcspmd: error reading patch.input')
        stop '(xcspmd)'
      elseif (ijpr.gt.ijqr .or. ipr.gt.iqr .or. jpr.gt.jqr) then
        if     (mnproc.eq.1) then
          write(lp,'(a,3i5)') 'input: ijpr,ipr,jpr =',ijpr,ipr,jpr
          write(lp,'(a,3i5)') 'param: ijqr,iqr,jqr =',ijqr,iqr,jqr
          call flush(lp)
        endif
        call xcstop('xcspmd: patch.input for wrong ipr,jpr,ijpr')
        stop '(xcspmd)'
#if defined(ARCTIC)
      elseif (nreg.ne.3) then  ! not arctic
        if     (mnproc.eq.1) then
          write(lp,'(a,i5)') 'input: nreg =',nreg
          call flush(lp)
        endif
        call xcstop('xcspmd: patch.input must be for arctic')
        stop '(xcspmd)'
#else
      elseif (nreg.lt.0 .or. nreg.gt.2) then  ! not closed or periodic
                                              ! use TYPE=one/omp for f-plane 
        if     (mnproc.eq.1) then
          write(lp,'(a,i5)') 'input: nreg =',nreg
          call flush(lp)
        endif
        call xcstop('xcspmd: patch.input for wrong nreg')
        stop '(xcspmd)'
#endif /* ARCTIC:else */
      endif
!
#if defined(RELO)
! --- region's horizontal dimensions are from patch.input:
!
      itdm = itdm_in
      jtdm = jtdm_in
      idm  = idm_in
      jdm  = jdm_in
#else
      if     (itdm_in.ne.itdm .or. jtdm_in.ne.jtdm) then
        if     (mnproc.eq.1) then
          write(lp,'(a,2i5)') 'input: itdm,jtdm =',itdm_in,jtdm_in
          write(lp,'(a,2i5)') 'param: itdm,jtdm =',itdm,   jtdm
          call flush(lp)
        endif
        call xcstop('xcspmd: patch.input for wrong itdm,jtdm')
        stop '(xcspmd)'
      elseif (idm_in.gt.idm .or. jdm_in.gt.jdm) then
        if     (mnproc.eq.1) then
          write(lp,'(a,2i5)') 'input: idm,jdm =',idm_in,jdm_in
          write(lp,'(a,2i5)') 'param: idm,jdm =',idm,   jdm
          call flush(lp)
        endif
        call xcstop('xcspmd: patch.input for wrong idm,jdm')
        stop '(xcspmd)'
      endif
#endif
!
!     individual tile rows.
!
      do n= 1,jpr
        read( uoff+99,'(12x,8i6)') (i0_pe(m,n),m=1,ipr)  ! ispt=i0+1
        read( uoff+99,'(12x,8i6)') (ii_pe(m,n),m=1,ipr)  ! iipe
        if (maxval(ii_pe(1:ipr,n)).le.0) then
          call xcstop('xcspmd: patch.input has an empty row')
          stop '(xcspmd)'
        endif
        do m= 1,ipr
          i0_pe(m,n) = i0_pe(m,n) - 1
        enddo
      enddo
#if defined(ARCTIC)
!
! --- all arctic patch tiles must be the same size or empty,
! --- and empty tiles must be "twinned" across the top boundary.
!
      if     (ipr.gt.1) then
        do m= 1,ipr
          if     (ii_pe(m,jpr).eq.0) then
            if     (ii_pe(ipr+1-m,jpr).ne.0) then
              if     (mnproc.eq.1) then
                write(lp,'(a,i3,a,i3,a)')  &
                 'error - tile',m,',jpr is empty but tile', &
                          ipr+1-m,',jpr is not'
                call flush(lp)
              endif
              call xcstop('xcspmd: arctic empty tiles are not twins')
              stop '(xcspmd)'
            endif
          elseif (ii_pe(m,jpr).ne.itdm/ipr) then
            if     (mnproc.eq.1) then
              write(lp,'(a,i5)')  &
               'error - arctic patch tiles should have ii =',itdm/ipr
              call flush(lp)
            endif
            call xcstop('xcspmd: arctic tiles are not the right size')
            stop '(xcspmd)'
          endif
        enddo !m
      endif
#endif /* ARCTIC */
!
!     the generic tile column (must cover entire column).
!
      read( uoff+99,*)
      read( uoff+99,'(12x,8i6)') (j0_pe(1,n),n=1,jpr)  ! jspt = io+1
      read( uoff+99,'(12x,8i6)') (jj_pe(1,n),n=1,jpr)  ! jjpe
      if     (j0_pe(1,1).ne.1) then
        call xcstop('xcspmd: patch.input for wrong jspt')
        stop '(xcspmd)'
      endif 
      j0_pe(1,1) = 0
      do n= 2,jpr
        j0_pe(1,n) = j0_pe(1,n) - 1
        if (j0_pe(1,n).ne.j0_pe(1,n-1)+jj_pe(1,n-1)) then
          call xcstop('xcspmd: patch.input non-contiguous')
          stop '(xcspmd)'
        endif
      enddo
      if     (j0_pe(1,jpr)+jj_pe(1,jpr).ne.jtdm) then
        call xcstop('xcspmd: patch.input for wrong jjpe')
        stop '(xcspmd)'
      endif 
      do m= 2,ipr
        j0_pe(m,:) = j0_pe(1,:)
        jj_pe(m,:) = jj_pe(1,:)
      enddo
      close(uoff+99)
!
#if defined(RELO)
      allocate( mpe_i(0:itdm+1,0:jqr), npe_j(0:jtdm+1) )
      call mem_stat_add( (itdm+2)*(jqr+1)+jtdm+2 )
#endif
!
#if defined(MPI)
!
!     do we have the right number of pes?
!
      if     (npesi.ne.ijpr) then
        if     (mnproc.eq.1) then
          write(lp,*)
          write(lp,*) '***** ERROR - WRONG MPI SIZE *****'
          write(lp,*)
          write(lp,*) 'NPES    = ',npesi
          write(lp,*) '   IJPR = ',    ijpr
          write(lp,*) 'IPR,JPR = ',ipr,jpr
          write(lp,*)
          call flush(lp)
        endif
        call xcstop('Error in xcspmd')
        stop
      endif
!
!     mpi messages are sent and received by pe number (0:ijpr-1).
!
      null_tile = mpi_proc_null
!
      mn = 0
      do n= 1,jpr
        mpe_1(n) = 0
        do m= 1,ipr
          if     (ii_pe(m,n).eq.0) then
            idproc(m,n)   = null_tile
          else
            idproc1(mn+1) = mn
            idproc(m,n)   = mn
            mn = mn + 1
            if     (mnproc.eq.mn) then
              mproc = m
              nproc = n
            endif
            mpe_e(n) = m
            if     (mpe_1(n).eq.0) then
              mpe_1(n) = m
            endif
          endif
        enddo
      enddo
!
      mp_1st = mpe_1(nproc)  !1st node in this row (public)
!
!     mpi-2 i/o group (public), see mod_za.
!
      if     (mproc.eq.mp_1st) then
        i = 1
        call mpi_comm_split(mpi_comm_hycom, i,0, &
                            group_1st_in_row, mpierr)
      else
        i = 0
        call mpi_comm_split(mpi_comm_hycom, i,0, &
                            group_1st_in_row, mpierr)
        call mpi_comm_free( group_1st_in_row, mpierr)
      endif
#elif defined(SHMEM)
!
!     do we have the right number of pes?
!
      if     (npesi.ne.ijpr) then
        if     (mnproc.eq.1) then
          write(lp,*)
          write(lp,*) '***** ERROR - WRONG SHMEM SIZE *****'
          write(lp,*)
          write(lp,*) 'NPES    = ',npesi
          write(lp,*) '   IJPR = ',    ijpr
          write(lp,*) 'IPR,JPR = ',ipr,jpr
          write(lp,*)
          call flush(lp)
        endif
        call xcstop('Error in xcspmd')
        stop
      endif
!
!     shmem messages are sent and received by pe number (0:ijpr-1).
!
      null_tile = -1
!
      mn = 0
      do n= 1,jpr
        mpe_1(n) = 0
        do m= 1,ipr
          if     (ii_pe(m,n).eq.0) then
            idproc(m,n)   = null_tile
          else
            idproc1(mn+1) = mn
            idproc(m,n)   = mn
            mn = mn + 1
            if     (mnproc.eq.mn) then
              mproc = m
              nproc = n
            endif
            mpe_e(n) = m
            if     (mpe_1(n).eq.0) then
              mpe_1(n) = m
            endif
          endif
        enddo
      enddo
!
      mp_1st = mpe_1(nproc)  !1st node in this row (public)
#endif
!
      if     (mn.ne.ijpr) then
        if     (mnproc.eq.1) then
          write(lp,'(a,i5)') 'input: ijpr =',ijpr
          write(lp,'(a,i5)') 'calc:  ijpr =',mn
          call flush(lp)
        endif
        call xcstop('xcspmd: wrong number of sea tiles')
        stop '(xcspmd)'
      endif
!
      if     (nreg.eq.0) then
!
!       longitudinal tile dimension is closed (not periodic)
!
        do n= 1,jpr
          idproc(    0,n) = null_tile
          idproc(ipr+1,n) = null_tile
        enddo
      else
!
!       longitudinal tile dimension is potentially periodic.
!
        do n= 1,jpr
          idproc(    0,n) = null_tile
          idproc(ipr+1,n) = null_tile
!
          i = maxval((i0_pe(1:ipr,n)+ii_pe(1:ipr,n)))
          if     (i0_pe(1,n).eq.0 .and. i.eq.itdm) then
            idproc(         0,n) = idproc(mpe_e(n),n)
            idproc(mpe_e(n)+1,n) = idproc(       1,n)
          endif
        enddo
      endif
#if defined(ARCTIC)
!
!     must have ipr even or 1 for arctic boundary case.
!
      if     (ipr.gt.1 .and. mod(ipr,2).ne.0) then
        call xcstop('Error in xcspmd (arctic) - ipr must be even')
        stop '(xcspmd)'
      endif
!
!     latitudinal tile dimension is closed/arctic.
!
      do m= 1,ipr
        idproc(m,    0) = null_tile
        idproc(m,jpr+1) = idproc(ipr+1-m,jpr)  !arctic tile mapping
      enddo
      idproc(    0,    0) = null_tile
      idproc(ipr+1,    0) = null_tile
      idproc(    0,jpr+1) = idproc(ipr,jpr+1)
      idproc(ipr+1,jpr+1) = idproc(1,  jpr+1)
#else
!
!     latitudinal tile dimension is closed
!
      do m= 0,ipr+1
        idproc(m,    0) = null_tile
        idproc(m,jpr+1) = null_tile
      enddo
#endif /* ARCTIC:else */
!
!     1-d tiling logic is easier if assumed periodic.
!
      idproc1(     0) = idproc1(ijpr)
      idproc1(ijpr+1) = idproc1(   1)
!
!     mapping from global i,j to mp,np.
!     ia,ja is on tile mpe_i(ia,npe_j(ja)),npe_j(ja), 
!     or on no tile if mpe_i(ia,npe_j(ja)) is 0 or -1.
!
      mpe_i(     :,0) = -1  !for npe_j out of range
      mpe_i(     0,:) = -1  !for i out of range
      mpe_i(itdm+1,:) = -1  !for i out of range
      do n= 1,jpr
        mpe_i(1:itdm,n) = 0  ! default is an empty tile
        do m= 1,ipr  ! i-map potentially varies with n
          if     (ii_pe(m,n).gt.0) then
            do i= i0_pe(m,n)+1,i0_pe(m,n)+ii_pe(m,n)
              mpe_i(i,n) = m
            enddo
            if     (m.ne.ipr) then
              if     (ii_pe(m+1,n).gt.0) then
                do i= i0_pe(m,n)+ii_pe(m,n)+1,i0_pe(m+1,n)
                  mpe_i(i,n) = -1  ! gap between tiles
                enddo
              endif
            endif
          endif
        enddo
        m = 1  ! only one j-map
          do j= j0_pe(m,n)+1,j0_pe(m,n)+jj_pe(m,n)
            npe_j(j)   = n
          enddo
          npe_j(     0) = 0  !for j out of range
          npe_j(jtdm+1) = 0  !for j out of range
      enddo
!
!     do each partial sum on the tile that owns its center point.
!       i1sum - local index of 1st partial sum on each tile
!       iisum - number of partial sums on each tile
!     see xcsum for how i1sum and iisum are used.
!
      ixsum = 0
      do n= 1,jpr
        do m= 1,ipr
          if     (ii_pe(m,n).le.0) then
            i1sum(m,n) =  0
            iisum(m,n) =  0
          else
! ---       partial sum specific definition of idhalo
            idhalo(1) = idproc(m-1,n)
            idhalo(2) = idproc(m+1,n)
            if     (idhalo(1).ne.null_tile .and. m.ne.1) then
              if (i0_pe(m,n).ne.i0_pe(m-1,n)+ii_pe(m-1,n)) then
                idhalo(1) = null_tile
              endif
            endif
            if     (m.eq.mpe_e(n)) then
              idhalo(2) = null_tile
            elseif (idhalo(2).ne.null_tile) then
              if     (i0_pe(m,n)+ii_pe(m,n).ne.i0_pe(m+1,n)) then
                idhalo(2) = null_tile
              endif
            endif
            i1sum(m,n) = -99
            iisum(m,n) =  0
            do i= 1+nbdy,itdm+nbdy,2*nbdy+1
              if     (i0_pe(m,n).lt.i .and. &
                                    i.le.i0_pe(m,n)+ii_pe(m,n)) then
                iisum(m,n) = iisum(m,n) + 1
                if     (iisum(m,n).eq.1) then
                  i1sum(m,n) = i - nbdy - i0_pe(m,n)
                endif
              elseif (idhalo(1).eq.null_tile .and. &
                      i.gt.i0_pe(m,n)-nbdy   .and. &
                      i.le.i0_pe(m,n)             ) then
                iisum(m,n) = iisum(m,n) + 1
                if     (iisum(m,n).eq.1) then
                  i1sum(m,n) = i - nbdy - i0_pe(m,n)
                endif
              elseif (idhalo(2).eq.null_tile          .and. &
                      i.gt.i0_pe(m,n)+ii_pe(m,n)      .and. &
                      i.le.i0_pe(m,n)+ii_pe(m,n)+nbdy      ) then
                iisum(m,n) = iisum(m,n) + 1
                if     (iisum(m,n).eq.1) then
                  i1sum(m,n) = i - nbdy - i0_pe(m,n)
                endif
              endif
            enddo
          endif
          ixsum = max( ixsum, iisum(m,n) )
        enddo !m
      enddo !n
!
!     local tile extents.
!
      i0 = i0_pe(mproc,nproc)
      ii = ii_pe(mproc,nproc)
      j0 = j0_pe(mproc,nproc)
      jj = jj_pe(mproc,nproc)
!
!     left and right halo targets
!
      idhalo(1) = idproc(mproc-1,nproc)
      idhalo(2) = idproc(mproc+1,nproc)
!
      if     (idhalo(1).ne.null_tile .and. mproc.ne.1) then
!
!       is the left tile touching this one?
!
        if (i0.ne.i0_pe(mproc-1,nproc)+ii_pe(mproc-1,nproc)) then
          idhalo(1) = null_tile
        endif
      endif
!
      if     (idhalo(2).ne.null_tile .and. mproc.ne.mpe_e(nproc)) then
!
!       is the right tile touching this one?
!
        if     (i0+ii.ne.i0_pe(mproc+1,nproc)) then
          idhalo(2) = null_tile
        endif
      endif
!
!     local halo exchange data structures
!
!     m0_top - tile offset:       top neighbors
!     mm_top - tile extent:       top neighbors (<=jpr)
!     i0_st  - halo offsets: send top neighbors
!     ii_st  - halo lengths: send top neighbors
!     i0_gt  - halo offsets:  get top neighbors
!     ii_gt  - halo lengths:  get top neighbors
!     m0_bot - tile offset:       bot neighbors
!     mm_bot - tile extent:       bot neighbors (<=jpr)
!     i0_sb  - halo offsets: send bot neighbors
!     ii_sb  - halo lengths: send bot neighbors
!     i0_gb  - halo offsets:  get bot neighbors
!     ii_gb  - halo lengths:  get bot neighbors
!
!     note that send is also receive, and is w.r.t. the local  tile.
!     similarly get  is also put,     and is w.r.t. the remote tile.
!
      if     (nproc.eq.jpr) then
#if defined(ARCTIC)
!       single, same size, top arctic nieghbor
        m0_top   = mproc - 1
        mm_top   =  1
        i0_st(1) =  0
        i0_gt(1) =  0
        ii_st(1) = ii
        ii_gt(1) = ii
#else
!       no top nieghbor (closed boundary)
        m0_top = 0
        mm_top = 0
#endif /* ARCTIC:else */
      else
        n = nproc + 1
        m0_top = 0
        mm_top = 0
        m      = 0
        do i= 1,ii
          if     (mpe_i(i0+i,n).ne.m) then
            if     (mm_top.eq.0) then
              m0_top = mpe_i(i0+i,n) - 1
            elseif (m.ne.-1) then
              ii_st(mm_top) = i-1 - i0_st(mm_top) 
              ii_gt(mm_top) = ii_st(mm_top)
            endif
            m = mpe_i(i0+i,n)
            if     (m.gt.0) then
              mm_top        = mm_top + 1
              i0_st(mm_top) = i-1
              i0_gt(mm_top) = i-1 + i0-i0_pe(m,n)
            elseif (m.eq.0) then
              mm_top        = mm_top + 1
              i0_st(mm_top) = i-1
              i0_gt(mm_top) = i0_gt(mm_top-1) + ii_gt(mm_top-1)
!           elseif (m.eq.-1) then  !do nothing
            endif
          endif
        enddo
        if     (mm_top.gt.0) then
          if     (m.gt.0) then
            ii_st(mm_top) = ii - i0_st(mm_top) 
            ii_gt(mm_top) = ii_st(mm_top)
          elseif (m.eq.0) then
            mm_top = mm_top-1
!         elseif (m.eq.-1) then  !do nothing
          endif
        endif
      endif  !nproc.eq.1:else
!
      if     (nproc.eq.1) then
!       no bottom nieghbor (closed boundary)
        m0_bot = 0
        mm_bot = 0
      else
        n = nproc - 1
        m0_bot = 0
        mm_bot = 0
        m      = 0
        do i= 1,ii
          if     (mpe_i(i0+i,n).ne.m) then
            if     (mm_bot.eq.0) then
              m0_bot = mpe_i(i0+i,n) - 1
            elseif (m.ne.-1) then
              ii_sb(mm_bot) = i-1 - i0_sb(mm_bot) 
              ii_gb(mm_bot) = ii_sb(mm_bot)
            endif
            m = mpe_i(i0+i,n)
            if     (m.gt.0) then
              mm_bot        = mm_bot + 1
              i0_sb(mm_bot) = i-1
              i0_gb(mm_bot) = i-1 + i0-i0_pe(m,n)
            elseif (m.eq.0) then
              mm_bot        = mm_bot + 1
              i0_sb(mm_bot) = i-1
              i0_gb(mm_bot) = i0_gb(mm_bot-1) + ii_gb(mm_bot-1)
!           elseif (m.eq.-1) then  !do nothing
            endif
          endif
        enddo
        if     (mm_bot.gt.0) then
          if     (m.gt.0) then
            ii_sb(mm_bot) = ii - i0_sb(mm_bot) 
            ii_gb(mm_bot) = ii_sb(mm_bot)
          elseif (m.eq.0) then
            mm_bot = mm_bot-1
!         elseif (m.eq.-1) then  !do nothing
          endif
        endif
      endif  !nproc.eq.1:else
!
!     printout the tile data structures.
!
      if     (mnproc.eq.1) then
        write(lp,'(/a)') &
         'mnproc mproc nproc     i0   ii     j0   jj  i1sum iisum'
        mn = 0
        do n= 1,jpr
          do m= 1,ipr
            if     (ii_pe(m,n).ne.0) then
              mn= mn + 1
              write(lp,'(i6,2i6,i7,i5,i7,i5,i7,i6)') &
                 mn,m,n, &
                 i0_pe(m,n),ii_pe(m,n), &
                 j0_pe(m,n),jj_pe(m,n), &
                 i1sum(m,n),iisum(m,n)
            endif
          enddo !m
        enddo !n
        write(lp,*)
#if defined(ARCTIC)
        write(lp,'(a)') &
         'mnproc mproc nproc mnarct'
        mn = 0
        do n= 1,jpr
          do m= 1,ipr
            if     (ii_pe(m,n).ne.0) then
              mn= mn + 1
              if     (n.eq.jpr) then
                write(lp,'(i6,2i6,i7)') &
                 mn,m,n,idproc(m,n+1)
              endif
            endif
          enddo !m
        enddo !n
        write(lp,*)
#endif /* ARCTIC */
#if defined(DEBUG_ALL)
        do n= 1,jpr
          write(lp,*) 'mpe_1,mpe_e = ',mpe_1(n),mpe_e(n)
        enddo
        do n= 1,jpr
          write(lp,*) 'mpe_i = ',mpe_i(:,n)
        enddo
        write(lp,*)
        write(lp,*) 'npe_j = ',npe_j(:)
        write(lp,*)
#endif
      endif
      call xcsync(flush_lp)
!
#if defined(DEBUG_ALL)
      do n= 1,jpr
        do m= 1,ipr
          if     (mproc.eq.m .and. nproc.eq.n) then
            write(lp,'(a,2i3,i4,i3,16(i5,i4))') &
               'm,n,_top,_st = ', &
                m,n,m0_top,mm_top,(i0_st(l),ii_st(l), l= 1,mm_top)
#if defined(SHMEM)
            write(lp,'(a,2i3,i4,i3,16(i5,i4))') &
               'm,n,_top,_gt = ', &
                m,n,m0_top,mm_top,(i0_gt(l),ii_gt(l), l= 1,mm_top)
#endif
            if     (m.eq.ipr) then
              write(lp,*) !blank line
            endif
          endif
          call xcsync(flush_lp)
        enddo !m
        do m= 1,ipr
          if     (mproc.eq.m .and. nproc.eq.n) then
            write(lp,'(a,2i3,i4,i3,16(i5,i4))') &
               'm,n,_bot,_sb = ', &
                m,n,m0_bot,mm_bot,(i0_sb(l),ii_sb(l), l= 1,mm_bot)
#if defined(SHMEM)
            write(lp,'(a,2i3,i4,i3,16(i5,i4))') &
               'm,n,_bot,_gb = ', &
                m,n,m0_bot,mm_bot,(i0_gb(l),ii_gb(l), l= 1,mm_bot)
#endif
            if     (m.eq.ipr) then
              write(lp,*) !blank line
            endif
          endif
          call xcsync(flush_lp)
        enddo !m
        do m= 1,ipr
          if     (mproc.eq.m .and. nproc.eq.n) then
            write(lp,'(a,2i3,3i5)') &
               'm,n,id,idhalo   = ', &
                m,n,idproc(m,n),idhalo
            if     (m.eq.ipr) then
              write(lp,*) !blank line
            endif
          endif
          call xcsync(flush_lp)
        enddo !m
      enddo !n
#endif /* DEBUG_ALL */
#if defined(USE_ESMF4) || defined(ESPC_COUPLE)
!
! --- ESMF tiling fills the global array (1:itdm,1:jtdm)
!
      mn=0
      do n=1,jpr
        do m=1,ipr
          if(ii_pe(m,n).ne.0)then
            mn= mn + 1
#if defined(ARCTIC)
! ---       don't include the last row (it is a copy of the previous one)
            deBlockList(2,1,mn) =      j0_pe(m,n)+1
            deBlockList(2,2,mn) = min( j0_pe(m,n)+jj_pe(m,n), jtdm-1 )
#else
            deBlockList(2,1,mn) = j0_pe(m,n)+1
            deBlockList(2,2,mn) = j0_pe(m,n)+jj_pe(m,n)
#endif /* ARCTIC:else */
            if     (m.eq.mpe_1(n)) then
              if     (m.eq.mpe_e(n)) then
! ---           only tile in row
                deBlockList(1,1,mn) = 1
                deBlockList(1,2,mn) = itdm
              else
! ---           1st tile in row, right edge may be extended later
                deBlockList(1,1,mn) = 1
                deBlockList(1,2,mn) = i0_pe(m,n)+ii_pe(m,n)
              endif
            elseif (m.eq.mpe_e(n)) then
! ---         last tile in row
              deBlockList(1,1,mn) = i0_pe(m,n)+1
              deBlockList(1,2,mn) = itdm
              if     (deBlockList(1,1,mn)    .ne. &
                      deBlockList(1,2,mn-1)+1    ) then
! ---           a hole in the HYCOM tiling, over land, is filled here
                deBlockList(1,2,mn-1) = (deBlockList(1,2,mn-1)+ &
                                         deBlockList(1,1,mn)   )/2
                deBlockList(1,1,mn)   = deBlockList(1,2,mn-1)+1
              endif
            else
! ---         middle tile in row, right edge may be extended later
              deBlockList(1,1,mn) = i0_pe(m,n)+1
              deBlockList(1,2,mn) = i0_pe(m,n)+ii_pe(m,n)
              if     (deBlockList(1,1,mn)    .ne. &
                      deBlockList(1,2,mn-1)+1    ) then
! ---           a hole in the HYCOM tiling, over land, is filled here
                deBlockList(1,2,mn-1) = (deBlockList(1,2,mn-1)+ &
                                         deBlockList(1,1,mn)   )/2
                deBlockList(1,1,mn)   = deBlockList(1,2,mn-1)+1
              endif
            endif
          endif
        enddo !m
      enddo !n
!
      if     (mnproc.eq.1) then
        write(lp,'(/a)') &
         'mnproc mproc nproc   BL11   BL21   BL12   BL22'
        mn = 0
        do n= 1,jpr
          do m= 1,ipr
            if     (ii_pe(m,n).ne.0) then
               mn=mn+1
               write(lp,'(i6,2i6,4i7)') &
                 mn,m,n, &
                 deBlockList(1,1,mn), &
                 deBlockList(2,1,mn), &
                 deBlockList(1,2,mn), &
                 deBlockList(2,2,mn)
            endif
          enddo !m
        enddo !n
        write(lp,*)
      endif !1st tile
      call xcsync(flush_lp)
#endif
!
!     mxsum large enough?
!
#if defined(RELO)
      mxsum = ((idm+4*nbdy)/(2*nbdy+1))*jdm
!
#endif
      if     (ixsum.gt.mxsum) then
        if     (mnproc.eq.1) then
        write(lp,'(a,2i5)') 'mxsum,ixsum =',mxsum,ixsum
        call flush(lp)
        endif
        call xcstop('Error in xcspmd - mxsum too small')
        stop '(xcspmd)'
      endif
!
#if defined(TIMER)
!     initialize timer names.
!
      call xctmrn( 1,'xcaget')
      call xctmrn( 2,'xceget')
      call xctmrn( 3,'xclget')
      call xctmrn( 4,'xcaput')
      call xctmrn( 5,'xcXput')
      call xctmrn( 6,'xcsum ')
      call xctmrn( 9,'xcastr')
      call xctmrn(10,'xcmaxr')
      call xctmrn(12,'xctilr')
#if defined(ARCTIC)
      call xctmrn(13,'xctila')
#endif
#endif
      return
      end subroutine xcspmd

      subroutine xcstop(cerror)
      implicit none
!
      character*(*), intent(in) :: cerror
!
!**********
!*
!  1) stop all processes.
!
!  2) all processes must call this routine.
!     use 'xchalt' for emergency stops.
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    cerror          char*(*)       input     error message
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
!     print active timers.
!
      call xctmrp
!
!     message passing version, set barrier and then stop everything.
!     use xcsync as the barrier so that stdout is flushed.
!     note that the system will hang unless all processes call xcstop.
!
      call xcsync(flush_lp)
      if     (mnproc.eq.1 .and. cerror.ne.' ') then
        write(lp,*) '**************************************************'
        write(lp,*) cerror
        write(lp,*) '**************************************************'
        write(lp,*)
      endif
      call xcsync(flush_lp)
!
#if defined(USE_ESMF4) || defined(OCEANS2) || defined(ESPC_COUPLE) 
!
! --- we may not be running on all processes, so call mpi_abort
!
      if     (mnproc.eq.1) then
        call mpi_abort(mpi_comm_hycom,9)
      endif
      call xcsync(flush_lp)
#elif defined(MPI)
      write(lp,*) 'mpi_finalize called on processor ',mnproc
      call xcsync(flush_lp)
      call mpi_finalize(mpierr)
!
#endif
#if defined(STOP2003)
! --- Fortran 2003 STOP outputs many lines, replace with exit
      call exit(0)
#else
      stop '(xcstop)'
#endif
      end subroutine xcstop

      subroutine xcsum(sum, a,mask)
      implicit none
!
      real*8,  intent(out)   :: sum
      real,    intent(inout) :: a(   1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
      integer, intent(in)    :: mask(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
!
!**********
!*
!  1) sum a 2-d array, where mask==1
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    sum             real*8         output    sum of a
!    a               real           input     source array
!    mask            integer        input     mask array
!
!  3) sum is bit for bit reproducable for the same halo size, nbdy.
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      real*8     zero8
      parameter (zero8=0.0)
!
      real*8  sum8
      real    vsave
      integer i,i1,j,l,mp,np
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0( 6)
        nxc = 6
      endif
#endif
#if defined(RELO)
!
      if     (.not.allocated(sum8t)) then
        allocate(    sum8t(mxsum), sum8j(jdm) )
        call mem_stat_add( mxsum +       jdm )
      endif
#endif
!
!     halo update so that 2*nbdy+1 wide strips are on chip.
!
      vsave = vland
      vland = 0.0
      call xctilr(a,1,1, nbdy,0, halo_ps)
      vland = vsave
!
!     row sums in 2*nbdy+1 wide strips.
!
!$OMP PARALLEL DO PRIVATE(j,i1,i,l,sum8) &
!$OMP          SCHEDULE(STATIC,jblk)
      do j=1,jj
        do l= 1,iisum(mproc,nproc)
          i1   = i1sum(mproc,nproc) + (l-1)*(2*nbdy+1)
          sum8 = zero8
          do i= max(i1,1-nbdy),min(i1+2*nbdy,ii+nbdy,itdm-i0)
            if     (mask(i,j).eq.1) then
              sum8 = sum8 + a(i,j)
            endif
          enddo
          sum8t(l + (j-1)*iisum(mproc,nproc)) = sum8
        enddo
      enddo
!$OMP END PARALLEL DO
!
!     complete row sums on first processor in each row.
!
#if defined(SHMEM)
      BARRIER
#endif
      if     (mproc.eq.mpe_1(nproc)) then
        do j=1,jj
          sum8j(j) = zero8
          do l= 1,iisum(mproc,nproc)
            sum8j(j) = sum8j(j) + sum8t(l + (j-1)*iisum(mproc,nproc))
          enddo
!         write(lp,'(a,i3,i5,f12.2)') 'xcsum: np,j,sum = ',
!    &                                1,j,sum8j(j)
        enddo
!
!       remote sums.
!
        do mp= mpe_1(nproc)+1,mpe_e(nproc)
          l = iisum(mp,nproc)*jj
          if     (l.gt.0) then
#if defined(MPI)
            call MPI_RECV(sum8t,l,MTYPED, &
                          idproc(mp,nproc), 9900, &
                          mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
#elif defined(SHMEM)
            call SHMEM_GETD(sum8t, &
                            sum8t,l,idproc(mp,nproc))
#endif
            do j=1,jj
              do l= 1,iisum(mp,nproc)
                sum8j(j) = sum8j(j) + sum8t(l + (j-1)*iisum(mp,nproc))
              enddo
!             write(lp,'(a,i3,i5,f12.2)') 'xcsum: np,j,sum = ',
!    &                                    mp,j,sum8j(j)
            enddo
          endif
        enddo
#if defined(MPI)
      else
        l = iisum(mproc,nproc)*jj
        if     (l.gt.0) then
          call MPI_SEND(sum8t,l,MTYPED, &
                        idproc(mpe_1(nproc),nproc), 9900, &
                        mpi_comm_hycom, mpierr)
        endif
#endif
      endif
!
!     sum of row sums, on first processor.
!
#if defined(SHMEM)
      BARRIER
#endif
      if     (mnproc.eq.1) then
        sum8 = zero8
        do j= 1,jj
          sum8 = sum8 + sum8j(j)
        enddo
!       write(lp,'(a,i5,f12.2)') 'xcsum: jj,sum = ',jj,sum8
!
        do np= 2,jpr
          mp = mpe_1(np)
#if defined(MPI)
          call MPI_RECV(sum8j,jj_pe(mp,np),MTYPED, &
                        idproc(mp,np), 9901, &
                        mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
#elif defined(SHMEM)
          call SHMEM_GETD(sum8j, &
                          sum8j,jj_pe(mp,np),idproc(mp,np))
#endif
          do j= 1,jj_pe(mp,np)
            sum8 = sum8 + sum8j(j)
          enddo
!         write(lp,'(a,i5,f12.2)') 'xcsum: jj,sum = ',
!    &                             jj_pe(mp,np),sum8
        enddo
        sum8s = sum8
#if defined(MPI)
      elseif (mproc.eq.mpe_1(nproc)) then
        call MPI_SEND(sum8j,jj,MTYPED, &
                      idproc1(1), 9901, &
                      mpi_comm_hycom, mpierr)
#endif
      endif
!
!     broadcast result to all processors.
!
#if defined(MPI)
      call mpi_bcast(sum8s,1,MTYPED, &
                     idproc1(1),mpi_comm_hycom,mpierr)
#elif defined(SHMEM)
      BARRIER
      if     (mnproc.ne.1) then
        call SHMEM_GETD(sum8s, &
                        sum8s,1,idproc1(1))
      endif
#endif
!
      sum = sum8s
#if defined(TIMER)
!
      if     (nxc.eq. 6) then
        call xctmr1( 6)
        nxc = 0
      endif
#endif
      return
      end subroutine xcsum

      subroutine xcsumj(sumj, a,mask)
      implicit none
!
      real*8,  intent(out)   :: sumj(jtdm)
      real,    intent(inout) :: a(   1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
      integer, intent(in)    :: mask(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy)
!
!**********
!*
!  1) rwo-sum of a 2-d array, where mask==1, on first processor only.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    sumj            real*8         output    row-sum of a
!    a               real           input     source array
!    mask            integer        input     mask array
!
!  3) sum is bit for bit reproducable for the same halo size, nbdy.
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      real*8     zero8
      parameter (zero8=0.0)
!
      real*8  sum8
      real    vsave
      integer i,i1,j,l,mp,np
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0( 6)
        nxc = 6
      endif
#endif
#if defined(RELO)
!
      if     (.not.allocated(sum8t)) then
        allocate(    sum8t(mxsum), sum8j(jdm) )
        call mem_stat_add( mxsum +       jdm )
      endif
#endif
!
!     halo update so that 2*nbdy+1 wide strips are on chip.
!
      vsave = vland
      vland = 0.0
      call xctilr(a,1,1, nbdy,0, halo_ps)
      vland = vsave
!
!     row sums in 2*nbdy+1 wide strips.
!
!$OMP PARALLEL DO PRIVATE(j,i1,i,l,sum8) &
!$OMP          SCHEDULE(STATIC,jblk)
      do j=1,jj
        do l= 1,iisum(mproc,nproc)
          i1   = i1sum(mproc,nproc) + (l-1)*(2*nbdy+1)
          sum8 = zero8
          do i= max(i1,1-nbdy),min(i1+2*nbdy,ii+nbdy,itdm-i0)
            if     (mask(i,j).eq.1) then
              sum8 = sum8 + a(i,j)
            endif
          enddo
          sum8t(l + (j-1)*iisum(mproc,nproc)) = sum8
        enddo
      enddo
!$OMP END PARALLEL DO
!
!     complete row sums on first processor in each row.
!
#if defined(SHMEM)
      BARRIER
#endif
      if     (mproc.eq.mpe_1(nproc)) then
        do j=1,jj
          sum8j(j) = zero8
          do l= 1,iisum(mproc,nproc)
            sum8j(j) = sum8j(j) + sum8t(l + (j-1)*iisum(mproc,nproc))
          enddo
!         write(lp,'(a,i3,i5,f12.2)') 'xcsum: np,j,sum = ',
!    &                                1,j,sum8j(j)
        enddo
!
!       remote sums.
!
        do mp= mpe_1(nproc)+1,mpe_e(nproc)
          l = iisum(mp,nproc)*jj
          if     (l.gt.0) then
#if defined(MPI)
            call MPI_RECV(sum8t,l,MTYPED, &
                          idproc(mp,nproc), 9900, &
                          mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
#elif defined(SHMEM)
            call SHMEM_GETD(sum8t, &
                            sum8t,l,idproc(mp,nproc))
#endif
            do j=1,jj
              do l= 1,iisum(mp,nproc)
                sum8j(j) = sum8j(j) + sum8t(l + (j-1)*iisum(mp,nproc))
              enddo
!             write(lp,'(a,i3,i5,f12.2)') 'xcsum: np,j,sum = ',
!    &                                    mp,j,sum8j(j)
            enddo
          endif
        enddo
#if defined(MPI)
      else
        l = iisum(mproc,nproc)*jj
        if     (l.gt.0) then
          call MPI_SEND(sum8t,l,MTYPED, &
                        idproc(mpe_1(nproc),nproc), 9900, &
                        mpi_comm_hycom, mpierr)
        endif
#endif
      endif
!
!     send row sums to first processor.
!
#if defined(SHMEM)
      BARRIER
#endif
      if     (mnproc.eq.1) then
        do j= 1,jj
          sumj(j) = sum8j(j)
        enddo
!
        do np= 2,jpr
          mp = mpe_1(np)
#if defined(MPI)
          call MPI_RECV(sum8j,jj_pe(mp,np),MTYPED, &
                        idproc(mp,np), 9901, &
                        mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
#elif defined(SHMEM)
          call SHMEM_GETD(sum8j, &
                          sum8j,jj_pe(mp,np),idproc(mp,np))
#endif
          do j= 1,jj_pe(1,np)
            sumj(j+j0_pe(1,np)) = sum8j(j)
          enddo
        enddo
#if defined(MPI)
      elseif (mproc.eq.mpe_1(nproc)) then
        call MPI_SEND(sum8j,jj,MTYPED, &
                      idproc1(1), 9901, &
                      mpi_comm_hycom, mpierr)
#endif
      endif
#if defined(TIMER)
!
      if     (nxc.eq. 6) then
        call xctmr1( 6)
        nxc = 0
      endif
#endif
      return
      end subroutine xcsumj

      subroutine xcsumr_0(a, mnflg)
      implicit none
!
      real,    intent(inout) :: a
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) replace scalar a with its element-wise sum over all tiles.
!
!  2) mnflg selects which nodes must return the sum 
!        = 0; all nodes
!        = n; node number n (mnproc=n)
!
!  3) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target variable
!    mnflg           integer        input     node return flag
!*
!**********
!
      real a1(1)
!
      a1(1) = a
      call xcsumr_1(a1, mnflg)
      a = a1(1)
      return
      end subroutine xcsumr_0

      subroutine xcsumr_1(a, mnflg)
      implicit none
!
      real,    intent(inout) :: a(:)
      integer, intent(in)    :: mnflg
!
!**********
!*
!  1) replace array a with its element-wise sum over all tiles.
!
!  2) mnflg selects which nodes must return the sum 
!        = 0; all nodes
!        = n; node number n (mnproc=n)
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target array
!    mnflg           integer        input     node return flag
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      integer i,is0,isl,mn,mnp,n,nn
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0(10)
        nxc = 10
      endif
#endif
!
!     stripmine a.
!
      n = size(a)
!
      do is0= 0,n-1,nmax
        isl = min(is0+nmax,n)
        nn = isl - is0
        do i= 1,nn
          b(i) = a(is0+i)
        enddo
!
#if defined(MPI)
        if     (mnflg.eq.0) then
          call mpi_allreduce(b,c,nn,MTYPER,mpi_sum, &
                             mpi_comm_hycom,mpierr)
        else
          call mpi_reduce(   b,c,nn,MTYPER,mpi_sum, &
                             idproc1(mnflg), &
                             mpi_comm_hycom,mpierr)
        endif
#elif defined(SHMEM)
        BARRIER
        mnp = max(1,mnflg)
        if     (mnproc.eq.mnp) then
!         form global sum on one processor
          do i= 1,nn
            c(i) = b(i)
          enddo
          do mn= 1,ijpr
            if     (mn.ne.mnp) then
              call SHMEM_GETR(b,b,nn, idproc1(mn))
              do i= 1,nn
                c(i) = c(i) + b(i)
              enddo
            endif !.ne.mnp
          enddo
          BARRIER
        endif
        if     (mnflg.eq.0) then
!         get global sum from 1st processor
          BARRIER
          call SHMEM_GETR(c,c,nn, idproc1(mnp))
        endif
        ! no barrier needed here because using two buffers (b and c)
#endif
        if     (mnflg.eq.0 .or. mnflg.eq.mnproc) then
          do i= 1,nn
            a(is0+i) = c(i)
          enddo
        endif
      enddo
#if defined(TIMER)
!
      if     (nxc.eq.10) then
        call xctmr1(10)
        nxc = 0
      endif
#endif
      return
      end subroutine xcsumr_1

      subroutine xcsync(lflush)
      implicit none
!
      logical, intent(in) :: lflush
!
!**********
!*
!  1) barrier, no processor exits until all arrive (and flush stdout).
!
!  2) some MPI implementations only flush stdout as a collective
!     operation, and hence the lflush=.true. option to flush stdout.
!
!  3) typically this is just a wrapper to the /* BARRIER */ macro.
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      if     (lflush) then
#if defined(MPI) && defined(AIX) && ! (defined(AIX_NOFL) || defined(USE_ESMF4)  || defined(ESPC_COUPLE) || defined(OCEANS2))
        call mp_flush(1)  ! flushes stdout, and implies a barrier
#else
        call flush(lp)
        BARRIER
#endif
      else
        BARRIER
      endif
      return
      end subroutine xcsync

#if defined(SHMEM) && defined(RINGB)
      subroutine xctbar(ipe1,ipe2)
      implicit none
!
      integer, intent(in) :: ipe1,ipe2
!
!**********
!*
!  1) sync with processors ipe1 and ipe2.
!
!  2) this is a global collective operation, and the calls on ipe1
!     and ipe2 must list this processor as one of the two targets.
!
!  3) this is used in place of a global barrier in halo operations,
!     but it only provides syncronization of one or two processors 
!     with the local processor.
!
!  4) ipe1 and/or ipe2 can be null_tile, to indicate no processor.
!*
!**********
!
      integer    cache_line,ilarge
      parameter (cache_line=32, ilarge=2**30)
!
      integer, save, dimension(cache_line,-1:ijqr-1) :: ibp
!
      integer i
!
      integer icount
      save    icount
      data    icount / -1 /
!
      icount = mod(icount+1,ilarge)
      if     (icount.eq.0) then
        call shmem_barrier_all()
        do i= -1,ijpr-1
          ibp(1,i) = -1
        enddo
        call shmem_barrier_all()
      endif
!
      ibp(1,-1) = icount
      if     (ipe1.ne.null_tile) then
        call shmem_integer_put(ibp(1,mnproc-1),icount,1,ipe1)
      endif
      if     (ipe2.ne.null_tile) then
        call shmem_integer_put(ibp(1,mnproc-1),icount,1,ipe2)
      endif
      call shmem_fence
!
      i = -1
      do while (i.lt.icount)
!       this assignment statement must not be optimized away.
!dir$   suppress
        i = min(ibp(1,ipe1),ibp(1,ipe2))
      enddo
      return
      end subroutine xctbar
#endif /* SHMEM && RINGB */

#if defined(ARCTIC)
      recursive subroutine xctila(a,l1,ld,itype)
      implicit none
!
      integer, intent(in)    :: l1,ld,itype
      real,    intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy,ld)
!
!**********
!*
!  1) update the top row of a real array.
!     only needed when importing a tripole grid array.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target array
!    l1              integer        input     3rd dim. start index
!    ld              integer        input     3rd dimension of a
!    mh              integer        input     1st (EW) update halo size
!    nh              integer        input     2nd (NS) update halo size
!    itype           integer        input     grid and field type
!
!  3) itype selects both the grid and field type
!        itype= 1; p-grid, scalar field
!        itype=11; p-grid, vector field
!
!  4) this version for a global tripole (arctic bipolar patch) grid
!*
!**********
!
!     halo buffer (in common for enhanced MPI safety).
!
#if defined(RELO)
      real,    save, allocatable, dimension(:,:) :: ai
#else
      integer, parameter :: ilen=idm*kdm
      real,    save, dimension(ilen,2) :: ai
#endif
      integer i,io,k,l,lg0,ls0,lm,m
      real    sarc
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
!
!     persistent communication handles.
!
      integer mpireqa(4*iqr),nreqa, &
              klnold
      save    mpireqa,       nreqa, &
              klnold
!
      data nreqa,klnold / 0,0 /
#endif /* MPI */
!
#if defined(RELO)
      if     (.not.allocated(ai)) then
        allocate(       ai(idm*kdm,2) )
        call mem_stat_add( idm*kdm*2 )
      endif
#endif
!
! --- split large requests into smaller pieces
!
      if     (ld-l1+1.gt.kdm) then
        do k= l1,ld,kdm
          l = min(k+kdm-1,ld)
          call xctila(a,k,l,itype)
        enddo
        return
      endif
!
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0(13)
        nxc = 13
      endif
#endif
!
#if defined(SHMEM)
! --- not needed if ipr.eq.1, but simpler to include it
      BARRIER
#endif
      if     (nproc.eq.jpr) then  !tiles involved in arctic boundary
        if     (itype.lt.10) then
          sarc =  1.0
        else
          sarc = -1.0
        endif
!
        if     (ipr.eq.1) then
          do k= l1,ld
            do i= 1,ii
              io = ii-mod(i-1,ii)
              if     (a(io,jj-1,k).ne.vland) then
                a(i,jj,k) = sarc*a(io,jj-1,k)
              else
                a(i,jj,k) = vland
              endif
            enddo !i
          enddo !k
        else
          l = 0
          do i= 1,ii  ! outer loop to simplify multiple neighbor case
            io = ii+1-i !ii:1:-1
            do k= l1,ld
              l = l + 1
              if     (a(io,jj-1,k).ne.vland) then
                ai(l,1) = sarc*a(io,jj-1,k)
              else
                ai(l,1) = vland
              endif
            enddo !k
          enddo !i
!         write(lp,'(a,5i6)') 'xctila - l1,ld,ii,l,mnproc = ',
!    &                                  l1,ld,ii,l,mnproc
!
#if defined(MPI)
          if     (klnold.ne.(ld-l1+1)) then  !new mpi init needed
#if defined(DEBUG_ALL)
!           if     (mproc.eq.mp_1st) then
!             write(lp,'(a,3i6)') 'xctila - l1,ld,itype = ',
!    &                                      l1,ld,itype
!           endif
!           call xcsync(flush_lp)
#endif
            do i= 1,nreqa
              call mpi_request_free(mpireqa(i), mpierr)
            enddo
            klnold = ld-l1+1
!
!           loop through all neigboring tiles.
!
            l = 0
            do m= 1,mm_top
              l   = l + 1
              ls0 = i0_st(m)*(ld-l1+1)
              lm  = ii_st(m)*(ld-l1+1)
              call mpi_send_init( &
                ai(ls0+1,1),lm,MTYPER, &
                idproc(m0_top+m,nproc+1), 99053, &
                mpi_comm_hycom, mpireqa(l), mpierr)
            enddo
            do m= 1,mm_top
              l   = l + 1
              ls0 = i0_st(m)*(ld-l1+1)
              lm  = ii_st(m)*(ld-l1+1)
              call mpi_recv_init( &
                ai(ls0+1,2),lm,MTYPER, &
                idproc(m0_top+m,nproc+1), 99053, &
                mpi_comm_hycom, mpireqa(l), mpierr)
            enddo
            nreqa = l
          endif  !new mpi_init
          if     (nreqa.gt.0) then
            call mpi_startall(nreqa, mpireqa, mpierr)
            call mpi_waitall( nreqa, mpireqa, MPI_STATUSES_IGNORE, &
                                              mpierr)
          endif
#elif defined(SHMEM)
!
!         loop through all neigboring tiles, barrier issued above.
!
          do m= 1,mm_top
            if     (idproc(m0_top+m,nproc+1).ne.null_tile) then
              lg0 = i0_gt(m)*(ld-l1+1)
              ls0 = i0_st(m)*(ld-l1+1)
              lm  = ii_st(m)*(ld-l1+1)
              if     (nproc.eq.jpr) then
                call SHMEM_GETR(ai(ls0+1,2), &
                                ai(lg0+1,1),lm,   & !buffer 1
                                idproc(m0_top+m,nproc+1))
              endif
            endif
          enddo
#endif    /* MPI:SHMEM */
!
          l = 0
          do i= 1,ii  ! outer loop to simplify multiple neighbor case
            do k= l1,ld
              l = l + 1
              a(i,jj,k) = ai(l,2)
            enddo
          enddo
        endif  !ipr.eq.1:else
      endif !nproc.eq.jpr
!
#if defined(TIMER)
      if     (nxc.eq.13) then
        call xctmr1(13)
        nxc = 0
      endif
#endif
      return
      end subroutine xctila
      recursive subroutine xctilr(a,l1,ld,mh,nh,itype)
      implicit none
!
      integer, intent(in)    :: l1,ld,mh,nh,itype
      real,    intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy,ld)
!
!**********
!*
!  1) update the tile overlap halo of a real array.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target array
!    l1              integer        input     3rd dim. start index
!    ld              integer        input     3rd dimension of a
!    mh              integer        input     1st (EW) update halo size
!    nh              integer        input     2nd (NS) update halo size
!    itype           integer        input     grid and field type
!
!  3) itype selects both the grid and field type
!        itype= 1; p-grid, scalar field
!        itype= 2; q-grid, scalar field
!        itype= 3; u-grid, scalar field
!        itype= 4; v-grid, scalar field
!        itype=11; p-grid, vector field
!        itype=12; q-grid, vector field
!        itype=13; u-grid, vector field
!        itype=14; v-grid, vector field
!
!  4) the global variable vland is returned by halos over land.
!
!  5) this version for a global tripole (arctic bipolar patch) grid
!*
!**********
!
!     halo buffer
!
#if defined(RELO)
      real, save, allocatable, dimension(:,:) :: ai,aj,aia
#else
      integer, parameter :: ilen= idm        *kdm*nbdy+64, &
                            jlen=(jdm+2*nbdy)*kdm*nbdy+64
      real, save, dimension (ilen,4) :: ai
      real, save, dimension (jlen,4) :: aj
      real, save, dimension (kdm*nbdy+64,2) :: aia
#endif
!
      integer i,io,itynew,j,k,l,lg0,ls0,lm,m,mhl,nhl
      real    sarc
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
!
!     persistent communication handles.
!
      integer mpireqa(4*iqr),mpireqb(4),nreqa, &
              klmold,klnold,mhlold,nhlold,ityold
      save    mpireqa,mpireqb,nreqa, &
              klmold,klnold,mhlold,nhlold,ityold
!
      data nreqa,klmold,klnold,mhlold,nhlold,ityold / 0,0,0,0,0,0 /
#endif /* MPI */
#if defined(RELO)
!
      if     (.not.allocated(ai)) then
        allocate(        ai( idm        *kdm*nbdy+64 ,4) )
        call mem_stat_add( ( idm        *kdm*nbdy+64)*4 )
        allocate(        aj((jdm+2*nbdy)*kdm*nbdy+64 ,4) )
        call mem_stat_add( ((jdm+2*nbdy)*kdm*nbdy+64)*4 )
        allocate(      aia( kdm*nbdy+64, 2) )
        call mem_stat_add( (kdm*nbdy+64)*2 )
      endif
#endif
!
! --- split large requests into smaller pieces
!
      if     (ld-l1+1.gt.kdm) then
        do k= l1,ld,kdm
          l = min(k+kdm-1,ld)
          call xctilr(a,k,l,mh,nh,itype)
        enddo
        return
      endif
!
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0(12)
        nxc = 12
      endif
#endif
!
      mhl = max(0,min(mh,nbdy))
      nhl = max(0,min(nh,nbdy))
!
      if     (itype.lt.10) then
        sarc =  1.0
      else
        sarc = -1.0
      endif
!
      if     (nhl.gt.0) then
        if     (ipr.eq.1 .and. jpr.eq.1) then
          do k= l1,ld
            do j= 1,nhl
              do i= 1,ii
                a(i, 1-j,k) = vland
              enddo
              if     (itype.eq.1 .or. itype.eq.11) then
                do i= 1,ii
                  io = ii-mod(i-1,ii)
                  if     (a(io,jj-1-j,k).ne.vland) then
                    a(i,jj+j,k) = sarc*a(io,jj-1-j,k)
                  else
                    a(i,jj+j,k) = vland
                  endif
                enddo !i
              elseif (itype.eq.2 .or. itype.eq.12) then
                do i= 1,ii
                  io = mod(ii-(i-1),ii)+1
                  if     (a(io,jj-j,k).ne.vland) then
                    a(i,jj+j,k) = sarc*a(io,jj-j,k)
                  else
                    a(i,jj+j,k) = vland
                  endif
                enddo !i
              elseif (itype.eq.3 .or. itype.eq.13) then
                do i= 1,ii
                  io = mod(ii-(i-1),ii)+1
                  if     (a(io,jj-1-j,k).ne.vland) then
                    a(i,jj+j,k) = sarc*a(io,jj-1-j,k)
                  else
                    a(i,jj+j,k) = vland
                  endif
                enddo !i
              elseif (itype.eq.4 .or. itype.eq.14) then
                do i= 1,ii
                  io = ii-mod(i-1,ii)
                  if     (a(io,jj-j,k).ne.vland) then
                    a(i,jj+j,k) = sarc*a(io,jj-j,k)
                  else
                    a(i,jj+j,k) = vland
                  endif
                enddo !i
              endif !itype
            enddo !j
          enddo !k
        else
          if     (nproc.ne.jpr) then
            l = 0
            do i= 1,ii  ! outer loop to simplify multiple neighbor case
              do k= l1,ld
                do j= 1,nhl
                  l = l + 1
                  ai(l,1) = a(i,jj+1-j,k)
                  ai(l,2) = a(i,     j,k)
                  ai(l,3) = vland
                  ai(l,4) = vland
                enddo !j
              enddo !k
            enddo !i
          elseif (itype.eq.1 .or. itype.eq.11) then !p-grid
            l = 0
            do i= 1,ii  ! outer loop to simplify multiple neighbor case
              io = ii+1-i !ii:1:-1
              do k= l1,ld
                do j= 1,nhl
                  l = l + 1
                  if     (a(io,jj-1-j,k).ne.vland) then
                    ai(l,1) = sarc*a(io,jj-1-j,k)
                  else
                    ai(l,1) = vland
                  endif
                  ai(l,2) = a(i,j,k)
                  ai(l,3) = vland
                  ai(l,4) = vland
                enddo !j
              enddo !k
            enddo !i
          elseif (itype.eq.3 .or. itype.eq.13) then  !u-grid
            l = 0
            do i= 1,ii  ! outer loop to simplify multiple neighbor case
              io = ii+2-i !ii+1:2:-1
              do k= l1,ld
                do j= 1,nhl
                  l = l + 1
                  if     (a(io,jj-1-j,k).ne.vland) then
                    ai(l,1) = sarc*a(io,jj-1-j,k)
                  else
                    ai(l,1) = vland
                  endif
                  ai(l,2) = a(i,j,k)
                  ai(l,3) = vland
                  ai(l,4) = vland
                enddo !j
              enddo !k
            enddo !i
            l = 0
            do k= l1,ld
              do j= 1,nhl
                l = l + 1
                if     (a(1,jj-1-j,k).ne.vland) then
                  aia(l,1) = sarc*a(1,jj-1-j,k)
                else
                  aia(l,1) = vland
                endif
                aia(l,2) = vland
              enddo !j
            enddo !k
          elseif (itype.eq.2 .or. itype.eq.12) then  !q-grid
            l = 0
            do i= 1,ii  ! outer loop to simplify multiple neighbor case
              io = ii+2-i !ii+1:2:-1
              do k= l1,ld
                do j= 1,nhl
                  l = l + 1
                  if     (a(io,jj-j,k).ne.vland) then
                    ai(l,1) = sarc*a(io,jj-j,k)
                  else
                    ai(l,1) = vland
                  endif
                  ai(l,2) = a(i,j,k)
                  ai(l,3) = vland
                  ai(l,4) = vland
                enddo !j
              enddo !k
            enddo !i
            l = 0
            do k= l1,ld
              do j= 1,nhl
                l = l + 1
                if     (a(1,jj-j,k).ne.vland) then
                  aia(l,1) = sarc*a(1,jj-j,k)
                else
                  aia(l,1) = vland
                endif
                aia(l,2) = vland
              enddo !j
            enddo !k
          else  !v-grid
            l = 0
            do i= 1,ii  ! outer loop to simplify multiple neighbor case
              io = ii+1-i  !ii:1:-1
              do k= l1,ld
                do j= 1,nhl
                  l = l + 1
                  if     (a(io,jj-j,k).ne.vland) then
                    ai(l,1) = sarc*a(io,jj-j,k)
                  else
                    ai(l,1) = vland
                  endif
                  ai(l,2) = a(i,j,k)
                  ai(l,3) = vland
                  ai(l,4) = vland
                enddo !j
              enddo !k
            enddo !i
          endif  !itype
!         write(lp,'(a,6i6)') 'xctilr - nhl,l1,ld,ii,l,mnproc = ',
!    &                                  nhl,l1,ld,ii,l,mnproc
!         call xcsync(flush_lp)
!
#if defined(MPI)
          if     (itype.eq.2 .or. itype.eq.12 .or.         & !q-grid
                  itype.eq.3 .or. itype.eq.13     ) then  !u-grid
            itynew = 2
          else
            itynew = 1
          endif
          if     (klnold.ne.(ld-l1+1) .or. &
                  nhlold.ne.nhl       .or. &
                  ityold.ne.itynew        ) then  !new mpi init needed
#if defined(DEBUG_ALL)
!           if     (mnproc.eq.1) then
!             write(lp,'(a,4i6)') 'xctilr - nhl,l1,ld,itype = ',
!    &                                      nhl,l1,ld,itype
!           endif
!           call xcsync(flush_lp)
#endif
            do i= 1,nreqa
              call mpi_request_free(mpireqa(i), mpierr)
            enddo
!
!           loop through all neigboring tiles.
!
            l = 0
            do m= 1,mm_top
              l   = l + 1
              ls0 = i0_st(m)*nhl*(ld-l1+1)
              lm  = ii_st(m)*nhl*(ld-l1+1)
              if     (nproc.ne.jpr) then
                call mpi_send_init( &
                  ai(ls0+1,1),lm,MTYPER, &
                  idproc(m0_top+m,nproc+1), 9905, &
                  mpi_comm_hycom, mpireqa(l), mpierr)
              else !arctic
                call mpi_send_init( &
                  ai(ls0+1,1),lm,MTYPER, &
                  idproc(m0_top+m,nproc+1), 99051, &
                  mpi_comm_hycom, mpireqa(l), mpierr)
              endif
            enddo
            if     (nproc.eq.jpr) then !arctic
              if     (itype.eq.2 .or. itype.eq.12 .or.         & !q-grid
                      itype.eq.3 .or. itype.eq.13     ) then  !u-grid
                l   = l + 1
                lm  = nhl*(ld-l1+1)
                call mpi_send_init( &
                  aia(1,1),lm,MTYPER, &
                  idproc(mod(ipr+1-mproc,ipr)+1,nproc), 99052, &
                  mpi_comm_hycom, mpireqa(l), mpierr)
              endif !q-grid,u-grid
            endif
            do m= 1,mm_bot
              l   = l + 1
              ls0 = i0_sb(m)*nhl*(ld-l1+1)
              lm  = ii_sb(m)*nhl*(ld-l1+1)
              call mpi_send_init( &
                ai(ls0+1,2),lm,MTYPER,idproc(m0_bot+m,nproc-1), 9906, &
                mpi_comm_hycom, mpireqa(l), mpierr)
            enddo
            do m= 1,mm_top
              l   = l + 1
              ls0 = i0_st(m)*nhl*(ld-l1+1)
              lm  = ii_st(m)*nhl*(ld-l1+1)
              if     (nproc.ne.jpr) then
                call mpi_recv_init( &
                  ai(ls0+1,4),lm,MTYPER, &
                  idproc(m0_top+m,nproc+1), 9906, &
                  mpi_comm_hycom, mpireqa(l), mpierr)
              else !arctic
                call mpi_recv_init( &
                  ai(ls0+1,4),lm,MTYPER, &
                  idproc(m0_top+m,nproc+1), 99051, &
                  mpi_comm_hycom, mpireqa(l), mpierr)
              endif
            enddo
            if     (nproc.eq.jpr) then !arctic
              if     (itype.eq.2 .or. itype.eq.12 .or.         & !q-grid
                      itype.eq.3 .or. itype.eq.13     ) then  !u-grid
                l   = l + 1
                lm  = nhl*(ld-l1+1)
                call mpi_recv_init( &
                  aia(1,2),lm,MTYPER, &
                  idproc(mod(ipr+1-mproc,ipr)+1,nproc), 99052, &
                  mpi_comm_hycom, mpireqa(l), mpierr)
              endif !q-grid,u-grid
            endif
            do m= 1,mm_bot
              l   = l + 1
              ls0 = i0_sb(m)*nhl*(ld-l1+1)
              lm  = ii_sb(m)*nhl*(ld-l1+1)
              call mpi_recv_init( &
                ai(ls0+1,3),lm,MTYPER,idproc(m0_bot+m,nproc-1), 9905, &
                mpi_comm_hycom, mpireqa(l), mpierr)
            enddo
            nreqa = l
          endif
          if     (nreqa.gt.0) then
            call mpi_startall(nreqa, mpireqa, mpierr)
            call mpi_waitall( nreqa, mpireqa, MPI_STATUSES_IGNORE, &
                                              mpierr)
          endif
#elif defined(SHMEM)
          BARRIER
!
!         loop through all neigboring tiles.
!
          do m= 1,mm_top
            if     (idproc(m0_top+m,nproc+1).ne.null_tile) then
              lg0 = i0_gt(m)*nhl*(ld-l1+1)
              ls0 = i0_st(m)*nhl*(ld-l1+1)
              lm  = ii_st(m)*nhl*(ld-l1+1)
              if     (nproc.ne.jpr) then
                call SHMEM_GETR(ai(ls0+1,4), &
                                ai(lg0+1,2),lm, &
                                idproc(m0_top+m,nproc+1))
              else !arctic
                call SHMEM_GETR(ai(ls0+1,4), &
                                ai(lg0+1,1),lm,   & !buffer 1
                                idproc(m0_top+m,nproc+1))
              endif
            endif
          enddo
          if     (nproc.eq.jpr) then !arctic
            if     (itype.eq.2 .or. itype.eq.12 .or.         & !q-grid
                    itype.eq.3 .or. itype.eq.13     ) then  !u-grid
              lm  = nhl*(ld-l1+1)
              call SHMEM_GETR(aia(1,2), &
                              aia(1,1),lm, &
                              idproc(mod(ipr+1-mproc)+1,nproc))
            endif !q-grid,u-grid
          endif
!
          do m= 1,mm_bot
            if     (idproc(m0_bot+m,nproc-1).ne.null_tile) then
              lg0 = i0_gb(m)*nhl*(ld-l1+1)
              ls0 = i0_sb(m)*nhl*(ld-l1+1)
              lm  = ii_sb(m)*nhl*(ld-l1+1)
              call SHMEM_GETR(ai(ls0+1,3), &
                              ai(lg0+1,1),lm, idproc(m0_bot+m,nproc-1))
            endif
          enddo
#endif  /* MPI:SHMEM */
!
          if     (nproc.eq.jpr) then  !arctic
            if     (itype.eq.2 .or. itype.eq.12 .or.         & !q-grid
                    itype.eq.3 .or. itype.eq.13     ) then  !u-grid
              l  = 0
              do k= l1,ld
                do j= 1,nhl
                  l  = l  + 1
                  ai(l,4) = aia(l,2)
                enddo !j
              enddo !k
            endif !q-grid,u-grid
          endif !arctic
          l = 0
          do i= 1,ii  ! outer loop to simplify multiple neighbor case
            do k= l1,ld
              do j= 1,nhl
                l = l + 1
                a(i, 1-j,k) = ai(l,3)
                a(i,jj+j,k) = ai(l,4)
              enddo
            enddo
          enddo
        endif  ! jpr.eq.1:else
      endif  ! nhl.gt.0
!
      if     (mhl.gt.0) then
        if     (ipr.eq.1) then
          if     (nreg.eq.0) then
            do k= l1,ld
              do j= 1-nhl,jj+nhl
                do i= 1,mhl
                  a( 1-i,j,k) = vland
                  a(ii+i,j,k) = vland
                enddo
              enddo
            enddo
          else
            do k= l1,ld
              do j= 1-nhl,jj+nhl
                do i= 1,mhl
                  a( 1-i,j,k) = a(ii+1-i,j,k)
                  a(ii+i,j,k) = a(     i,j,k)
                enddo
              enddo
            enddo
          endif
        else
          l = 0
          do k= l1,ld
            do j= 1-nhl,jj+nhl
              do i= 1,mhl
                l = l + 1
                aj(l,1) = a(ii+1-i,j,k)
                aj(l,2) = a(     i,j,k)
                aj(l,3) = vland
                aj(l,4) = vland
              enddo
            enddo
          enddo
!         write(lp,'(a,6i6)') 'xctilr - mhl,l1,ld,jj,l,mnproc = ',
!    &                                  mhl,l1,ld,jj,l,mnproc
!         call xcsync(flush_lp)
#if defined(MPISR)
          call mpi_sendrecv( &
                aj(1,1),l,MTYPER,idhalo(2), 9907, &
                aj(1,4),l,MTYPER,idhalo(2), 9908, &
                mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
          call mpi_sendrecv( &
                aj(1,2),l,MTYPER,idhalo(1), 9908, &
                aj(1,3),l,MTYPER,idhalo(1), 9907, &
                mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
#elif defined(MPI)
          if     (klmold.ne.(ld-l1+1) .or. &
                  mhlold.ne.mhl       .or. &
                  nhlold.ne.nhl           ) then  !new mpi init
#if defined(DEBUG_ALL)
!           if     (mnproc.eq.1) then
!             write(lp,'(a,3i6)') 'xctilr - mhl,l1,ld       = ',
!    &                                      mhl,l1,ld
!           endif
!           call xcsync(flush_lp)
#endif
            if     (klmold.ne.0) then
              call mpi_request_free(mpireqb(1), mpierr)
              call mpi_request_free(mpireqb(2), mpierr)
              call mpi_request_free(mpireqb(3), mpierr)
              call mpi_request_free(mpireqb(4), mpierr)
            endif
            call mpi_send_init( &
                  aj(1,1),l,MTYPER,idhalo(2), 9907, &
                  mpi_comm_hycom, mpireqb(1), mpierr)
            call mpi_send_init( &
                  aj(1,2),l,MTYPER,idhalo(1), 9908, &
                  mpi_comm_hycom, mpireqb(2), mpierr)
            call mpi_recv_init( &
                  aj(1,3),l,MTYPER,idhalo(1), 9907, &
                  mpi_comm_hycom, mpireqb(3), mpierr)
            call mpi_recv_init( &
                  aj(1,4),l,MTYPER,idhalo(2), 9908, &
                  mpi_comm_hycom, mpireqb(4), mpierr)
          endif
          call mpi_startall(4, mpireqb, mpierr)
          call mpi_waitall( 4, mpireqb, MPI_STATUSES_IGNORE, mpierr)
#elif defined(SHMEM)
          BARRIER_MP
          if     (idhalo(1).ne.null_tile) then
            call SHMEM_GETR(aj(1,3), &
                            aj(1,1),l,idhalo(1))
          endif
          if     (idhalo(2).ne.null_tile) then
            call SHMEM_GETR(aj(1,4), &
                            aj(1,2),l,idhalo(2))
          endif
          BARRIER_MP
#endif
          l = 0
          do k= l1,ld
            do j= 1-nhl,jj+nhl
              do i= 1,mhl
                l = l + 1
                a( 1-i,j,k) = aj(l,3)
                a(ii+i,j,k) = aj(l,4)
              enddo
            enddo
          enddo
        endif  ! ipr.eq.1:else
      endif  ! mhl.gt.0
!
      klnold = ld-l1+1
      klmold = ld-l1+1
      nhlold = nhl
      mhlold = mhl
      ityold = itynew
#if defined(TIMER)
!
      if     (nxc.eq.12) then
        call xctmr1(12)
        nxc = 0
      endif
#endif
      return
      end subroutine xctilr
#else /* !ARCTIC */
      recursive subroutine xctilr(a,l1,ld,mh,nh,itype)
      implicit none
!
      integer, intent(in)    :: l1,ld,mh,nh,itype
      real,    intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy,ld)
!
!**********
!*
!  1) update the tile overlap halo of a real array.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    a               real           in/out    target array
!    l1              integer        input     3rd dim. start index
!    ld              integer        input     3rd dimension of a
!    mh              integer        input     1st (EW) update halo size
!    nh              integer        input     2nd (NS) update halo size
!    itype           integer        input     grid and field type
!
!  3) itype selects both the grid and field type
!        itype= 1; p-grid, scalar field
!        itype= 2; q-grid, scalar field
!        itype= 3; u-grid, scalar field
!        itype= 4; v-grid, scalar field
!        itype=11; p-grid, vector field
!        itype=12; q-grid, vector field
!        itype=13; u-grid, vector field
!        itype=14; v-grid, vector field
!     it is ignored here because all types are the same unless
!      the grid includes the arctic ocean
!
!  4) the global variable vland is returned by halos over land.
!*
!**********
!
!     halo buffer.
!
#if defined(RELO)
      real, save, allocatable, dimension (:,:) :: ai,aj
#else
      integer, parameter :: ilen= idm        *kdm*nbdy+64, &
                            jlen=(jdm+2*nbdy)*kdm*nbdy+64
      real, save, dimension(ilen,4) :: ai
      real, save, dimension(jlen,4) :: aj
#endif
!
      integer i,j,k,l,lg0,ls0,lm,m,mhl,nhl
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
!
!     persistent communication handles.
!
      integer mpireqa(4*iqr),mpireqb(4), &
              ilold,jlold,nreqa
      save    mpireqa,mpireqb, &
              ilold,jlold,nreqa
!
      data ilold,jlold / 0,0 /
#endif /* MPI */
#if defined(RELO)
!
      if     (.not.allocated(ai)) then
        allocate(        ai( idm        *kdm*nbdy+64 ,4) )
        call mem_stat_add( ( idm        *kdm*nbdy+64)*4 )
        allocate(        aj((jdm+2*nbdy)*kdm*nbdy+64 ,4) )
        call mem_stat_add( ((jdm+2*nbdy)*kdm*nbdy+64)*4 )
      endif
#endif
!
! --- split large requests into smaller pieces
!
      if     (ld-l1+1.gt.kdm) then
        do k= l1,ld,kdm
          l = min(k+kdm-1,ld)
          call xctilr(a,k,l,mh,nh,itype)
        enddo
        return
      endif
!
#if defined(TIMER)
!
      if     (nxc.eq.0) then
        call xctmr0(12)
        nxc = 12
      endif
#endif
!
      mhl = max(0,min(mh,nbdy))
      nhl = max(0,min(nh,nbdy))
!
      if     (nhl.gt.0) then
        if     (jpr.eq.1) then
          do k= l1,ld
            do j= 1,nhl
              do i= 1,ii
                a(i, 1-j,k) = vland
                a(i,jj+j,k) = vland
              enddo
            enddo
          enddo
        else
          l = 0
          do i= 1,ii  ! outer loop to simplify multiple neighbor case
            do k= l1,ld
              do j= 1,nhl
                l = l + 1
                ai(l,1) = a(i,jj+1-j,k)
                ai(l,2) = a(i,     j,k)
                ai(l,3) = vland
                ai(l,4) = vland
              enddo
            enddo
          enddo
!         write(lp,'(a,6i6)') 'xctilr - nhl,l1,ld,ii,l,mnproc = ',
!    &                                  nhl,l1,ld,ii,l,mnproc
!         call xcsync(flush_lp)
!
#if defined(MPI)
          if     (jlold.ne.l) then
            if     (jlold.ne.0) then
              do i= 1,nreqa
                call mpi_request_free(mpireqa(i), mpierr)
              enddo
            endif
            jlold = l
!
!           loop through all neigboring tiles.
!
            l = 0
            do m= 1,mm_top
              l   = l + 1
              ls0 = i0_st(m)*nhl*(ld-l1+1)
              lm  = ii_st(m)*nhl*(ld-l1+1)
              call mpi_send_init( &
                ai(ls0+1,1),lm,MTYPER,idproc(m0_top+m,nproc+1), 9905, &
                mpi_comm_hycom, mpireqa(l), mpierr)
            enddo
            do m= 1,mm_bot
              l   = l + 1
              ls0 = i0_sb(m)*nhl*(ld-l1+1)
              lm  = ii_sb(m)*nhl*(ld-l1+1)
              call mpi_send_init( &
                ai(ls0+1,2),lm,MTYPER,idproc(m0_bot+m,nproc-1), 9906, &
                mpi_comm_hycom, mpireqa(l), mpierr)
            enddo
            do m= 1,mm_top
              l   = l + 1
              ls0 = i0_st(m)*nhl*(ld-l1+1)
              lm  = ii_st(m)*nhl*(ld-l1+1)
              call mpi_recv_init( &
                ai(ls0+1,4),lm,MTYPER,idproc(m0_top+m,nproc+1), 9906, &
                mpi_comm_hycom, mpireqa(l), mpierr)
            enddo
            do m= 1,mm_bot
              l   = l + 1
              ls0 = i0_sb(m)*nhl*(ld-l1+1)
              lm  = ii_sb(m)*nhl*(ld-l1+1)
              call mpi_recv_init( &
                ai(ls0+1,3),lm,MTYPER,idproc(m0_bot+m,nproc-1), 9905, &
                mpi_comm_hycom, mpireqa(l), mpierr)
            enddo
            nreqa = l
          endif
          if     (nreqa.gt.0) then
            call mpi_startall(nreqa, mpireqa, mpierr)
            call mpi_waitall( nreqa, mpireqa, MPI_STATUSES_IGNORE, &
                                              mpierr)
          endif
#elif defined(SHMEM)
          BARRIER
!
!         loop through all neigboring tiles.
!
          do m= 1,mm_top
            if     (idproc(m0_top+m,nproc+1).ne.null_tile) then
              lg0 = i0_gt(m)*nhl*(ld-l1+1)
              ls0 = i0_st(m)*nhl*(ld-l1+1)
              lm  = ii_st(m)*nhl*(ld-l1+1)
              call SHMEM_GETR(ai(ls0+1,4), &
                              ai(lg0+1,2),lm, idproc(m0_top+m,nproc+1))
            endif
          enddo
!
          do m= 1,mm_bot
            if     (idproc(m0_bot+m,nproc-1).ne.null_tile) then
              lg0 = i0_gb(m)*nhl*(ld-l1+1)
              ls0 = i0_sb(m)*nhl*(ld-l1+1)
              lm  = ii_sb(m)*nhl*(ld-l1+1)
              call SHMEM_GETR(ai(ls0+1,3), &
                              ai(lg0+1,1),lm, idproc(m0_bot+m,nproc-1))
            endif
          enddo
#endif  /* MPI:SHMEM */
!
          l = 0
          do i= 1,ii  ! outer loop to simplify multiple neighbor case
            do k= l1,ld
              do j= 1,nhl
                l = l + 1
                a(i, 1-j,k) = ai(l,3)
                a(i,jj+j,k) = ai(l,4)
              enddo
            enddo
          enddo
        endif  ! jpr.eq.1:else
      endif  ! nhl.gt.0
!
      if     (mhl.gt.0) then
        if     (ipr.eq.1) then
          if     (nreg.eq.0) then
            do k= l1,ld
              do j= 1-nhl,jj+nhl
                do i= 1,mhl
                  a( 1-i,j,k) = vland
                  a(ii+i,j,k) = vland
                enddo
              enddo
            enddo
          else
            do k= l1,ld
              do j= 1-nhl,jj+nhl
                do i= 1,mhl
                  a( 1-i,j,k) = a(ii+1-i,j,k)
                  a(ii+i,j,k) = a(     i,j,k)
                enddo
              enddo
            enddo
          endif
        else
          l = 0
          do k= l1,ld
            do j= 1-nhl,jj+nhl
              do i= 1,mhl
                l = l + 1
                aj(l,1) = a(ii+1-i,j,k)
                aj(l,2) = a(     i,j,k)
                aj(l,3) = vland
                aj(l,4) = vland
              enddo
            enddo
          enddo
!         write(lp,'(a,6i6)') 'xctilr - mhl,l1,ld,jj,l,mnproc = ',
!    &                                  mhl,l1,ld,jj,l,mnproc
!         call xcsync(flush_lp)
#if defined(MPISR)
          call mpi_sendrecv( &
                aj(1,1),l,MTYPER,idhalo(2), 9907, &
                aj(1,4),l,MTYPER,idhalo(2), 9908, &
                mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
          call mpi_sendrecv( &
                aj(1,2),l,MTYPER,idhalo(1), 9908, &
                aj(1,3),l,MTYPER,idhalo(1), 9907, &
                mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
#elif defined(MPI)
          if     (ilold.ne.l) then
            if     (ilold.ne.0) then
              call mpi_request_free(mpireqb(1), mpierr)
              call mpi_request_free(mpireqb(2), mpierr)
              call mpi_request_free(mpireqb(3), mpierr)
              call mpi_request_free(mpireqb(4), mpierr)
            endif
            ilold = l
            call mpi_send_init( &
                  aj(1,1),l,MTYPER,idhalo(2), 9907, &
                  mpi_comm_hycom, mpireqb(1), mpierr)
            call mpi_send_init( &
                  aj(1,2),l,MTYPER,idhalo(1), 9908, &
                  mpi_comm_hycom, mpireqb(2), mpierr)
            call mpi_recv_init( &
                  aj(1,3),l,MTYPER,idhalo(1), 9907, &
                  mpi_comm_hycom, mpireqb(3), mpierr)
            call mpi_recv_init( &
                  aj(1,4),l,MTYPER,idhalo(2), 9908, &
                  mpi_comm_hycom, mpireqb(4), mpierr)
          endif
          call mpi_startall(4, mpireqb, mpierr)
          call mpi_waitall( 4, mpireqb, MPI_STATUSES_IGNORE, mpierr)
#elif defined(SHMEM)
          BARRIER_MP
          if     (idhalo(1).ne.null_tile) then
            call SHMEM_GETR(aj(1,3), &
                            aj(1,1),l,idhalo(1))
          endif
          if     (idhalo(2).ne.null_tile) then
            call SHMEM_GETR(aj(1,4), &
                            aj(1,2),l,idhalo(2))
          endif
          BARRIER_MP
#endif
          l = 0
          do k= l1,ld
            do j= 1-nhl,jj+nhl
              do i= 1,mhl
                l = l + 1
                a( 1-i,j,k) = aj(l,3)
                a(ii+i,j,k) = aj(l,4)
              enddo
            enddo
          enddo
        endif  ! ipr.eq.1:else
      endif  ! mhl.gt.0
#if defined(TIMER)
!
      if     (nxc.eq.12) then
        call xctmr1(12)
        nxc = 0
      endif
#endif
      return
      end subroutine xctilr
#endif /* ARCTIC:else */

      subroutine xctmri
      implicit none
!
!**********
!*
!  1) initialize timers.
!
!  2) timers  1:32 are for message passing routines,
!     timers 33:80 are for general hycom routines,
!     timers 81:96 are for user selected routines.
!     timer     97 is the total time.
!
!  3) call xctmri    to initialize timers (called in xcspmd),
!     call xctmr0(n) to start timer n,
!     call xctmr1(n) to stop  timer n and add event to timer sum,
!     call xctnrn(n,cname) to register a name for timer n,
!     call xctnrs(n,ncan,ncin) to set nca(n) and nci(n),
!     call xctmrp to printout timer statistics (called by xcstop).
!
!  4) time every nci(n)-th event above nca(n), default 50 and 5,000.
!*
!**********
!
      integer i
!
      real*8     zero8
      parameter (zero8=0.0)
!
      nxc = 0
      do i= 1,97
        cc( i) = '      '
        nc( i) = 0
        nca(i) = 5000  !default
        nci(i) = 50    !default
        tc( i) = zero8
      enddo
!
      call xctmrn(97,'total ')
      call xctmr0(97)
      return
      end subroutine xctmri

      subroutine xctmr0(n)
      implicit none
!
      integer, intent(in) :: n
!
!**********
!*
!  1) start timer n.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    n               integer        input     timer number
!
!  3) time every nci(n)-th event above nca(n), default 50 and 5,000.
!*
!**********
!
      real*8 wtime
!
#if defined(DEBUG_TIMER_ALL)
      if     (              cc(n).ne.'      ') then
        write(lp,'(i5,2x,a,a)') mnproc,'call ',cc(n)
        call flush(lp)
      endif
#endif
#if defined(DEBUG_TIMER)
      if     (n.gt.32 .and. cc(n).ne.'      ') then
        if     (mnproc.eq.1) then
        write(lp,*) 'call ',cc(n)
        call flush(lp)
        endif
      endif
#endif
      if     (timer_on) then
        if     (mod(nc(n),nci(n)).eq.0 .or. nc(n).le.nca(n)) then
          t0(n) = wtime()
        endif
      endif !timer_on
      return
      end subroutine xctmr0

      subroutine xctmr1(n)
      implicit none
!
      integer, intent(in) :: n
!
!**********
!*
!  1) add time since call to xctim0 to timer n.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    n               integer        input     timer number
!
!  3) time every nci(n)-th event above nca(n), default 50 and 5,000.
!*
!**********
!
      real*8  wtime
!
      if     (timer_on) then
        if     (nc(n).gt.nca(n)) then
          if     (mod(nc(n),nci(n)).eq.0) then
            tc(n) = tc(n) + nci(n)*(wtime() - t0(n))
          endif
        else
          tc(n) = tc(n) + (wtime() - t0(n))
        endif
        nc(n) = nc(n) + 1
      endif !timer_on
#if defined(DEBUG_TIMER_ALL)
      if     (              cc(n).ne.'      ') then
        write(lp,'(i5,2x,a,a)') mnproc,'exit ',cc(n)
        call flush(lp)
      endif
#endif
#if defined(DEBUG_TIMER)
      if     (n.gt.32 .and. cc(n).ne.'      ') then
        if     (mnproc.eq.1) then
        write(lp,*) 'exit ',cc(n)
        call flush(lp)
        endif
      endif
#endif
      return
      end subroutine xctmr1

      subroutine xctmrn(n,cname)
      implicit none
!
      character*6, intent(in) :: cname
      integer,     intent(in) :: n
!
!**********
!*
!  1) register name of timer n.
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    n               integer        input     timer number
!    cname           char*(6)       input     timer name
!*
!**********
!
      cc(n) = cname
      return
      end subroutine xctmrn

      subroutine xctmrs(n,ncan,ncin)
      implicit none
!
      integer,     intent(in) :: n,ncan,ncin
!
!**********
!*
!  1) set nca(n) and nci(n)
!
!  2) parameters:
!       name            type         usage            description
!    ----------      ----------     -------  ----------------------------
!    n               integer        input     timer number
!    ncan            integer        input     value for nca(n)
!    ncin            integer        input     value for nci(n)
!
!  3) ncan must be a multiple of ncin
!*
!**********
!
      if     (mod(ncan,ncin).ne.0) then
        if     (mnproc.eq.1) then
        write(lp,'(a,i3,2i8)') 'n,ncan,ncin =',n,ncan,ncin
        call flush(lp)
        endif
        call xcstop('Error in xctmrs - ncan not a multiple of ncin')
        stop '(xcspmd)'
      endif
!
      nca(n) = ncan
      nci(n) = ncin
      return
      end subroutine xctmrs

#if defined(TIMER_ALLOUT)
      subroutine xctmrp
      implicit none
!
!**********
!*
!  1) print all active timers, on all processors.
!
!  2) on exit all timers are reset to zero.
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      real*8       tci
      integer      i,mn
      character*16 cmnproc
!
      real*8     zero8
      parameter (zero8=0.0)
!
!     get total time.
!
      call xctmr1(97)
!
      call xcsync(flush_lp)  !includes a barrier for shmem
      if     (mnproc.ne.1) then
#if defined(MPI)
        call MPI_SEND(tc,97,MTYPED, &
                      idproc1(1), 9949, &
                      mpi_comm_hycom, mpierr)
#endif
      else   !mnproc.eq.1
        do mn= 1,ijpr
          if     (mn.ne.1) then
#if defined(MPI)
            call MPI_RECV(tc,97,MTYPED, &
                          idproc1(mn), 9949, &
                          mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
#elif defined(SHMEM)
            call SHMEM_GETD(tc,tc,97,idproc1(mn))
#endif
          endif
          write(lp,6000) mn,ijpr
          tcxc(1) = zero8
          do i= 1,32
            if     (nc(i).ne.0) then
              if     (nc(i).le.nca(i)) then
                tci = tc(i)
              else !correct for timing every nci(i)-th event
                tci = (tc(i)*nc(i))/(nc(i)-mod(nc(i),nci(i)))
              endif
              if     (cc(i).ne.'      ') then
                write(lp,6100) cc(i),nc(i),tci,tci/nc(i)
              else
                write(lp,6150)    i, nc(i),tci,tci/nc(i)
              endif
              if     (cc(i)(1:2).eq.'xc') then
                tcxc(1) = tcxc(1) + tci  !communication overhead
              endif
            endif !nc(i)>0
          enddo !i
          write(lp,6100) 'xc****',1,tcxc(1),tcxc(1)
          do i= 33,97
            if     (nc(i).ne.0) then
              if     (nc(i).le.nca(i)) then
                tci = tc(i)
              else !correct for timing every nci(i)-th event
                tci = (tc(i)*nc(i))/(nc(i)-mod(nc(i),nci(i)))
              endif
              if     (cc(i).ne.'      ') then
                write(lp,6100) cc(i),nc(i),tci,tci/nc(i)
              else
                write(lp,6150)    i, nc(i),tci,tci/nc(i)
              endif
            endif
          enddo !i
        enddo !mn
        call mem_stat_print('processor     1:')
      endif !mnproc.ne.1:else
      call xcsync(flush_lp)  !includes a barrier for shmem
      if     (nproc.eq.2 .and. mproc.eq.mpe_1(nproc)) then
        write(cmnproc,'(a,i6,a)') 'processor',mnproc,':'
        call mem_stat_print(cmnproc)
      endif
      call xcsync(flush_lp)
      if     (mnproc.eq.ijpr) then
        write(cmnproc,'(a,i6,a)') 'processor',mnproc,':'
        call mem_stat_print(cmnproc)
        write(lp,*)
      endif
      call xcsync(flush_lp)
!
!     reset timers to zero.
!
      do i= 1,97
        nc(i) = 0
        tc(i) = zero8
      enddo
      tcxc(1) = zero8
!
!     start a new total time measurement.
!
      call xctmr0(97)
      return
!
 6000 format(/ / &
          3x,' timer statistics, processor',i5,' out of',i5 / &
          3x,'-----------------------------------------------' /)
 6100 format(3x,a6, &
         '   calls =',i9, &
         '   time =',f11.5, &
         '   time/call =',f14.8)
 6150 format(3x,'   #',i2, &
         '   calls =',i9, &
         '   time =',f11.5, &
         '   time/call =',f14.8)
      end subroutine xctmrp
#else
      subroutine xctmrp
      implicit none
!
!**********
!*
!  1) print all active timers.
!
!  2) on exit all timers are reset to zero.
!*
!**********
!
#if defined(MPI)
      include 'mpif.h'
      integer mpierr
#endif
!
      real*8       tci
      integer      i,mn,mnloc
      character*16 cmnproc
!
      real*8     zero8
      parameter (zero8=0.0)
!
!     get total time.
!
      call xctmr1(97)
!
!     report time on the processor with the least communication overhead
!
      tcxc(2) = mnproc
      tcxc(1) = zero8
      do i= 1,32
        if     (nc(i).ne.0 .and. cc(i)(1:2).eq.'xc') then
          tcxc(1) = tcxc(1) + tc(i)  !communication overhead
        endif
      enddo !i
!
      if     (ijpr.ne.1) then
#if defined(MPI)
        tcxl(1) = tcxc(1)
        tcxl(2) = tcxc(2)
        call mpi_allreduce(tcxl,tcxc,1, &
                           mpi_2double_precision,mpi_minloc, &
                           mpi_comm_hycom,mpierr)
        mnloc = tcxc(2)  !processor with the least comm. overhead
        if     (mnproc.eq.1) then
          if     (mnloc.ne.1) then
            call MPI_RECV(tc,97,MTYPED, &
                          idproc1(mnloc), 9949, &
                          mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr)
          endif
        elseif (mnproc.eq.mnloc) then
          call MPI_SEND(tc,97,MTYPED, &
                        idproc1(1), 9949, &
                        mpi_comm_hycom, mpierr)
        endif
#elif defined(SHMEM)
        BARRIER
        if     (mnproc.eq.1) then
          mnloc = 1
          do mn= 2,ijpr
            call SHMEM_GETD(tcxc(2),tcxc(1),1,idproc1(mn))
            if     (tcxc(2).gt.tcxc(1)) then
              tcxc(1) = tcxc(2)
              mnloc   = mn
            endif
          enddo
          tcxc(2) = mnloc  !processor with the least comm. overhead
        endif
        if     (mnloc.ne.1) then
          call SHMEM_GETD(tc,tc,97,idproc1(mnloc))
        endif
        BARRIER
#endif
      endif
!
      call xcsync(flush_lp)
      if     (mnproc.eq.1) then
        write(lp,6000) mnloc,ijpr
        do i= 1,32
          if     (nc(i).ne.0) then
            if     (nc(i).le.nca(i)) then
              tci = tc(i)
            else !correct for timing every nci(i)-th event
              tci = (tc(i)*nc(i))/(nc(i)-mod(nc(i),nci(i)))
            endif
            if     (cc(i).ne.'      ') then
              write(lp,6100) cc(i),nc(i),tci,tci/nc(i)
            else
              write(lp,6150)    i, nc(i),tci,tci/nc(i)
            endif
            if     (cc(i)(1:2).eq.'xc') then
              tcxc(1) = tcxc(1) + tci  !communication overhead
            endif
          endif !nc(i)>0
        enddo !i
        write(lp,6100) 'xc****',1,tcxc(1),tcxc(1)
        do i= 33,97
          if     (nc(i).ne.0) then
            if     (nc(i).le.nca(i)) then
              tci = tc(i)
            else !correct for timing every nci(i)-th event
              tci = (tc(i)*nc(i))/(nc(i)-mod(nc(i),nci(i)))
            endif
            if     (cc(i).ne.'      ') then
              write(lp,6100) cc(i),nc(i),tci,tci/nc(i)
            else
              write(lp,6150)    i, nc(i),tci,tci/nc(i)
            endif
          endif
        enddo !i
        call mem_stat_print('processor     1:')
      endif !mnproc.eq.1
      call xcsync(flush_lp)
      if     (nproc.eq.2 .and. mproc.eq.mpe_1(nproc)) then
        write(cmnproc,'(a,i6,a)') 'processor',mnproc,':'
        call mem_stat_print(cmnproc)
      endif
      call xcsync(flush_lp)
      if     (mnproc.eq.ijpr) then
        write(cmnproc,'(a,i6,a)') 'processor',mnproc,':'
        call mem_stat_print(cmnproc)
        write(lp,*)
      endif
      call xcsync(flush_lp)
!
!     reset timers to zero.
!
      do i= 1,97
        nc(i) = 0
        tc(i) = zero8
      enddo
      tcxc(1) = zero8
!
!     start a new total time measurement.
!
      call xctmr0(97)
      return
!
 6000 format(/ / &
          3x,' timer statistics, processor',i5,' out of',i5 / &
          3x,'-----------------------------------------------' /)
 6100 format(3x,a6, &
         '   calls =',i9, &
         '   time =',f11.5, &
         '   time/call =',f14.8)
 6150 format(3x,'   #',i2, &
         '   calls =',i9, &
         '   time =',f11.5, &
         '   time/call =',f14.8)
      end subroutine xctmrp
#endif /* TIMER_ALLOUT:else */
!
!  Revision history:
!
!> Nov. 2009 - iisum bugfix for periodic domains
!> Nov. 2011 - time every 50-th event above 5,000 (was 1,000).
!> Mar. 2012 - added optional mnflg to xclput
!> Mar. 2012 - bugfix to periodic case in xclput
!> Apr. 2012 - added optional mnflg to xceget and xceput
!> Apr. 2012 - added xciget and xciput
!> Nov. 2012 - added the /* OCEANS2 */ macro and xcpipe
!> Dec. 2012 - iisum bugfix for xcsumj
!> Jan. 2014 - replaced mpistat with MPI_STATUS[ES]_IGNORE
!> Jan. 2014 - ARCTIC xctilr bugfix for when called after xcsum
!> Feb. 2015 - reduced buffering memory requirements
!> Dec. 2016 - added nca and nci
!> July 2017 - added xcgput4
!> Dec. 2018 - mpi_comm_vm now an optional argument to xcspmd
!> Oct. 2019 - bugfix to xcmaxr and xcminr for SHMEM
!> Oct. 2019 - added xcsumr