!PDLIB Software License
!
!Software, as understood herein, shall be broadly interpreted as being inclusive of algorithms,
!source code, object code, data bases and related documentation, all of which shall be furnished
!free of charge to the Licensee. Corrections, upgrades or enhancements may be furnished and, if
!furnished, shall also be furnished to the Licensee without charge. NOAA, however, is not
!required to develop or furnish such corrections, upgrades or enhancements.
!Roland & Partner software, whether that initially furnished or corrections or upgrades,
!are furnished "as is". Roland & Partner furnishes its software without any warranty
!whatsoever and is not responsible for any direct, indirect or consequential damages
!that may be incurred by the Licensee. Warranties of merchantability, fitness for any
!particular purpose, title, and non-infringement, are specifically negated.
!The Licensee is not required to develop any software related to the licensed software.
!However, in the event that the Licensee does so, the Licensee is required to offer same
!to Roland & Partner for inclusion under the instant licensing terms with Roland & Partner
!licensed software along with documentation regarding its principles, use and its advantages.
!This includes changes to the wave model proper including numerical and physical approaches
!to wave modeling, and boundary layer parameterizations embedded in the wave model
!A Licensee may reproduce sufficient software to satisfy its needs.
!All copies shall bear the name of the software with any version number
!as well as replicas of any applied copyright notice, trademark notice,
!other notices and credit lines. Additionally, if the copies have been modified,
!e.g. with deletions or additions, this shall be so stated and identified.
!All of Licensee's employees who have a need to use the software may have access
!to the software but only after reading the instant license and stating, in writing,
!that they have read and understood the license and have agreed to its terms.
!Licensee is responsible for employing reasonable efforts to assure
!that only those of its employees that should have access to the software, in fact, have access.
!The Licensee may use the software for any purpose relating to sea state prediction.
!No disclosure of any portion of the software, whether by means of a media or verbally,
!may be made to any third party by the Licensee or the Licensee's employees
!The Licensee is responsible for compliance with any applicable export or
!import control laws of the United States, the European Union and Germany.
!
!© 2009 Roland&Partner, Georgenstr.32, 64297 Germany. All rights reserved.
!PDLIB is a trademark of Roland & Partner. No unauthorized use without permission.
!
!> \file yowpdlibmain.F90
!> \brief initialization
!> \author Thomas Huxhorn
!> \date 2011-2012
module yowpdlibMain
  use yowerr
  use yowDatapool, only : rkind
  use w3servmd,    only : print_memcheck

  !module default
  implicit none

  private
  public :: initFromGridDim, finalizePD

contains

  !> @param[in] MNP number of nodes global
  !> @param[in] XP node X value
  !> @param[in] XY node Y value
  !> @param[in] DEP node Z value
  !> @param[in] MNE number of element global
  !> @param[in] INE element array
  !> @param[in] secDim size of the second dimensions to exchange
  !> @param[in] thirdDim size of the third dimensions to exchange
  !> @param[in] MPIComm MPI communicator to use with pdlib
  !> @overload initPD1
  subroutine initFromGridDim(MNP, MNE, INE_global, secDim, MPIcomm)
    use yowDatapool,       only: myrank, debugPrePartition, debugPostPartition
    use yowNodepool,       only: np_global, np, np_perProcSum, ng, ipgl, iplg, npa
    use yowElementpool,    only: ne_global,ne
    use yowSidepool,       only: ns, ns_global
    use yowExchangeModule, only: nConnDomains, setDimSize
    use yowRankModule,     only: initRankModule, ipgl_npa

    integer, intent(in) :: MNP, MNE
    integer, intent(in) :: INE_global(3,MNE)
    integer, intent(in) :: secDim
    integer, intent(in) :: MPIcomm
    integer :: istat, memunit

    ! note: myrank=0 until after initMPI is called, so only rank=0 file
    ! contains the 'section 1' information
    memunit = 70000+myrank

    call setDimSize(secDim)
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 1')
#ifdef W3_DEBUGINIT
    Print *, '1: MPIcomm=', MPIcomm
#endif
    call initMPI(MPIcomm)

    memunit = 70000+myrank
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 2')
#ifdef W3_DEBUGINIT
    Print *, '2: After initMPI'
#endif
    call assignMesh(MNP, MNE)
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 3')
#ifdef W3_DEBUGINIT
    Print *, '3: After assignMesh'
#endif
    call prePartition()
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 4')
#ifdef W3_DEBUGINIT
    Print *, '3: After prePartition'
#endif
    call findConnNodes(INE_global)
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 5')
#ifdef W3_DEBUGINIT
    Print *, '4: After findConnNodes'
#endif
    if(debugPrePartition) then
      if(myrank == 0) then
        write(*,*) "pre-partition"
        write(*,*) "# Nodes: ", np_global
        write(*,*) "# Elements: ", ne_global
        write(*,*) "# Sides ", ns_global
        write(*,*) "np_perProcSum :", np_perProcSum
      end if
      write(*,*) "Thread", myrank, "# local nodes ", np
      write(*,*) "Thread", myrank, "# local sides" , ns
    endif

    !   call writeMesh()
#ifdef W3_DEBUGINIT
    Print *, '4.1: After findConnNodes'
#endif
    !    CALL REAL_MPI_BARRIER_PDLIB(MPIcomm, "Before call to runParmetis")
    call runParmetis(MNP)
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 6')
    !    CALL REAL_MPI_BARRIER_PDLIB(MPIcomm, "After call to runParmetis")
#ifdef W3_DEBUGINIT
    Print *, '5: After runParmetis'
#endif
    call postPartition
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 7')
#ifdef W3_DEBUGINIT
    Print *, 'Before findGhostNodes'
#endif
    call findGhostNodes
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 8')

    call findConnDomains
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 9')

    call exchangeGhostIds
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 10')

    call postPartition2(INE_global)
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 11')

    call initRankModule
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 12')

    call ComputeTRIA_IEN_SI_CCON
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 13')

    call ComputeIA_JA_POSI_NNZ
    call print_memcheck(memunit, 'memcheck_____:'//' WW3_PDLIB SECTION 14')

    if(debugPostPartition) then
      if(myrank == 0) then
        write(*,*) "New data after partition"
        write(*,*) "# Nodes: ", np_global
        write(*,*) "# Elements: ", ne_global
        write(*,*) "# Sides ", ns_global
        write(*,*) "np_perProcSum :", np_perProcSum
      end if
      write(*,*) "Thread", myrank, "# local elements ", ne
      write(*,*) "Thread", myrank, "# local nodes ", np
      write(*,*) "Thread", myrank, "# local sides" , ns
      write(*,*) "Thread", myrank, "# of ghosts", ng
      write(*,*) "Thread", myrank, "# of neighbor domains", nConnDomains
    endif
  end subroutine initFromGridDim


  SUBROUTINE REAL_MPI_BARRIER_PDLIB(TheComm, string)

    INCLUDE "mpif.h"
    integer, intent(in) :: TheComm
    character(*), intent(in) :: string
    integer NbProc, eRank
    integer :: istatus(MPI_STATUS_SIZE)
    integer ierr, iField(1), iProc
    !      Print *, 'Start of REAL_MPI_BARRIER_PDLIB'
    CALL MPI_COMM_RANK(TheComm, eRank, ierr)
    CALL MPI_COMM_SIZE(TheComm, NbProc, ierr)
    !      Print *, '   eRank=', eRank, ' NbProc=', NbProc
    iField(1)=1
    IF (eRank .eq. 0) THEN
      DO iProc=2,NbProc
        !          Print *, '   Before MPI_RECV 1 iProc=', iProc
        CALL MPI_RECV(iField, 1, MPI_INTEGER, iProc-1, 711, TheComm, istatus, ierr)
        !          Print *, '   Before MPI_SEND 1'
        CALL MPI_SEND(iField, 1, MPI_INTEGER, iProc-1, 712, TheComm, ierr)
      END DO
    ELSE
      !        Print *, '   Before MPI_SEND 2 eRank=', eRank
      CALL MPI_SEND(iField, 1, MPI_INTEGER, 0, 711, TheComm, ierr)
      !        Print *, '   Before MPI_RECV 2 eRank=', eRank
      CALL MPI_RECV(iField, 1, MPI_INTEGER, 0, 712, TheComm, istatus, ierr)
    END IF
    !      Print *, 'Passing barrier string=', string
  END SUBROUTINE REAL_MPI_BARRIER_PDLIB
  !--------------------------------------------------------------------------
  ! Init MPI
  !--------------------------------------------------------------------------

  !> initialize MPI.
  subroutine initMPI(MPIcomm)
    use yowDatapool, only: comm, nTasks, myrank
    use yowerr
    use MPI

    integer, intent(in) :: MPIcomm
    logical :: flag
    integer :: ierr
#ifdef W3_DEBUGINIT
    Print *, '2: MPIcomm=', MPIcomm
#endif
    if(MPIcomm == MPI_COMM_NULL) then
      CALL ABORT("A null communicator is not allowed")
    endif

    comm = MPIcomm
    call MPI_Initialized(flag, ierr)
    if(ierr/=MPI_SUCCESS) call parallel_abort(error=ierr)

    if(flag .eqv. .false.) then
#ifdef W3_DEBUGINIT
      Print *, 'Before MPI_INIT yowpdlibmain'
#endif
      call mpi_init(ierr)
#ifdef W3_DEBUGINIT
      Print *, 'After MPI_INIT yowpdlibmain'
#endif
      if(ierr/=MPI_SUCCESS) call parallel_abort(error=ierr)
    endif

    ! Get number of processors
    call mpi_comm_size(comm, nTasks,ierr)
    if(ierr/=MPI_SUCCESS) call parallel_abort(error=ierr)

    ! Get rank
    call mpi_comm_rank(comm, myrank,ierr)
    if(ierr/=MPI_SUCCESS) call parallel_abort(error=ierr)
  end subroutine initMPI


  !> @param[in] MNP number of nodes global
  !> @param[in] XP node X value
  !> @param[in] XY node Y value
  !> @param[in] DEP node Z value
  !> @param[in] MNE number of element global
  !> @param[in] INE element array
  !> alter: np_global, nodes_global(), ne_global, elements(), INE_global
  subroutine assignMesh(MNP, MNE)
    use yowNodepool,    only: nodes_global, np_global
    use yowElementpool, only: ne_global
    use yowerr,       only: parallel_abort

    integer, intent(in) :: MNP, MNE
    !integer, intent(in) :: INE(3,MNE)
    integer :: stat, i

    np_global=MNP
    if(allocated(nodes_global)) deallocate(nodes_global)
    allocate(nodes_global(np_global), stat=stat);
    if(stat/=0) CALL ABORT('nodes_global() allocate failure')

    do i =1, np_global
      nodes_global(i)%id_global = i
    end do

    ne_global=MNE

    !if(allocated(INE_global)) deallocate(INE_global)
    !allocate(INE_global(3, ne_global), stat=stat);
    !if(stat/=0) CALL ABORT('INE_global allocate failure')
    !INE_global = INE
  end subroutine assignMesh


  !-------------------------------------------------------------------------
  ! pre-partition: divide the mesh into nTasks parts.
  !-------------------------------------------------------------------------

  !> pre-partition the mesh
  !> just divide the mesh into nTasks parts
  !> and create a premature iplg
  !> alter: np_perProc, np_perProcSum, np, iplg
  subroutine prePartition
    use yowDatapool, only: nTasks ,myrank
    use yowerr,    only: parallel_abort
    use yowNodepool, only: np_global, np, np_perProc, np_perProcSum, iplg

    integer :: i, stat

    ! determine equal number of nodes in each processor (except for the last one).
    ! and create a provisional node local to global mapping iplg

    ! start the arrays from 0, because the first thread id is 0
    if(allocated(np_perProc)) deallocate(np_perProc)
    allocate(np_perProc(0:nTasks-1), stat=stat)
    if(stat/=0) call parallel_abort('np_perProc allocation failure')

    if(allocated(np_perProcSum)) deallocate(np_perProcSum)
    allocate(np_perProcSum(0:nTasks), stat=stat)
    if(stat/=0) call parallel_abort('np_perProcSum allocation failure')

    np_perProcSum = 0
    np = np_global / nTasks

    do i = 0, nTasks-2
      np_perProc(i) = np
      np_perProcSum(i+1) = np_perProcSum(i) + np
    end do

    np_perProc(nTasks-1) = np_global - np_perProcSum(nTasks-1)
    np_perProcSum(nTasks) = np_perProcSum(nTasks-1) + np_perProc(nTasks-1)
    np = np_perProc(myrank)

    ! create a provisional node local to global mapping iplg
    ! this will override later with the data from parmetis
    if(allocated(iplg)) deallocate(iplg)
    allocate(iplg(np), stat=stat);
    if(stat/=0) call parallel_abort(' iplg allocate failure')
    do i = 1, np
      iplg(i) = i + np_perProcSum(myrank)
    end do
  end subroutine prePartition

  !-------------------------------------------------------------------------
  !  Create the connected Nodes array
  !-------------------------------------------------------------------------

  !> create the connected Nodes array
  !> loop over all elements and their nodes. get then the neighbor nodes
  !> finally calculate the number of sides
  !> alter: maxConnNodes, connNodes_data, ns, ns_global, node%nConnNodes
  subroutine findConnNodes(INE_global)
    use yowerr,       only: parallel_abort
    use yowNodepool,    only: np, np_global, nodes_global, nodes, maxConnNodes, t_Node, connNodes_data
    use yowElementpool, only: ne_global
    use yowSidepool,    only: ns, ns_global

    integer, intent(in) :: INE_global(3,ne_global)
    integer :: i, j, stat
    type(t_Node), pointer :: node
    integer JPREV, JNEXT

    ! Loop over all nlements
    ! look at their nodes
    ! get the node 1, insert node 2 and 3 into the connected nodes array
    ! do that for node 2 and 3 again

    ! implementation is some different
    ! loop over alle elements to get the # of connected nodes
    ! allocate space
    ! loop a second time to insert the connected nodes

    ! first loop
    do i = 1, ne_global
      do j = 1, 3
        node => nodes_global(INE_global(j,i))
        call node%insertConnNode()
        call node%insertConnNode()
      end do
    end do

    maxConnNodes = maxval(nodes_global(:)%nConnNodes)
    nodes_global(:)%nConnNodes = 0

    ! allocate space
    !> \todo we allocate more than we really need
    if(allocated(connNodes_data)) deallocate(connNodes_data)
    allocate(connNodes_data(np_global, maxConnNodes), stat = stat)
    if(stat/=0) call parallel_abort('connNodes allocation failure')

    ! second loop
    do i = 1, ne_global
      DO J=1,3
        IF (J .eq. 3) THEN
          JNEXT = 1
        ELSE
          JNEXT = J + 1
        END IF
        IF (J .eq. 1) THEN
          JPREV = 3
        ELSE
          JPREV = J - 1
        END IF
        node => nodes_global(INE_global(J,i))
        call node%insertConnNode(INE_global(JNEXT,i))
        call node%insertConnNode(INE_global(JPREV,i))
      END DO

    end do

    ns = 0
    ! calc # sides local
    do i = 1, np
      node => nodes(i)
      ns = ns + node%nConnNodes
    end do

    do i = 1, np_global
      node => nodes_global(i)
      ns_global = ns_global + node%nConnNodes
    end do
  end subroutine findConnNodes


  !------------------------------------------------------------------------
  ! Collect all data for parmetis und partition the mesh
  !------------------------------------------------------------------------

  !> Collect all data for parmetis und partition the mesh
  !> after that, we know for every node the domain ID
  !> alter: t_Node::domainID
  subroutine runParmetis(MNP)
    use yowerr,    only: parallel_abort
    use yowDatapool, only: debugParmetis,debugPartition, nTasks, myrank, itype, comm
    use yowNodepool, only: np, npa, np_global, nodes, nodes_global, t_Node, iplg, np_perProcSum, np_perProc
    use yowSidepool, only: ns
    use yowElementpool, only: ne, ne_global
    use w3gdatmd, only: xgrd, ygrd
    use MPI

    integer, intent(in) :: MNP

    ! Parmetis
    ! Node neighbor information
    integer :: wgtflag, numflag, ndims, nparts, edgecut, ncon
    integer, allocatable :: xadj(:), part(:), vwgt(:), adjwgt(:), vtxdist(:), options(:), adjncy(:), iweights(:)
    ! parmetis need single precision
    real(4), allocatable :: xyz(:), tpwgts(:), ubvec(:)
    integer :: IP_glob, itmp
    integer :: ref
    logical :: lexist = .false.

    ! Node to domain mapping.
    ! np_global long. give the domain number for die global node number
    integer, allocatable :: node2domain(:)

    ! Mics
    integer :: i, j, stat, ierr
    type(t_Node), pointer :: node, nodeNeighbor

    !    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 1")
    ! Create xadj and adjncy arrays. They holds the nodes neighbors in CSR Format
    ! Here, the adjacency structure of a graph is represented by two arrays,
    ! xadj[n+1] and adjncy[m]; n vertices and m edges. Every edge is listen twice
    allocate(adjncy(ns), stat=stat)
    if(stat/=0) call parallel_abort('adjncy allocation failure')
    allocate(xadj(np+1), stat=stat)
    if(stat/=0) call parallel_abort('xadj allocation failure')
    !    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 2")

    xadj = 0
    xadj(1) = 1
    adjncy = 0
    do i=1,np
      node => nodes(i)
      xadj(i+1) = xadj(i) + node%nConnNodes
      if(debugParmetis) write(710+myrank,*) i, xadj(i), xadj(i+1)
      if(node%nConnNodes == 0) then
        write(*,*) "Thread", myrank,"global node has no conn nodes", node%id_global
      endif
      do j=1, node%nConnNodes
        nodeNeighbor => node%connNodes(j)
        adjncy(j + xadj(i) - 1) = nodeNeighbor%id_global
        if(debugParmetis) write(710+myrank,*) i, j, j + xadj(i) - 1, adjncy(j + xadj(i) - 1), nodeNeighbor%id_global
      end do
    end do
    !    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 3")

    ! Option for Parmetis
    allocate(options(3))
    options(1)=1   ! 0: default options; 1: user options
    if(debugParmetis) then
      options(2)=15
    else
      options(2)=0  ! Level of information returned: see defs.h in ParMETIS-Lib dir
    endif
    options(3)=15  ! Random number seed
    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 4")
    ! Fortran-style numbering that starts from 1
    numflag = 1
    ! # dimensions of the space in which the graph is embedded
    ndims = 2
    ! # parts the mesh is divide. Usually nTasks
    nparts = nTasks
    ! # weight per node
    ncon = 1
    ! wgtflag: 0: none (vwgt and adjwgt are NULL); 1: edges (vwgt is NULL); 2: vertices (adjwgt is NULL); 3: both vertices & edges;

#ifdef WEIGHTS
    wgtflag = 2
    INQUIRE ( FILE='weights.ww3', EXIST = lexist )
    IF (lexist) THEN
      OPEN(100001,FILE='weights.ww3',FORM='FORMATTED',status='unknown')
      allocate(iweights(np_global)); iweights = 0
      do i = 1, np_global
        read(100001,*) iweights(i)
      enddo
      CLOSE(100001)
    ELSE
      wgtflag = 0
    ENDIF
#else
    wgtflag = 0
#endif

    ! Create weights
    allocate(vwgt(np*ncon), stat=stat)
    if(stat/=0) call parallel_abort('vwgt allocation failure')

#ifdef WEIGHTS
    if (lexist) then
      do i = 1, np
        itmp = max(1,int(real((iweights(iplg(i))+100))))
        !vwgt(i) = max(1,int(real(iweights(iplg(i)))/maxval(iweights)) * 10)
        !vwgt(i) = max(1,iweights(iplg(i)))
        !vwgt(i) = max(1,itmp)
        vwgt(i) = itmp
      enddo
      vwgt = 1
      deallocate(iweights)
    else
      vwgt = 1
    endif
#else
    vwgt = 1
#endif

    allocate(adjwgt(ns), stat=stat)
    if(stat/=0) call parallel_abort('adjwgt allocation failure')
    !> \todo
    adjwgt = 1

    ! Vertex weight fraction
    allocate(tpwgts(ncon*nTasks),stat=stat)
    if(stat/=0) call parallel_abort('partition: tpwgts allocation failure')
    tpwgts=1.0/real(nTasks)

    ! Imbalance tolerance
    allocate(ubvec(ncon),stat=stat)
    if(stat/=0) call parallel_abort('partition: ubvec allocation failure')
    ubvec=1.01
    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 5")

    ! Partition dual graph
    allocate(xyz(2*np),stat=stat)
    if(stat/=0) call parallel_abort('xyz: ubvec allocation failure')
    if(debugParmetis) write(710+myrank,*) 'np_global, ne_global, np, npa, ne'
    if(debugParmetis) write(710+myrank,*) np_global, ne_global, np, npa, ne
    do i = 1, np
      IP_glob = iplg(i)
      !AR: this is questionable ...
      xyz(2*(i-1)+1) = REAL(xgrd(1,IP_glob))
      xyz(2*(i-1)+2) = REAL(ygrd(1,IP_glob))
      if(debugParmetis) then
        write(710+myrank,*) i, np, xyz(2*(i-1)+1), xyz(2*(i-1)+2)
        call flush(710+myrank)
      endif
    end do
    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 6")

    ! Partition array returned from ParMeTiS
    allocate(part(np),stat=stat)
    if(stat/=0) call parallel_abort('part: ubvec allocation failure')

    ! ParMeTiS vertex distribution array (starts at 1)
    allocate(vtxdist(nTasks+1),stat=stat)
    if(stat/=0) call parallel_abort('partition: vtxdist allocation failure')

    call mpi_allgather(np_perProcSum(myrank)+1, 1, itype, vtxdist, 1, itype, comm, ierr)
    if(ierr/=MPI_SUCCESS) call parallel_abort('partition: mpi_allgather',ierr)
    vtxdist(nTasks+1)=np_global+1
    !    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 7")

    ! check vtxdist
    ! myrank stats from 0
    if((vtxdist(myrank+2) - vtxdist(myrank+1)) < 1) then
      write(*,*) "Thread", myrank, "has no nodes"
      write(*,*) "vtxdist", vtxdist
      CALL ABORT("Poor initial vertex distribution detected")
    endif


    !  My notes from manual:
    !  p: # of processors;
    !  n: total # of vertices (local) in graph sense;
    !  m: total # of neighboring vertices ("edges"); double counted between neighboring vertice u and v.
    !  ncon: # of weights for each vertex;
    !  int(in) vtxdist(p+1): Processor j stores vertices vtxdist(j):vtxdist(j+1)-1
    !  int (in) xadj(n+1), adjncy(m):
    !           locally, vertex j's neighboring vertices are adjncy(xadj(j):xadj(j+1)-1). adjncy points to global index;
    !  int(in) vwgt(ncon*n), adjwgt(m): weights at vertices and "edges". Format of adjwgt follows adjncy;
    !  int(in) wgtflag: 0: none (vwgt and adjwgt are NULL);
    !          1: edges (vwgt is NULL); 2: vertices (adjwgt is NULL); 3: both vertices & edges;
    !  int(in) numflag: 0: C-style numbering from 0; 1: FORTRAN style from 1;
    !  int(in) ndims: 2 or 3 (D);
    !  float(in) xyz(ndims*n): coordinate for vertex j is xyz(j*ndims:(j+1)*ndims-1);
    !  int(in)   nparts: # of desired sub-domains (usually nTasks);
    !  float(in) tpwgts(ncon*nparts): =1/nparts if sub-domains are to be of same size for each vertex weight;
    !  float(in) ubvec(ncon): imbalance tolerance for each weight;
    !  int(in)   options: additonal parameters for the routine (see above);
    !  int(out)  edgecut: # of edges that are cut by the partitioning;
    !  int(out)  part(): array size = # of local vertices. It stores indices of local vertices.

    ! write(1112+myrank,*) "Thread",myrank,"sum;vtxdist", sum(vtxdist), vtxdist
    ! write(1112+myrank,*) "Thread",myrank,"sum;xadj", sum(xadj), xadj
    ! write(1112+myrank,*) "Thread",myrank,"sum;adjncy", sum(adjncy), adjncy

    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 8")

    if(debugParmetis) then
      write(710+myrank,*) vtxdist, xadj, adjncy, &
           vwgt, & !vwgt - ignore weights
           adjwgt, & ! adjwgt - ignore weights
           wgtflag, &
           numflag,ndims,ncon,nparts,tpwgts,ubvec,options, &
           edgecut,part,comm
      call flush(710+myrank)
    endif

    !if(debugParmetis) write(710+myrank,*) "Run ParMETIS now..."
#ifdef W3_SCOTCH
    call SCOTCH_ParMETIS_V3_PartGeomKway(vtxdist, xadj, adjncy, &
         vwgt, & !vwgt - ignore weights
         adjwgt, & ! adjwgt - ignore weights
         wgtflag, &
         numflag,ndims,xyz,ncon,nparts,tpwgts,ubvec,options, &
         edgecut,part, comm,ref)
#endif

#ifdef W3_METIS
    call ParMETIS_V3_PartGeomKway(vtxdist, xadj, adjncy, &
         vwgt, & !vwgt - ignore weights
         adjwgt, & ! adjwgt - ignore weights
         wgtflag, &
         numflag,ndims,xyz,ncon,nparts,tpwgts,ubvec,options, &
         edgecut,part, comm)
#endif


    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 9")

    if(nTasks == 1) then
      !    write(*,*) myrank, "minval part", minval(part)
      if(minval(part) == 0) then
        part(:) = part(:) + 1
      endif
    endif

    ! write(*,*) myrank, "edge cuted", edgecut

    ! Collect the parmetis data from all threads
    ! and create a global node to domain number mapping
    allocate(node2domain(np_global),stat=stat)
    if(stat/=0) call parallel_abort(' node2domain allocation failure')
    !
    call mpi_allgatherv(part, np, itype, node2domain, np_perProc, np_perProcSum, itype, comm, ierr)
    if(ierr/=MPI_SUCCESS) call parallel_abort('mpi_allgatherv ',ierr)
    !
    do i = 1, np_global
      node => nodes_global(i)
      node%domainID = node2domain(node%id_global)
    end do
    !    CALL REAL_MPI_BARRIER_PDLIB(comm, "runParmetis, step 10")


    ! write out partition info for katerfempresenter
    if(debugPartition) write(600,*) node2domain
    !    Print *, 'runparmetis step 1'

    if(allocated(xadj))        deallocate(xadj)
    !    Print *, 'runparmetis step 2'
    if(allocated(adjncy))      deallocate(adjncy)
    !    Print *, 'runparmetis step 3'
    if(allocated(part))        deallocate(part)
    !    Print *, 'runparmetis step 4'
    if(allocated(vwgt))        deallocate(vwgt)
    !    Print *, 'runparmetis step 5'
    if(allocated(adjwgt))      deallocate(adjwgt)
    !    Print *, 'runparmetis step 6'
    if(allocated(xyz))         deallocate(xyz)
    !    Print *, 'runparmetis step 7'
    if(allocated(tpwgts))      deallocate(tpwgts)
    !    Print *, 'runparmetis step 8'
    if(allocated(ubvec))       deallocate(ubvec)
    !    Print *, 'runparmetis step 9'
    if(allocated(vtxdist))     deallocate(vtxdist)
    !    Print *, 'runparmetis step 10'
    if(allocated(node2domain)) deallocate(node2domain)
    !    Print *, 'runparmetis step 11'
  end subroutine runParmetis

  !------------------------------------------------------------------------
  ! with the new data from parmetis, recalculate some variables
  !------------------------------------------------------------------------

  ! Create for every boundary node a dummy node and for every dummy node two local elements
  ! extend x,y,z, INE
  ! rebuild  NCONE, CONE, NCONN, CONN
  ! alter nd, nde
  !subroutine dummyNodes
  ! use yowExchangeModule
  ! use yowNodepool, only: npa, nb, nd, PDLIB_NCONN, PDLIB_CONN, boundaryNodes, nodes, x, y, z, outwardNormal, XY, dummy2boundary, BNDprevnext
  ! use yowElementpool, only: ne, nde, INE, NCONE, CONE
  ! implicit none
  ! integer :: i, IP, IPprev, IPnext, newIP, newIE
  ! integer, allocatable :: INEnew(:,:)
  ! real(rkind) :: vec(2), newPoint(2), newLength
  ! real(rkind), allocatable :: xNew(:), yNew(:), zNew(:)
  !
  !    nd = nb
  !
  !    allocate(dummy2boundary(nd))
  !
  !    nde = 2*nd
  !    allocate(INEnew(3,ne+nde))
  !    INEnew(:,1:ne) = INE(:,1:ne)
  !
  !    allocate(xNew(npa+nd), yNew(npa+nd), zNew(npa+nd))
  !    xNew(1:npa) = x(1:npa)
  !    yNew(1:npa) = y(1:npa)
  !    zNew(1:npa) = z(1:npa)
  !
  !    nd = 0
  !    nde = 0
  !   do i=1, nb
  !     nd = nd + 1
  !     IP = boundaryNodes(i)
  !     dummy2boundary(nd) = IP
  !
  !     call BNDprevnext(IP, IPprev, IPnext)
  !     newLength = max(distanceTo(XY(IP), XY(IPprev)), distanceTo(XY(IP), XY(IPnext)))
  !     newPoint  = XY(IP) + outwardNormal(:,i) * newLength
  !    newIP = npa+i
  !    xNew(newIP) = newPoint(1)
  !    yNew(newIP) = newPoint(2)
  !    zNew(newIP) = z(IP)
  !    NCONN(IP) = NCONN(IP) + 1
  !    CONN(NCONN(IP),IP) = newIP
  !
  !    nde = nde +1
  !     newIE = ne+nde
  !     INEnew(1,newIE) = IP
  !     INEnew(2,newIE) = IPprev
  !     INEnew(3,newIE) = newIP
  !     NCONE(IP) = NCONE(IP) +1
  !     CONE(NCONE(IP),IP) = newIE
  !
  !     nde = nde +1
  !     newIE = ne+nde
  !     INEnew(1,newIE) = IP
  !     INEnew(2,newIE) = newIP
  !     INEnew(3,newIE) = IPnext
  !     NCONE(IP) = NCONE(IP) +1
  !     CONE(NCONE(IP),IP) = newIE
  !   end do
  !
  !   call move_alloc(INEnew, INE)
  !   call move_alloc(xNew, x)
  !   call move_alloc(yNew, y)
  !   call move_alloc(zNew, z)
  !
  !   ! Die Anzahl der angeschlossenen Knoten fuer Geisterknoten wird fuer findBoundaryNodes and Dummynodes gebraucht.
  !   call exchange(NCONN)
  !   call exchange(NCONE)
  !end subroutine

  !> recalculate some variables
  !> parmetis change the number of sides per domain. So recalculate some variables
  !> alter: np, ns, np_perProc, np_perProcSum, iplg, ipgl, nodes_global%id
  !> @note connNodes_data(:) has not changed after the call to parmetis.
  subroutine postPartition
    use yowerr,    only: parallel_abort
    use yowDatapool, only: myrank, nTasks
    use yowNodepool, only: np_global, np, nodes_global, nodes, np_perProc, np_perProcSum, iplg, ipgl, t_Node
    use yowSidepool, only: ns

    integer :: i, j, stat
    type(t_Node), pointer :: node

    ! determine how many nodes now belong to which thread
    ! and set the nodes local id
    np_perProc = 0
    do i = 1, np_global
      ! fortran counts from 1. np_perProc from 0
      np_perProc(nodes_global(i)%domainID-1) = np_perProc(nodes_global(i)%domainID-1)+1
      ! set the new local id
      nodes_global(i)%id = np_perProc(nodes_global(i)%domainID-1)
    end do

    np_perProcSum(0) = 0
    do i = 1, nTasks-1
      np_perProcSum(i) = np_perProcSum(i-1) + np_perProc(i-1)
    end do

    np = np_perProc(myrank)

    ! create the new node local to global mapping iplg. This is not the final one.
    ! create a iplg with ghost nodes in findGhostNodes
    if(allocated(iplg)) deallocate(iplg)
    allocate(iplg(np), stat=stat)
    if(stat/=0) call parallel_abort('iplg second allocation failure')
    iplg = 0

    if(allocated(ipgl)) deallocate(ipgl)
    allocate(ipgl(np_global), stat=stat)
    if(stat/=0) call parallel_abort('ipgl allocation failure')
    ipgl = 0

    j = 1
    do i = 1, np_global
      node => nodes_global(i)
      if(node%domainID == myrank+1) then
        iplg(j) = i
        ipgl(i) = j
        j = j + 1
      endif
    end do

    ! calc # sides local again, because the nodes now belongs to another domain
    ns = 0
    do i = 1, np
      node => nodes(i)
      ns = ns + node%nConnNodes
    end do
  end subroutine postPartition


  !----------------------------------------------------------------------
  ! find the ghost nodes of the local domain
  !----------------------------------------------------------------------

  !> find the ghost nodes of the local domain
  !> alter: ng, ghosts(), ghostlg, ghostgl, npa, iplg
  subroutine findGhostNodes
    use yowerr,    only: parallel_abort
    use yowDatapool, only: myrank
    use yowNodepool, only: t_Node, np, nodes, ghosts, nodes_global, ng, ghostlg, ghostgl, npa, np_global, iplg

    integer :: i, j, k, stat
    type(t_Node), pointer :: node, nodeNeighbor, nodeGhost
    !> temporary hold the ghost numbers
    integer, save, allocatable :: ghostTemp(:)
#ifdef W3_DEBUGINIT
    Print *, 'Passing in findGhostNodes'
#endif

    ! iterate over all local nodes and look at their neighbors
    ! has the neighbor another domain id, than it is a ghost

    ! implementation is some different
    ! loop over all nodes to get the # of ghost nodes
    ! allocate space
    ! loop a second time to insert the ghost nodes
    !> \todo make this faster. dont check all neighbors from all local nodes. mark if an node has already been checked.

    ! first loop. find out how many ghost nodes we have (with double entries)
    ng = 0
    do i = 1, np
      node => nodes(i)
      do j = 1, node%nConnNodes
        nodeNeighbor => node%connNodes(j)
        if(nodeNeighbor%domainID /= node%domainID) then
          ! yes, we found a ghost
          ng = ng + 1
        end if
      end do
    end do

    allocate(ghostTemp(ng), stat=stat)
    if(stat/=0) call parallel_abort('ghostTemp allocation failure')

#ifdef W3_DEBUGINIT
    Print *, 'np_global=', np_global
#endif
    IF (allocated(ghostgl)) THEN
      Print *, 'ghostgl is already allocated'
    END IF
    allocate(ghostgl(np_global), stat=stat)
    if(stat/=0) call parallel_abort('ghostgl allocation failure')
    ghostgl = 0

    ! second loop. fill ghostlg. ignore double entries
    ng = 0
    ! iterate over all local nodes
    do i = 1, np
      node => nodes(i)

      ! check their neighbors
      secondloop: do j = 1, node%nConnNodes
        nodeNeighbor => node%connNodes(j)

        if(nodeNeighbor%domainID /= node%domainID) then
          ! yes, we found a ghost
          ! check if this ghost is allready in the ghost list
          do k = 1, ng
            nodeGhost => nodes_global(ghostTemp(k))
            if(nodeNeighbor%id_global == nodeGhost%id_global) then
              ! yes, we allready know this ghost.
              ! check the next neighbor
              cycle secondloop
            end if
          end do

          ! no we don't know this ghost. insert it
          ng = ng + 1
          ghostTemp(ng) = nodeNeighbor%id_global
          ghostgl(nodeNeighbor%id_global) = ng
        end if
      end do secondloop
    end do

    ! reallocate the gosttemp array becouse it is longer then the new ng
    if(allocated(ghostlg)) deallocate(ghostlg)
    allocate(ghostlg(ng), stat=stat)
    if(stat/=0) call parallel_abort('ghostlg allocation failure')
    ghostlg = ghostTemp(1:ng)
    deallocate(ghostTemp)

    npa = np + ng

    ! check if ghostlg contains only uniqe values
    do i=1, ng
      do j=i+1, ng
        if(ghostlg(i) == ghostlg(j)) then
          write(*,*) "double global ghost id in ghostlg(i,j)", i, j
          stop "double global ghost id in ghostlg(i,j)"
        endif
      end do
    end do


    ! create the new node local to global mapping iplg with ghost. final one.
    if(allocated(iplg)) deallocate(iplg)
    allocate(iplg(npa), stat=stat)
    if(stat/=0) call parallel_abort('iplg second allocation failure')
    iplg = 0

    j = 1
    do i = 1, np_global
      node => nodes_global(i)
      if(node%domainID == myrank+1) then
        iplg(j) = i
        j = j + 1
      endif
    end do

    iplg(np+1: npa) = ghostlg(1:ng)
  end subroutine findGhostNodes
  !-------------------------------------------------------------------------------
  ! find the number of connected domains and their ghosts
  !-------------------------------------------------------------------------------

  !> find the number of connected domains and their ghosts
  !> 1) Iterate over all ghost nodes and look at their thread id to find # neighbor domains
  !> 2) assign the ghost nodes to their domains
  !> alter: neighborDomains(), nConnDomains
  subroutine findConnDomains
    use yowerr,          only: parallel_abort
    use yowNodepool,       only: ghosts, ng, t_Node
    use yowDatapool,       only: nTasks, myrank
    use yowExchangeModule, only: neighborDomains, initNbrDomains

    integer :: i, stat, itemp
    type(t_Node), pointer :: ghost

    ! # of ghost per neighbor domain
    integer, allocatable :: numberGhostPerNeighborDomainTemp(:)
    ! look up table. domainID to neighbor (ID)
    integer, allocatable :: domainID2NeighborTemp(:)

    ! Part 1) find # neighbor domains

    ! allocate this array with a fixed size of nTasks. even if we not have
    ! so many neighbor domains, nTasks will never be very large
    allocate(numberGhostPerNeighborDomainTemp(nTasks), stat=stat)
    if(stat/=0) call parallel_abort('numberGhostPerNeighborDomainTemp allocation failure')
    numberGhostPerNeighborDomainTemp = 0

    allocate(domainID2NeighborTemp(nTasks), stat=stat)
    if(stat/=0) call parallel_abort('domainID2NeighborTemp allocation failure')
    domainID2NeighborTemp = 0


    ! iterate over all ghost nodes an get their thread id
    itemp = 0
    do i = 1, ng
      ghost => ghosts(i)

      ! sum how many ghost belongs to the ghost domainID
      numberGhostPerNeighborDomainTemp(ghost%domainID) = numberGhostPerNeighborDomainTemp(ghost%domainID) + 1

      ! check if this ghost domainID is allready in the domains list
      if(domainID2NeighborTemp(ghost%domainID) /= 0) then
        ! yes we have allready insert this domain id
        cycle
      end if

      ! no we dont know this domain id. insert it
      itemp = itemp + 1
      domainID2NeighborTemp(ghost%domainID) = itemp
    end do

    ! Part 2)  assign the ghost nodes to their domains
    call initNbrDomains(itemp)

    do i = 1, nTasks
      if(numberGhostPerNeighborDomainTemp(i) /= 0) then
        neighborDomains(domainID2NeighborTemp(i) )%domainID = i

        allocate(neighborDomains(domainID2NeighborTemp(i))%nodesToReceive(numberGhostPerNeighborDomainTemp(i)), stat=stat)
        if(stat/=0) call parallel_abort('neighborDomains%ghosts allocation failure')
        neighborDomains(domainID2NeighborTemp(i))%nodesToReceive = 0
      end if
    end do

    do i = 1, ng
      ghost => ghosts(i)
      itemp = domainID2NeighborTemp(ghost%domainID)

      neighborDomains(itemp)%numNodesToReceive = neighborDomains(itemp)%numNodesToReceive + 1
      neighborDomains(itemp)%nodesToReceive(neighborDomains(itemp)%numNodesToReceive) = ghost%id_global
    end do

    if(allocated(numberGhostPerNeighborDomainTemp)) deallocate(numberGhostPerNeighborDomainTemp)
    if(allocated(domainID2NeighborTemp)) deallocate(domainID2NeighborTemp)
  end subroutine findConnDomains


  !-------------------------------------------------------------------------------
  ! exchange Ghost Ids so every thread knows which nodes he has to send to the
  ! other parition
  !-------------------------------------------------------------------------------

  !> exchange Ghost Ids so every thread knows which nodes he has to send to the
  !> other parition. every parition has a list of ghost nodes from the other
  !> partition. this data is stored in the neighborDomains variable
  !> \todo make a better  explanation
  !> 1) send to the neighbor domain which ghost nodes we want from him
  !> 2) receive from the neighbor domain which ghost we must send to him
  !> alter: neighborDomains()%{numNodesToSend, nodesToSend},
  subroutine exchangeGhostIds
    use yowerr
    use yowNodepool,       only: np, t_node, nodes
    use yowDatapool,       only: nTasks, myrank, comm
    use yowExchangeModule, only: neighborDomains, nConnDomains, createMPITypes
    use MPI

    integer :: i, j, k
    integer :: ierr
    ! uniq tag that identify the sender and which information he sends
    integer :: tag
    ! we use non-blocking send and recv subroutines
    ! store the send status
    integer :: sendRequest(nConnDomains)
    ! store the revc status
    integer :: recvRequest(nConnDomains)
    ! status to verify if one communication fails or not
    integer :: status(MPI_STATUS_SIZE, nConnDomains);


    type(t_node), pointer :: node

    ! send to all domain neighbors how many ghosts nodes we want from him and which ones
    do i=1, nConnDomains
      ! create a uniq tag for this domain
      tag = neighborDomains(i)%domainID*10 + 1
      ! send to the neighbor how many ghost nodes we want from him
      call MPI_Isend(neighborDomains(i)%numNodesToReceive, &
           1, &
           MPI_INT, &
           neighborDomains(i)%domainID-1, &
           tag, &
           comm, &
           sendRequest(i), &
           ierr);
      if(ierr/=MPI_SUCCESS) then
        write(*,*) "mpi send failure"
      endif

      tag = neighborDomains(i)%domainID*10 + 2
      ! send to the neighbor which ghost nodes we want from him
      call MPI_Isend(neighborDomains(i)%nodesToReceive, &
           neighborDomains(i)%numNodesToReceive, &
           MPI_INT, &
           neighborDomains(i)%domainID-1, &
           tag, &
           comm, &
           !> todo use a second sendRequest array here
           sendRequest(i), &
           ierr);
      if(ierr/=MPI_SUCCESS) then
        write(*,*) "mpi send failure"
      endif

      ! receive from neighbor how many ghost nodes we have to send him
      tag = (myrank+1)*10 + 1
      call MPI_Irecv(neighborDomains(i)%numNodesToSend, &
           1, &
           MPI_INT, &
           neighborDomains(i)%domainID-1, &
           tag, &
           comm, &
           recvRequest(i), &
           ierr)
      if(ierr/=MPI_SUCCESS) then
        write(*,*) "mpi recv failure"
      endif
    end do

    ! wait for communication end
    call MPI_Waitall(nConnDomains, recvRequest, status, ierr)

    ! test for all neighbor domains
    do i=1, nConnDomains
      ! test if the neighbor wants more ghost nodes than we have
      if(neighborDomains(i)%numNodesToSend > np) then
        write(*,'(i5, a, i5, a, i5,a, i5, a, /)', advance='no') myrank, " ERROR neighbordomain ", neighborDomains(i)%domainID, &
             " wants ", neighborDomains(i)%numNodesToSend, &
             " nodes, but we have only ", np, " nodes"
        CALL ABORT("")
      end if
    end do

    ! receive from all neighbor domains which nodes we must send him
    do i=1, nConnDomains
      allocate(neighborDomains(i)%nodesToSend(neighborDomains(i)%numNodesToSend))
      neighborDomains(i)%nodesToSend = 0

      ! receive from neighbor which nodes we must send
      tag = (myrank+1)*10 + 2
      call MPI_Irecv(neighborDomains(i)%nodesToSend, &
           neighborDomains(i)%numNodesToSend, &
           MPI_INT, &
           neighborDomains(i)%domainID-1, &
           tag, &
           comm, &
           recvRequest(i), &
           ierr)
      if(ierr/=MPI_SUCCESS) then
        CALL PARALLEL_ABORT("mpi recv failure", ierr)
      endif
    end do

    ! wait for communication end
    call MPI_Waitall(nConnDomains, recvRequest, status, ierr)

    ! test for all neighbor domains
    do i=1, nConnDomains
      ! test if the neighbor wants nodes that we don't own
      outerloop: do j=1, neighborDomains(i)%numNodesToSend
        ! compare with all local nodes
        do k=1, np
          node => nodes(k)
          if(node%id_global == neighborDomains(i)%nodesToSend(j)) then
            cycle outerloop
          end if
        end do
        write(*,*) myrank, "Neighbordomain", neighborDomains(i)%domainID, &
             " want Node", neighborDomains(i)%nodesToSend(j), &
             " but we don't own this node"
        stop
      end do outerloop
    end do

    call createMPITypes()
  end subroutine exchangeGhostIds

  !> this collects all data which depends on ghost information
  !> alter: ne, INE, x, y, z, ielg
  subroutine postPartition2(INE_global)
    use yowElementpool, only: ne, ne_global, INE, belongto, ielg
    use yowerr,         only: parallel_abort
    use yowDatapool,    only: myrank
    use yowNodepool,    only: np_global, np, nodes_global, iplg, t_Node, ghostlg, ng, npa
    use yowNodepool,    only: x, y, z
    use w3gdatmd,       only: xgrd, ygrd, zb

    integer, intent(in) :: INE_global(3,ne_global)

    integer :: i, j, k, stat, IP_glob
    type(t_Node), pointer :: node
    logical :: assigned

    ! to find the local elements, iterate over all global elements and check their node domain IDs

    ! step 1: calc the number of local elements
    ne = 0
    do i=1, ne_global
      if (belongto(INE_global(:,i))) then
        ne = ne +1
      endif
    end do

    ! step 2: fill the local element index array
    if(allocated(INE)) deallocate(INE)
    allocate(INE(3, ne), stat=stat)
    if(stat/=0) call parallel_abort('INE allocation failure')
    INE = 0

    ne = 0
    do i=1, ne_global
      ! yes, this element belongs to this domain
      if (belongto(INE_global(:,i))) then
        ne = ne + 1
        do j=1, 3
          assigned = .false.
          node => nodes_global(INE_global(j,i))
          if(node%domainID == myrank+1) then
            INE(j, ne) = node%id
            assigned = .true.
          else
            ! the element have a ghost node
            !> \todo create some localnode to localghost mapping
            !> What number is this ghost
            do k=1, ng
              if(node%id_global == ghostlg(k)) then
                ! conversion: the ghost nodes are stored behind the local nodes.
                if(INE(j,ne) /= 0) then
                  write(*,*) "will write to INE(j, ne) but there is allready a value", j, ne, INE(j, ne)
                endif
                INE(j, ne) = np + k
                node%id = np+k
                assigned = .true.
                !                 write(*,*) myrank, "node to ele", node%id_global-1, i-1, np+k
                exit
              endif
            end do
          endif
          if(assigned .eqv. .false.) then
            write(*,*) "Can't assign global node to INE", node%id_global
          endif
        end do
      endif
    end do

    ! check if INE contains 0.
    do i=1, ne
      if(MINVAL(ABS(INE(:,i))) == 0) then
        write(*,*) "0 in INE ne=", ne
        stop "0 in INE"
      endif
    end do

    if(MAXVAL(INE) /= npa) then
      write(*,*) "MAXVAL(INE) /= npa ERROR?"
    endif

    ! create element local to global mapping ielg
    if(allocated(ielg)) deallocate(ielg)
    allocate(ielg(ne), stat=stat)
    if(stat/=0) call parallel_abort('ielg allocation failure')
    ielg = 0

    j = 0
    do i=1, ne_global
      if (belongto(INE_global(:,i))) then
        j = j +1
        ielg(j) = i
      end if
    end do

    ! fill the local x,y arrays
    if(allocated(x)) deallocate(x)
    allocate(x(npa), stat=stat)
    if(stat/=0) call parallel_abort('x allocation failure')

    if(allocated(y)) deallocate(y)
    allocate(y(npa), stat=stat)
    if(stat/=0) call parallel_abort('y allocation failure')

    if(allocated(z)) deallocate(z)
    allocate(z(npa), stat=stat)
    if(stat/=0) call parallel_abort('z allocation failure')

    do i=1, np
      IP_glob = iplg(i)
      x(i) = xgrd(1,IP_glob)
      y(i) = ygrd(1,IP_glob)
      z(i) = zb(IP_glob)
    end do

    do i=1, ng
      IP_glob = ghostlg(i)
      x(np+i) = xgrd(1,IP_glob)
      y(np+i) = ygrd(1,IP_glob)
      z(np+i) = zb(IP_glob)
    end do

  end subroutine postPartition2
  !**********************************************************************
  !*                                                                    *
  !**********************************************************************
  subroutine ComputeTRIA_IEN_SI_CCON
    use yowElementpool, only: ne, ne_global, INE, ielg
    use yowExchangeModule, only : PDLIB_exchange1Dreal
    use yowerr,       only: parallel_abort
    use yowDatapool,    only: myrank
    use yowNodepool,    only: np_global, np, iplg, t_Node, ghostlg, ng, npa
    use yowNodepool,    only: x, y, z, PDLIB_SI, PDLIB_IEN, PDLIB_TRIA, PDLIB_CCON, PDLIB_TRIA03

    integer I1, I2, I3, stat, IE, NI(3)
    real  :: DXP1, DXP2, DXP3, DYP1, DYP2, DYP3, DBLTMP, TRIA03
    logical :: CROSSES_DATELINE

    allocate(PDLIB_SI(npa), PDLIB_CCON(npa), PDLIB_IEN(6,ne), PDLIB_TRIA(ne), PDLIB_TRIA03(ne), stat=stat)
    if(stat/=0) call parallel_abort('SI allocation failure')

    PDLIB_SI(:)   = 0.0d0 ! Median Dual Patch Area of each Node
    PDLIB_CCON(:) = 0     ! Number of connected Elements
    DO IE = 1 , ne
      I1 = INE(1,IE)
      I2 = INE(2,IE)
      I3 = INE(3,IE)
      NI = INE(:,IE)

      DXP1=x(I2) - x(I1)
      DYP1=y(I2) - y(I1)
      DXP2=x(I3) - x(I2)
      DYP2=y(I3) - y(I2)
      DXP3=x(I1) - x(I3)
      DYP3=y(I1) - y(I3)
      CALL ELEMENT_CROSSES_DATELINE(DXP1, DXP2, DXP3, CROSSES_DATELINE)
      IF (CROSSES_DATELINE) THEN
        CALL CORRECT_DX_GT180(DXP1)
        CALL CORRECT_DX_GT180(DXP2)
        CALL CORRECT_DX_GT180(DXP3)
      ENDIF

      PDLIB_IEN(1,IE) = - DYP2
      PDLIB_IEN(2,IE) =   DXP2
      PDLIB_IEN(3,IE) = - DYP3
      PDLIB_IEN(4,IE) =   DXP3
      PDLIB_IEN(5,IE) = - DYP1
      PDLIB_IEN(6,IE) =   DXP1
      DBLTMP = (DXP3*DYP1 - DYP3*DXP1)*0.5
      PDLIB_TRIA(IE) = DBLTMP
      IF (PDLIB_TRIA(IE) .lt. TINY(1.)) THEN
        WRITE(*,*) PDLIB_IEN(:,IE)
        WRITE(*,*)
        WRITE(*,*) 'AREA SMALLER ZERO IN PDLIB', IE, NE, PDLIB_TRIA(IE)
        STOP
      ENDIF

      PDLIB_CCON(I1) = PDLIB_CCON(I1) + 1
      PDLIB_CCON(I2) = PDLIB_CCON(I2) + 1
      PDLIB_CCON(I3) = PDLIB_CCON(I3) + 1
      TRIA03         = PDLIB_TRIA(IE)/3.d0
      PDLIB_SI(I1) = PDLIB_SI(I1) + TRIA03
      PDLIB_SI(I2) = PDLIB_SI(I2) + TRIA03
      PDLIB_SI(I3) = PDLIB_SI(I3) + TRIA03
      PDLIB_TRIA03(IE) = TRIA03
    ENDDO
    CALL PDLIB_exchange1Dreal(PDLIB_SI)
  end subroutine ComputeTRIA_IEN_SI_CCON
  !**********************************************************************
  !*                                                                    *
  !**********************************************************************
  subroutine ELEMENT_CROSSES_DATELINE(RX1, RX2, RX3, CROSSES_DATELINE)
    !   Purpose: understanding if an element crosses the dateline.
    !   An element crossing the dateline has, e.g. a node with lon < 180
    !   and another 2 with lon > -180

    REAL(rkind),  INTENT(IN)  :: RX1, RX2, RX3
    LOGICAL, INTENT(OUT) :: CROSSES_DATELINE
    INTEGER :: R1GT180, R2GT180, R3GT180
    R1GT180 = MERGE(1, 0, ABS(RX1).GT.180)
    R2GT180 = MERGE(1, 0, ABS(RX2).GT.180)
    R3GT180 = MERGE(1, 0, ABS(RX3).GT.180)
    ! if R1GT180+R2GT180+R3GT180 .eq. 0 the element does not cross the dateline
    ! if R1GT180+R2GT180+R3GT180 .eq. 1 the element contains the pole
    ! if R1GT180+R2GT180+R3GT180 .eq. 2 the element crosses the dateline
    CROSSES_DATELINE = R1GT180+R2GT180+R3GT180 .EQ. 2
  end subroutine ELEMENT_CROSSES_DATELINE
  !**********************************************************************
  !*                                                                    *
  !**********************************************************************
  subroutine CORRECT_DX_GT180(DXP)
    !   Purpose: the absolute zonal distance between 2 points is always <= 180
    !            This subroutine corrects the zonal distance to satifsy
    !            this requirement

    REAL(rkind), INTENT(INOUT) :: DXP
    IF (DXP .le. -180) THEN
      DXP=DXP + 360
    END IF
    IF (DXP .ge. 180) THEN
      DXP=DXP - 360
    END IF
  end subroutine CORRECT_DX_GT180
  !**********************************************************************
  !*                                                                    *
  !**********************************************************************
  subroutine ComputeIA_JA_POSI_NNZ
    use yowElementpool, only: ne, ne_global, INE, ielg
    use yowerr,       only: parallel_abort
    use yowDatapool,    only: myrank
    use yowNodepool,    only: np_global, np, nodes_global, iplg, t_Node, ghostlg, ng, npa
    use yowNodepool,    only: PDLIB_CCON, PDLIB_IA, PDLIB_JA, PDLIB_JA_IE, PDLIB_IA_P, PDLIB_JA_P
    use yowNodepool,    only: PDLIB_NNZ, PDLIB_POSI, PDLIB_IE_CELL, PDLIB_POS_CELL, PDLIB_IE_CELL2
    use yowNodepool,    only: PDLIB_POS_CELL2, PDLIB_I_DIAG

    integer CHILF(npa)
    integer istat
    integer MAXMNECON
    integer IE, J, I, IP, K, IP_J, IP_K, IP_I
    integer I1, I2, I3, POS, POS_J, POS_K
    integer COUNT_MAX
    integer, allocatable :: CELLVERTEX(:,:,:), PTABLE(:,:)
    integer :: ITMP(npa)
    MAXMNECON  = MAXVAL(PDLIB_CCON)
    ALLOCATE(CELLVERTEX(npa,MAXMNECON,2), stat=istat)
    IF (istat/=0) CALL PARALLEL_ABORT('ComputeIA_JA_POSI_NNZ, allocate error 4')
    !
    CELLVERTEX(:,:,:) = 0
    CHILF             = 0
    DO IE = 1, ne
      DO J=1,3
        I = INE(J,IE)
        CHILF(I) = CHILF(I)+1
        CELLVERTEX(I,CHILF(I),1) = IE
        CELLVERTEX(I,CHILF(I),2) = J
      END DO
    ENDDO
    !
    !        Emulates loop structure and counts max. entries in the different pointers that have to be designed
    !
    J = 0
    DO IP = 1, npa
      DO I = 1, PDLIB_CCON(IP)
        J = J + 1
      END DO
    END DO
    COUNT_MAX = J
    ALLOCATE (PDLIB_IE_CELL(COUNT_MAX), PDLIB_POS_CELL(COUNT_MAX), stat=istat)
    IF (istat/=0) CALL PARALLEL_ABORT('wwm_fluctsplit, allocate error 5a')
    ALLOCATE (PDLIB_IE_CELL2(MAXMNECON,NPA), PDLIB_POS_CELL2(MAXMNECON, NPA), stat=istat)
    IF (istat/=0) CALL PARALLEL_ABORT('wwm_fluctsplit, allocate error 5b')
    ! Just a remapping from CELLVERTEX ... Element number in the
    ! order of the occurence in the loop during runtime
    PDLIB_IE_CELL  = 0
    ! Just a remapping from CELLVERTEX ... Position of the node
    ! in the Element index -"-
    PDLIB_POS_CELL = 0
    J = 0
    DO IP = 1, npa
      DO I = 1, PDLIB_CCON(IP)
        J = J + 1
        PDLIB_IE_CELL(J)      = CELLVERTEX(IP,I,1)
        PDLIB_POS_CELL(J)     = CELLVERTEX(IP,I,2)
        PDLIB_IE_CELL2(I,IP)  = CELLVERTEX(IP,I,1)
        PDLIB_POS_CELL2(I,IP) = CELLVERTEX(IP,I,2)
      END DO
    END DO
    deallocate(CELLVERTEX)

    ALLOCATE(PTABLE(COUNT_MAX,7), stat=istat)
    IF (istat/=0) CALL PARALLEL_ABORT('wwm_fluctsplit, allocate error 6')
    ALLOCATE(PDLIB_JA_IE(3,3,ne), stat=istat)
    IF (istat/=0) CALL PARALLEL_ABORT('wwm_fluctsplit, allocate error 6.1')

    J = 0
    PTABLE(:,:) = 0.
    DO IP = 1, npa
      DO I = 1, PDLIB_CCON(IP)
        J = J + 1
        IE    = PDLIB_IE_CELL(J)
        POS   = PDLIB_POS_CELL(J)
        I1 = INE(1,IE)
        I2 = INE(2,IE)
        I3 = INE(3,IE)
        IF (POS == 1) THEN
          POS_J = 2
          POS_K = 3
        ELSE IF (POS == 2) THEN
          POS_J = 3
          POS_K = 1
        ELSE
          POS_J = 1
          POS_K = 2
        END IF
        IP_I = IP
        IP_J = INE(POS_J,IE)
        IP_K = INE(POS_K,IE)
        PTABLE(J,1) = IP_I ! Node numbers of the connected elements
        PTABLE(J,2) = IP_J
        PTABLE(J,3) = IP_K
        PTABLE(J,4) = POS  ! Position of the nodes in the element index
        PTABLE(J,5) = POS_J
        PTABLE(J,6) = POS_K
        PTABLE(J,7) = IE   ! Element numbers same as PDLIB_IE_CELL
      END DO
    END DO
    !
    ! Count number of nonzero entries in the matrix ...
    ! Basically, each connected element may have two off-diagonal
    ! contribution and one diagonal related to the connected vertex itself ...
    !
    J = 0
    PDLIB_NNZ = 0
    ITMP = 0
    DO IP = 1, npa
      ITMP = 0
      DO I = 1, PDLIB_CCON(IP)
        J = J + 1
        IP_J  = PTABLE(J,2)
        IP_K  = PTABLE(J,3)
        ITMP(IP)   = 1
        ITMP(IP_J) = 1
        ITMP(IP_K) = 1
      END DO
      PDLIB_NNZ = PDLIB_NNZ + SUM(ITMP)
    END DO
    !
    ! Allocate sparse matrix pointers using the Compressed Sparse Row Format CSR ... this is now done only of npa nodes
    ! The next step is to do it for the whole Matrix npa * MSC * MDC
    ! see ...:x
    !
    ALLOCATE (PDLIB_JA(PDLIB_NNZ), PDLIB_IA(npa+1), PDLIB_JA_P(PDLIB_NNZ), stat=istat)
    IF (istat/=0) CALL PARALLEL_ABORT('wwm_fluctsplit, allocate error 6a')
    ALLOCATE (PDLIB_IA_P(npa+1), PDLIB_POSI(3,COUNT_MAX), PDLIB_I_DIAG(npa), stat=istat)
    IF (istat/=0) CALL PARALLEL_ABORT('wwm_fluctsplit, allocate error 6b')
    PDLIB_JA = 0
    PDLIB_IA = 0
    PDLIB_JA_P = 0
    PDLIB_IA_P = 0
    PDLIB_POSI = 0
    ! Points to the position of the matrix entry in the mass matrix
    ! according to the CSR matrix format see p. 124
    J = 0
    K = 0
    PDLIB_IA  (1) = 1
    PDLIB_IA_P(1) = 0
    DO IP = 1, npa ! Run through all rows
      ITMP=0
      DO I = 1, PDLIB_CCON(IP) ! Check how many entries there are ...
        J = J + 1
        IP_J = PTABLE(J,2)
        IP_K = PTABLE(J,3)
        ITMP(IP)   = 1
        ITMP(IP_J) = 1
        ITMP(IP_K) = 1
      END DO
      DO I = 1, npa ! Run through all columns
        IF (ITMP(I) .GT. 0) THEN
          K = K + 1
          PDLIB_JA(K) = I
          PDLIB_JA_P(K) = I-1
        END IF
      END DO
      PDLIB_IA  (IP + 1) = K + 1
      PDLIB_IA_P(IP + 1) = K
    END DO
    PDLIB_POSI = 0
    J = 0
    DO IP = 1, npa
      DO I = 1, PDLIB_CCON(IP)
        J = J + 1
        IP_J  = PTABLE(J,2)
        IP_K  = PTABLE(J,3)
        DO K = PDLIB_IA(IP), PDLIB_IA(IP+1) - 1
          IF (IP   == PDLIB_JA(K)) PDLIB_POSI(1,J)  = K
          IF (IP   == PDLIB_JA(K)) PDLIB_I_DIAG(IP) = K
          IF (IP_J == PDLIB_JA(K)) PDLIB_POSI(2,J)  = K
          IF (IP_K == PDLIB_JA(K)) PDLIB_POSI(3,J)  = K
        END DO
      END DO
    END DO
    J=0
    DO IP=1,npa
      DO I = 1, PDLIB_CCON(IP)
        J = J + 1
        IE    =  PDLIB_IE_CELL(J)
        POS   =  PDLIB_POS_CELL(J)
        I1    =  PDLIB_POSI(1,J)
        I2    =  PDLIB_POSI(2,J)
        I3    =  PDLIB_POSI(3,J)
        PDLIB_JA_IE(POS,1,IE) = I1
        PDLIB_JA_IE(POS,2,IE) = I2
        PDLIB_JA_IE(POS,3,IE) = I3
      END DO
    END DO
    deallocate(PTABLE)
  end subroutine ComputeIA_JA_POSI_NNZ
  !**********************************************************************
  !*                                                                    *
  !**********************************************************************
  subroutine finalizePD()
    use yowExchangeModule, only: finalizeExchangeModule
    use yowNodepool,    only: finalizeNodepool
    use yowElementpool, only: finalizeElementpool
    use yowRankModule,  only: finalizeRankModule

    call finalizeRankModule()
    call finalizeExchangeModule()
    call finalizeElementpool()
    call finalizeNodepool()
  end subroutine finalizePD

end module yowpdlibMain
!**********************************************************************
!*                                                                    *
!**********************************************************************