!     gfortran -ffree-line-length-none -O2 -static -static-libgfortran -finit-local-zero -fopenmp -o test1 test1.f90 
!     pgf90 -O2 -mcmodel=medium -Mbounds -mp -o test1 test1.f90
!     ifort -g -mcmodel=medium  -Bstatic -assume byterecl -qopenmp -o test1 test1.f90

!Warning: best to check if the parallel loop size >  # of threads. When they r equal, strange things happen on gcc.
!Accroding to some ref, dynamic arrays (alloc) can only be shared

      module glbl
      implicit none

      integer, save :: n
      real*8, save, allocatable :: a2d(:,:)
      end module

      program main
      use glbl
      implicit none
      real*8, parameter :: a1=1
      real*8, parameter :: b1=-1
      integer :: i,j,tid,nth,nth0,omp_get_thread_num,omp_get_num_threads,icount, & !,icount(2)
     &imax,jmax,imax2,jmax2
      real*8 :: sum1,sum2,rmax1,rmax2,rmax3
      real*8, allocatable :: aa(:,:),bb(:),cc(:),dd(:,:),ee(:),ff(:),ww(:,:)
      real*8 :: tau(3),tmp1

!     Find # of threads
!$OMP parallel default(shared)
!$OMP master
!$      nth0=omp_get_num_threads()
!$OMP end master
!$OMP end parallel

      n=5
      allocate(aa(3,n),bb(n),cc(3),dd(n,n),ee(n),ff(n*n),a2d(n,10),ww(3,n))
      bb=-1.
      do j=1,10
        do i=1,n
          a2d(i,j)=i-2*j
        enddo !i
      enddo !j

      sum1=0; sum2=0
!$OMP parallel if(n>nth0) default(shared) private(i,j,tid,nth,imax2,jmax2,rmax2,tau)
!$      tid=omp_get_thread_num()
!$      nth=omp_get_num_threads() !=1 if n<=nth0!!
!        if(nth/=nth0) then
!          print*, '# of threads mismatch,',nth,nth0
!          stop
!        endif
!$      print*, 'from thread ',tid,nth

!     The following can be init'ed outside parallel region also
!     Since they are shared, they become known to all threads after this
!$OMP master
      rmax1=-huge(1.d0)
      rmax3=rmax1
      icount=0
      imax=-1; jmax=-1
!no implied barriers for master construct
!$OMP end master
!$OMP barrier

!     Although the outer loop is parallel (and executed in random order), the inner loop is serial and
!     is executed in the intended order for correctness
!$OMP do
      do i=1,n
        ww(1,i)=-i !bottom
        do j=2,3
          ww(j,i)=ww(j-1,i)+1
        enddo !j
      enddo !i
!$OMP end do

!$OMP do reduction(+: sum1,sum2)
      !reduction vars cannot be an elem of an array e.g. rmax1(3)
      do i=1,n
        do j=1,3
          tau(j)=i+j
          print*, 'each thread',tid,i,j,tau(j)

          if(i<4.and.j<2) then
            aa(j,i)=bb(i)-n-i+j+a1
            sum1=sum1+aa(j,i)
          else
            aa(j,i)=i+j-b1
            sum2=sum2+aa(j,i)
          endif
        enddo !j
      enddo !i
!$OMP end do nowait

!$OMP workshare
      bb(1:n:2)=bb(1:n:2)-1; tmp1=maxval(abs(ww))
!$OMP end workshare

!$OMP do reduction(max: rmax1)
      do i=1,n
        imax2=-1; jmax2=-1;  rmax2=-huge(1.d0)
        do j=1,3
          if(i<5.and.j/=3) then
            rmax1=max(rmax1,aa(j,i))
!$OMP       atomic
!            icount(1)=icount(1)+1 <-atomic option cannot be applied to array elem!
            icount=icount+1 

            !Find loc in non-parallel dimension first
            if(aa(j,i)>rmax2) then
              rmax2=aa(j,i)
              imax2=i; jmax2=j
            endif
          endif !i<5.a
        enddo !j

!$OMP   critical
        if(i<5.and.rmax2>rmax3) then
          rmax3=rmax2
          imax=imax2; jmax=jmax2
        endif !aa
!$OMP   end critical
      enddo !i
!$OMP end do

!$OMP end parallel 

!     Only master thread in serial portion
       print*, 'max(ww)=',maxval(abs(ww)),tmp1
      do i=1,n
        print*, i,'ww=',ww(:,i)
        print*, i,'aa=',aa(:,i)
      enddo !i
      print*, 'sum1,2=',sum1,sum2,sum(aa)  
      print*, 'max, loc=',rmax1,rmax3,icount,imax,jmax

      print*, 'bb=',bb

!     Matrix-vector product
      call mxv(3,n,aa,bb,cc)

      print*, 'aa*bb=',cc

!     Another Matrix-vector product (no race condition)
!$OMP parallel default(shared) private(i,j,nth)
!$      nth=omp_get_num_threads()
!$      print*, 'nth(new)=',nth
!$OMP workshare
      ff=(/((i-j,i=1,n),j=1,n)/)
      dd(:,:)=reshape(ff,(/n,n/)) !transpose of the final matrix D for cache efficiency
      ee(:)=0
!$OMP end workshare

!$OMP master
      do i=1,n
        print*, i,'dd(,i)=',dd(:,i)
      enddo !i
!$OMP end master
!no implied barriers for master construct

!$OMP do schedule(guided)
      do i=1,n
        do j=1,n
          !Note the order of indices for dd; no race
          ee(i)=ee(i)+dd(j,i)*bb(j)
        enddo !j
      enddo !i
!$OMP end do
!$OMP end parallel

      print*, 'transpose(dd)*bb=',ee

!     Routine call
!$OMP parallel default(private) shared(n,ee)
      nth=omp_get_num_threads()
      tid=omp_get_thread_num()
!$OMP do
      do i=1,n
        print*, 'tst2:',tid,i
        call tst2(i,tid,ee)
      enddo !i
!$OMP end do
!$OMP end parallel

!     Nested loop
      bb=0
!$OMP parallel default(shared) private(j,i)
      do j=1,5
!$OMP   do
        do i=1,n
          bb(i)=bb(i)+i-j
        enddo !i
!$OMP   end do
      enddo !j
!$OMP end parallel

!     Compare with serial
      ee=0
      do j=1,5
        do i=1,n
          ee(i)=ee(i)+i-j
        enddo !i
      enddo !j

      print*, 'After nested loop:',bb
      print*, 'After nested loop (serial):',ee

      stop
      end

      subroutine tst2(i,tid,ee)
      use glbl
      implicit none
      integer, intent(in) :: i,tid
      real*8, intent(in) :: ee(n)

      integer :: i2,j

      i2=i*i
      write(99,*)'output from tst2 and thread ',tid,i,n,ee
      write(99,*)'i2=',i2 !still private
      do j=1,n
        write(99,*)j,'a2d:',a2d(j,:) !global a2d accesible, but better not to write to it!
      enddo !j

      end subroutine tst2

      subroutine mxv(m,n,aa,bb,cc)
      implicit none
      integer, intent(in) :: m,n
      real*8, intent(in) :: aa(m,n),bb(n)
      real*8, intent(out) :: cc(m)

      integer :: i,j

!$OMP parallel default(shared) private(i,j)
!$OMP workshare
      cc(:)=0
!$OMP end workshare

!array reduction - each thread takes a chunk of j and do array sum (1:m)
!$OMP do reduction(+: cc)
      do j=1,n
!        do i=1,m
        cc(1:m)=cc(1:m)+aa(1:m,j)*bb(j)
!        enddo !i
      enddo !j
!$OMP end parallel

      end subroutine mxv