!-----------------------------------------------------------------------------
!
! tokenize.f90 -- Parse line of text into list of names and delimiters.
!
! This is a simplistic, general-application expression parser.
!
! Its original application is for the NCEP/PSL bias correction system.
! It is used to parse formula expressions in some of the program
! configuration files.
!
! 2022-apr-16	Original version.  By Dave Allured, NOAA/PSL/CIRES.
!
! This is a low level token parser that does not apply any
! sequence rules or higher level understanding to the token
! sequence.  Higher level interpretation is left up to the caller.
!
! Caller supplies a line of text, and a list of valid delimiter
! characters.
!
! The line is broken up into a list of consecutive tokens, which
! are returned to the caller as the primary output.  The output
! array is auto-allocated.
!
! The secondary output is a list of one-character codes of token
! types.  Names are returned as the letter "n", and delimiters
! are returned as themselves.  In other words, a pattern string
! is returned.  This simplifies interpretation and validation.
!
!     Pattern example for summing:        n=n+n+n
!     Pattern example for function call:  n=n(n,n)
!
! If the input string is all spaces, then the returned token is
! a single token containing all spaces, and the returned pattern
! is also all spaces.
!
!-----------------------------------------------------------------------------
!
! Parsing rules:
!
! * Valid tokens consist of names and delimiters.  There are no
!   other possibilities.
!
! * A break between tokens is signalled by any listed delimiter,
!   or by a space character.
!
! * All delimiters except for spaces are treated as single-
!   character tokens.  Each delimiter occurrence is included in
!   the output token list, even if consecutive.
!
! * Spaces are treated specially.  They are never returned as
!   tokens.
!
! * Leading and trailing spaces are ignored when adjacent to
!   other tokens.
!
! * However, spaces between two names, with no other delimiter,
!   are treated as a break between two name tokens.
!
! * In this case, two consecutive name tokens are output, with
!   no intervening delimiter token.
!
! * No quoting is currently supported.
!
! * There is no special processing or detection for comments,
!   control characters, or other strange characters.
!
! * Every character in the fortran input string is processed
!   literally, according to the simple parsing rules above.
!
! * A consequence of these simple rules is that space and all
!   listed delimiter characters are prohibited within names.
!
!-----------------------------------------------------------------------------

module tokenize_mod
contains

subroutine tokenize (line, delimiters, tokens, pattern)
   implicit none

   character(*), intent(in)               :: line
   character(*), intent(in)               :: delimiters

   character(*), intent(out), allocatable :: tokens(:)
   character(*), intent(out)              :: pattern

! Local variables.

   character char*1
   integer p, p1, p2, line_len			! char pointers into line
   integer ti, ntokens, out_size

   integer, allocatable :: starts(:), ends(:)

   logical is_space, is_delim

! Initialize.

   line_len = len_trim (line)

   p       = 1					! point to first char in line
   ti      = 0					! init token list
   pattern = ' '				! empty pattern string to start

   allocate (starts(line_len), ends(line_len))	! max possible no. of tokens

! Outer loop -- Skip spaces, find start of next token.

outer_loop: &
   do while (p <= line_len)
      char = line(p:p)				! get next character in line

      if (char == ' ') then			! space: keep searching
         p = p + 1				!   for start of next token
         cycle outer_loop
      end if

      is_delim = (index (delimiters, char) /= 0)    ! is char a delimiter?

      if (is_delim) then			! yes, output DELIMITER token
         ti             = ti + 1
         starts(ti)     = p			! output single delimiter char.
         ends(ti)       = p
         pattern(ti:ti) = char			! output same char to pattern
         p              = p + 1			! advance to next input char
         cycle outer_loop			! go find start of next token
      end if

! Any other character is start of a NAME token.
! On entry, this is always the start of a NAME token.  No exceptions.
! This is also always a non-blank.

      ti             = ti + 1			! begin output for NAME token
      starts(ti)     = p			! remember start pointer
      pattern(ti:ti) = 'n'			! output pattern for NAME token

! Inner loop -- Find end of NAME token.

! Note -- This loop always finishes the last name token, never leaves
! an incomplete one dangling at end of line.

inner_loop: &
      do
         char     = line(p:p)			! get current character in line
         is_space = (char == ' ')		! classify
         is_delim = (index (delimiters, char) /= 0)

         if (is_space .or. is_delim) then	! end NAME token on delimiter?
            ends(ti) = p - 1			! yes, end name on PREVIOUS char
            cycle outer_loop			! DO NOT ADVANCE, and go process
						! current token breaking char.

         else if (p >= line_len) then		! final character in line?
            ends(ti) = p			! yes, end name on CURRENT char
            exit outer_loop			! and EXIT ALL scanning
         end if

         p = p + 1			! otherwise advance to next char, and
      end do inner_loop			! keep searching for end of name token

   end do outer_loop

! End of line.  All parsing is complete.

! Now convert token pointer lists to final output list.

   ntokens = ti
   out_size = max (ntokens, 1)		! handle all spaces on input

   allocate (tokens(out_size))		! allocate variable size output array

   tokens(1) = ' '			! output one space if all input spaces

   do ti = 1, ntokens
      p1         = starts(ti)		! copy each token from input line
      p2         = ends(ti)		! to output array
      tokens(ti) = line(p1:P2)
   end do

end subroutine tokenize
end module tokenize_mod