#!/usr/bin/env python

from contextlib import closing
import os, sys, time, getopt, re
import datetime, time
import pytz
from string import *
from six.moves import urllib
import ssl
#import HTMLParser
from Canadian_Sites import Canadian_Sites


def main(argv):
   """
       Function to get output directory

       Return: Output directory
   """
   outputdir = ''
   sitefile= ''
   try:
       opts, args = getopt.getopt(argv,"h:o:s:",["odir=", "sites="])
   except getopt.GetoptError:
      print( 'canadian_flow_retrieval.py -o <outputdir> -s <sitefile>' )
      sys.exit(2)
   for opt, arg in opts:
      if opt == '-h':
         print( 'canadian_flow_retrieval.py -o <outputdir>' )
         sys.exit()
      elif opt in ("-o", "--odir"):
         outputdir = arg
         if not os.path.exists( outputdir ):
             os.makedirs( arg )
      elif opt in ('-s', "--sites" ):
         sitefile = arg
         if not os.path.exists( sitefile ):
                 raise RuntimeError( 'FATAL ERROR: sitefile ' + \
                                 sitefile + ' does not exist!' )
  
  
   print( 'Output dir is "', outputdir )
   return (outputdir, sitefile)
#-----------------------------------------------------------        
#
# Download real time stream flow data from the Canadian "datamart" 
# for a given list of stations.
#
def fetch_ca_sites( sites, odir ):

    good_sites = ()
    fail_sites = ()

    today = datetime.datetime.now()
    twodaysago = today - datetime.timedelta(days=2)

    #
    # Loop through each station
    #
    for site_id in sites:
        print( datetime.datetime.now(), end = " --- " )
        print( 'downloading ', site_id )
        #
        # Construct the query URL
        # 
        # Example URL:
        # 47 is the parameter id for discharge
        # https://wateroffice.ec.gc.ca/services/links_e.html
        #https://wateroffice.ec.gc.ca/services/real_time_data/csv/inline?stations[]=02FF002&parameters[]=47&start_date=2023-11-28%2000:00:00&end_date=2023-11-30%2023:59:59
        #
        filename = site_id + '_hourly_hydrometric.csv'
        URL = 'https://wateroffice.ec.gc.ca/services/real_time_data/csv/inline?stations[]=' + \
                site_id + '&parameters[]=47&start_date=' +  \
                twodaysago.strftime("%Y-%m-%d%%2000:00:00") + \
                '&end_date=' + today.strftime("%Y-%m-%d%%2023:59:59")

        localFile = odir + os.path.sep + filename
        print( datetime.datetime.now(), end = " --- " )
        print( "URL: " + URL)
        print( datetime.datetime.now(), end = " --- " )
        print( "local file: " + localFile)
               
        #
        # Connect to the server
        # 
        try:
            lFile = urllib.request.urlretrieve(URL, localFile)
            good_sites = good_sites + (site_id,)
        except IOError as e:
            #print ('WARNING: site : ', site_id, ' skipped - ' #, e.reason)
            #
            # If failed, remember the station id and continue
            # 
            fail_sites = fail_sites + ( site_id, )

    return good_sites, fail_sites

#-----------------------------------------------------------        
def build_download_list( province, odir ):
    #
    #  province = 'ON' or 'QC'  (Ontario or Quebec)
    #  odir = output directory
    #
    #
    #  Get the UTC offset, in seconds, then convert that to a
    #  timedelta object.
    #
    tz_offset = datetime.timedelta(seconds=time.timezone)

    #
    #  Define the regular expression search patterns that will be repeatedly used
    #  with the information on the web page listing.
    #    site_pattern will identify sites that are in the Great Lakes basin only ("02" is the key)
    #    ts_pattern is used for parsing the modification timestamp
    #
    site_pattern = re.compile(province + '_02' + '.{5}' + '_hourly_hydrometric.csv')
    ts_pattern   = re.compile('[\d]{4}-[\d]{2}-[\d]{2} [\d]{2}:[\d]{2}')

    #
    #  Define some useful date constants
    #
    missing_time = datetime.datetime(1000, 1, 1, 1, 1, 1, 1)
    future_offset = datetime.timedelta(days=99999)
    
    #
    #  Get the directory listing from the web page.
    #  Parse each line. If it is a file entry, check the "modified time" vs the
    #  local file's timestamp.  If the modified time is more recent, then we
    #  need to update that file, so add it to the list.
    #
    total_sites = 0
    sitelist=list()
    
    urllib.request.urlcleanup()
    urlstring = 'https://dd.weather.gc.ca/hydrometric/csv/' + province + '/hourly/'
    print( datetime.datetime.now(), end = " --- " )
    print( urlstring )
    with closing(urllib.request.urlopen(urlstring)) as dirlisting:
        for line in dirlisting:
            s = line.decode('ascii')
            match_site = site_pattern.search(s)
            if (match_site):
                total_sites += 1
                fname = match_site.group(0)       # e.g. "ON_02AB006_hourly_hydrometric.csv"
                id = fname[3:10]
               
                #
                #  Get timestamp for the existing local file with that same id (if
                #  it exists.)  Adjust to UTC, because the modification times reported 
                #  on the remote server are given in UTC.
                #  If the remote file's timestamp is newer, that means it has been
                #  updated since the last time we downloaded a file for that site.
                #
                local_file = odir + os.path.sep + fname
                l_mod_time = missing_time                     # default invalid value, long ago
                if os.path.isfile(local_file):
                    try:
                        lfm = os.path.getmtime(local_file)    # local file mod time (float value)
                        l_mod_time = datetime.datetime.fromtimestamp(lfm)
                        l_mod_time = l_mod_time + tz_offset
                    except:
                        l_mod_time = missing_time

                #
                #  Extract the modified time for the remote file (from the html dir listing)
                #  Format (in testing, at least) is, e.g. "2018-08-16 18:46".
                #  These are UTC times.
                #
                #  If the time retrieval fails (or formatting has changed since this
                #  code was updated), then the remote file is assigned a modification 
                #  date far into the future, which will force the file to be flagged as
                #  "new", and needing to be updated. The resulting sitelist will contain
                #  a list of the site IDs for those files that need to be updated.
                #
                match_ts = ts_pattern.search(s)
                if match_ts:
                    ds = match_ts.group(0)
                    try:
                        r_mod_time = datetime.datetime.strptime(ds, '%Y-%m-%d %H:%M')
                    except:
                        r_mod_time = datetime.datetime.now()
                else:
                    r_mod_time = l_mod_time + future_offset              # default future value

                #
                #  Compare timestamps and add to list if needed
                #
                try:
                    if (r_mod_time > l_mod_time): sitelist.append(id)
                except Exception:
                    pass
  
    print( datetime.datetime.now(), end = " --- " )
    print(len(sitelist), ' of ', total_sites, ' need to be updated')
    return sitelist
    
    
#-----------------------------------------------------------        
#
# The main function to download real time stream flow
#  
#
def canadian_flow_retrieval( odir, sitefile ):

    #
    # This fixes the certificate verify error
    # ERROR:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:777)
    #
    try:
       _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
       # Legacy Python that doesn't verify HTTPS certificates by default
       pass
    else:
       # Handle target environment that doesn't support HTTPS verification
       ssl._create_default_https_context = _create_unverified_https_context

    #
    #  How often to check for new files (in minutes).
    #  It appears (by simple observation) that the files are 
    #  updated about every 30 minutes. Not all files are updated 
    #  at the same time, so setting this to an interval of about 
    #  10 minutes seems like a reasonable value to me, but users
    #  should adjust to whatever they deem appropriate.
    #
    download_frequency = 10

    #
    #  Time stamp when the process starts
    #
    loop_start = time.time()
    
    sites = Canadian_Sites( sitefile )

    #
    #  Infinite loop
    #  For testing, disable the "while True:" line and enable the
    #  two loop number lines.
    #
    lnum = 0
#    while True:
#    while lnum < 25:             
    while lnum < 1:             
        lnum = lnum + 1
        
        list_of_updated_sites = sites.getStationIds()
        #
        #  How many files need to be updated?
        #
        upd_count = 0               
        if (list_of_updated_sites):
            upd_count = len(list_of_updated_sites)

        #
        #  If count > 0, then update the files in the list.
        #  Two lists are returned:
        #     good_sites = list of sites that were successfully updated
        #     fail_sites = list of sites that failed when trying to update them.
        #
        if (upd_count > 0):
            good_sites, fail_sites = fetch_ca_sites( list_of_updated_sites, odir )
            try:
                i = len(good_sites)
            except:
                i = 0
            try:
                j = len(fail_sites)
            except:
                j = 0
            print( datetime.datetime.now(), end = " --- " )
            print('There were ', i, ' good downloads and ', j, ' failed downloads')
            

        #
        #  Compute the correct amount of time to sleep before
        #  doing this again.  
        #
        loop_end = time.time()
        elapsed  = loop_end - loop_start      # seconds
        sleep_time = (download_frequency * 60.0) - elapsed
        
        #
        #  If the specified sleep time has already elapsed because
        #  the download took longer than the specified time, then
        #  just sleep for an arbitrary 10 seconds in order to
        #  be sure that everything gets to a fully reset state.
        #
        if (sleep_time <= 10): sleep_time = 10
        
        #
        #  Sleep for the specified number of seconds.
        #
#        print('sleeping for ', sleep_time, ' seconds')
#        time.sleep( sleep_time )
        loop_start = time.time()

#-----------------------------------------------------------        
#-----------------------------------------------------------  
      
#MyDir = '/gpfs/hps3/ptmp/Zhengtao.Cui/CanDA/test1'
#MyDir = '/gpfs/hps3/ptmp/Zhengtao.Cui/wscxml2'

#canadian_flow_retrieval( odir )