#! /usr/bin/env python3 import sys import os import datetime import netCDF4 import numpy from numpy.ma import is_masked # set to true to output more info DEBUG = False # constant values that will be used for every observation MESSAGE_TYPE = 'ARGO' ELEVATION = 'NA' LEVEL = 'NA' QC_STRING = '1' # skip data if PRES_ADJUSTED_ERROR value is greater than this value MAX_PRESSURE_ERROR = 20 """Read and format the input 11-column observations: (1) string: Message_Type (2) string: Station_ID (3) string: Valid_Time(YYYYMMDD_HHMMSS) (4) numeric: Lat(Deg North) (5) numeric: Lon(Deg East) (6) numeric: Elevation(msl) (7) string: Var_Name(or GRIB_Code) (8) numeric: Level (9) numeric: Height(msl or agl) (10) string: QC_String (11) numeric: Observation_Value """ def get_string_value(var): """!Get string value from NetCDF variable. The string variables are stored as bytes, so decode them and strip off any whitespace before or after. @param var NetCDF variable to read @returns string value from variable """ return var[:].tobytes().decode('utf-8').strip() def get_val_check_qc(nc_obj, field_name, idx_p, idx_l=None, error_max=None): """!Get field value unless quality control checks do not pass. The conditions to skip data are as follows: 1) If {field_name}_QC is masked. 2) If {field_name} is masked. 3) If {field_name}_QC value is not equal to 1 (recommended from ARGO docs). 4) If error_max is set, skip if {field_name}_ERROR is not masked and less than the error_max value. @param nc_obj NetCDF object @param field_name name of field to read @param idx_p index of profile (1st dimension of data) @param idx_l (optional) index of level (2nd dimension of data). Defaults to None which assumes field is not 2D @param error_max (optional) value to compare to {field_name}_ERROR. Skip if error value is less than this value @returns numpy.float64 field value if all QC checks pass or None """ if idx_l is None: qc = nc_obj.variables[f'{field_name}_QC'][idx_p] field = nc_obj.variables[field_name][idx_p] data_str = f'{field_name}[{idx_p}]' else: qc = nc_obj.variables[f'{field_name}_QC'][idx_p][idx_l] field = nc_obj.variables[field_name][idx_p][idx_l] data_str = f'{field_name}[{idx_p}][{idx_l}]' qc_mask = is_masked(qc) field_mask = is_masked(field) if qc_mask: if DEBUG: print(f"Skip {data_str} {field_name}_QC is masked") return None if field_mask: if DEBUG: print(f"Skip {data_str} {field_name} is masked") return None if int(qc) != 1: if DEBUG: print(f"Skip {data_str} {field_name}_QC value ({int(qc)}) != 1") return None if error_max: err = nc_obj.variables.get(f'{field_name}_ERROR') if err: err = err[idx_p] if idx_l is None else err[idx_p][idx_l] if not is_masked(err) and err > error_max: print(f"Skip {data_str} {field_name}_ERROR > {error_max}") return None return numpy.float64(field) def get_valid_time(ref_dt, nc_obj, idx): """!Get valid time by adding julian days to the reference date time. @param ref_dt Datetime object of reference date time @param nc_obj NetCDF object @param idx index of profile to read @returns string of valid time in YYYYMMDD_HHMMSS format """ julian_days = nc_obj.variables['JULD'][idx] day_offset = datetime.timedelta(days=float(julian_days)) valid_dt = ref_dt + day_offset return valid_dt.strftime('%Y%m%d_%H%M%S') def get_lat_lon(nc_obj, idx): """!Read latitude and longitude values from NetCDF file at profile index. @param nc_obj NetCDF object @param idx index of profile to read @returns tuple of lat and lon values as floats """ return (numpy.float64(nc_obj.variables['LATITUDE'][idx]), numpy.float64(nc_obj.variables['LONGITUDE'][idx])) if len(sys.argv) < 2: print(f"ERROR: {__file__} - Must provide at least 1 input file argument") sys.exit(1) is_ok = True input_files = [] for arg in sys.argv[1:]: if arg.endswith('debug'): print('Debugging output turned on') DEBUG = True continue input_file = os.path.expandvars(arg) if not os.path.exists(input_file): print(f'ERROR: Input file does not exist: {input_file}') is_ok = False continue input_files.append(input_file) if not is_ok: sys.exit(1) print(f'Number of input files: {len(input_files)}') point_data = [] for input_file in input_files: print(f'Processing file: {input_file}') nc_in = netCDF4.Dataset(input_file, 'r') # get reference date time time_str = get_string_value(nc_in.variables['REFERENCE_DATE_TIME']) reference_date_time = datetime.datetime.strptime(time_str, '%Y%m%d%H%M%S') # get number of profiles and levels num_profiles = nc_in.dimensions['N_PROF'].size num_levels = nc_in.dimensions['N_LEVELS'].size new_point_data = [] for index_p in range(0, num_profiles): # check QC and mask of JULD to skip profiles with bad time info if get_val_check_qc(nc_in, 'JULD', index_p) is None: continue valid_time = get_valid_time(reference_date_time, nc_in, index_p) station_id = get_string_value( nc_in.variables['PLATFORM_NUMBER'][index_p] ) lat, lon = get_lat_lon(nc_in, index_p) # loop through levels for index_l in range(0, num_levels): # read pressure data to get height in meters of sea water (msw) height = get_val_check_qc(nc_in, 'PRES_ADJUSTED', index_p, index_l, error_max=MAX_PRESSURE_ERROR) if height is None: continue # get temperature and ocean salinity values for var_name in ('TEMP', 'PSAL'): observation_value = get_val_check_qc(nc_in, f'{var_name}_ADJUSTED', index_p, index_l) if observation_value is None: continue point = [ MESSAGE_TYPE, station_id, valid_time, lat, lon, ELEVATION, var_name, LEVEL, height, QC_STRING, observation_value, ] new_point_data.append(point) if DEBUG: print(', '.join([str(val) for val in point])) point_data.extend(new_point_data) nc_in.close() print(" point_data: Data Length:\t" + repr(len(point_data))) print(" point_data: Data Type:\t" + repr(type(point_data)))