#!/usr/bin/env python """ Extract variables from GFS output and format for HEMCO (i.e., as if MERRA-2) """ from pathlib import Path # MERRA-2 data looks like this: # netcdf MERRA2.20190101.A1.05x0625 { # dimensions: # time = 24 ; # lat = 361 ; # lon = 576 ; # variables: # int time(time) ; # time:calendar = "gregorian" ; # time:long_name = "time" ; # time:standard_name = "time" ; # time:units = "minutes since 2019-01-01 00:00:00.0" ; # time:delta_t = "0000-00-00 01:00:00" ; # time:begin_date = "20190101" ; # time:begin_time = "000000" ; # time:time_increment = "010000" ; # float lat(lat) ; # lat:long_name = "latitude" ; # lat:standard_name = "latitude" ; # lat:units = "degrees_north" ; # float lon(lon) ; # lon:long_name = "longitude" ; # lon:standard_name = "longitude" ; # lon:units = "degrees_east" ; # # In xarray: # # Dimensions: (time: 24, lat: 361, lon: 576) # Coordinates: # * time (time) datetime64[ns] 2019-01-01T00:30:00 ... 2019-01-01T23:30:00 # * lat (lat) float32 -90.0 -89.5 -89.0 -88.5 ... 88.5 89.0 89.5 90.0 # * lon (lon) float32 -180.0 -179.4 -178.8 -178.1 ... 178.1 178.8 179.4 # ... M2_DS_ATTRS = { # "Title": "MERRA2 1-hour time-averaged parameters (A1), processed for GEOS-Chem input", # "Contact": "GEOS-Chem Support Team (geos-chem-support@as.harvard.edu)", # "References": "www.geos-chem.org; wiki.geos-chem.org", # "Filename": "MERRA2_Glob", # "History": "File generated on: 2019/03/02 17:56:33 GMT-0400", # "ProductionDateTime": "File generated on: 2019/03/02 17:56:33 GMT-0400", # "ModificationDateTime": "File generated on: 2019/03/02 17:56:33 GMT-0400", "Format": "NetCDF-4", "SpatialCoverage": "global", "Conventions": "COARDS", # "Version": "MERRA2", # "VersionID": "5.12.4", # "Nlayers": "72", # "Start_Date": "20190101", # "Start_Time": "00:00:00.0", # "End_Date": "20190101", # "End_Time": "23:59:59.99999", # "Delta_Time": "010000", "Delta_Lon": "0.625", "Delta_Lat": "0.5", } M2_TIME_ATTRS = { "long_name": "time", "standard_name": "time", # "delta_t": "0000-00-00 01:00:00", # "begin_date": "20190101", # "begin_time": "000000", # "time_increment": "010000", } M2_LAT_ATTRS = { "long_name": "latitude", "standard_name": "latitude", "units": "degrees_north", } M2_LON_ATTRS = { "long_name": "longitude", "standard_name": "longitude", "units": "degrees_east", } M2_DATA_VAR_INFO = { "T2M": { "gfs_name": "tmp2m", "attrs": { "long_name": "2-meter_air_temperature", "standard_name": "2-meter_air_temperature", "units": "K", "gamap_category": "GMAO-2D", }, }, "GWETROOT": { "gfs_name": "soilw4", "attrs": { "long_name": "root_zone_soil_wetness", "standard_name": "root_zone_soil_wetness", "units": "1", "gamap_category": "GMAO-2D", }, }, "PARDF": { "gfs_name": "vddsf_ave", "attrs": { "long_name": "surface_downwelling_par_diffuse_flux", "standard_name": "surface_downwelling_par_diffuse_flux", "units": "W m-2", "gamap_category": "GMAO-2D", }, }, "PARDR": { "gfs_name": "vbdsf_ave", "attrs": { "long_name": "surface_downwelling_par_beam_flux", "standard_name": "surface_downwelling_par_beam_flux", "units": "W m-2", "gamap_category": "GMAO-2D", }, }, } M2_DATA_VAR_OLD_TO_NEW = {d["gfs_name"]: k for k, d in M2_DATA_VAR_INFO.items()} # https://github.com/NCAR/ccpp-physics/blob/c348f3e363f066c2c513b0449690859d3104bac8/physics/set_soilveg.f#L258 DRYSMC = [ None, # for vtype 0 0.010, 0.025, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.010, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, ] # https://github.com/NCAR/ccpp-physics/blob/c348f3e363f066c2c513b0449690859d3104bac8/physics/set_soilveg.f#L270 MAXSMC = [ None, # for vtype 0 0.395, 0.421, 0.434, 0.476, 0.476, 0.439, 0.404, 0.464, 0.465, 0.406, 0.468, 0.457, 0.464, 0.421, 0.200, 0.421, 0.457, 0.200, 0.395, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, ] def main(i_fps, o_fp): """ Parameters ---------- i_fps : list of Path GFS files to be loaded. Can be a single path with a glob expression file name. o_fp : Path, optional Desired path of output file. """ import netCDF4 as nc import numpy as np from scipy.interpolate import interp1d if len(i_fps) == 1: maybe_glob = i_fps[0] if "?" in maybe_glob.name or "*" in maybe_glob.name: # Expand files = sorted(maybe_glob.parent.glob(maybe_glob.name)) else: # Not glob, single file files = [maybe_glob] else: files = sorted(i_fps) assert len(files) >= 2, "need at least 2 for time interp and time calcs" # # Get old grid info from first GFS file # # The GFS files are 3-hourly and look like this: # netcdf gfs.t00z.sfcf030 { # dimensions: # grid_xt = 3072 ; # grid_yt = 1536 ; # time = 1 ; # variables: # double grid_xt(grid_xt) ; # grid_xt:cartesian_axis = "X" ; # grid_xt:long_name = "T-cell longitude" ; # grid_xt:units = "degrees_E" ; # double lon(grid_yt, grid_xt) ; # lon:long_name = "T-cell longitude" ; # lon:units = "degrees_E" ; # double grid_yt(grid_yt) ; # grid_yt:cartesian_axis = "Y" ; # grid_yt:long_name = "T-cell latiitude" ; # grid_yt:units = "degrees_N" ; # double lat(grid_yt, grid_xt) ; # lat:long_name = "T-cell latitude" ; # lat:units = "degrees_N" ; # double time(time) ; # time:long_name = "time" ; # time:units = "hours since 2022-11-30 00:00:00" ; # time:cartesian_axis = "T" ; # time:calendar_type = "JULIAN" ; # time:calendar = "JULIAN" ; # ... ds = nc.Dataset(files[0], "r") lat_gfs_deg = ds["grid_yt"][:] lon_gfs_deg = ds["grid_xt"][:] # grid_yt starts at N pole, so we are already ascending once convert to colat colat_gfs_deg = 90 - lat_gfs_deg colat_gfs = np.deg2rad(colat_gfs_deg) assert (np.diff(colat_gfs) > 0).all(), "already ascending" assert colat_gfs.min() > 0 and colat_gfs.max() < np.pi # grid_xt is [0, 360), so we already meet the conditions lon_gfs = np.deg2rad(lon_gfs_deg) assert (np.diff(lon_gfs) > 0).all(), "already ascending" assert -np.pi <= lon_gfs[0] < np.pi and lon_gfs[-1] <= lon_gfs[0] + 2 * np.pi # Soil type (so we can set non-soil points to 0 instead of 1) sotyp = ds["sotyp"][:].squeeze() # Veg type (needed for normalizing the soil water wrt. min/max) vtype = ds["vtype"][:].squeeze().astype(int) unique_vtypes = sorted(np.unique(vtype)) # t0 t0 = nc.num2date(ds["time"][0], units=ds["time"].units, calendar=ds["time"].calendar) ds.close() # # Get GFS file times and grid # gfs_times = [] for i, fp in enumerate(files): ds = nc.Dataset(fp, "r") # Get time assert ds.dimensions["time"].size == 1 t_num = ds["time"][0] t = nc.num2date(t_num, units=ds["time"].units, calendar=ds["time"].calendar) gfs_times.append(t_num) print(t_num, t, fp) # Get grid if i == 0: # TODO: should move this up and here check that these are same for all files gfs_lon_1d = ds["grid_xt"][:] gfs_lat_1d = ds["grid_yt"][:] gfs_time_units = ds["time"].units gfs_time_calendar = ds["time"].calendar gfs_time_dtype = ds["time"].dtype ds.close() assert (np.diff(gfs_lon_1d) > 0).all(), "already ascending" lat_needs_flip = gfs_lat_1d[0] > gfs_lat_1d[-1] if lat_needs_flip: print("Will flip lat") # # Create and initialize new dataset # if o_fp is None: o_fp = Path.cwd() / t0.strftime(r"gfs-bio_%Y%m%d.nc") ds_new = nc.Dataset(o_fp, "w", format="NETCDF4") ds_new.title = "Biogenic inputs from GFS for NEXUS/HEMCO" ds_new.history = ( "NOAA GFS data reformatted to fit the COARDS conventions " "and be used in NEXUS/HEMCO" ) for k, v in M2_DS_ATTRS.items(): ds_new.setncattr(k, v) ntime_gfs = len(files) # e.g. 25 (0:3:72) ntime_m2 = int(gfs_times[-1] - gfs_times[0] + 1) # e.g. 73 (0:1:72) # NOTE: ^ assumes GFS times are on the hour ds_new.createDimension("time", ntime_m2) time = ds_new.createVariable("time", gfs_time_dtype, ("time",)) for k, v in M2_TIME_ATTRS.items(): setattr(time, k, v) time.axis = "T" lat_dim = ds_new.createDimension("lat", gfs_lat_1d.size) lat = ds_new.createVariable("lat", gfs_lat_1d.dtype, ("lat",)) lat[:] = gfs_lat_1d[::-1] if lat_needs_flip else gfs_lat_1d for k, v in M2_LAT_ATTRS.items(): setattr(lat, k, v) lat.axis = "Y" lon_dim = ds_new.createDimension("lon", gfs_lon_1d.size) lon = ds_new.createVariable("lon", gfs_lon_1d.dtype, ("lon",)) lon[:] = gfs_lon_1d for k, v in M2_LON_ATTRS.items(): setattr(lon, k, v) lon.axis = "X" ds_new_pre = {} for vn, d in M2_DATA_VAR_INFO.items(): var = ds_new.createVariable(vn, np.float32, ("time", "lat", "lon")) var[:] = 0 for k, v in d["attrs"].items(): setattr(var, k, v) ds_new_pre[vn] = np.empty((ntime_gfs, lat_dim.size, lon_dim.size), dtype=np.float32) # # Load variables # print("Loading variables") for i, (fp, t) in enumerate(zip(files, gfs_times)): print(f"{fp.as_posix()} ({t})") ds = nc.Dataset(fp, "r") for vn_old, vn_new in M2_DATA_VAR_OLD_TO_NEW.items(): print(f"{vn_old} -> {vn_new}") data = ds[vn_old][:].squeeze() # squeeze singleton time # TODO: deal with `.missing_value`/`_FillValue`? (both are set) if vn_old == "soilw4": # Set non-soil to 0 (from 1) # 0: Water # 16: Antarctica data[(sotyp == 0) | (sotyp == 16)] = 0 # Normalize soil moisture based on min/max by vtype data for vt in unique_vtypes: if vt == 0 or vt > 19: continue max_vt = MAXSMC[vt] min_vt = DRYSMC[vt] assert 0 < min_vt < max_vt < 1, f"{vt} ({min_vt}, {max_vt})" is_vt = vtype == vt data[is_vt] = (data[is_vt] - min_vt) / (max_vt - min_vt) if vn_old == "soilw4": data_new = np.clip(data, 0, 1) else: data_new = np.clip(data, 0, None) # no negatives ds_new_pre[vn_new][i, :, :] = data_new[::-1, :] if lat_needs_flip else data_new # # Time interpolation of data vars and set times # gfs_times = np.array(gfs_times, dtype=gfs_time_dtype) gfs_is_hourly = (np.diff(gfs_times) == 1).all() assert (np.floor(gfs_times) == gfs_times).all(), "on the hour" assert gfs_time_units.startswith("hours since ") m2_times = np.arange(gfs_times[0], gfs_times[-1] + 1, 1, dtype=gfs_time_dtype) assert m2_times.size == ntime_m2 time[:] = m2_times time.calendar = gfs_time_calendar time.units = gfs_time_units x = gfs_times x_new = m2_times print("Time interp") if gfs_is_hourly: assert (gfs_times == m2_times).all() print( "(but the GFS input is already hourly, so we won't actually do time interp, " "just load variables)" ) for vn in M2_DATA_VAR_INFO: print(vn) if gfs_is_hourly: tmp = ds_new_pre[vn] else: f = interp1d(x, ds_new_pre[vn], kind="linear", axis=0, copy=False, assume_sorted=True) tmp = np.clip(f(x_new), 0, None) ds_new[vn][:] = tmp print(f"Writing out new dataset to {o_fp.as_posix()}") ds_new.close() print("Done") return 0 def parse_args(argv=None): import argparse parser = argparse.ArgumentParser( description="Extract and format bio inputs for NEXUS from GFS data", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "-i", "--input", type=Path, nargs="+", help=( "paths to the GFS files to be loaded. " "Can be an expanded or unexpanded glob expression. " "If unexpanded, the wildcards must be in the file name part of the path." ), required=True, ) parser.add_argument( "-o", "--output", type=Path, help="output file path. Defaults to gfs-bio_YYYYMMDD.nc if not specified.", required=False, ) args = parser.parse_args(argv) return { "i_fps": args.input, "o_fp": args.output, } if __name__ == "__main__": raise SystemExit(main(**parse_args()))