#!/usr/bin/env python3 ############################################################################################### # # final_lmp_cv_fcst.py - formatting utility # # Purpose: # format lmp web data files with final forecast and relevant threshold # # INPUT: # $1 = lmp u660 formatted file with cv prop forecasts (script keys on header names) # $2 = lmp u660 formatted file with cv thresholds (script keys on header names) # # HISTORY: DECEMBER 2017 SCHNAPP MDL- CREATED # FEBRUARY 2021 HUANG MDL- MODIFIED FOR ADDING ccg, cvs MELD PROBS AND THRESHOLDS # # LIBRARY REQUIREMENTS: # Python3.6 # pandas (works with 0.20.1) # numpy (works with 1.12.1) # ############################################################################################### import sys import pandas as pd import numpy as np def final(f, t): # forecast elements and versions in order of priority fcsts = {'CIG#2':['CIGMD#2','CIGBP#2','CIGBS#2'], 'CIG#3':['CIGMD#3','CIGBP#3','CIGBS#3'], 'CIG#5':['CIGMD#5','CIGBP#5','CIGBS#5'], 'CCG#2':['CCGMD#2','CCGBP#2','CCGBS#2'], 'CCG#3':['CCGMD#3','CCGBP#3','CCGBS#3'], 'CCG#5':['CCGMD#5','CCGBP#5','CCGBS#5'], 'VIS#2':['VISMD#2','VISBP#2','VISBS#2'], 'VIS#4':['VISMD#4','VISBP#4','VISBS#4'], 'VIS#5':['VISMD#5','VISBP#5','VISBS#5'], 'CVS#2':['CVSMD#2','CVSBP#2','CVSBS#2'], 'CVS#4':['CVSMD#4','CVSBP#4','CVSBS#4'], 'CVS#5':['CVSMD#5','CVSBP#5','CVSBS#5'], 'PPO':['LMPPPOP','LMPPPOS'], 'PRBFRZ':['PRBFRZP','PRBFRZS'], 'PRBSNW':['PRBSNWP','PRBSNWS'],} # need to create explicit mapping as there is little consistency between prob and corresponding thresh labels # FCST THRESH #CIG map_f_to_t = {'CIGMD#2':'CIGM#2', 'CIGMD#3':'CIGM#3', 'CIGMD#5':'CIGM#5', 'CIGBP#2':'CIGP#2', 'CIGBP#3':'CIGP#3', 'CIGBP#5':'CIGP#5', 'CIGBS#2':'CIGS#2', 'CIGBS#3':'CIGS#3', 'CIGBS#5':'CIGS#5', # CONDITIONAL CIG 'CCGMD#2':'CCGM#2', 'CCGBP#2':'CCGP#2', 'CCGBS#2':'CCGS#2', 'CCGMD#3':'CCGM#3', 'CCGBP#3':'CCGP#3', 'CCGBS#3':'CCGS#3', 'CCGMD#5':'CCGM#5', 'CCGBP#5':'CCGP#5', 'CCGBS#5':'CCGS#5', # VIS 'VISMD#2':'VISM#2', 'VISMD#4':'VISM#4', 'VISMD#5':'VISM#5', 'VISBP#2':'VISP#2', 'VISBP#4':'VISP#4', 'VISBP#5':'VISP#5', 'VISBS#2':'VISS#2', 'VISBS#4':'VISS#4', 'VISBS#5':'VISS#5', # CONDITIONAL VIS 'CVSMD#2':'CVSM#2', 'CVSBP#2':'CVSP#2', 'CVSBS#2':'CVSS#2', 'CVSMD#4':'CVSM#4', 'CVSBP#4':'CVSP#4', 'CVSBS#4':'CVSS#4', 'CVSMD#5':'CVSM#5', 'CVSBP#5':'CVSP#5', 'CVSBS#5':'CVSS#5', # PPO 'LMPPPOP':'PPOP#1', 'LMPPPOS':'PPOS#1', # FREEZING 'PRBFRZP':'FRZPRI', 'PRBFRZS':'FRZSEC', # SNOW 'PRBSNWP':'SNWPRI', 'PRBSNWS':'SNWSEC'} # lists of series for output dataframes forecasts = [f['VARIABLE']] thresholds = [t['VARIABLE']] # iteratively check the forecasts in reversed order of priority filling in higher priority forecasts where they exist for key in fcsts.keys(): # let fs and ts be series which hold the forecasts and thresholds for a single forecast element and category # Initiate them as the lowest priority forecast and threshold fs = f[fcsts[key][-1]].astype('float') ts = t[map_f_to_t[fcsts[key][-1]]].astype('float') for ver in reversed(fcsts[key]): # Fill in fs and ts with higher priority data if it exists (simplicity of starting from what is initialized does not hit performance) fs = f[ver].where( f[ver].astype('float') < 999, fs ).astype('float') ts = t[map_f_to_t[ver]].where( f[ver].astype('float') < 999, ts ).astype('float') fs.name = key ts.name = key forecasts.append(fs) thresholds.append(ts) df_f = pd.concat(forecasts, axis=1) df_t = pd.concat(thresholds, axis=1) return df_f, df_t def data_reader(file): ''' IDENTIFY LOCATIONS OF HEADER SECTIONS HEADERS HAVE THE FOLLOWING FORMAT... "\n" " DATA FOR YYYYMMDDHH FOR VARIABLE GROUP X" "\n" " VARIABLE all varable headers...." HEADERS ARE FOLLOWED BY DATA DATA SECTION TERMINATES WHEN NEW HEADER OR EOF FOLLOWS ''' frames = [] with open(file, 'r') as fcsts: more_data = True while more_data: header_found = False for ix,line in enumerate(fcsts): if 'DATA FOR' in line: # hit header next(fcsts) # skip blank line followed by every DATA FOR line head_labels = next(fcsts).split() # record head labels header_found = True break if not header_found: more_data = False break data = [] for line in fcsts: if line.strip(): data.append(line.split()[1:]) else: break df = pd.DataFrame(data, columns = head_labels) frames.append(df) return frames def data_writer(old_file, new_frames): newfile = old_file + '_cv' with open(old_file,'r') as old, open(newfile, 'bw') as new: frame_index = 0 for write_line in old: if 'CIG' in write_line: # write formatted header cols = len(new_frames[frame_index].columns) format = '%10s '+'%9s'*(cols -2) np.savetxt(new, [new_frames[frame_index].columns.values[1:]],fmt=format) # write formatted data format = '%5i %-7s'+'%8.3f '*(cols -2) np.savetxt(new, new_frames[frame_index], fmt=format) frame_index += 1 skipping = True while skipping: try: line = next(old) except StopIteration: skipping = False if not line.strip(): new.write(line.encode()) skipping = False continue else: new.write(write_line.encode()) def main(): fcst_file = sys.argv[1] thresh_file = sys.argv[2] # read data fcst_frames = data_reader(fcst_file) thresh_frames = data_reader(thresh_file) final_f_frames = [] final_t_frames = [] # perform data manipulation/organization for f_frame, t_frame in zip(fcst_frames,thresh_frames): final_f_frame, final_t_frame = final(f_frame, t_frame) # add the index as a column (start at 1) final_f_frame.insert(0,'idx',final_f_frame.index + 1) final_t_frame.insert(0,'idx',final_f_frame.index + 1) # store all the dataframes in lists final_f_frames.append(final_f_frame) final_t_frames.append(final_t_frame) # write data data_writer(fcst_file, final_f_frames) data_writer(thresh_file, final_t_frames) if __name__ == "__main__": main()