import sys import netCDF4import xarray as xrimport numpy as npimport matplotlib.pyplot as pltfrom scipy.stats import gaussian_kde as kdeimport pandas as pdimport plotly.graph_objects as gofrom sklearn.preprocessing import StandardScaler, MinMaxScaler
Importing data
Code
# 2019 campaignpath ='C:/Users/Sothea Has/Postdoc_Sothea/Datasets/Rani_offline/offline_v9_Strateole_Sothea/offline_v9_Strateole_Sothea/hourly_data/'# path = '/home/shas/Data/'Input = netCDF4.Dataset(path +'Input_ERA5_data_all_balloons.nc')print('\n* Input data from analysis\n==========================')print('- Number of observations: '+str(Input.variables['time'].shape[0]))print('- Variables: '+str(list(Input.variables.keys())))# Ballon data (Rani)# -------------------------print('\n* Hourly balloon data from Rani\n===============================')Bal_data = netCDF4.Dataset(path +'All_STRATEOLE2_Balloon_1hrs15min.nc')print('- Number of observations: '+str(Bal_data.variables['time'].shape[0]))print('- Variables: '+str(list(Bal_data.variables.keys())))bal_levels = [49, 49, 51, 51, 51, 49, 49, 51]#[49, 51, 55, 53, 53, 49, 49, 51] # Balloon levels from 1 to 8index_level = [int((n-1)/2) for n in bal_levels]balloon_indices = [[1,2565], [2566,5036], [5037,7464], [7465,9056], [9057,10974], [10975,12356], [12357,14351], [14352,16197]]balloon_sizes = [balloon_indices[i][1]-balloon_indices[i][0] +1for i inrange(len(balloon_indices))]print(sum(balloon_sizes))# 2021 campaignpath2021 ="C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Dataset/Strateole2_2021/"Input2021 = netCDF4.Dataset(path2021 +"Input_ERA5_data_all_variables_balloons_ph2.nc")Bal_data2 = netCDF4.Dataset(path2021 +'All_STRATEOLE2_Balloon_ph2_1hrs15min.nc')# balloon_index2021 = [1, 30, 328, 1076, 1787, 2887, 4027, 5087, 6378, 7994, 8905, 9785, 11009, 12710, 14062, 14695, 15186]balloon_index2021 = [[1,30],[31,328],[329, 1076],[1077,1787],[1788,2987],[2988,4027],[4028,5087],[5088,6378],[6379,7994],[7995,8905],[8906,9785],[9786,11099],[11100,12710],[12711,14062],[14063,14695],[14696,15186]]bal_size2 = [balloon_index2021[i][1]-balloon_index2021[i][0]+1for i inrange(len(balloon_index2021))]
* Input data from analysis
==========================
- Number of observations: 16197
- Variables: ['time', 'longitude', 'latitude', 'level', 'tp', 'lnsp', 'temp', 'vitu', 'vitv']
* Hourly balloon data from Rani
===============================
- Number of observations: 16197
- Variables: ['time', 'lon', 'lat', 'qdm_u', 'qdm_v', 'qdm_u_abs', 'qdm_v_abs', 'qdm_u_east', 'qdm_u_west', 'qdm_v_north', 'qdm_v_south', 'qdm_total', 'alt', 'sza', 'pressure']
16197
def transform(y):return ydef reverse(y):return ydef modify_resolution(resolution, X, y): group = [] start =0 balloon = [0]for i inrange(8): s = balloon_sizes[i] // resolution r = balloon_sizes[i] % resolution group = group +list(np.repeat(range(start, start + s), resolution)) +list(np.repeat(start + s,r))if r ==0: start += selse: start += (s +1) balloon.append(max(group)+1) x_ = X.groupby(by = group).mean() y_ = y.groupby(by = group).mean()return x_, y_, balloondef modify_resolution2(resolution, X, y): group = [] start =0 balloon = [0]for i inrange(16): s = bal_size2[i] // resolution r = bal_size2[i] % resolution group = group +list(np.repeat(range(start, start + s), resolution)) +list(np.repeat(start + s, r))if r ==0: start += selse: start += (s +1) balloon.append(max(group)+1) x_ = X.groupby(by = group).mean() y_ = y.groupby(by = group).mean()return x_, y_, balloondef av_resolution(resulution, y): a =len(y) // resulution r =len(y) % aif r < resulution/2: group = np.concatenate((np.repeat(list(range(0,a)), resulution), np.repeat(a-1, r)))else: group = np.concatenate((np.repeat(list(range(0,a)), resulution), np.repeat(a, r)))return y.groupby(by = group).mean()def lag(va, lag, id, bal =1, test =False):if test: tem = x_.loc[id[bal-1]:id[bal],va]return np.concatenate([np.repeat(np.nan, lag), tem[:-lag]])else: tem = np.zeros(len(x_) -id[bal] +id[bal-1]) a =0for i inrange(len(id)-1): st =int(0)if i != bal -1: a = x_.loc[:,va][id[i]:id[i+1]] tem[st:(st+id[i+1]-id[i])] = np.concatenate([np.repeat(np.nan, lag), a[:-lag]]) st +=id[i+1]-id[i]return temx_, y_, cut_id = modify_resolution(resolution, X_u, u)scaler = StandardScaler()x_train_scaled = scaler.fit_transform(x_)y_train = y_[va_]x_test_scaled = scaler.transform(X_u2)y_test = u2[va_]x_test_new_resol, y_test_new_resol, id_new_resol = modify_resolution2(24, X_u2, u2)
Build models using data of 2019 campaign
We simply just build the models using the optimal hyperparameters obtained from the tuning process of 2019 campaign. In sequel, the comparisons are done in 1day time resolution.
We took the models built entirely using the data from 2019 to predict the data of 2021 campaign.
Code
y_test1 = av_resolution(24, pd.DataFrame(y_test))ex_pred1 = av_resolution(24, pd.DataFrame(ex_pred))rf_pred1 = av_resolution(24, pd.DataFrame(rf_pred))boo_pred1 = av_resolution(24, pd.DataFrame(boo_pred))ex_cor = np.round(np.corrcoef(ex_pred1.values.reshape(-1), y_test1.values.reshape(-1))[0][1], 3)rf_cor = np.round(np.corrcoef(rf_pred1.values.reshape(-1), y_test1.values.reshape(-1))[0][1], 3)boo_cor = np.round(np.corrcoef(boo_pred1.values.reshape(-1), y_test1.values.reshape(-1))[0][1], 3)names = {'abs' : "absolute", 'east' : "eastward", 'west': "westward"}fig = go.Figure(data = [go.Scatter( x =list(range(1, len(y_test1)+1)), y = y_test1.values.reshape(-1), name ="True "+ names[sub] +" GWMF 2021", showlegend=True, mode ="lines", line =dict(color ="black"))])fig.add_trace(go.Scatter(x =list(range(1, len(y_test1)+1)), y = rf_pred1.values.reshape(-1), name ="Random forest ({})".format(rf_cor), showlegend=True, mode ="lines", line =dict(color ="#9002F8")))fig.add_trace(go.Scatter(x =list(range(1, len(y_test1)+1)), y = ex_pred1.values.reshape(-1), name ="Extra-trees ({})".format(ex_cor), showlegend=True, mode ="lines", line =dict(color ="#0F2BF8")))fig.add_trace(go.Scatter(x =list(range(1, len(y_test1)+1)), y = boo_pred1.values.reshape(-1), name ="Adaboost ({})".format(boo_cor), showlegend=True, mode ="lines", line =dict(color ="#FD0F0B")))fig.update_layout(title_text ="True vs predicted GWMF", width =800, height =500)col_map = ["#566573", "white"]for i inrange(16): fig.add_vrect( x0=id_new_resol[i], x1 = id_new_resol[i+1], annotation =dict(text =str(i+2)), fillcolor = col_map[i%2], opacity =0.2, line =dict(width =0.5, color ="black"))fig.show()print("\n\n\n* RMSE of (RF, Extra, Adaboost) = ({}, {}, {})".format( np.round(np.sqrt(mean_squared_error(y_test1, rf_pred1))/np.mean(y_test1), 3), np.round(np.sqrt(mean_squared_error(y_test1, ex_pred1))/np.mean(y_test1),3), np.round(np.sqrt(mean_squared_error(y_test1, boo_pred1))/np.mean(y_test1), 3)))print("\n\n\n* Correlation of (RF, Extra, Adaboost) = ({}, {}, {})".format( rf_cor, ex_cor, boo_cor))
* RMSE of (RF, Extra, Adaboost) = (0.636, 0.637, 0.677)
* Correlation of (RF, Extra, Adaboost) = (0.498, 0.495, 0.48)
Aggregate model 1: Gradient COBRA
This aggregation method requires some observations from the new source to fine tune the key parameter. We use the first three balloons comprising of 1077 (1h time resolution) observations to estimate the bandwidth parameter for the aggregation. However, the comparison on the remaining data is done in daily time resolution.
Code
from gradientcobra.gradientcobra import GradientCOBRAPred_agg = np.column_stack([rf_pred, ex_pred, boo_pred])n_size = balloon_index2021[3][0]bal_sect = np.array(id_new_resol[3:]) - id_new_resol[3]gc = GradientCOBRA( opt_method="grid", bandwidth_list=np.linspace(0.001, 10, 500)).fit( X=Pred_agg[:n_size,:] *5, y = y_test.values.reshape(-1)[:n_size])gc.draw_learning_curve()gc_pred = gc.predict(X=Pred_agg[n_size:,:] *5)gc_pred1 = av_resolution(24, pd.DataFrame(gc_pred))y_test2 = av_resolution(24, pd.DataFrame(y_test.values.reshape(-1)[n_size:]))fig = go.Figure(data = [go.Scatter( x =list(range(1, len(y_test) - n_size +1)), y = y_test2.values.reshape(-1), name ="True "+ names[sub] +" GWMF 2021", showlegend=True, mode ="lines", line =dict(color ="black"))])ag_cor1 = np.round(np.corrcoef(y_test2.values.reshape(-1), gc_pred1.values.reshape(-1))[0,1],3)fig.add_trace(go.Scatter(x =list(range(1, len(y_test) - n_size +1)), y = gc_pred1.values.reshape(-1), name ="Aggregation 1 ({})".format(ag_cor1), showlegend=True, mode ="lines", line =dict(color ="#FD0F0B")))fig.update_layout(title_text ="True vs aggregation", width =800, height =500)col_map = ["#566573", "white"]fig.add_vrect( x0=bal_sect[0], x1 = bal_sect[1], annotation =dict(text ="Bal-4", align ="center"), fillcolor ="white", opacity =0.1, line =dict(width =0))for i inrange(12): fig.add_vrect( x0=bal_sect[i+1], x1 = bal_sect[i+2], annotation =dict(text =str(i+5)), fillcolor = col_map[i%2], opacity =0.2, line =dict(width =0.5, color ="black"))fig.show()print("\n\n\n* RMSE = {}".format(np.round(np.sqrt(mean_squared_error(gc_pred1, y_test2))/np.mean(y_test2),3)))print("\n\n\n* Corr = {}".format(ag_cor1))
-> Grid search algorithm with radial kernel is in progress...
~ Full process|--------------------------------------------------|100%
~ Processing|==================================================|100%
* RMSE = 0.641
* Corr = 0.504
Aggregate model 2: MixCOBRA
This is another aggregation method taking into account the input part.
Code
from gradientcobra.mixcobra import MixCOBRARegressorgc2 = MixCOBRARegressor( alpha_list=np.linspace(0.0001, 1000, 100), beta_list=np.linspace(0.0001, 30, 100)).fit( X=x_test_scaled[:n_size,:] *1000, Pred_features=Pred_agg[:n_size,:] *1000, y=y_test.values.reshape(-1)[:n_size])gc2.draw_learning_curve()gc_pred2 = gc2.predict(X=x_test_scaled[n_size:,:] *1000, Pred_X = Pred_agg[n_size:,:]*1000)gc_pred2 = av_resolution(24, pd.DataFrame(gc_pred2))fig = go.Figure(data = [go.Scatter( x =list(range(1, len(y_test) - n_size +1)), y = y_test2.values.reshape(-1), name ="True "+ names[sub] +" GWMF 2021", showlegend=True, mode ="lines", line =dict(color ="black"))])ag_cor2 = np.round(np.corrcoef(y_test2.values.reshape(-1), gc_pred2.values.reshape(-1))[0,1],3)fig.add_trace(go.Scatter(x =list(range(1, len(y_test) - n_size +1)), y = gc_pred2.values.reshape(-1), name ="Aggregation 2 ({})".format(ag_cor2), showlegend=True, mode ="lines", line =dict(color ="#FD0F0B")))fig.update_layout(title_text ="True vs aggregation", width =800, height =500)for i inrange(12): fig.add_vrect( x0=bal_sect[i+1], x1 = bal_sect[i+2], annotation =dict(text =str(i+5)), fillcolor = col_map[i%2], opacity =0.2, line =dict(width =0.5, color ="black"))fig.show()print("\n\n\n* RMSE = {}".format(np.round(np.sqrt(mean_squared_error(gc_pred2, y_test2))/np.mean(y_test2),3)))print("\n\n\n* Corr = {}".format(ag_cor2))
* Grid search algorithm of two parameters with radial kernel is in progress...
~ Full process|--------------------------------------------------|100%
~ Processing|==================================================|100%
* RMSE = 0.729
* Corr = 0.369
Source Code
---title: "Building models using 2019 Campaign and predict on the 2021"author: "[`Sothea HAS`](https://hassothea.github.io/)"format: html: anchor-sections: true code-tools: true code-fold: truetoc: truetoc-depth: 3jupyter: python3---# Installing and importing packages```{python}import sys import netCDF4import xarray as xrimport numpy as npimport matplotlib.pyplot as pltfrom scipy.stats import gaussian_kde as kdeimport pandas as pdimport plotly.graph_objects as gofrom sklearn.preprocessing import StandardScaler, MinMaxScaler```# Importing data```{python}# 2019 campaignpath ='C:/Users/Sothea Has/Postdoc_Sothea/Datasets/Rani_offline/offline_v9_Strateole_Sothea/offline_v9_Strateole_Sothea/hourly_data/'# path = '/home/shas/Data/'Input = netCDF4.Dataset(path +'Input_ERA5_data_all_balloons.nc')print('\n* Input data from analysis\n==========================')print('- Number of observations: '+str(Input.variables['time'].shape[0]))print('- Variables: '+str(list(Input.variables.keys())))# Ballon data (Rani)# -------------------------print('\n* Hourly balloon data from Rani\n===============================')Bal_data = netCDF4.Dataset(path +'All_STRATEOLE2_Balloon_1hrs15min.nc')print('- Number of observations: '+str(Bal_data.variables['time'].shape[0]))print('- Variables: '+str(list(Bal_data.variables.keys())))bal_levels = [49, 49, 51, 51, 51, 49, 49, 51]#[49, 51, 55, 53, 53, 49, 49, 51] # Balloon levels from 1 to 8index_level = [int((n-1)/2) for n in bal_levels]balloon_indices = [[1,2565], [2566,5036], [5037,7464], [7465,9056], [9057,10974], [10975,12356], [12357,14351], [14352,16197]]balloon_sizes = [balloon_indices[i][1]-balloon_indices[i][0] +1for i inrange(len(balloon_indices))]print(sum(balloon_sizes))# 2021 campaignpath2021 ="C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Dataset/Strateole2_2021/"Input2021 = netCDF4.Dataset(path2021 +"Input_ERA5_data_all_variables_balloons_ph2.nc")Bal_data2 = netCDF4.Dataset(path2021 +'All_STRATEOLE2_Balloon_ph2_1hrs15min.nc')# balloon_index2021 = [1, 30, 328, 1076, 1787, 2887, 4027, 5087, 6378, 7994, 8905, 9785, 11009, 12710, 14062, 14695, 15186]balloon_index2021 = [[1,30],[31,328],[329, 1076],[1077,1787],[1788,2987],[2988,4027],[4028,5087],[5088,6378],[6379,7994],[7995,8905],[8906,9785],[9786,11099],[11100,12710],[12711,14062],[14063,14695],[14696,15186]]bal_size2 = [balloon_index2021[i][1]-balloon_index2021[i][0]+1for i inrange(len(balloon_index2021))]```# Select the data and parameters```{python}lmd =0.6bal_ =1resolution =3beta =0.2typ ='u1h'sub ="abs"va_ ="qdm_u_"+ subimage_format ='png'if sub !='west': tranf =Trueelse: tranf =False# Saving imagesnamefig = ['rf', 'extra', 'boost']fignames = ['Random Forest', 'Extra Trees', 'Adaboost']addname ='fig_'+ typ +'_bal'+str(bal_) +'_3h_24h_'+ sub +'.png'```# Preparing data```{python}input_lookup =list(Input.variables.keys())input_dict = {}input_dict2 = {}leng =len(Input.variables['time'])for va in input_lookup:if va in ['lnsp']: input_dict[va] = np.array(Input.variables[va][:,2,2]).astype('float32') input_dict2[va] = np.array(Input2021.variables[va][:,2,2]).astype('float32')if va in ['temp', 'vitu', 'vitv']:for lev in [int((n-1)/2) for n in [51, 85, 111, 137]]: input_dict[va+str(int(2*lev+1))] = np.array(Input.variables[va][:, lev, 2, 2]).astype('float32') input_dict2[va+str(int(2*lev+1))] = np.array(Input2021.variables[va][:, lev, 2, 2]).astype('float32')if va =='tp': input_dict['tp'] = np.array(Input.variables[va][:,2,2]).astype('float32') input_dict2['tp'] = np.array(Input2021.variables[va][:,2,2]).astype('float32')input_dict['sza'] = Bal_data['sza'][:]input_dict['tp_mean'] = np.mean(Input.variables['tp'][:,:,:], axis=(1,2))input_dict['tp_sd'] = np.std(Input.variables['tp'][:,:,:], axis=(1,2))input_dict2['sza'] = Bal_data2['sza'][:]input_dict2['tp_mean'] = np.mean(Input2021.variables['tp'][:,:,:], axis=(1,2))input_dict2['tp_sd'] = np.std(Input2021.variables['tp'][:,:,:], axis=(1,2))# All inputsX_u = pd.DataFrame(input_dict)# Input for zonal wind (u)u = pd.DataFrame({'qdm_u': Bal_data.variables['qdm_u'][:], 'qdm_u_abs': Bal_data.variables['qdm_u_abs'][:],'qdm_u_west': Bal_data.variables['qdm_u_west'][:],'qdm_u_east': Bal_data.variables['qdm_u_east'][:]})print(X_u.shape)print(u.shape)print(X_u.columns)# InputsX_u2 = pd.DataFrame(input_dict2)# Input for zonal wind (u)u2 = pd.DataFrame({'qdm_u': Bal_data2.variables['qdm_u'][:], 'qdm_u_abs': Bal_data2.variables['qdm_u_abs'][:],'qdm_u_west': Bal_data2.variables['qdm_u_west'][:],'qdm_u_east': Bal_data2.variables['qdm_u_east'][:]})print(X_u2.shape)print(u2.shape)print(X_u2.columns)```# Time resolution transformation (3h)```{python}def transform(y):return ydef reverse(y):return ydef modify_resolution(resolution, X, y): group = [] start =0 balloon = [0]for i inrange(8): s = balloon_sizes[i] // resolution r = balloon_sizes[i] % resolution group = group +list(np.repeat(range(start, start + s), resolution)) +list(np.repeat(start + s,r))if r ==0: start += selse: start += (s +1) balloon.append(max(group)+1) x_ = X.groupby(by = group).mean() y_ = y.groupby(by = group).mean()return x_, y_, balloondef modify_resolution2(resolution, X, y): group = [] start =0 balloon = [0]for i inrange(16): s = bal_size2[i] // resolution r = bal_size2[i] % resolution group = group +list(np.repeat(range(start, start + s), resolution)) +list(np.repeat(start + s, r))if r ==0: start += selse: start += (s +1) balloon.append(max(group)+1) x_ = X.groupby(by = group).mean() y_ = y.groupby(by = group).mean()return x_, y_, balloondef av_resolution(resulution, y): a =len(y) // resulution r =len(y) % aif r < resulution/2: group = np.concatenate((np.repeat(list(range(0,a)), resulution), np.repeat(a-1, r)))else: group = np.concatenate((np.repeat(list(range(0,a)), resulution), np.repeat(a, r)))return y.groupby(by = group).mean()def lag(va, lag, id, bal =1, test =False):if test: tem = x_.loc[id[bal-1]:id[bal],va]return np.concatenate([np.repeat(np.nan, lag), tem[:-lag]])else: tem = np.zeros(len(x_) -id[bal] +id[bal-1]) a =0for i inrange(len(id)-1): st =int(0)if i != bal -1: a = x_.loc[:,va][id[i]:id[i+1]] tem[st:(st+id[i+1]-id[i])] = np.concatenate([np.repeat(np.nan, lag), a[:-lag]]) st +=id[i+1]-id[i]return temx_, y_, cut_id = modify_resolution(resolution, X_u, u)scaler = StandardScaler()x_train_scaled = scaler.fit_transform(x_)y_train = y_[va_]x_test_scaled = scaler.transform(X_u2)y_test = u2[va_]x_test_new_resol, y_test_new_resol, id_new_resol = modify_resolution2(24, X_u2, u2)```------------------# Build models using data of 2019 campaign> We simply just build the models using the optimal hyperparameters obtained from the tuning process of 2019 campaign. In sequel, the comparisons are done in 1day time resolution.```{python}from sklearn.tree import DecisionTreeRegressorfrom sklearn.model_selection import RepeatedKFold, GridSearchCVfrom sklearn.metrics import mean_squared_errorfrom sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressorva ='qdm_u_'+ sub# param_path = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/New_loss/5th_ty_E2_Cor9_Beta/Beta025/output/"param_path ="C:/Users/Sothea Has/Postdoc_Sothea/Codes/ERA5_to_QDM/Computation10/Parameters/output2/"param = typ +"_parameters_bal"+str(bal_) +"_3h_24h"+ sub +".nc"parameters = xr.open_dataset(param_path+param)# data splitting# kl_id = np.random.permutation(range(len(y_train)))# k = len(kl_id)//2# X_k, X_l, y_k, y_l = x_train_scaled.iloc[kl_id[:k],:].reset_index(drop='index'), x_train_scaled.iloc[kl_id[k:],:].reset_index(drop='index'), y_train.iloc[kl_id[:k]].reset_index(drop='index'), y_train.iloc[kl_id[k:]].reset_index(drop='index')extra = ExtraTreesRegressor( n_estimators=int(parameters['n_estimators_extra']), max_features=int(parameters['max_features_extra']), min_samples_leaf=int(parameters['min_samples_leaf_extra']) ).fit(x_train_scaled, y_train)rf = RandomForestRegressor( n_estimators=int(parameters['n_estimators_rf']), max_features=int(parameters['max_features_rf']), min_samples_leaf=int(parameters['min_samples_leaf_rf']) ).fit(x_train_scaled, y_train)boost = AdaBoostRegressor( estimator=DecisionTreeRegressor(max_depth=int(parameters['max_depth_boost']), max_features=int(parameters['max_features_boost'])), n_estimators=int(parameters['n_estimators_boost']), learning_rate=1 ).fit(x_train_scaled, y_train)ex_pred, rf_pred, boo_pred = extra.predict(x_test_scaled), rf.predict(x_test_scaled), boost.predict(x_test_scaled)```------------------# Predictions on 2021 campaign> We took the models built entirely using the data from 2019 to predict the data of 2021 campaign.```{python}y_test1 = av_resolution(24, pd.DataFrame(y_test))ex_pred1 = av_resolution(24, pd.DataFrame(ex_pred))rf_pred1 = av_resolution(24, pd.DataFrame(rf_pred))boo_pred1 = av_resolution(24, pd.DataFrame(boo_pred))ex_cor = np.round(np.corrcoef(ex_pred1.values.reshape(-1), y_test1.values.reshape(-1))[0][1], 3)rf_cor = np.round(np.corrcoef(rf_pred1.values.reshape(-1), y_test1.values.reshape(-1))[0][1], 3)boo_cor = np.round(np.corrcoef(boo_pred1.values.reshape(-1), y_test1.values.reshape(-1))[0][1], 3)names = {'abs' : "absolute", 'east' : "eastward", 'west': "westward"}fig = go.Figure(data = [go.Scatter( x =list(range(1, len(y_test1)+1)), y = y_test1.values.reshape(-1), name ="True "+ names[sub] +" GWMF 2021", showlegend=True, mode ="lines", line =dict(color ="black"))])fig.add_trace(go.Scatter(x =list(range(1, len(y_test1)+1)), y = rf_pred1.values.reshape(-1), name ="Random forest ({})".format(rf_cor), showlegend=True, mode ="lines", line =dict(color ="#9002F8")))fig.add_trace(go.Scatter(x =list(range(1, len(y_test1)+1)), y = ex_pred1.values.reshape(-1), name ="Extra-trees ({})".format(ex_cor), showlegend=True, mode ="lines", line =dict(color ="#0F2BF8")))fig.add_trace(go.Scatter(x =list(range(1, len(y_test1)+1)), y = boo_pred1.values.reshape(-1), name ="Adaboost ({})".format(boo_cor), showlegend=True, mode ="lines", line =dict(color ="#FD0F0B")))fig.update_layout(title_text ="True vs predicted GWMF", width =800, height =500)col_map = ["#566573", "white"]for i inrange(16): fig.add_vrect( x0=id_new_resol[i], x1 = id_new_resol[i+1], annotation =dict(text =str(i+2)), fillcolor = col_map[i%2], opacity =0.2, line =dict(width =0.5, color ="black"))fig.show()print("\n\n\n* RMSE of (RF, Extra, Adaboost) = ({}, {}, {})".format( np.round(np.sqrt(mean_squared_error(y_test1, rf_pred1))/np.mean(y_test1), 3), np.round(np.sqrt(mean_squared_error(y_test1, ex_pred1))/np.mean(y_test1),3), np.round(np.sqrt(mean_squared_error(y_test1, boo_pred1))/np.mean(y_test1), 3)))print("\n\n\n* Correlation of (RF, Extra, Adaboost) = ({}, {}, {})".format( rf_cor, ex_cor, boo_cor))```------------------# Aggregate model 1: Gradient COBRA> This aggregation method requires some observations from the new source to fine tune the key parameter. We use the first three balloons comprising of 1077 (1h time resolution) observations to estimate the bandwidth parameter for the aggregation. However, the comparison on the remaining data is done in daily time resolution.```{python}from gradientcobra.gradientcobra import GradientCOBRAPred_agg = np.column_stack([rf_pred, ex_pred, boo_pred])n_size = balloon_index2021[3][0]bal_sect = np.array(id_new_resol[3:]) - id_new_resol[3]gc = GradientCOBRA( opt_method="grid", bandwidth_list=np.linspace(0.001, 10, 500)).fit( X=Pred_agg[:n_size,:] *5, y = y_test.values.reshape(-1)[:n_size])gc.draw_learning_curve()gc_pred = gc.predict(X=Pred_agg[n_size:,:] *5)gc_pred1 = av_resolution(24, pd.DataFrame(gc_pred))y_test2 = av_resolution(24, pd.DataFrame(y_test.values.reshape(-1)[n_size:]))fig = go.Figure(data = [go.Scatter( x =list(range(1, len(y_test) - n_size +1)), y = y_test2.values.reshape(-1), name ="True "+ names[sub] +" GWMF 2021", showlegend=True, mode ="lines", line =dict(color ="black"))])ag_cor1 = np.round(np.corrcoef(y_test2.values.reshape(-1), gc_pred1.values.reshape(-1))[0,1],3)fig.add_trace(go.Scatter(x =list(range(1, len(y_test) - n_size +1)), y = gc_pred1.values.reshape(-1), name ="Aggregation 1 ({})".format(ag_cor1), showlegend=True, mode ="lines", line =dict(color ="#FD0F0B")))fig.update_layout(title_text ="True vs aggregation", width =800, height =500)col_map = ["#566573", "white"]fig.add_vrect( x0=bal_sect[0], x1 = bal_sect[1], annotation =dict(text ="Bal-4", align ="center"), fillcolor ="white", opacity =0.1, line =dict(width =0))for i inrange(12): fig.add_vrect( x0=bal_sect[i+1], x1 = bal_sect[i+2], annotation =dict(text =str(i+5)), fillcolor = col_map[i%2], opacity =0.2, line =dict(width =0.5, color ="black"))fig.show()print("\n\n\n* RMSE = {}".format(np.round(np.sqrt(mean_squared_error(gc_pred1, y_test2))/np.mean(y_test2),3)))print("\n\n\n* Corr = {}".format(ag_cor1))```------------------# Aggregate model 2: MixCOBRA> This is another aggregation method taking into account the input part.```{python}from gradientcobra.mixcobra import MixCOBRARegressorgc2 = MixCOBRARegressor( alpha_list=np.linspace(0.0001, 1000, 100), beta_list=np.linspace(0.0001, 30, 100)).fit( X=x_test_scaled[:n_size,:] *1000, Pred_features=Pred_agg[:n_size,:] *1000, y=y_test.values.reshape(-1)[:n_size])gc2.draw_learning_curve()gc_pred2 = gc2.predict(X=x_test_scaled[n_size:,:] *1000, Pred_X = Pred_agg[n_size:,:]*1000)gc_pred2 = av_resolution(24, pd.DataFrame(gc_pred2))fig = go.Figure(data = [go.Scatter( x =list(range(1, len(y_test) - n_size +1)), y = y_test2.values.reshape(-1), name ="True "+ names[sub] +" GWMF 2021", showlegend=True, mode ="lines", line =dict(color ="black"))])ag_cor2 = np.round(np.corrcoef(y_test2.values.reshape(-1), gc_pred2.values.reshape(-1))[0,1],3)fig.add_trace(go.Scatter(x =list(range(1, len(y_test) - n_size +1)), y = gc_pred2.values.reshape(-1), name ="Aggregation 2 ({})".format(ag_cor2), showlegend=True, mode ="lines", line =dict(color ="#FD0F0B")))fig.update_layout(title_text ="True vs aggregation", width =800, height =500)for i inrange(12): fig.add_vrect( x0=bal_sect[i+1], x1 = bal_sect[i+2], annotation =dict(text =str(i+5)), fillcolor = col_map[i%2], opacity =0.2, line =dict(width =0.5, color ="black"))fig.show()print("\n\n\n* RMSE = {}".format(np.round(np.sqrt(mean_squared_error(gc_pred2, y_test2))/np.mean(y_test2),3)))print("\n\n\n* Corr = {}".format(ag_cor2))```