---
title: "Aggregation of ML and Parametrization (2019)"
author: "[`Sothea HAS`](https://hassothea.github.io/)"
format:
html:
anchor-sections: true
code-tools: true
code-fold: true
toc: false
toc-depth: 3
jupyter: python3
---
# Installing and importing packages
```{python}
import sys
import netCDF4
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde as kde
import pandas as pd
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
```
# Aggregation 1: Gradient COBRA
 {width="1000px"}
with, for example, $$K_h(x)=\exp(-\|x/h\|^2).$$
One needs to estimate the best smoothing parameter $h$ for the aggregation.
# Aggregation 2: MixCOBRA
 {width="1000px"}
with, for example, $$K_{a,b}(x)=\exp(-\|(x/a,y/b)\|^2).$$
One needs to estimate $a$ and $b$.
# Aggregation 3: Super learner
 {width="1000px"}
# Report result
The following result are:
- The first graphic: time series of true gravity wave momentum fluxes and the three aggregation methods.
- The second graphic: PDF of the shown time series (logarithmic scale on the y axis).
- The barplot of correlations of the 3 ML methods and the 11 parametrizations.
```{python}
def av_resolution(resulution, y):
a = len (y) // resulution
r = len (y) % a
if r < resulution/ 2 :
group = np.concatenate((np.repeat(list (range (0 ,a)), resulution), np.repeat(a- 1 , r)))
else :
group = np.concatenate((np.repeat(list (range (0 ,a)), resulution), np.repeat(a, r)))
return y.groupby(by = group).mean()
```
```{python}
#| warning: false
#| messages: false
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error
version = 4
typ = 'abs'
# ml_coef = {
# '1' : [0.56, 0.57, 0.58],
# '2' : [0.70, 0.67, 0.74],
# '3' : [0.45, 0.48, 0.49],
# '4' : [0.44, 0.43, 0.47],
# '5' : [0.51, 0.56, 0.55],
# '6' : [0.72, 0.74, 0.75],
# '7' : [0.51, 0.53, 0.48],
# '8' : [0.74, 0.76, 0.72]
# }
cols = {'y_test' : 'black' , 'Agg1' : '#4095C2' , 'Agg2' : '#558C5E' , 'Agg3' : '#D45A56' }
index_bal_day = [0 , 107 , 210 , 311 , 377 , 457 , 515 , 598 , 675 ]
path0 = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/models/output_ml_param_both_v2/output/"
path = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/models/output_ml_param/output/new_result_2024/result_param3/"
path1 = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/models/output_ml_param/output/"
ml_pred = xr.open_dataset(path + f'pred_ml_2019_ { typ} .nc' )
rf_pred0 = av_resolution(8 , pd.DataFrame(ml_pred['rf' ].values)).values.reshape(- 1 )
extra_pred0 = av_resolution(8 , pd.DataFrame(ml_pred['extra' ].values)).values.reshape(- 1 )
boost_pred0 = av_resolution(8 , pd.DataFrame(ml_pred['boost' ].values)).values.reshape(- 1 )
# subfig_title = ["ML","ML + Parametrization","ML + LMDz", "Parametrization"]
subfig_title = ["ML" ,"ML + 3 Params" ,"3 Params" , 'Agg_Param_All_No_OldCMAM' ]
fig = make_subplots(
rows= 2 , cols= 2 ,
specs= [[{}, {}],[{}, {}]],
horizontal_spacing = 0.075 ,
vertical_spacing= 0.09 ,
subplot_titles= ("Among MLs" ,"MLs with 3 Params" ,"Among the 3 Params" , "Among all params except 'Old CMAM'" ))
fig_scatter = make_subplots(
rows= 2 , cols= 2 ,
specs= [[{}, {}],[{}, {}]],
horizontal_spacing = 0.075 ,
vertical_spacing= 0.09 ,
subplot_titles= ("Among MLs" ,"MLs with 3 Params" ,"Among the 3 Params" , "Among all params except 'Old CMAM'" ))
sns.set ()
first = True
rs = {1 : 1 , 2 : 1 , 3 : 2 , 4 : 2 }
cs = {1 : 1 , 2 : 2 , 3 : 1 , 4 : 2 }
all_correlation_coef = []
all_rmse = []
for i in range (8 ):
df_ml = xr.open_dataset(path + f'pred_bal { i+ 1 } _u1h_ { typ} _ML_v6.nc' )
df_ml_param = xr.open_dataset(path + f'pred_bal { i+ 1 } _u1h_ { typ} _ML_Param_3M.nc' )
df_param = xr.open_dataset(path + f'pred_bal { i+ 1 } _u1h_ { typ} _Param_No_OldCMAM_v6.nc' )
df_ml_lmdz = xr.open_dataset(path + f'pred_bal { i+ 1 } _u1h_ { typ} _Param_3M.nc' )
all_pred = {'ml' : df_ml, 'param' : df_param, 'ml_param' : df_ml_param, 'ml_lmdz' : df_ml_lmdz}
k = 1
show = True
fig.data = []
fig_scatter.data = []
f, axs = plt.subplots(nrows= 2 , ncols= 2 , figsize= (8 , 6 ))
f.tight_layout(pad= 2.0 )
corr_agg = []
rmse_agg = []
corr_type = []
agg_type = []
jj = 1
nam = {'ml' : 'Agg_ML' , 'ml_param' : 'ML_Param_3M' , 'ml_lmdz' : 'Param_3M' , 'param' : 'Agg_Param_All_No_OldCMAM' }
for j in ['ml' , 'ml_param' , 'ml_lmdz' , 'param' ]:
pred1 = 1000 * av_resolution(8 , pd.DataFrame(all_pred[j].variables['agg1' ][:].values)).values.reshape(- 1 )
pred2 = 1000 * av_resolution(8 , pd.DataFrame(all_pred[j].variables['agg2' ][:].values)).values.reshape(- 1 )
pred3 = 1000 * av_resolution(8 , pd.DataFrame(all_pred[j].variables['agg3' ][:].values)).values.reshape(- 1 )
# pred3 = av_resolution(8, pd.DataFrame(all_pred[j].variables['agg3'][:].values)).values.reshape(-1)
y_test1 = 1000 * av_resolution(8 , pd.DataFrame(all_pred[j].variables['y_test' ][:].values)).values.reshape(- 1 )
rmse1 = np.round (np.sqrt(mean_squared_error(y_test1, pred1)), 5 )
coef1 = np.round (np.corrcoef(y_test1, pred1)[0 ,1 ], 2 )
rmse2 = np.round (np.sqrt(mean_squared_error(y_test1, pred2)), 5 )
coef2 = np.round (np.corrcoef(y_test1, pred2)[0 ,1 ], 2 )
rmse3 = np.round (np.sqrt(mean_squared_error(y_test1, pred3)), 5 )
coef3 = np.round (np.corrcoef(y_test1, pred3)[0 ,1 ], 2 )
corr_agg.extend([coef1, coef2, coef3])
rmse_agg.extend([rmse1, rmse2, rmse3])
# print(rmse_agg)
corr_type.extend([nam[j]+ '1' , nam[j]+ '2' , nam[j]+ '3' ])
agg_type.extend(['Agg1' , 'Agg2' , 'Agg3' ])
if j == 'ml' :
pred_rf = 1000 * np.array(rf_pred0[index_bal_day[i]:index_bal_day[i+ 1 ]])
pred_ex = 1000 * np.array(extra_pred0[index_bal_day[i]:index_bal_day[i+ 1 ]])
pred_boost = 1000 * np.array(boost_pred0[index_bal_day[i]:index_bal_day[i+ 1 ]])
coef_rf = np.round (np.corrcoef(y_test1, pred_rf)[0 ,1 ], 2 )
coef_ex = np.round (np.corrcoef(y_test1, pred_ex)[0 ,1 ], 2 )
coef_boost = np.round (np.corrcoef(y_test1, pred_boost)[0 ,1 ], 2 )
rmse_rf = np.round (np.sqrt(mean_squared_error(y_test1, pred_rf)),5 )
rmse_ex = np.round (np.sqrt(mean_squared_error(y_test1, pred_ex)),5 )
rmse_boost = np.round (np.sqrt(mean_squared_error(y_test1, pred_boost)), 5 )
aggregations = pd.DataFrame(
{
'True GWMF' : y_test1,
'Agg 1 ( {} )' .format (coef1) : pred1,
'Agg 2 ( {} )' .format (coef2) : pred2,
'Agg 3 ( {} )' .format (coef3) : pred3
})
fig.add_trace(go.Scatter(
x= list (range (1 , len (y_test1)+ 1 )),
y= y_test1,
name = "True" ,
mode= "lines" ,
showlegend= show,
line = dict (color = "#2474B4" )),
row= rs[k], col= cs[k])
fig.add_trace(go.Scatter(
x= list (range (1 , len (y_test1))),
y= pred1,
name = "Agg1" ,
mode= "lines" ,
showlegend= show,
line = dict (color = "#E09C39" )),
row= rs[k], col= cs[k])
fig.add_trace(go.Scatter(
x= list (range (1 , len (y_test1)+ 1 )),
y= pred2,
name = "Agg2" ,
mode= "lines" ,
showlegend= show,
line = dict (color = "#52A23C" )),
row= rs[k], col= cs[k])
fig.add_trace(go.Scatter(
x= list (range (1 , len (y_test1)+ 1 )),
y= pred3,
name = "Agg3" ,
mode= "lines" ,
showlegend= show,
line = dict (color = "#B83813" )),
row= rs[k], col= cs[k])
if k == 1 :
xaxis = 'xaxis'
yaxis = 'yaxis'
else :
xaxis = 'xaxis' + str (k)
yaxis = 'yaxis' + str (k)
if k in [3 ,4 ]:
fig['layout' ][xaxis]['title' ] = "Flight duration (day)"
if k in [1 ,3 ]:
fig['layout' ][yaxis]['title' ] = "GWMF (mPa)"
# fig['layout'][yaxis].update(range=[0, np.max(np.max(pred1, pred2, pred3))])
# Scatterplot
fig_scatter.add_trace(go.Scatter(
x= y_test1,
y= y_test1,
name = "True" ,
mode= "lines" ,
opacity= 0.75 ,
showlegend= show,
line = dict (color = "#2474B4" )),
row= rs[k], col= cs[k])
fig_scatter.add_trace(go.Scatter(
x= y_test1,
y= pred1,
name = "Agg1" ,
mode= "markers" ,
opacity= 0.75 ,
showlegend= show,
marker = dict (color = "#E09C39" )),
row= rs[k], col= cs[k])
fig_scatter.add_trace(go.Scatter(
x= y_test1,
y= pred2,
name = "Agg2" ,
mode= "markers" ,
opacity= 0.75 ,
showlegend= show,
marker = dict (color = "#52A23C" )),
row= rs[k], col= cs[k])
fig_scatter.add_trace(go.Scatter(
x= y_test1,
y= pred3,
name = "Agg3" ,
mode= "markers" ,
opacity= 0.75 ,
showlegend= show,
marker = dict (color = "#B83813" )),
row= rs[k], col= cs[k])
if k in [3 ,4 ]:
fig_scatter['layout' ][xaxis]['title' ] = "Reference GWMF (mPa)"
if k in [1 ,3 ]:
fig_scatter['layout' ][yaxis]['title' ] = "Prediction (mPa)"
fig_scatter['layout' ][yaxis]['range' ] = [0 , np.max ([pred1, pred2, pred3])* 1.1 ]
sns.histplot(data= aggregations, ax = axs[rs[k]- 1 , cs[k]- 1 ], legend= True , palette = list (cols.values()))
# axs[rs[k]-1, cs[k]-1].set_yscale('log')
axs[rs[k]- 1 , cs[k]- 1 ].set_title(subfig_title[k- 1 ])
# axs[(k-1) // 2, (k-1) % 2].set_xticks(axs[(k-1) // 2, (k-1) % 2].get_xticks(), rotation = 30)
if k == 1 :
show = False
k += 1
path1 = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Dataset/for_Sothea/phase1/"
param = pd.read_csv(path1 + f"Pred_Balloon_ { i+ 1 } _Amplitude_hourly.DAT" )
param = param.drop(columns= "old_CMAM" )
cor_param = [np.round (np.corrcoef(av_resolution(24 , param[param.columns[i]]).values.reshape(- 1 ), y_test1), 2 )[0 ,1 ] for i in range (param.shape[1 ])]
rmse_param = [np.round (np.sqrt(mean_squared_error(1000 * av_resolution(24 , param[param.columns[i]]).values.reshape(- 1 ), y_test1)), 5 ) for i in range (param.shape[1 ])]
agg_methods_ = np.repeat(['Agg_ML' , 'Agg_ML_3Param' , 'Agg_3Param' , 'Agg_all_Param' ], 3 )
bar_data = pd.DataFrame({
'Corr' : [coef_rf, coef_ex, coef_boost] + corr_agg + cor_param ,
'Method' : ['RF' , 'Extra' , 'Boost' ] + corr_type + list (param.columns),
'Type' : ['ML' , 'ML' , 'ML' ] + agg_type + list (np.repeat('Param' , param.shape[1 ]))
})
all_correlation_coef.append([coef_rf, coef_ex, coef_boost] + corr_agg + cor_param)
fig.update_layout(title_text= "Aggregations on balloon {} " .format (i+ 1 ), width = 820 , height = 600 )
fig.show()
fig_scatter.update_layout(title_text= "Scatterplots on {} " .format (i+ 1 ), width = 820 , height = 600 )
fig_scatter.show()
plt.figure(figsize= (9 ,5 ))
ax = sns.barplot(
bar_data,
x = "Method" ,
y = "Corr" ,
hue = 'Type'
)
ax.set_xticklabels(bar_data.Method, rotation= 45 , ha= "right" )
ax.bar_label(ax.containers[0 ], rotation = 45 )
ax.bar_label(ax.containers[1 ], rotation = 45 )
ax.bar_label(ax.containers[2 ], rotation = 45 )
ax.bar_label(ax.containers[3 ], rotation = 45 )
ax.bar_label(ax.containers[4 ], rotation = 45 )
plt.title("Correlations of all the methods." , fontsize = 15 )
plt.show()
rmse_data = pd.DataFrame({
'RMSE' : np.array([rmse_rf, rmse_ex, rmse_boost] + rmse_agg + rmse_param),
'Method' : ['RF' , 'Extra' , 'Boost' ] + corr_type + list (param.columns),
'Type' : ['ML' , 'ML' , 'ML' ] + list (agg_methods_) + list (np.repeat('Param' , param.shape[1 ]))
})
ax2 = sns.barplot(
rmse_data,
x = "Method" ,
y = "RMSE" ,
hue = 'Type'
)
ax2.set_xticklabels(rmse_data.Method, rotation= 45 , ha= "right" )
ax2.bar_label(ax2.containers[0 ], rotation = 85 )
ax2.bar_label(ax2.containers[1 ], rotation = 85 )
ax2.bar_label(ax2.containers[2 ], rotation = 85 )
ax2.bar_label(ax2.containers[3 ], rotation = 85 )
ax2.bar_label(ax2.containers[4 ], rotation = 85 )
ax2.bar_label(ax2.containers[5 ], rotation = 85 )
ax2.set_ylabel("RMSE (mPa)" , fontsize = 15 )
plt.title("RMSE (mPa) of all the methods." , fontsize = 15 )
plt.show()
print ("Balloon " + str (i+ 1 ) + "... Done!" )
df = pd.DataFrame(all_correlation_coef, columns = ['RF' , 'Extra' , 'Boost' ] + corr_type + list (param.columns))
df.to_csv("C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/Aggregation_with_3_Parameterization/Slides/all_corr_abs.csv" )
df_rmse = pd.DataFrame(all_rmse, columns = ['RF' , 'Extra' , 'Boost' ] + corr_type + list (param.columns))
df_rmse.to_csv("C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/Aggregation_with_3_Parameterization/Slides/all_rmse_abs.csv" )
```