Aggregation of ML and Parametrization (2019)

Author

Installing and importing packages

Code
import sys 
import netCDF4
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde as kde
import pandas as pd
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns

Aggregation 1: Gradient COBRA

with, for example, \[K_h(x)=\exp(-\|x/h\|^2).\]

One needs to estimate the best smoothing parameter \(h\) for the aggregation.

Aggregation 2: MixCOBRA

with, for example, \[K_{a,b}(x)=\exp(-\|(x/a,y/b)\|^2).\]

One needs to estimate \(a\) and \(b\).

Aggregation 3: Super learner

Report result

The following result are:

  • The first graphic: time series of true gravity wave momentum fluxes and the three aggregation methods.
  • The second graphic: PDF of the shown time series (logarithmic scale on the y axis).
  • The barplot of correlations of the 3 ML methods and the 11 parametrizations.
Code
def av_resolution(resulution, y):
    a = len(y) // resulution
    r = len(y) % a
    if r < resulution/2:
        group = np.concatenate((np.repeat(list(range(0,a)), resulution), np.repeat(a-1, r)))
    else:
        group = np.concatenate((np.repeat(list(range(0,a)), resulution), np.repeat(a, r)))
    return y.groupby(by = group).mean()
Code
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error

version = 4
typ = 'abs'
# ml_coef = {
#     '1' : [0.56, 0.57, 0.58],
#     '2' : [0.70, 0.67, 0.74],
#     '3' : [0.45, 0.48, 0.49],
#     '4' : [0.44, 0.43, 0.47],
#     '5' : [0.51, 0.56, 0.55],
#     '6' : [0.72, 0.74, 0.75],
#     '7' : [0.51, 0.53, 0.48],
#     '8' : [0.74, 0.76, 0.72]
# }
cols = {'y_test': 'black', 'Agg1' : '#4095C2', 'Agg2': '#558C5E', 'Agg3': '#D45A56'}
index_bal_day = [0, 107, 210, 311, 377, 457, 515, 598, 675]
path0 = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/models/output_ml_param_both_v2/output/"
path = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/models/output_ml_param/output/new_result_2024/result_param3/"
path1 = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/models/output_ml_param/output/"
ml_pred = xr.open_dataset(path + f'pred_ml_2019_{typ}.nc')
rf_pred0 = av_resolution(8, pd.DataFrame(ml_pred['rf'].values)).values.reshape(-1)
extra_pred0 = av_resolution(8, pd.DataFrame(ml_pred['extra'].values)).values.reshape(-1)
boost_pred0 = av_resolution(8, pd.DataFrame(ml_pred['boost'].values)).values.reshape(-1)
# subfig_title = ["ML","ML + Parametrization","ML + LMDz", "Parametrization"]
subfig_title = ["ML","ML + 3 Params","3 Params", 'Agg_Param_All_No_OldCMAM']
fig = make_subplots(
    rows=2, cols=2,
    specs=[[{}, {}],[{}, {}]],
    horizontal_spacing = 0.075,
    vertical_spacing= 0.09,
    subplot_titles=("Among MLs","MLs with 3 Params","Among the 3 Params", "Among all params except 'Old CMAM'"))

fig_scatter = make_subplots(
    rows=2, cols=2,
    specs=[[{}, {}],[{}, {}]],
    horizontal_spacing = 0.075,
    vertical_spacing= 0.09,
    subplot_titles=("Among MLs","MLs with 3 Params","Among the 3 Params", "Among all params except 'Old CMAM'"))
sns.set()
first = True
rs = {1 : 1, 2: 1, 3: 2, 4: 2}
cs = {1 : 1, 2: 2, 3: 1, 4: 2}

all_correlation_coef = []
all_rmse = []

for i in range(8): 
    df_ml = xr.open_dataset(path + f'pred_bal{i+1}_u1h_{typ}_ML_v6.nc')
    df_ml_param = xr.open_dataset(path + f'pred_bal{i+1}_u1h_{typ}_ML_Param_3M.nc')
    df_param = xr.open_dataset(path + f'pred_bal{i+1}_u1h_{typ}_Param_No_OldCMAM_v6.nc')
    df_ml_lmdz = xr.open_dataset(path + f'pred_bal{i+1}_u1h_{typ}_Param_3M.nc') 
    all_pred = {'ml' : df_ml, 'param' : df_param, 'ml_param' : df_ml_param, 'ml_lmdz' : df_ml_lmdz}
    k = 1
    show = True
    fig.data = []
    fig_scatter.data = []
    f, axs = plt.subplots(nrows=2, ncols=2, figsize=(8, 6))
    f.tight_layout(pad=2.0)
    corr_agg = []
    rmse_agg = []
    corr_type = []
    agg_type = []
    jj = 1
    nam = {'ml': 'Agg_ML', 'ml_param': 'ML_Param_3M', 'ml_lmdz': 'Param_3M', 'param': 'Agg_Param_All_No_OldCMAM'}
    for j in ['ml', 'ml_param', 'ml_lmdz', 'param']:
        pred1 = 1000*av_resolution(8, pd.DataFrame(all_pred[j].variables['agg1'][:].values)).values.reshape(-1)
        pred2 = 1000*av_resolution(8, pd.DataFrame(all_pred[j].variables['agg2'][:].values)).values.reshape(-1)
        pred3 = 1000*av_resolution(8, pd.DataFrame(all_pred[j].variables['agg3'][:].values)).values.reshape(-1)
        # pred3 = av_resolution(8, pd.DataFrame(all_pred[j].variables['agg3'][:].values)).values.reshape(-1)
        y_test1 = 1000*av_resolution(8, pd.DataFrame(all_pred[j].variables['y_test'][:].values)).values.reshape(-1)

        rmse1 = np.round(np.sqrt(mean_squared_error(y_test1, pred1)), 5)
        coef1 = np.round(np.corrcoef(y_test1, pred1)[0,1], 2)

        rmse2 = np.round(np.sqrt(mean_squared_error(y_test1, pred2)), 5)
        coef2 = np.round(np.corrcoef(y_test1, pred2)[0,1], 2)

        rmse3 = np.round(np.sqrt(mean_squared_error(y_test1, pred3)), 5)
        coef3 = np.round(np.corrcoef(y_test1, pred3)[0,1], 2)
        corr_agg.extend([coef1, coef2, coef3])
        rmse_agg.extend([rmse1, rmse2, rmse3])
        # print(rmse_agg)
        corr_type.extend([nam[j]+'1', nam[j]+'2', nam[j]+'3'])
        agg_type.extend(['Agg1', 'Agg2', 'Agg3'])
        if j == 'ml':
            pred_rf = 1000*np.array(rf_pred0[index_bal_day[i]:index_bal_day[i+1]])
            pred_ex = 1000*np.array(extra_pred0[index_bal_day[i]:index_bal_day[i+1]])
            pred_boost = 1000*np.array(boost_pred0[index_bal_day[i]:index_bal_day[i+1]])
            coef_rf = np.round(np.corrcoef(y_test1, pred_rf)[0,1], 2)
            coef_ex = np.round(np.corrcoef(y_test1, pred_ex)[0,1], 2)
            coef_boost = np.round(np.corrcoef(y_test1, pred_boost)[0,1], 2)

            rmse_rf = np.round(np.sqrt(mean_squared_error(y_test1, pred_rf)),5)
            rmse_ex = np.round(np.sqrt(mean_squared_error(y_test1, pred_ex)),5)
            rmse_boost = np.round(np.sqrt(mean_squared_error(y_test1, pred_boost)), 5)
        aggregations = pd.DataFrame(
            {
                'True GWMF' : y_test1,
                'Agg 1 ({})'.format(coef1) : pred1,
                'Agg 2 ({})'.format(coef2) : pred2,
                'Agg 3 ({})'.format(coef3) : pred3
            })

        fig.add_trace(go.Scatter(
            x=list(range(1, len(y_test1)+1)), 
            y=y_test1, 
            name = "True",
            mode="lines",
            showlegend=show,
            line = dict(color = "#2474B4")),
                        row=rs[k], col=cs[k])
        fig.add_trace(go.Scatter(
            x=list(range(1, len(y_test1))), 
            y=pred1, 
            name = "Agg1",
            mode="lines",
            showlegend=show,
            line = dict(color = "#E09C39")),
                        row=rs[k], col=cs[k])
        fig.add_trace(go.Scatter(
            x=list(range(1, len(y_test1)+1)), 
            y=pred2, 
            name = "Agg2",
            mode="lines",
            showlegend=show,
            line = dict(color = "#52A23C")),
                        row=rs[k], col=cs[k])
        fig.add_trace(go.Scatter(
            x=list(range(1, len(y_test1)+1)), 
            y=pred3, 
            name = "Agg3",
            mode="lines",
            showlegend=show,
            line = dict(color = "#B83813")),
                        row=rs[k], col=cs[k])
        if k == 1:
            xaxis = 'xaxis'
            yaxis = 'yaxis'
        else:
            xaxis = 'xaxis' + str(k)
            yaxis = 'yaxis' + str(k)
        if k in [3,4]:
            fig['layout'][xaxis]['title'] = "Flight duration (day)"
        if k in [1,3]:
            fig['layout'][yaxis]['title'] = "GWMF (mPa)"
        # fig['layout'][yaxis].update(range=[0, np.max(np.max(pred1, pred2, pred3))])
        # Scatterplot
        fig_scatter.add_trace(go.Scatter(
            x=y_test1, 
            y=y_test1, 
            name = "True",
            mode="lines",
            opacity=0.75,
            showlegend=show,
            line = dict(color = "#2474B4")),
                        row=rs[k], col=cs[k])
        fig_scatter.add_trace(go.Scatter(
            x=y_test1, 
            y=pred1,
            name = "Agg1",
            mode="markers",
            opacity=0.75,
            showlegend=show,
            marker = dict(color = "#E09C39")),
                        row=rs[k], col=cs[k])
        fig_scatter.add_trace(go.Scatter(
            x=y_test1, 
            y=pred2, 
            name = "Agg2",
            mode="markers",
            opacity=0.75,
            showlegend=show,
            marker = dict(color = "#52A23C")),
                        row=rs[k], col=cs[k])
        fig_scatter.add_trace(go.Scatter(
            x=y_test1,
            y=pred3, 
            name = "Agg3",
            mode="markers",
            opacity=0.75,
            showlegend=show,
            marker = dict(color = "#B83813")),
                        row=rs[k], col=cs[k])
        if k in [3,4]:
            fig_scatter['layout'][xaxis]['title'] = "Reference GWMF (mPa)"
        if k in [1,3]:
            fig_scatter['layout'][yaxis]['title'] = "Prediction (mPa)"
        fig_scatter['layout'][yaxis]['range'] = [0, np.max([pred1, pred2, pred3])*1.1]
        sns.histplot(data=aggregations, ax = axs[rs[k]-1, cs[k]-1], legend=True, palette = list(cols.values()))
        # axs[rs[k]-1, cs[k]-1].set_yscale('log')
        axs[rs[k]-1, cs[k]-1].set_title(subfig_title[k-1])
        # axs[(k-1) // 2, (k-1) % 2].set_xticks(axs[(k-1) // 2, (k-1) % 2].get_xticks(), rotation = 30)
        if k == 1:
            show = False
        k += 1
    path1 = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Dataset/for_Sothea/phase1/"
    param = pd.read_csv(path1 + f"Pred_Balloon_{i+1}_Amplitude_hourly.DAT")
    param = param.drop(columns="old_CMAM")
    cor_param = [np.round(np.corrcoef(av_resolution(24, param[param.columns[i]]).values.reshape(-1), y_test1), 2)[0,1] for i in range(param.shape[1])]
    rmse_param = [np.round(np.sqrt(mean_squared_error(1000*av_resolution(24, param[param.columns[i]]).values.reshape(-1), y_test1)), 5) for i in range(param.shape[1])]
    agg_methods_ = np.repeat(['Agg_ML', 'Agg_ML_3Param', 'Agg_3Param', 'Agg_all_Param'], 3)
    bar_data = pd.DataFrame({
        'Corr' : [coef_rf, coef_ex, coef_boost] + corr_agg + cor_param ,
        'Method' : ['RF', 'Extra', 'Boost'] + corr_type + list(param.columns),
        'Type' : ['ML', 'ML', 'ML'] + agg_type + list(np.repeat('Param', param.shape[1]))
    })
    all_correlation_coef.append([coef_rf, coef_ex, coef_boost] + corr_agg + cor_param)
    fig.update_layout(title_text="Aggregations on balloon {}".format(i+1), width = 820, height = 600)
    fig.show()
    fig_scatter.update_layout(title_text="Scatterplots on {}".format(i+1), width = 820, height = 600)
    fig_scatter.show()
    plt.figure(figsize=(9,5))
    ax = sns.barplot(
        bar_data,
        x = "Method",
        y = "Corr",
        hue = 'Type'
    )
    ax.set_xticklabels(bar_data.Method, rotation=45, ha="right")
    ax.bar_label(ax.containers[0], rotation = 45)
    ax.bar_label(ax.containers[1], rotation = 45)
    ax.bar_label(ax.containers[2], rotation = 45)
    ax.bar_label(ax.containers[3], rotation = 45)
    ax.bar_label(ax.containers[4], rotation = 45)
    plt.title("Correlations of all the methods.", fontsize = 15)
    plt.show()
    rmse_data = pd.DataFrame({
        'RMSE' : np.array([rmse_rf, rmse_ex, rmse_boost] + rmse_agg + rmse_param),
        'Method' : ['RF', 'Extra', 'Boost'] + corr_type + list(param.columns),
        'Type' : ['ML', 'ML', 'ML'] + list(agg_methods_) + list(np.repeat('Param', param.shape[1]))
    })
    ax2 = sns.barplot(
        rmse_data,
        x = "Method",
        y = "RMSE",
        hue = 'Type'
    )
    ax2.set_xticklabels(rmse_data.Method, rotation=45, ha="right")
    ax2.bar_label(ax2.containers[0], rotation = 85)
    ax2.bar_label(ax2.containers[1], rotation = 85)
    ax2.bar_label(ax2.containers[2], rotation = 85)
    ax2.bar_label(ax2.containers[3], rotation = 85)
    ax2.bar_label(ax2.containers[4], rotation = 85)
    ax2.bar_label(ax2.containers[5], rotation = 85)
    ax2.set_ylabel("RMSE (mPa)", fontsize = 15)
    plt.title("RMSE (mPa) of all the methods.", fontsize = 15)
    plt.show()
    print("Balloon " + str(i+1) + "... Done!")

df = pd.DataFrame(all_correlation_coef, columns = ['RF', 'Extra', 'Boost'] + corr_type + list(param.columns))
df.to_csv("C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/Aggregation_with_3_Parameterization/Slides/all_corr_abs.csv")

df_rmse = pd.DataFrame(all_rmse, columns = ['RF', 'Extra', 'Boost'] + corr_type + list(param.columns))
df_rmse.to_csv("C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Use2021_predict2019/Aggregation_with_3_Parameterization/Slides/all_rmse_abs.csv")

Balloon 1... Done!
Balloon 2... Done!
Balloon 3... Done!
Balloon 4... Done!
Balloon 5... Done!
Balloon 6... Done!
Balloon 7... Done!
Balloon 8... Done!