Building models using 2019 Campaign and predict on the 2021

Author

Installing and importing packages

Code
import sys 
import netCDF4
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde as kde
import pandas as pd
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, MinMaxScaler

Importing data

Code
# 2019 campaign
path = 'C:/Users/Sothea Has/Postdoc_Sothea/Datasets/Rani_offline/offline_v9_Strateole_Sothea/offline_v9_Strateole_Sothea/hourly_data/'
# path = '/home/shas/Data/'
Input = netCDF4.Dataset(path + 'Input_ERA5_data_all_balloons.nc')
print('\n* Input data from analysis\n==========================')
print('- Number of observations: ' + str(Input.variables['time'].shape[0]))
print('- Variables: ' + str(list(Input.variables.keys())))

# Ballon data (Rani)
# -------------------------
print('\n* Hourly balloon data from Rani\n===============================')
Bal_data = netCDF4.Dataset(path + 'All_STRATEOLE2_Balloon_1hrs15min.nc')
print('- Number of observations: ' + str(Bal_data.variables['time'].shape[0]))
print('- Variables: ' + str(list(Bal_data.variables.keys())))

bal_levels = [49, 49, 51, 51, 51, 49, 49, 51]#[49, 51, 55, 53, 53, 49, 49, 51]  # Balloon levels from 1 to 8
index_level = [int((n-1)/2) for n in bal_levels]
balloon_indices = [[1,2565], [2566,5036], [5037,7464], [7465,9056], [9057,10974], [10975,12356], [12357,14351], [14352,16197]]
balloon_sizes = [balloon_indices[i][1]-balloon_indices[i][0] + 1 for i in range(len(balloon_indices))]
print(sum(balloon_sizes))
# 2021 campaign
path2021 = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/Dataset/Strateole2_2021/"
Input2021 = netCDF4.Dataset(path2021 + "Input_ERA5_data_all_variables_balloons_ph2.nc")
Bal_data2 = netCDF4.Dataset(path2021 + 'All_STRATEOLE2_Balloon_ph2_1hrs15min.nc')

# balloon_index2021 = [1, 30, 328, 1076, 1787, 2887, 4027, 5087, 6378, 7994, 8905, 9785, 11009, 12710, 14062, 14695, 15186]
balloon_index2021 = [[1,30],[31,328],[329, 1076],[1077,1787],[1788,2987],[2988,4027],[4028,5087],[5088,6378],[6379,7994],[7995,8905],[8906,9785],[9786,11099],[11100,12710],[12711,14062],[14063,14695],[14696,15186]]
bal_size2 = [balloon_index2021[i][1]-balloon_index2021[i][0]+1 for i in range(len(balloon_index2021))]

* Input data from analysis
==========================
- Number of observations: 16197
- Variables: ['time', 'longitude', 'latitude', 'level', 'tp', 'lnsp', 'temp', 'vitu', 'vitv']

* Hourly balloon data from Rani
===============================
- Number of observations: 16197
- Variables: ['time', 'lon', 'lat', 'qdm_u', 'qdm_v', 'qdm_u_abs', 'qdm_v_abs', 'qdm_u_east', 'qdm_u_west', 'qdm_v_north', 'qdm_v_south', 'qdm_total', 'alt', 'sza', 'pressure']
16197

Select the data and parameters

Code
lmd = 0.6
bal_ = 1
resolution = 3
beta = 0.2
typ = 'u1h'
sub = "abs"
va_ = "qdm_u_" + sub
image_format = 'png'

if sub != 'west':
    tranf = True
else:
    tranf = False
 # Saving images
namefig = ['rf', 'extra', 'boost']
fignames = ['Random Forest', 'Extra Trees', 'Adaboost']
addname = 'fig_' + typ + '_bal' + str(bal_)  + '_3h_24h_' + sub + '.png'

Preparing data

Code
input_lookup = list(Input.variables.keys())
input_dict = {}
input_dict2 = {}
leng = len(Input.variables['time'])
for va in input_lookup:
    if va in ['lnsp']:
        input_dict[va] = np.array(Input.variables[va][:,2,2]).astype('float32')
        input_dict2[va] = np.array(Input2021.variables[va][:,2,2]).astype('float32')
    if va in ['temp', 'vitu', 'vitv']:
        for lev in [int((n-1)/2) for n in [51, 85, 111, 137]]:
            input_dict[va+str(int(2*lev+1))] = np.array(Input.variables[va][:, lev, 2, 2]).astype('float32')
            input_dict2[va+str(int(2*lev+1))] = np.array(Input2021.variables[va][:, lev, 2, 2]).astype('float32')
    if va == 'tp':
        input_dict['tp'] = np.array(Input.variables[va][:,2,2]).astype('float32')
        input_dict2['tp'] = np.array(Input2021.variables[va][:,2,2]).astype('float32')
input_dict['sza'] = Bal_data['sza'][:]
input_dict['tp_mean'] = np.mean(Input.variables['tp'][:,:,:], axis=(1,2))
input_dict['tp_sd'] = np.std(Input.variables['tp'][:,:,:], axis=(1,2))

input_dict2['sza'] = Bal_data2['sza'][:]
input_dict2['tp_mean'] = np.mean(Input2021.variables['tp'][:,:,:], axis=(1,2))
input_dict2['tp_sd'] = np.std(Input2021.variables['tp'][:,:,:], axis=(1,2))

# All inputs
X_u = pd.DataFrame(input_dict)
# Input for zonal wind (u)
u = pd.DataFrame({
    'qdm_u': Bal_data.variables['qdm_u'][:], 
    'qdm_u_abs': Bal_data.variables['qdm_u_abs'][:],
    'qdm_u_west': Bal_data.variables['qdm_u_west'][:],
    'qdm_u_east': Bal_data.variables['qdm_u_east'][:]})

print(X_u.shape)
print(u.shape)
print(X_u.columns)

# Inputs
X_u2 = pd.DataFrame(input_dict2)
# Input for zonal wind (u)
u2 = pd.DataFrame({
    'qdm_u': Bal_data2.variables['qdm_u'][:], 
    'qdm_u_abs': Bal_data2.variables['qdm_u_abs'][:],
    'qdm_u_west': Bal_data2.variables['qdm_u_west'][:],
    'qdm_u_east': Bal_data2.variables['qdm_u_east'][:]})
print(X_u2.shape)
print(u2.shape)
print(X_u2.columns)
(16197, 17)
(16197, 4)
Index(['tp', 'lnsp', 'temp51', 'temp85', 'temp111', 'temp137', 'vitu51',
       'vitu85', 'vitu111', 'vitu137', 'vitv51', 'vitv85', 'vitv111',
       'vitv137', 'sza', 'tp_mean', 'tp_sd'],
      dtype='object')
(15186, 17)
(15186, 4)
Index(['tp', 'lnsp', 'temp51', 'temp85', 'temp111', 'temp137', 'vitu51',
       'vitu85', 'vitu111', 'vitu137', 'vitv51', 'vitv85', 'vitv111',
       'vitv137', 'sza', 'tp_mean', 'tp_sd'],
      dtype='object')

Time resolution transformation (3h)

Code
def transform(y):
    return y

def reverse(y):
    return y

def modify_resolution(resolution, X, y):
    group = []
    start = 0
    balloon = [0]
    for i in range(8):
        s = balloon_sizes[i] // resolution
        r = balloon_sizes[i] % resolution
        group = group + list(np.repeat(range(start, start + s), resolution)) +  list(np.repeat(start + s,r))
        if r == 0:
            start += s
        else:
            start += (s + 1)
        balloon.append(max(group)+1)
    x_ = X.groupby(by = group).mean()
    y_ = y.groupby(by = group).mean()
    return x_, y_, balloon

def modify_resolution2(resolution, X, y):
    group = []
    start = 0
    balloon = [0]
    for i in range(16):
        s = bal_size2[i] // resolution
        r = bal_size2[i] % resolution
        group = group + list(np.repeat(range(start, start + s), resolution)) +  list(np.repeat(start + s, r))
        if r == 0:
            start += s
        else:
            start += (s + 1)
        balloon.append(max(group)+1)
    x_ = X.groupby(by = group).mean()
    y_ = y.groupby(by = group).mean()
    return x_, y_, balloon

def av_resolution(resulution, y):
    a = len(y) // resulution
    r = len(y) % a
    if r < resulution/2:
        group = np.concatenate((np.repeat(list(range(0,a)), resulution), np.repeat(a-1, r)))
    else:
        group = np.concatenate((np.repeat(list(range(0,a)), resulution), np.repeat(a, r)))
    return y.groupby(by = group).mean()

def lag(va, lag, id, bal = 1, test = False):
    if test:
        tem = x_.loc[id[bal-1]:id[bal],va]
        return np.concatenate([np.repeat(np.nan, lag), tem[:-lag]])
    else:
        tem = np.zeros(len(x_) - id[bal] + id[bal-1])
        a = 0
        for i in range(len(id)-1):
            st = int(0)
            if i != bal - 1:
                a =  x_.loc[:,va][id[i]:id[i+1]]
                tem[st:(st+id[i+1]-id[i])] = np.concatenate([np.repeat(np.nan, lag), a[:-lag]])
                st += id[i+1]-id[i]
        return tem

x_, y_, cut_id = modify_resolution(resolution, X_u, u)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_)
y_train = y_[va_]
x_test_scaled = scaler.transform(X_u2)
y_test = u2[va_]
x_test_new_resol, y_test_new_resol, id_new_resol = modify_resolution2(24, X_u2, u2)

Build models using data of 2019 campaign

We simply just build the models using the optimal hyperparameters obtained from the tuning process of 2019 campaign. In sequel, the comparisons are done in 1day time resolution.

Code
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor

va = 'qdm_u_' + sub
# param_path = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/Second_Part/New_loss/5th_ty_E2_Cor9_Beta/Beta025/output/"
param_path = "C:/Users/Sothea Has/Postdoc_Sothea/Codes/ERA5_to_QDM/Computation10/Parameters/output2/"
param = typ + "_parameters_bal" + str(bal_) + "_3h_24h" + sub + ".nc"
parameters = xr.open_dataset(param_path+param)

# data splitting
# kl_id = np.random.permutation(range(len(y_train)))
# k = len(kl_id)//2
# X_k, X_l, y_k, y_l = x_train_scaled.iloc[kl_id[:k],:].reset_index(drop='index'), x_train_scaled.iloc[kl_id[k:],:].reset_index(drop='index'), y_train.iloc[kl_id[:k]].reset_index(drop='index'), y_train.iloc[kl_id[k:]].reset_index(drop='index')

extra = ExtraTreesRegressor(
            n_estimators=int(parameters['n_estimators_extra']), 
            max_features=int(parameters['max_features_extra']), 
            min_samples_leaf=int(parameters['min_samples_leaf_extra'])
        ).fit(x_train_scaled, y_train)
rf = RandomForestRegressor(
        n_estimators=int(parameters['n_estimators_rf']), 
        max_features=int(parameters['max_features_rf']), 
        min_samples_leaf=int(parameters['min_samples_leaf_rf'])
    ).fit(x_train_scaled, y_train)
boost = AdaBoostRegressor(
            estimator=DecisionTreeRegressor(max_depth=int(parameters['max_depth_boost']), 
            max_features=int(parameters['max_features_boost'])), 
            n_estimators=int(parameters['n_estimators_boost']),
            learning_rate=1
        ).fit(x_train_scaled, y_train)
ex_pred, rf_pred, boo_pred = extra.predict(x_test_scaled), rf.predict(x_test_scaled), boost.predict(x_test_scaled)

Predictions on 2021 campaign

We took the models built entirely using the data from 2019 to predict the data of 2021 campaign.

Code
y_test1 = av_resolution(24, pd.DataFrame(y_test))
ex_pred1 = av_resolution(24, pd.DataFrame(ex_pred))
rf_pred1 = av_resolution(24, pd.DataFrame(rf_pred))
boo_pred1 = av_resolution(24, pd.DataFrame(boo_pred))

ex_cor = np.round(np.corrcoef(ex_pred1.values.reshape(-1), y_test1.values.reshape(-1))[0][1], 3)
rf_cor = np.round(np.corrcoef(rf_pred1.values.reshape(-1), y_test1.values.reshape(-1))[0][1], 3)
boo_cor = np.round(np.corrcoef(boo_pred1.values.reshape(-1), y_test1.values.reshape(-1))[0][1], 3)

names = {'abs' : "absolute", 'east' : "eastward", 'west': "westward"}

fig = go.Figure(data = [go.Scatter(
        x = list(range(1, len(y_test1)+1)),
        y = y_test1.values.reshape(-1),
        name = "True "+ names[sub] +" GWMF 2021",
        showlegend=True,
        mode = "lines",
        line = dict(color = "black"))])
fig.add_trace(go.Scatter(x = list(range(1, len(y_test1)+1)),
                 y = rf_pred1.values.reshape(-1),
                 name = "Random forest ({})".format(rf_cor),
                 showlegend=True,
                 mode = "lines",
                 line = dict(color = "#9002F8")))
fig.add_trace(go.Scatter(x = list(range(1, len(y_test1)+1)),
                 y = ex_pred1.values.reshape(-1),
                 name = "Extra-trees ({})".format(ex_cor),
                 showlegend=True,
                 mode = "lines",
                 line = dict(color = "#0F2BF8")))
fig.add_trace(go.Scatter(x = list(range(1, len(y_test1)+1)),
                 y = boo_pred1.values.reshape(-1),
                 name = "Adaboost ({})".format(boo_cor),
                 showlegend=True,
                 mode = "lines",
                 line = dict(color = "#FD0F0B")))
fig.update_layout(title_text = "True vs predicted GWMF",
                width = 800,
                height = 500)
col_map = ["#566573", "white"]
for i in range(16):
    fig.add_vrect(
    x0=id_new_resol[i], 
    x1 = id_new_resol[i+1],
    annotation = dict(text = str(i+2)),
    fillcolor = col_map[i%2],
    opacity = 0.2,
    line = dict(width = 0.5, color = "black")
)
fig.show()

print("\n\n\n* RMSE of (RF, Extra, Adaboost) = ({}, {}, {})".format(
    np.round(np.sqrt(mean_squared_error(y_test1, rf_pred1))/np.mean(y_test1), 3),
    np.round(np.sqrt(mean_squared_error(y_test1, ex_pred1))/np.mean(y_test1),3),
    np.round(np.sqrt(mean_squared_error(y_test1, boo_pred1))/np.mean(y_test1), 3)
))
print("\n\n\n* Correlation of (RF, Extra, Adaboost) = ({}, {}, {})".format(
    rf_cor,
    ex_cor,
    boo_cor
))



* RMSE of (RF, Extra, Adaboost) = (0.636, 0.637, 0.677)



* Correlation of (RF, Extra, Adaboost) = (0.498, 0.495, 0.48)

Aggregate model 1: Gradient COBRA

This aggregation method requires some observations from the new source to fine tune the key parameter. We use the first three balloons comprising of 1077 (1h time resolution) observations to estimate the bandwidth parameter for the aggregation. However, the comparison on the remaining data is done in daily time resolution.

Code
from gradientcobra.gradientcobra import GradientCOBRA
Pred_agg = np.column_stack([rf_pred, ex_pred, boo_pred])
n_size = balloon_index2021[3][0]
bal_sect = np.array(id_new_resol[3:]) - id_new_resol[3]
gc = GradientCOBRA(
    opt_method="grid",
    bandwidth_list=np.linspace(0.001, 10, 500)).fit(
    X=Pred_agg[:n_size,:] * 5, 
    y = y_test.values.reshape(-1)[:n_size])
gc.draw_learning_curve()
gc_pred = gc.predict(X=Pred_agg[n_size:,:] * 5)

gc_pred1 = av_resolution(24, pd.DataFrame(gc_pred))
y_test2 = av_resolution(24, pd.DataFrame(y_test.values.reshape(-1)[n_size:]))
fig = go.Figure(data = [go.Scatter(
        x = list(range(1, len(y_test) - n_size + 1)),
        y = y_test2.values.reshape(-1),
        name = "True "+ names[sub] +" GWMF 2021",
        showlegend=True,
        mode = "lines",
        line = dict(color = "black"))])

ag_cor1 = np.round(np.corrcoef(y_test2.values.reshape(-1), gc_pred1.values.reshape(-1))[0,1],3)

fig.add_trace(go.Scatter(x = list(range(1, len(y_test) - n_size + 1)),
                 y = gc_pred1.values.reshape(-1),
                 name = "Aggregation 1 ({})".format(ag_cor1),
                 showlegend=True,
                 mode = "lines",
                 line = dict(color = "#FD0F0B")))
fig.update_layout(title_text = "True vs aggregation", width = 800, height = 500)
col_map = ["#566573", "white"]
fig.add_vrect(
    x0=bal_sect[0],
    x1 = bal_sect[1],
    annotation = dict(text = "Bal-4", align = "center"),
    fillcolor = "white",
    opacity = 0.1,
    line = dict(width = 0)
)

for i in range(12):
    fig.add_vrect(
    x0=bal_sect[i+1], 
    x1 = bal_sect[i+2],
    annotation = dict(text = str(i+5)),
    fillcolor = col_map[i%2],
    opacity = 0.2,
    line = dict(width = 0.5, color = "black")
)
fig.show()
print("\n\n\n* RMSE = {}".format(np.round(np.sqrt(mean_squared_error(gc_pred1, y_test2))/np.mean(y_test2),3)))
print("\n\n\n* Corr = {}".format(ag_cor1))

    -> Grid search algorithm with radial kernel is in progress...
        ~ Full process|--------------------------------------------------|100%
        ~   Processing|==================================================|100%



* RMSE = 0.641



* Corr = 0.504

Aggregate model 2: MixCOBRA

This is another aggregation method taking into account the input part.

Code
from gradientcobra.mixcobra import MixCOBRARegressor
gc2 = MixCOBRARegressor(
    alpha_list=np.linspace(0.0001, 1000, 100),
    beta_list=np.linspace(0.0001, 30, 100)).fit(
    X=x_test_scaled[:n_size,:] * 1000,
    Pred_features=Pred_agg[:n_size,:] * 1000,
    y=y_test.values.reshape(-1)[:n_size])
gc2.draw_learning_curve()
gc_pred2 = gc2.predict(X=x_test_scaled[n_size:,:] * 1000,
                       Pred_X = Pred_agg[n_size:,:]* 1000)
gc_pred2 = av_resolution(24, pd.DataFrame(gc_pred2))
fig = go.Figure(data = [go.Scatter(
        x = list(range(1, len(y_test) - n_size + 1)),
        y = y_test2.values.reshape(-1),
        name = "True "+ names[sub] +" GWMF 2021",
        showlegend=True,
        mode = "lines",
        line = dict(color = "black"))])
ag_cor2 = np.round(np.corrcoef(y_test2.values.reshape(-1), gc_pred2.values.reshape(-1))[0,1],3)
fig.add_trace(go.Scatter(x = list(range(1, len(y_test) - n_size + 1)),
                 y = gc_pred2.values.reshape(-1),
                 name = "Aggregation 2 ({})".format(ag_cor2),
                 showlegend=True,
                 mode = "lines",
                 line = dict(color = "#FD0F0B")))
fig.update_layout(title_text = "True vs aggregation", width = 800, height = 500)
for i in range(12):
    fig.add_vrect(
    x0=bal_sect[i+1], 
    x1 = bal_sect[i+2],
    annotation = dict(text = str(i+5)),
    fillcolor = col_map[i%2],
    opacity = 0.2,
    line = dict(width = 0.5, color = "black")
)
fig.show()
print("\n\n\n* RMSE = {}".format(np.round(np.sqrt(mean_squared_error(gc_pred2, y_test2))/np.mean(y_test2),3)))
print("\n\n\n* Corr = {}".format(ag_cor2))

    * Grid search algorithm of two parameters with radial kernel is in progress...
        ~ Full process|--------------------------------------------------|100%
        ~   Processing|==================================================|100%



* RMSE = 0.729



* Corr = 0.369