import kagglehub
# Download latest version
= kagglehub.dataset_download("mahmoudreda55/satellite-image-classification") path
TP6 - Clustering
Course: Advanced Machine Learning
Lecturer: Sothea HAS, PhD
Objective: Clustering algorithm is an unsuperivsed learning method aiming at grouping data into clusters based on their similarities. In this TP, we will use various clusterint algorithms we have seen to solve some practical tasks such as image and data segmentation.
- The
notebook
of thisTP
can be downloaded here: TP6_Clustering.ipynb.
1. Satellite Image Segmentation
A. Assembling data
- Download satellite images from the following kaggle repository: Satellite Images.
- There are four folders of different areas captured by satellite images:
cloudy
(\(1500\times 256\times 256\))desert
(\(1131\times 256\times 256\))green_area
(\(1500\times 64\times 64\))water
(\(1500\times 64\times 64\))
- Assemble these four types of images (convert them to \(64\times 64\)-resolution) and save it as
satellite_images.npy
. You may find the following libraries useful:cv2
glob
PIL
import cv2
import glob
from PIL import Image
= ['cloudy', 'desert', 'green_area', 'water']
folder_names = ['jpg'] # Add image formats here
ext
= []
resized_images
for name in ['cloudy', 'desert', 'green_area', 'water']:
= path + '/data/' + name + '/'
imdir = []
files + '*.' + e)) for e in ext]
[files.extend(glob.glob(imdir = [cv2.imread(file) for file in files]
images # Resize images
if images[0].shape[0] == 256:
4,::4,:] for img in images])
resized_images.extend([img[::else:
resized_images.extend(images)len(resized_images)
5631
resized_images
# Save it
import numpy as np
# data = np.array(resized_images)
# np.save(path + '/data/satellite_images.npy', data)
= np.load(path + '/data/satellite_images.npy')
data = np.repeat(folder_names, (1500, 1131, 1500, 1500))
label data.shape
(5631, 64, 64, 3)
B. Clustering.
- Load the assembled data and perform different clustering algorithms on the data.
- Detect the optimal number of clusters. Is the result reasonable?
- Explore if the clustering algorithms cluster images into their real categories.
import matplotlib.pyplot as plt
import numpy as np
= plt.subplots(3,4, figsize=(10, 8))
_, axs for i in range(12):
= np.random.choice(5631,1)
ids //4,i%4].imshow(data[ids,:,:,:].reshape(64,64,3))
axs[i//4,i%4].axis('off')
axs[i//4,i%4].set_title(label[ids][0])
axs[i plt.show()
= data.reshape(-1,64*64*3)
data
data.shapefrom sklearn.preprocessing import StandardScaler
= StandardScaler()
sclaer = sclaer.fit_transform(data) data_scaled
KMeans algorithm
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
= list(range(1,7))
k_list = []
wss = []
sh_avg for k in k_list:
= KMeans(n_clusters=k)
km = km.fit(data_scaled)
km
wss.append(km.inertia_)= km.labels_ clusters
import pandas as pd
= pd.DataFrame({
df_wss "K": k_list,
"WSS": wss
})
import seaborn as sns
="K", y="WSS", markers=True)
sns.lineplot(df_wss, x=df_wss.K, y=df_wss.WSS)
plt.scatter(x"WSS as a function number of clusters")
plt.title( plt.show()
# Let's detect the elbow
= np.diff(wss)
dif = np.argmax(dif[:-1]/dif[1:]) + 1
id_opt print(f'The optimal clsuter is {k_list[id_opt]}')
The optimal clsuter is 2
Silhouette Score
from sklearn.metrics import silhouette_samples, silhouette_score
= KMeans(n_clusters=2, max_iter=100, n_init=2)
km = km.fit(data_scaled)
km = km.labels_
clusters = silhouette_score(data_scaled, clusters)
silhouette_avg = silhouette_samples(data_scaled, clusters)
sample_silhouette_values
# Plot silhouette scores
= plt.subplots(1, 1, figsize=(6,3))
fig, ax1 = 10
y_lower for k in range(km.n_clusters):
= sample_silhouette_values[clusters == k]
ith_cluster_silhouette_values
ith_cluster_silhouette_values.sort()= ith_cluster_silhouette_values.shape[0]
size_cluster_k = y_lower + size_cluster_k
y_upper 0,
ax1.fill_betweenx(np.arange(y_lower, y_upper),
ith_cluster_silhouette_values)-0.05, y_lower + 0.5 * size_cluster_k, str(k+1))
ax1.text(= y_upper + 10
y_lower "Silhouette coefficients for K=2")
ax1.set_title("Silhouette coefficient values")
ax1.set_xlabel("Cluster label")
ax1.set_ylabel(=silhouette_avg, color="red", linestyle="--")
ax1.axvline(x plt.show()
- We shall see if \(K=2\) is also an optimal number of cluster based on Silhouette scores:
= []
sh_avg for k in k_list[1:]:
= KMeans(n_clusters=k)
km = km.fit(data_scaled)
km = km.labels_
clusters = silhouette_score(data_scaled, clusters)
silh_avg sh_avg.append(silh_avg)
import seaborn as sns
= pd.DataFrame({
df_wss "K": k_list[1:],
"Silhouette Coefficients": sh_avg
})="K", y="Silhouette Coefficients", markers=True)
sns.lineplot(df_wss, x=df_wss.K, y=df_wss['Silhouette Coefficients'])
plt.scatter(x"Silhouette Coefficients as a function number of clusters")
plt.title( plt.show()
Based the above graph, \(K=2\) also maximizes the Silhouette score which suggests that it’s a suitable number of clusters. We can see the distribution of each cluster by analyzing the type of images of each cluster as follows.
= plt.subplots(1,2, figsize=(7, 2))
_, axs = pd.DataFrame({'label': label})
df_label == 0], x="label", ax=axs[0])
sns.countplot(df_label.loc[clusters 0].set_title("Type of images in cluster 1")
axs[
== 1], x="label", ax=axs[1])
sns.countplot(df_label.loc[clusters 1].set_title("Type of images in cluster 2")
axs[
plt.show()
Let’s see the water views that are missed grouped with green areas.
= plt.subplots(2,3, figsize=(10,6))
_, axs = data[(km.labels_ == 1) & (label == "water"),:]
wrong_1 for i in range(6):
//3, i%3].imshow(wrong_1[i,:].reshape(64,64,3)) axs[i
Here are desert views that were missed clustered with cloudy skies.
= plt.subplots(2,3, figsize=(10,6))
_, axs = data[(km.labels_ == 0) & (label == "desert"),:]
wrong_1 for i in range(6):
//3, i%3].imshow(wrong_1[i,:].reshape(64,64,3)) axs[i
- Finally, we can try to cluster them into 4 classes and see if images from the same group would be clustered into the same cluster.
= KMeans(n_clusters=4)
km = km.fit(data_scaled)
km = km.labels_
clusters
= plt.subplots(1,4, figsize=(12, 3))
_, axs = pd.DataFrame({'label': label})
df_label
for i, cl in enumerate(range(4)):
== i], x="label", ax=axs[i])
sns.countplot(df_label.loc[clusters f"Type of images in cluster {i}")
axs[i].set_title(='x', labelrotation = 45)
axs[i].tick_params(axis
plt.tight_layout() plt.show()
It’s clear that KMeans does not cluster data into their real types.
C. Predictive Models
- Create a target of four categories \(y=\) [‘cloudy’, ‘desert’, ‘forest’, ‘water’].
- Randomly select 10% from of each category and store them as test data.
- Train ML models to predict the category of images.
- Report the accuracy of the models.
# Train-test split
from sklearn.model_selection import train_test_split, GridSearchCV
= train_test_split(data.reshape(data.shape[0], -1)/np.max(data), label,test_size=0.1, random_state=42)
X_train, X_test, y_train, y_test print(X_train.shape)
(5067, 12288)
We will explore some models including
- KNN
- Random Forest
- Extra-trees
- DNN
# KNN
from sklearn.neighbors import KNeighborsClassifier
= {
param "n_neighbors": np.arange(100,151, 5, dtype=int)
}= KNeighborsClassifier()
knn = GridSearchCV(knn, param, cv=10, scoring="neg_log_loss")
grid_cv = grid_cv.fit(X_train, y_train)
grid_cv print(f'Optimal number of cluster: {grid_cv.best_params_}')
= grid_cv.best_estimator_.predict(X_test)
y_hat print(f'Test accuracy: {np.mean(y_test == y_hat)}')
= pd.DataFrame({
df_acc str(grid_cv.best_params_['n_neighbors'])+"NN": [np.mean(y_test == y_hat)]
=["Accuracy"])
}, index df_acc
Optimal number of cluster: {'n_neighbors': 125}
Test accuracy: 0.900709219858156
125NN | |
---|---|
Accuracy | 0.900709 |
# Random Forest & Extra-trees
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
= {
param "n_estimators": [100, 300, 500],
"max_features": [40, 50, 70, 100],
"min_samples_leaf": [5, 10, 20, 30,50]
}
# mask = np.random.choice([True, False], replace=True, p=[0.1, 0.99], size=len(y_train))
= RandomForestClassifier()
rf = GridSearchCV(rf, param, cv=10, scoring="neg_log_loss")
grid_cv = grid_cv.fit(X_train, y_train)
grid_cv print(f'Optimal parameters: {grid_cv.best_params_}')
= grid_cv.best_estimator_.predict(X_test)
y_hat print(f'Test accuracy: {np.mean(y_test == y_hat)}')
= pd.concat([df_acc, pd.DataFrame({
df_acc "RF": [np.mean(y_test == y_hat)]
=["Accuracy"])], axis=1)
}, index df_acc
Optimal parameters: {'max_features': 50, 'min_samples_leaf': 30, 'n_estimators': 100}
Test accuracy: 0.9148936170212766
125NN | RF | |
---|---|---|
Accuracy | 0.900709 | 0.914894 |
# Extra-trees
# mask = np.random.choice([True, False], replace=True, p=[0.01, 0.99], size=len(y_train))
= ExtraTreesClassifier()
ex_tr = GridSearchCV(ex_tr, param, cv=10, scoring="neg_log_loss")
grid_cv = grid_cv.fit(X_train, y_train)
grid_cv print(f'Optimal parameters: {grid_cv.best_params_}')
= grid_cv.best_estimator_.predict(X_test)
y_hat print(f'Test accuracy: {np.mean(y_test == y_hat)}')
= pd.concat([df_acc, pd.DataFrame({
df_acc "Ex-trees": [np.mean(y_test == y_hat)]
=["Accuracy"])], axis=1)
}, index df_acc
Optimal parameters: {'max_features': 50, 'min_samples_leaf': 30, 'n_estimators': 100}
Test accuracy: 0.9131205673758865
125NN | RF | Ex-trees | |
---|---|---|---|
Accuracy | 0.900709 | 0.914894 | 0.913121 |
# XGboost
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from itertools import product
= {
param_grid 'colsample_bytree': [0.2, 0.5, 0.8],
'max_depth': [10, 15, 20],
'n_estimators': [200, 500]
}= LabelEncoder()
label_encoder = label_encoder.fit_transform(y_train)
y_train_encoded = label_encoder.transform(y_test)
y_test_encoded = 5
n_cv = KFold(n_splits=n_cv, shuffle=True, random_state=42)
kf = XGBClassifier(objective="log_loss")
xgb
# Perform parameter search manually
= list(product(*param_grid.values()))
list_params = np.zeros(shape=(len(list_params),))
loss_cv = 1
j for train_index, test_index in kf.split(X_train):
= X_train[train_index,:], X_train[test_index,:]
X_tr, X_te = y_train_encoded[train_index], y_train_encoded[test_index]
y_tr, y_te = np.zeros(shape=(len(list_params),))
loss_ for i,params in enumerate(list_params):
= dict(zip(param_grid.keys(), params))
param_dict = XGBClassifier(**param_dict)
model
model.fit(X_tr, y_tr)= model.predict(X_te)
y_pred = np.mean(y_pred == y_te)
loss = loss
loss_[i] = loss_cv + loss_
loss_cv print(f"* Fold: {j} / {n_cv}")
+= 1 j
/= n_cv
loss_cv = dict(zip(param_grid.keys(), list_params[np.argmin(loss_cv)]))
opt_param print(opt_param)
= XGBClassifier(**opt_param)
model = model.fit(X_train.to_numpy(), y_train_encoded)
model = model.predict(X_test.to_numpy())
y_pred_xgb_cv print(f"Accuracy: {np.mean(y_test_encoded == y_hat)}")
= model.predict(X_test)
y_hat = pd.concat([df_acc, pd.DataFrame({
df_acc "XGboost": [np.mean(y_test_encoded == y_hat)]
=["Accuracy"])], axis=1) }, index
DNN model
# This is an example with Keras
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
= OneHotEncoder()
onehot = onehot.fit_transform(y_train.reshape(-1,1)).toarray()
y_train_encoded = onehot.transform(y_test.reshape(-1,1)).toarray()
y_test_encoded
from keras.callbacks import Callback
# Input
= X_train.shape[1]
d
= Sequential()
model =(d,)))
model.add(Input(shape
# To do
128, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(0.2))
model.add(Dropout(32, activation="relu"))
model.add(Dense(0.2))
model.add(Dropout(4, activation="softmax"))
model.add(Dense(
compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.
# I only print every N epochs
class custom_callback(Callback):
def __init__(self, N):
super(custom_callback, self).__init__()
self.N = N
def on_epoch_end(self, epoch, logs=None):
if (epoch + 1) % self.N == 0:
print(f'Epoch {epoch + 1}: loss = {logs["loss"]}, accuracy = {logs["accuracy"]}')
= custom_callback(200)
print_callback = model.fit(X_train, y_train_encoded, epochs=1000, batch_size=256, validation_split=0.1, verbose=0, callbacks=[print_callback])
history
= history.history['loss']
train_loss = history.history['val_loss'] val_loss
Epoch 200: loss = 0.273344486951828, accuracy = 0.8901315927505493
Epoch 400: loss = 0.23533892631530762, accuracy = 0.9059210419654846
Epoch 600: loss = 0.1987672746181488, accuracy = 0.9179824590682983
Epoch 800: loss = 0.17246949672698975, accuracy = 0.9344298243522644
Epoch 1000: loss = 0.15364068746566772, accuracy = 0.9445175528526306
import plotly.graph_objs as go
# Plot the learning curves
= list(range(1, len(train_loss) + 1))
epochs = go.Figure(go.Scatter(x=epochs, y=train_loss, name="Training loss"))
fig1 =epochs, y=val_loss, name="Training loss"))
fig1.add_trace(go.Scatter(x="Training and Validation Loss",
fig1.update_layout(title=800, height=500,
width=dict(title="Epoch", type="log"),
xaxis=dict(title="Loss"))
yaxis fig1.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
= model.predict(X_test).argmax(axis=1)
y_hat = np.argmax(y_test_encoded, axis=1)
y_test_reverse = np.mean(y_hat == y_test_reverse)
test_acc print(f"Accuracy: {test_acc}")
18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step
Accuracy: 0.9184397163120568
= pd.concat([df_acc, pd.DataFrame({
df_acc "DNN": [test_acc]
=["Accuracy"])], axis=1) }, index
df_acc
125NN | RF | Ext-trees | DNN | |
---|---|---|---|---|
Accuracy | 0.900709 | 0.914894 | 0.913121 | 0.91844 |
2. Revisit Spam dataset
Task: Perform clustering algorithms on Spam dataset. Can clustering algorithms distinguish spam and non-spam emails based on it characteristics.
import pandas as pd
= "https://raw.githubusercontent.com/hassothea/MLcourses/main/data/spam.txt"
path = pd.read_csv(path, sep=" ")
data 5) data.head(
Id | make | address | all | num3d | our | over | remove | internet | order | ... | charSemicolon | charRoundbracket | charSquarebracket | charExclamation | charDollar | charHash | capitalAve | capitalLong | capitalTotal | type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.00 | 0.64 | 0.64 | 0.0 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.000 | 0.0 | 0.778 | 0.000 | 0.000 | 3.756 | 61 | 278 | spam |
1 | 2 | 0.21 | 0.28 | 0.50 | 0.0 | 0.14 | 0.28 | 0.21 | 0.07 | 0.00 | ... | 0.00 | 0.132 | 0.0 | 0.372 | 0.180 | 0.048 | 5.114 | 101 | 1028 | spam |
2 | 3 | 0.06 | 0.00 | 0.71 | 0.0 | 1.23 | 0.19 | 0.19 | 0.12 | 0.64 | ... | 0.01 | 0.143 | 0.0 | 0.276 | 0.184 | 0.010 | 9.821 | 485 | 2259 | spam |
3 | 4 | 0.00 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | ... | 0.00 | 0.137 | 0.0 | 0.137 | 0.000 | 0.000 | 3.537 | 40 | 191 | spam |
4 | 5 | 0.00 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | ... | 0.00 | 0.135 | 0.0 | 0.135 | 0.000 | 0.000 | 3.537 | 40 | 191 | spam |
5 rows × 59 columns
References
\(^{\text{📚}}\) Linder, T. (2002).
\(^{\text{📚}}\) Luxburg (2007).
\(^{\text{📚}}\) Satellite Images.