import os
import ilwis
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import lightgbm as lgb
import pandas as pd


#use the Python function os.getcwd() to retrieve the current folder used and add a sub-folder containing the sample data
work_dir = os.getcwd() + '/ILWISPy_ImageClassification'

#set the working directory for ILWISPy
ilwis.setWorkingCatalog(work_dir)
print(work_dir)
ilwis.version()

D:\jupyter\notebook_scripts\ilwispy_tutorial\tutorial_release/ILWISPy_ImageClassification

'1.0 build 20230403'


# Load the multi-spectral image 
s2_raster = ilwis.RasterCoverage('s2A_sub.mpl')

#print some metadata
print(s2_raster.size())

Size(2600, 1750, 9)


rcsat = np.fromiter(iter(s2_raster), np.float64, s2_raster.size().linearSize()) 
rcsat = rcsat.reshape((s2_raster.size().zsize, s2_raster.size().ysize, s2_raster.size().xsize))
rcsat.shape

(9, 1750, 2600)


#reshape the order
x_sat = np.moveaxis(rcsat, 0, -1)
x_sat = x_sat.reshape(rcsat.shape[1], rcsat.shape[2], rcsat.shape[0])
x_sat.shape

(1750, 2600, 9)


gt_data = ilwis.RasterCoverage('samples.mpr')
print(gt_data.size())

Size(2600, 1750, 1)


y_data = np.fromiter(iter(gt_data), np.ubyte, gt_data.size().linearSize()) 
y_data = y_data.reshape((gt_data.size().ysize, gt_data.size().xsize))
y_data.shape

(1750, 2600)


# Find how many non-zero entries we have -- i.e. how many training data samples?
n_samples = (y_data > 0).sum()
print('We have {n} samples'.format(n=n_samples))

# What are our classification labels?
labels = np.unique(y_data[y_data > 0])
print('The training data include {n} classes: {classes}'.format(n=labels.size, classes=labels))


X = x_sat[y_data > 0] 
y = y_data[y_data > 0]

print('Our X matrix is sized: {sz}'.format(sz=X.shape))
print('Our y array is sized: {sz}'.format(sz=y.shape))

We have 67062 samples
The training data include 9 classes: [1 2 3 4 5 6 7 8 9]
Our X matrix is sized: (67062, 9)
Our y array is sized: (67062,)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (46943, 9) (46943,)
Test set: (20119, 9) (20119,)


# initialize our model with 500 trees
# calculate the out of bag score
# use multi core processing
rf = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=-1)

# Fit our model to training data
rf = rf.fit(X_train, y_train)


print('Our OOB prediction of accuracy is: {oob}%'.format(oob=rf.oob_score_ * 100))

Our OOB prediction of accuracy is: 96.49148967897237%


bands = [1, 2, 3, 4, 5, 6, 7, 8, 9]

for b, imp in zip(bands, rf.feature_importances_):
    print('Band {b} importance: {imp}'.format(b=b, imp=imp))

Band 1 importance: 0.06087870405050715
Band 2 importance: 0.06823245041959516
Band 3 importance: 0.12514457165133938
Band 4 importance: 0.08443655010223311
Band 5 importance: 0.08683527313849859
Band 6 importance: 0.12196750324109212
Band 7 importance: 0.0961785154357118
Band 8 importance: 0.16959718792180534
Band 9 importance: 0.18672924403921742


df_train = pd.DataFrame()
df_train['truth'] = y_train
df_train['predict'] = rf.predict(X_train)

# Show cross-tabulate predictions
print(pd.crosstab(df_train['truth'], df_train['predict'], margins=True))

predict     1     2    3     4     5    6     7     8     9    All
truth                                                             
1        7872     0    0     0     0    0     0     0     0   7872
2           0  6912    0     0     0    0     0     0     0   6912
3           0     0  526     0     0    0     0     0     0    526
4           0     0    0  9698     0    0     0     0     0   9698
5           0     0    0     0  2635    0     0     0     0   2635
6           0     0    0     0     0  636     0     0     0    636
7           0     0    0     0     0    0  7885     0     0   7885
8           0     0    0     0     0    0     0  4876     0   4876
9           0     0    0     0     0    0     0     0  5903   5903
All      7872  6912  526  9698  2635  636  7885  4876  5903  46943


df_test = pd.DataFrame()
df_test['truth'] = y_test
df_test['predict'] = rf.predict(X_test)

# Show cross-tabulate predictions
print(pd.crosstab(df_test['truth'], df_test['predict'], margins=True))

predict     1     2    3     4     5    6     7     8     9    All
truth                                                             
1        3362     0    0    31     0    0     6    20     6   3425
2           2  2898   11     2     0    0     1    15     1   2930
3           0    37  201     0     0    0     0     0     0    238
4          40     0    0  4178     0    0     6     2     0   4226
5           0     0    0     0  1123    0     0     0     0   1123
6           0     1    0     0     0  278     0     0     0    279
7          27     0    0    11     0    0  3354    14     3   3409
8          77     4    0     1     0    0    22  1775   158   2037
9          16     1    1     1     0    0    22   179  2232   2452
All      3524  2941  213  4224  1123  278  3411  2005  2400  20119


rf_pred = rf.predict(X_test)


print(f"Accuracy: {accuracy_score(y_test, rf_pred)*100}")

Accuracy: 96.43123415676723


print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           1       0.95      0.98      0.97      3425
           2       0.99      0.99      0.99      2930
           3       0.94      0.84      0.89       238
           4       0.99      0.99      0.99      4226
           5       1.00      1.00      1.00      1123
           6       1.00      1.00      1.00       279
           7       0.98      0.98      0.98      3409
           8       0.89      0.87      0.88      2037
           9       0.93      0.91      0.92      2452

    accuracy                           0.96     20119
   macro avg       0.96      0.95      0.96     20119
weighted avg       0.96      0.96      0.96     20119


# Take our full image and reshape into long 2d array (nrow * ncol, nband) for classification
new_shape = (x_sat.shape[0] * x_sat.shape[1], x_sat.shape[2])

img_as_array = x_sat[:, :, :9].reshape(new_shape)
print('Reshaped from {o} to {n}'.format(o = x_sat.shape, n = img_as_array.shape))

# Now predict for each pixel
rf_class = rf.predict(img_as_array)

# Reshape our classification map
rf_class = rf_class.reshape(x_sat[:, :, 0].shape)
rf_class.shape

Reshaped from (1750, 2600, 9) to (4550000, 9)

(1750, 2600)


fig = plt.figure(figsize = (20, 20))
plt.imshow(rf_class, interpolation='none', cmap=ListedColormap(['yellow','#acf734', '#2ced09', 'red', 'blue', '#e1e8f2', 'green',  '#45c22f', 'lightgreen']))
loc = np.arange(0.5,9.5,0.9)
labels = np.arange(0,10,1)
cb = plt.colorbar(shrink=0.45)
cb.set_ticks(loc)
cb.set_ticklabels(labels)
plt.axis('off')
plt.show()


x = np.moveaxis(rcsat, 0, -1)
x.shape

(1750, 2600, 9)


X_data = x.reshape(-1, 9)

scaler = StandardScaler().fit(X_data)

X_scaled = scaler.transform(X_data)

X_scaled.shape

(4550000, 9)


X_scaled_new = (rcsat.shape[1], rcsat.shape[2], rcsat.shape[0])
X_array = X_scaled.reshape(X_scaled_new)
X_array.shape

(1750, 2600, 9)


# Find how many non-zero entries we have -- i.e. how many training data samples?
n_samples = (y_data > 0).sum()
print('We have {n} samples'.format(n=n_samples))

# What are our classification labels?
labels = np.unique(y_data[y_data > 0])
print('The training data include {n} classes: {classes}'.format(n=labels.size, classes=labels))


X1 = X_array[y_data > 0] 
y1 = y_data[y_data > 0]

print('Our X matrix is sized: {sz}'.format(sz=X1.shape))
print('Our y array is sized: {sz}'.format(sz=y1.shape))

We have 67062 samples
The training data include 9 classes: [1 2 3 4 5 6 7 8 9]
Our X matrix is sized: (67062, 9)
Our y array is sized: (67062,)


#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (46943, 9) (46943,)
Test set: (20119, 9) (20119,)


Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

array([0.96302003, 0.95695611, 0.96560465, 0.96227447, 0.9650082 ,
       0.96316914, 0.96401412, 0.96167802, 0.96232417])


plt.plot(range(1,Ks),mean_acc,'r')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 1xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (k)')
plt.tight_layout()
plt.show()


print( "The best accuracy was with", mean_acc.max()*100, "with k=", mean_acc.argmax()+1)

The best accuracy was with 96.56046523187037 with k= 3


k = 3 #number of neighbours defined having highest accuracy

knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1) # n_jobs set so it supports multi core processing
knn.fit(X_train, y_train)

knn_pred = knn.predict(X_test)


print(f"Accuracy: {accuracy_score(y_test, knn_pred)*100}")

Accuracy: 96.56046523187037


print(classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

           1       0.95      0.99      0.97      3425
           2       0.98      0.99      0.99      2930
           3       0.90      0.82      0.86       238
           4       0.99      0.99      0.99      4226
           5       1.00      1.00      1.00      1123
           6       1.00      1.00      1.00       279
           7       0.99      0.99      0.99      3409
           8       0.88      0.88      0.88      2037
           9       0.94      0.91      0.92      2452

    accuracy                           0.97     20119
   macro avg       0.96      0.95      0.96     20119
weighted avg       0.97      0.97      0.97     20119


# Classification using K-NNC
knn_class = knn.predict(X_scaled).reshape(1750,2600)
knn_class.shape

(1750, 2600)


# Plot of classfied K-NNC map
plt.figure()
fig = plt.figure(figsize = (20, 20))
plt.imshow(knn_class, interpolation='none', cmap=ListedColormap(['#F2EB8A', '#74A0FF', '#9C6AF9', '#FF6464', '#0000FF', '#4068E1', '#549484', '#7CFC00', '#407165']))
cb = plt.colorbar(shrink=0.45)
loc = np.arange(0.5,9.5,0.9)
labels = np.arange(0,10,1)
cb.set_ticks(loc)
cb.set_ticklabels(labels)
plt.axis('off')
plt.show()

<Figure size 432x288 with 0 Axes>


from sklearn.svm import SVC

svm = SVC(C=3.0, kernel='linear', degree=6, cache_size=1024)
svm.fit(X_train, y_train)

SVC(C=3.0, cache_size=1024, degree=6, kernel='linear')

SVC(C=3.0, cache_size=1024, degree=6, kernel='linear')


svm_pred = svm.predict(X_test)


print(f"Accuracy: {accuracy_score(y_test, svm_pred)*100}")

Accuracy: 95.31288831452855


print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

           1       0.93      0.97      0.95      3425
           2       0.98      0.99      0.98      2930
           3       0.95      0.77      0.85       238
           4       0.99      0.98      0.98      4226
           5       1.00      1.00      1.00      1123
           6       1.00      1.00      1.00       279
           7       0.97      0.98      0.98      3409
           8       0.85      0.83      0.84      2037
           9       0.92      0.90      0.91      2452

    accuracy                           0.95     20119
   macro avg       0.95      0.94      0.94     20119
weighted avg       0.95      0.95      0.95     20119


# Visualize Classification Map of SVM
svm_class = svm.predict(X_scaled).reshape(1750, 2600)


# Plot of classfied SVM map
plt.figure()
fig = plt.figure(figsize = (20, 20))
plt.imshow(svm_class, interpolation='none', cmap=ListedColormap(['#F2EB8A', '#74A0FF', '#9C6AF9', '#FF6464', '#0000FF', '#4068E1', '#549484', '#7CFC00', '#407165']))
cb = plt.colorbar(shrink=0.45)
loc = np.arange(0.5,9.5,0.9)
labels = np.arange(0,10,1)
cb.set_ticks(loc)
cb.set_ticklabels(labels)
plt.axis('off')
plt.show()

<Figure size 432x288 with 0 Axes>


d_train = lgb.Dataset(X_train, label=y_train)

params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='multiclass' #Multi-class target feature
params['metric']='multi_logloss' #metric for multi-class
params['max_depth']=20
params['num_leaves']=11
params['num_class']=11 #no.of unique values in the target class not inclusive of the end value

clf = lgb.train(params, d_train, 100)

[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 46943, number of used features: 9
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -1.785622
[LightGBM] [Info] Start training from score -1.915675
[LightGBM] [Info] Start training from score -4.491388
[LightGBM] [Info] Start training from score -1.577014
[LightGBM] [Info] Start training from score -2.880051
[LightGBM] [Info] Start training from score -4.301491
[LightGBM] [Info] Start training from score -1.783972
[LightGBM] [Info] Start training from score -2.264609
[LightGBM] [Info] Start training from score -2.073473
[LightGBM] [Info] Start training from score -34.538776


lgb_predictions = clf.predict(X_test)


print(lgb_predictions[1], np.argmax(lgb_predictions[1]))

[4.53103852e-16 7.03841949e-03 5.76394779e-03 7.76375444e-04
 6.47154721e-03 1.57906327e-03 3.98287592e-04 5.29266485e-03
 3.72742754e-01 5.99936940e-01 4.53103852e-16] 9


lgb_pred = np.argmax(lgb_predictions, axis=1)
lgb_pred.shape

(20119,)


print(f"Accuracy: {accuracy_score(y_test, lgb_pred)*100}")

Accuracy: 94.5971469754958


print(classification_report(y_test, lgb_pred))

              precision    recall  f1-score   support

           1       0.91      0.96      0.94      3425
           2       0.98      0.98      0.98      2930
           3       0.94      0.80      0.86       238
           4       0.99      0.98      0.98      4226
           5       1.00      1.00      1.00      1123
           6       1.00      1.00      1.00       279
           7       0.97      0.98      0.98      3409
           8       0.84      0.79      0.81      2037
           9       0.90      0.88      0.89      2452

    accuracy                           0.95     20119
   macro avg       0.95      0.93      0.94     20119
weighted avg       0.95      0.95      0.95     20119


#  Classification using LightGBM
gbm_class = np.argmax(clf.predict(X_scaled), axis=1).reshape((1750, 2600))


# Plot of classfied LightGBM map
plt.figure()
fig = plt.figure(figsize = (20, 20))
plt.imshow(gbm_class, interpolation='none', cmap=ListedColormap(['#F2EB8A', '#74A0FF', '#9C6AF9', '#FF6464', '#0000FF', '#4068E1', '#549484', '#7CFC00', '#407165']))
cb = plt.colorbar(shrink=0.45)
loc = np.arange(0.5,9.5,0.9)
labels = np.arange(0,10,1)
cb.set_ticks(loc)
cb.set_ticklabels(labels)
plt.axis('off')
plt.show()

<Figure size 432x288 with 0 Axes>


# envelope =  min X, max Y, max X, min Y,  here 499980 1600020 609780 1490220 (metric units - meters)
# gridsize =  no columns, no rows
# note name of georeference - here the name of output file created further below is used

grf = ilwis.GeoReference('code=georef:type=corners, csy=epsg:29636, envelope=514780 1599920 566780 1564920, gridsize=2600 1750, cornerofcorners=yes, name=class')
dfNum = ilwis.DataDefinition(ilwis.NumericDomain('code=value'), ilwis.NumericRange(0, 10, 1)) 
rcNew = ilwis.RasterCoverage()
rcNew.setSize(ilwis.Size(2600, 1750))
rcNew.setGeoReference(grf)
rcNew.setDataDef(dfNum)


rcrf = rcNew
print(rf_class.dtype)
rcrf.array2raster(rf_class.astype(np.uint8)) 
print(rcrf.size())
rcrf.store('rf_cl.mpr')
print(rf_class.dtype)

uint8
Size(2600, 1750, 1)
uint8


rcknn = rcNew
print(knn_class.dtype)
rcknn.array2raster(knn_class.astype(np.uint8)) 
print(rcknn.size())
rcknn.store('knn_cl.mpr')
print(knn_class.dtype)

uint8
Size(2600, 1750, 1)
uint8


rcsvm = rcNew
print(svm_class.dtype)
rcsvm.array2raster(svm_class.astype(np.uint8)) 
print(rcsvm.size())
rcsvm.store('svm_cl.mpr')
print(svm_class.dtype)

uint8
Size(2600, 1750, 1)
uint8


rcgbm = rcNew
print(gbm_class.dtype)
rcgbm.array2raster(gbm_class.astype(np.uint8)) 
print(rcgbm.size())
rcgbm.store('gbm_cl.mpr')
print(gbm_class.dtype)

int64
Size(2600, 1750, 1)
int64


#transform to dtype uint8
gbm_classuint8 = np.uint8(gbm_class)
print(gbm_classuint8.dtype)

uint8


# create stack
cl_stack = np.array([rf_class, knn_class, svm_class, gbm_classuint8])

print(cl_stack.shape)
print(cl_stack.dtype)

(4, 1750, 2600)
uint8


# envelope =  min X, max Y, max X, min Y,  here 499980 1600020 609780 1490220 (metric units - meters)
# gridsize =  no columns, no rows
# note name of georeference - here the name of output file created further below is used

grf = ilwis.GeoReference('code=georef:type=corners, csy=epsg:29636, envelope=514780 1599920 566780 1564920, gridsize=2600 1750, cornerofcorners=yes, name=class')
dfNum = ilwis.DataDefinition(ilwis.NumericDomain('code=value'), ilwis.NumericRange(0, 10, 1)) 
rcout = ilwis.RasterCoverage()
rcout.setSize(ilwis.Size(2600, 1750, 4))
rcout.setGeoReference(grf)
rcout.setDataDef(dfNum)


rcout.array2raster(cl_stack) 
print(rcout.size())

Size(2600, 1750, 4)


#rcout.store('cl_stack.mpl')


ilwis.operationMetaData('aggregaterasterstatistics')

'aggregaterasterstatistics(inputraster,statisticalmarker=mean|variance|standarddev|totalsumsquares|skew|kurtosis|max|min|maxindex|minindex|median|sum)'


# calculate median and standard deviation for the classified image stack
rc_median = ilwis.do('aggregaterasterstatistics', rcout, 'median')
rc_std = ilwis.do('aggregaterasterstatistics', rcout, 'standarddev')


#rc_median.store('cl_median.mpr')
#rc_std.store('cl_std.mpr')


# create a plot of the numpy arrays using MatPlotLib - Imshow
# using the Median and standard deviation
median_2np = np.fromiter(iter(rc_median), np.uint8, rc_median.size().linearSize()) 
median_2np = median_2np.reshape((rc_median.size().ysize, rc_median.size().xsize))

std_2np = np.fromiter(iter(rc_std), np.uint8, rc_std.size().linearSize()) 
std_2np = std_2np.reshape((rc_std.size().ysize, rc_std.size().xsize))

fig1 = plt.figure(figsize=(20, 15))
plt.subplot(1, 2, 1)
plt.imshow(median_2np, interpolation='none', cmap=ListedColormap(['#F2EB8A', '#74A0FF', '#9C6AF9', '#FF6464', '#0000FF', '#4068E1', '#549484', '#7CFC00', '#407165']))
plt.title('Median of all classifications')
loc = np.arange(0.5,9.5,0.9)
labels = np.arange(0,10,1)
cb.set_ticks(loc)
cb.set_ticklabels(labels)
plt.axis('on')
plt.colorbar(shrink=0.35)

plt.subplot(1, 2, 2)
plt.imshow(std_2np, interpolation = 'none', cmap = 'jet')
plt.title('Standard deviation')
plt.axis('on')
plt.colorbar(shrink=0.3)

<matplotlib.colorbar.Colorbar at 0x21747cedbe0>

Advanced image classification using ILWISPy in conjuntion with SciKit¶

Import Modules¶

Read Data and get meta data information¶

Ground truth¶

Pairing Y with X¶

Train Test Split¶

The Random Forest classifier¶

Training the Random Forest¶

Classifying the image stack using the RF classifier¶

Scaling¶

K-Nearest Neighbor Classifier (K-NNC)¶

Support Vector Machine (SVM)¶

Light Gradient Boosting Classifier (GBM)¶

Evaluation of your results¶

Final comment:¶