Random Forest Model for Crop Type and Land Classification
Using RandomForest Classifier for crop type mapping with data from Google Earth Engine.
• Drew Bollinger, Zhuang-Fang Yi, Alex Mandel • 13 min read
- Setup Notebook
- Connect Google Drive
- Label data preparation
- Reading imagery from Google Earth Engine (GEE)
- Model training
- Using the Model
This notebook teaches you how to read satellite imagery (Sentinal-2) from Google Earth Engine and use it for crop type mapping with a RandomForest Classifier.
We will use data created by SERVIR East Africa, RCMRD, and FEWSNET. Demonstrating how to train a RandomForest classifier over Trans Nzoia county, Kenya.
# Requirements, will skip if already installed
%pip install geopandas rasterio rasterstats shapely
%pip install folium earthengine-api
%pip install scikit-learn
%pip install treeinterpreter
from os import path as op
import pickle
import geopandas as gpd
import shapely as shp
import matplotlib.pyplot as plt
import numpy as np
import rasterio
from rasterio.features import rasterize
from rasterstats.io import bounds_window
import rasterstats
import folium
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from treeinterpreter import treeinterpreter as ti
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
Mounted at /content/drive/
# your root directory for outputs is set to your google drive
# you should create a sub-folder called data under your 'Colab Notebooks'.
# An example is in https://developmentseed.org/sat-ml-training/GettingStarted#Explore-your-drive
my_root_dir = "/content/drive/My Drive/Colab Notebooks/data"
# read in training data polygons that created as geojson from a shared directory
training_data = '/content/drive/Shared drives/servir-sat-ml/data/training_data.geojson'
training_vectors = gpd.read_file(training_data)
# make a bounding box and centroid for mapping
bbox = training_vectors.total_bounds
center = shp.geometry.box(bbox[0], bbox[1], bbox[2], bbox[3]).centroid
# show the 1st 5 lines
training_vectors.head()
name | description | geometry | |
---|---|---|---|
0 | Shadow | None | MULTIPOLYGON (((34.83383 1.18204, 34.83577 1.1... |
1 | Forestland | None | MULTIPOLYGON (((35.30961 1.01328, 35.30964 1.0... |
2 | Maize | early reproductive | MULTIPOLYGON (((34.90904 1.09515, 34.90907 1.0... |
3 | Sugarcane | no change..maize farm on the right and far lef... | MULTIPOLYGON (((34.90750 1.08934, 34.90753 1.0... |
4 | Maize | reproductive good crop | MULTIPOLYGON (((34.87144 0.82953, 34.87147 0.8... |
# If not on Colab you'll need install the earth-engine Python API
#!pip install earthengine-api #earth-engine Python API
# Athenticate to your GEE account.
!earthengine authenticate
# Earth Engine Python API
import ee
ee.Initialize()
We will now search for Sentinel 2 imagery, a multispectral satellite with ~10m resolution and repeat coverage every 5 days.
Filters will include selecting bands, a date range, and only imagery within a defined Area of Interest (AOI).
# From GEE
#training_vectors.total_bounds.tolist()
aoi = ee.Geometry.Rectangle(training_vectors.total_bounds.tolist())
band_sel = ('B2', 'B3', 'B4', 'B8', 'B11', 'B12')
sentinel_scenes = ee.ImageCollection("COPERNICUS/S2")\
.filterBounds(aoi)\
.filterDate('2019-05-02', '2019-05-03')\
.select(band_sel)
scenes = sentinel_scenes.getInfo()
[print(scene['id']) for scene in scenes["features"]]
sentinel_mosaic = sentinel_scenes.mean().rename(band_sel)
sentinel_mosaic.getInfo()
COPERNICUS/S2/20190502T074621_20190502T080204_T36NXF COPERNICUS/S2/20190502T074621_20190502T080204_T36NXG COPERNICUS/S2/20190502T074621_20190502T080204_T36NYF COPERNICUS/S2/20190502T074621_20190502T080204_T36NYG
{'bands': [{'crs': 'EPSG:4326', 'crs_transform': [1, 0, 0, 0, 1, 0], 'data_type': {'max': 65535, 'min': 0, 'precision': 'double', 'type': 'PixelType'}, 'id': 'B2'}, {'crs': 'EPSG:4326', 'crs_transform': [1, 0, 0, 0, 1, 0], 'data_type': {'max': 65535, 'min': 0, 'precision': 'double', 'type': 'PixelType'}, 'id': 'B3'}, {'crs': 'EPSG:4326', 'crs_transform': [1, 0, 0, 0, 1, 0], 'data_type': {'max': 65535, 'min': 0, 'precision': 'double', 'type': 'PixelType'}, 'id': 'B4'}, {'crs': 'EPSG:4326', 'crs_transform': [1, 0, 0, 0, 1, 0], 'data_type': {'max': 65535, 'min': 0, 'precision': 'double', 'type': 'PixelType'}, 'id': 'B8'}, {'crs': 'EPSG:4326', 'crs_transform': [1, 0, 0, 0, 1, 0], 'data_type': {'max': 65535, 'min': 0, 'precision': 'double', 'type': 'PixelType'}, 'id': 'B11'}, {'crs': 'EPSG:4326', 'crs_transform': [1, 0, 0, 0, 1, 0], 'data_type': {'max': 65535, 'min': 0, 'precision': 'double', 'type': 'PixelType'}, 'id': 'B12'}], 'type': 'Image'}
# To make a map we first need some helper functions
# Define the URL format used for Earth Engine generated map tiles.
EE_TILES = 'https://earthengine.googleapis.com/map/{mapid}/{{z}}/{{x}}/{{y}}?token={token}'
#@title Mapdisplay: Display GEE objects using folium.
def Mapdisplay(center, dicc, Tiles="OpensTreetMap",zoom_start=10):
'''
:param center: Center of the map (Latitude and Longitude).
:param dicc: Earth Engine Geometries or Tiles dictionary
:param Tiles: Mapbox Bright,Mapbox Control Room,Stamen Terrain,Stamen Toner,stamenwatercolor,cartodbpositron.
:zoom_start: Initial zoom level for the map.
:return: A folium.Map object.
'''
mapViz = folium.Map(location=center,tiles=Tiles, zoom_start=zoom_start)
for k,v in dicc.items():
if gpd.geodataframe.GeoDataFrame == type(v):
folium.GeoJson(
data = v,
name = k
).add_to(mapViz)
elif ee.image.Image in [type(x) for x in v.values()]:
folium.TileLayer(
tiles = v["tile_fetcher"].url_format,
attr = 'Google Earth Engine',
overlay =True,
name = k
).add_to(mapViz)
else:
folium.GeoJson(
data = v,
name = k
).add_to(mapViz)
mapViz.add_child(folium.LayerControl())
return mapViz
s2_vis_params = {
'bands': ['B4', 'B3', 'B2'],
'min': 0,
'max': 3000,
}
Mapdisplay(center=[center.y, center.x],
dicc={'S2':sentinel_mosaic.getMapId(s2_vis_params),
'TrainingData':training_vectors},
zoom_start=12)
# We will save it to Google Drive for later reuse
raster_name = op.join(my_root_dir,'sentinel_mosaic-Trans_Nzoia')
# If you want to keep track of the export you can run this code
# However if run this, you will need to wait for it to finish before running additional code
import time
while task.active():
print('Polling for task (id: {}).'.format(task.id))
time.sleep(15)
# Reference the raster on disk.
raster_path = op.join(my_root_dir, raster_name)
raster_file = ".".join([raster_path, "tif"])
# Alternate reference already prepared file on Google Drive, uncomment next line to use
#raster_file = '/content/drive/Shared drives/servir-sat-ml/data/Trans_nzoia_2019_05-02.tif'
print(raster_file)
/content/drive/My Drive/Colab Notebooks/data/sentinel_mosaic-Trans_Nzoia.tif
Now we're going to work on training a model to identify classes of land use based on the training data and the satellite imagery.
First we will need to do some preparation to organize the training data into the correct python types, and to extract sample pixels from the intersecting imagery.
# find all unique values of training data names to use as classes
classes = np.unique(training_vectors.name)
classes
array(['Built', 'Cloud', 'Fallow', 'Forestland', 'Grassland', 'Maize', 'Shadow', 'Sugarcane', 'Sunflower', 'Waterbody'], dtype=object)
# create a dictionary to convert class names into integers for modeling
class_dict = dict(zip(classes, range(len(classes))))
class_dict
{'Built': 0, 'Cloud': 1, 'Fallow': 2, 'Forestland': 3, 'Grassland': 4, 'Maize': 5, 'Shadow': 6, 'Sugarcane': 7, 'Sunflower': 8, 'Waterbody': 9}
This section loops through the training classes and their polygons reading the imagery raster extracting values.
# raster information
##If you want to read the data directly from the shared folder, uncomment the following line.
# raster_file = '/content/drive/Shared drives/servir-sat-ml/data/Trans_nzoia_2019_05-02.tif'
# a custom function for getting each value from the raster
def all_values(x):
return x
# this larger cell reads data from a raster file for each training vector
X_raw = []
y_raw = []
with rasterio.open(raster_file, 'r') as src:
for (label, geom) in zip(training_vectors.name, training_vectors.geometry):
# read the raster data matching the geometry bounds
window = bounds_window(geom.bounds, src.transform)
# store our window information
window_affine = src.window_transform(window)
fsrc = src.read(window=window)
# rasterize the geometry into the larger shape and affine
mask = rasterize(
[(geom, 1)],
out_shape=fsrc.shape[1:],
transform=window_affine,
fill=0,
dtype='uint8',
all_touched=True
).astype(bool)
# for each label pixel (places where the mask is true)
label_pixels = np.argwhere(mask)
for (row, col) in label_pixels:
# add a pixel of data to X
data = fsrc[:,row,col]
one_x = np.nan_to_num(data, nan=1e-3)
X_raw.append(one_x)
# add the label to y
y_raw.append(class_dict[label])
# convert the training data lists into the appropriate numpy array shape and format for scikit-learn
X = np.array(X_raw)
y = np.array(y_raw)
(X.shape, y.shape)
((160461, 6), (160461,))
In addition to the raw pixel values we will calculated a couple of indices that help in some classifications.
- Normalized Difference Vegetation Index (NDVI) - great for identiying photosynthesizing plants.
- Normalized Difference Water Index (NDWI) - great for identifying open water (when there isn't a lot of glare)
# helper function for calculating ND*I indices (bands in the final dimension)
def band_index(arr, a, b):
return np.expand_dims((arr[..., a] - arr[..., b]) / (arr[..., a] + arr[..., b]), axis=1)
ndvi = band_index(X, 3, 2)
ndwi = band_index(X, 1, 3)
X = np.concatenate([X, ndvi, ndwi], axis=1)
X.shape
(160461, 8)
Now were going to split 20% of the data to reserve for testing the quality of the trained model.
# split the data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Since we don't have the same amount of training data for each class, we're going to calculate the relative quantities and tell the model so it can adjust to reduce bias.
# calculate class weights to allow for training on inbalanced training samples
labels, counts = np.unique(y_train, return_counts=True)
class_weight_dict = dict(zip(labels, 1 / counts))
class_weight_dict
{0: 0.00046882325363338024, 1: 0.001597444089456869, 2: 0.0004928536224741252, 3: 1.970093973482535e-05, 4: 0.000819000819000819, 5: 1.5704257424187697e-05, 6: 0.0002473410833539451, 7: 0.0002824858757062147, 8: 0.05263157894736842, 9: 0.003115264797507788}
# initialize a RandomForestClassifier
clf = RandomForestClassifier(
n_estimators=200,
class_weight=class_weight_dict,
max_depth=6,
n_jobs=-1,
verbose=1,
random_state=0)
# fit the model to the data (training)
clf.fit(X, y)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers. [Parallel(n_jobs=-1)]: Done 46 tasks | elapsed: 6.9s [Parallel(n_jobs=-1)]: Done 196 tasks | elapsed: 27.7s [Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 28.3s finished
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={0: 0.00046882325363338024, 1: 0.001597444089456869, 2: 0.0004928536224741252, 3: 1.970093973482535e-05, 4: 0.000819000819000819, 5: 1.5704257424187697e-05, 6: 0.0002473410833539451, 7: 0.0002824858757062147, 8: 0.05263157894736842, 9: 0.003115264797507788}, criterion='gini', max_depth=6, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1, oob_score=False, random_state=0, verbose=1, warm_start=False)
# predict on X_test to evaluate the model
preds = clf.predict(X_test)
cm = confusion_matrix(y_test, preds, labels=labels)
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers. [Parallel(n_jobs=2)]: Done 46 tasks | elapsed: 0.2s [Parallel(n_jobs=2)]: Done 196 tasks | elapsed: 0.9s [Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed: 0.9s finished
# (optional) save the trained model as python pickle file
model_name = op.join(my_root_dir,'random_forest.sav')
with open(model_name, 'wb') as modelfile:
pickle.dump(clf, modelfile)
A confusion matrix shows a comparision between what the class is based on the test data, and what the model predicted it to be. Low numbers are good. The diagonal from top left to bottom right, is a class compared to itself should be high. Values range from 0 to 1.
# plot the confusion matrix
%matplotlib inline
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title='Normalized Confusion Matrix',
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
You can see the most often confused classes are Shadows with Clouds, Maize with Shadows, and Maize with Sugarcane.
# predict again with the tree interpreter to see how much each band contributes to the classification
sample = 100
prediction, bias, contributions = ti.predict(clf, X_test[:sample])
c = np.sum(contributions, axis=0)
# plot the contributions
band_names = ['Blue', 'Green', 'Red', 'NIR', 'SWIR1', 'SWIR2', 'NDVI', 'NDWI']
gdf = gpd.GeoDataFrame(c, columns=classes, index=band_names)
gdf.style.background_gradient(cmap='viridis')
Built | Cloud | Fallow | Forestland | Grassland | Maize | Shadow | Sugarcane | Sunflower | Waterbody | |
---|---|---|---|---|---|---|---|---|---|---|
Blue | -0.702971 | -0.450047 | -2.665426 | 5.865529 | -0.405301 | 0.415711 | -0.113953 | -0.486660 | -0.216748 | -1.240135 |
Green | -1.088079 | 0.001056 | -1.594761 | 2.925666 | -0.276179 | 1.407419 | 0.533859 | -0.236484 | -0.781711 | -0.890786 |
Red | -1.856321 | -0.581717 | 0.275449 | 3.093265 | 0.144207 | 0.698167 | -0.238189 | -0.635007 | -0.700595 | -0.199260 |
NIR | 0.101407 | -0.275263 | -3.239440 | 2.886731 | -0.747007 | 1.295758 | 0.706776 | 0.528081 | -1.053634 | -0.203409 |
SWIR1 | 0.144890 | 0.060224 | -0.970734 | 2.995083 | -0.630481 | 0.885438 | 0.517873 | 0.340152 | -0.945101 | -2.397344 |
SWIR2 | -0.688075 | -0.459099 | 0.721128 | 2.733324 | -0.166478 | 0.222193 | -0.259353 | -0.388720 | -0.971504 | -0.743416 |
NDVI | -1.002602 | -0.195639 | 2.239791 | 3.626398 | -0.968151 | 1.197682 | -0.272888 | -0.399344 | -2.373592 | -1.851654 |
NDWI | 0.098464 | 0.039940 | 1.066409 | 2.931208 | -1.106715 | 0.695973 | 0.550015 | -0.465344 | -1.953002 | -1.856948 |
Generate predictions over the full image
Now that we've trained and tested the model over the existing data we can use the trained RandomForestClassifier clf
over a whole satellite image that covers a larger geospatial location. We can split the job into small tiles to optimize the compute power and potentially do the classification in parallel.
# in this case, we predict over the entire input image
# (only small portions were used for training)
new_image = raster_file
# specify the output
output_image = op.join(my_root_dir, "classification.tif")
with rasterio.open(new_image, 'r') as src:
profile = src.profile
profile.update(
dtype=rasterio.uint8,
count=1,
)
with rasterio.open(output_image, 'w', **profile) as dst:
# perform prediction on each small image patch to minimize required memory
patch_size = 500
for i in range((src.shape[0] // patch_size) + 1):
for j in range((src.shape[1] // patch_size) + 1):
# define the pixels to read (and write) with rasterio windows reading
window = rasterio.windows.Window(
j * patch_size,
i * patch_size,
# don't read past the image bounds
min(patch_size, src.shape[1] - j * patch_size),
min(patch_size, src.shape[0] - i * patch_size))
# read the image into the proper format
data = src.read(window=window)
# adding indices if necessary
img_swp = np.moveaxis(data, 0, 2)
img_flat = img_swp.reshape(-1, img_swp.shape[-1])
img_ndvi = band_index(img_flat, 3, 2)
img_ndwi = band_index(img_flat, 1, 3)
img_w_ind = np.concatenate([img_flat, img_ndvi, img_ndwi], axis=1)
# remove no data values, store the indices for later use
m = np.ma.masked_invalid(img_w_ind)
to_predict = img_w_ind[~m.mask].reshape(-1, img_w_ind.shape[-1])
# skip empty inputs
if not len(to_predict):
continue
# predict
img_preds = clf.predict(to_predict)
# add the prediction back to the valid pixels (using only the first band of the mask to decide on validity)
# makes the assumption that all bands have identical no-data value arrangements
output = np.zeros(img_flat.shape[0])
output[~m.mask[:, 0]] = img_preds.flatten()
# resize to the original image dimensions
output = output.reshape(*img_swp.shape[:-1])
# create our final mask
mask = (~m.mask[:, 0]).reshape(*img_swp.shape[:-1])
# write to the final files
dst.write(output.astype(rasterio.uint8), 1, window=window)
dst.write_mask(mask, window=window)
### Visualize the results
import matplotlib.pyplot as plt
from rasterio.plot import show
%matplotlib inline
# Load the original image
# Load the classification
output_image = '/content/drive/Shared drives/servir-sat-ml/data/RF_classification.tif'
def linear_rescale(image, in_range=(0, 1), out_range=(1, 255)):
imin, imax = in_range
omin, omax = out_range
image = np.clip(image, imin, imax) - imin
image = image / np.float(imax - imin)
return image * (omax - omin) + omin
with rasterio.open(output_image, 'r') as class_raster:
# show(class_raster)
classes = class_raster.read()
# Compare side by side
with rasterio.open(raster_file, 'r') as s2_raster:
# show(s2_raster)
s2 = s2_raster.read([1,2,3])
for band in range(s2.shape[0]):
s2[band] = linear_rescale(
s2[band],
in_range=(0, 3000),
out_range=[0, 255]
)
s2 = s2.astype(np.uint8)
#print(s2.shape)
fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(10, 4), sharey=True)
show(classes, transform=class_raster.transform, ax=ax1)
show(s2[[2,1,0], : , :], transform=s2_raster.transform, adjust='linear', ax=ax2)
<matplotlib.axes._subplots.AxesSubplot at 0x7f3e1795f0f0>
You can also download and view the results in your favorite GIS application. There's a QGIS style file in the Shared drive for visualizing.
In some cases imagery can often appear dark, you can check how well the data distribution spans the possible display values with a histogram. We applied a rescale function (just like we did when using Google Earth Engine earlier), in order to get a good RGB display of the imagery reflectance values.
This method can also be useful for exploring the distribution of classes in the model result.
# Exploring why the imagery is so dark.
# Looks like it will need manual stretching to make a nice image
rasterio.plot.show_hist(s2,
bins=50,
histtype='stepfilled',
lw=0.0,
stacked=False,
alpha=0.3)