Collection of training data for remote sensing model building

Tutorial: EON Summer School 2023

Author

Paul Magdon, University of Applied Sciences and Arts (HAWK)

Published

August 29, 2023

#install.packages("devtools")
devtools::install_github("bleutner/RStoolbox")
Skipping install of 'RStoolbox' from a github remote, the SHA1 (ec6cac23) has not changed since last install.
  Use `force = TRUE` to force installation
library(sf)
Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE
library(RStoolbox)
The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
which was just loaded, will retire in October 2023.
Please refer to R-spatial evolution reports for details, especially
https://r-spatial.org/r/2023/05/15/evolution4.html.
It may be desirable to make the sf package available;
package maintainers should consider adding sf to Suggests:.
The sp package is now running under evolution status 2
     (status 2 uses the sf package in place of rgdal)
library(raster)
Lade nötiges Paket: sp
library(knitr)
library(ggplot2)
library(rmarkdown)
library(patchwork)

Attache Paket: 'patchwork'
Das folgende Objekt ist maskiert 'package:raster':

    area
library(mapview)
library(kableExtra)
library(rmarkdown)
library("rprojroot")

Data sets

In this tutorial we will work with a Sentinel-2 scene from 18/06/2022 from the National Park, Harz. We will also use the boundary of the National Park to define our study area. Before we can start you may download the S2 Scene from the following link: S2-download. Place this file into the data folder of this tutorial.

# create a string containing the current working directory
wd=paste0(find_rstudio_root_file(),"/paul_tutorial/data/")

#Import the boundary of the n
np_boundary = st_transform(st_read(paste0(wd,"nlp-harz_aussengrenze.gpkg")),25832)
Reading layer `nlp-harz_aussengrenze' from data source 
  `C:\Users\49160\Documents\courses\EON\EON2023\paul_tutorial\data\nlp-harz_aussengrenze.gpkg' 
  using driver `GPKG'
Simple feature collection with 1 feature and 3 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 591196.6 ymin: 5725081 xmax: 619212.6 ymax: 5751232
Projected CRS: WGS 84 / UTM zone 32N
s2  <- raster::brick(paste0(wd,"S2B_MSIL2A_20220618T102559_N0400_R10_resampled_harz_np.tif"))

names(s2)<-c('blue','green','red','vnir1','vnir2','vnir3','nir1','nir2','swir')

s2 <-raster::mask(s2,np_boundary)

Anaylsing the spectral variablity within the study area

If we have no access to prior information on our target variable in the study area we can use the spectral variability as a proxy for the variability of the target variable. By using the spectral variability as a sampling criterion we also ensure, that we cover the spectral range with our sampling.

Dimension reduction (PCA)

In a fist step we reduce the dimensions of the 9 Sentinel-2 bands while maintaining most of the information, using a principal component analysis (PCA).

#PCA
pca<-RStoolbox::rasterPCA(s2,nSamples = 5000, spca=TRUE )

ggRGB(pca$map,1,2,3, stretch="lin", q=0)

From the output of the PCA we see that we can capture 92% of the variability with the first two components. Thus we will only use the PC1 and PC2 for the subsequent analysis.

Unsupervised clustering

In the next step we run an unsupervised classification of the PC1 and PC2 to get a clustered map. For the unsupervised classification we need to take a decision on the number of classes/clusters to be created. Here we will take n=5 classes. However, depending on the target variable this value need to be adjusted.

set.seed(2222)
cluster <- unsuperClass(pca$map[[c('PC1','PC2')]], nSamples = 100, nClasses = 5, nStarts = 5)


## Plots
colors <- rainbow(5)
plot(cluster$map, col = colors, legend = TRUE, axes = TRUE, box =TRUE)

The map shows a clear spatial patterns related to the elevation, tree species and vitality status of the Nationalpark forests.

Create a stratified sample

In the next step we take a stratified random sample with \(n=10\) points from each of the 5 spectral classes.

samples <- sampleStratified(cluster$map,size = 10,na.rm=T,xy=T,sp=T)
samples$class <- as.factor(samples$class)
my.palette <- rainbow(5)
point.size <- 0.5

np.layer <- list("sp.polygons", as_Spatial(np_boundary))


spplot(samples,'class', col.regions = my.palette, sp.layout = np.layer,
    cex = point.size, main = "Stratified random sample for training")

We can now print the sample plot list as following:

kable(samples@data[c('x','y','class')], caption='Training plot list') %>%
  kable_styling(fixed_thead = T) %>% scroll_box(height = "400px")
Training plot list
x y class
611335 5750495 1
591385 5726975 1
611115 5748515 1
594375 5727165 1
606975 5729505 1
593825 5725735 1
607275 5726725 1
592305 5728525 1
611725 5749605 1
594425 5726745 1
604685 5737625 2
608785 5731095 2
605745 5739955 2
616025 5740945 2
595955 5728155 2
618615 5736595 2
609835 5734025 2
608895 5735735 2
606685 5739485 2
607075 5731635 2
613995 5742695 3
608905 5740045 3
608725 5739125 3
615335 5738535 3
608635 5741785 3
612345 5744955 3
607445 5740635 3
608705 5732285 3
611185 5737695 3
617605 5738225 3
614425 5747095 4
611505 5740905 4
611595 5737865 4
601625 5733415 4
598565 5734455 4
612825 5736295 4
604665 5735905 4
607065 5740135 4
601365 5734725 4
603315 5732365 4
607675 5738015 5
614105 5737465 5
596355 5730455 5
595455 5730655 5
601505 5736215 5
615025 5740415 5
610685 5744915 5
610185 5741695 5
613605 5744835 5
612755 5746185 5

Implement a plot design

# Create a training data set by extracting the mean value of all pixels touching
# a buffered area with 13m around the plot center
train<-raster::extract(s2,samples,sp=T,buffer=13,fun='mean')
mapview(train, zcol="class",
        map.types = c("Esri.WorldShadedRelief", "OpenStreetMap.DE"))+
  mapview(np_boundary,alpha.regions = 0.2, aplha = 1)

Compare the pixel value range between the sample and the image

image.sample<-raster::sampleRandom(s2,size=100000)
image.sample<- as.data.frame(image.sample)
image.sample$group<-'image'

train.df<- train@data[,names(s2)]
train.df$group<-'train'

df <- rbind(image.sample,train.df)

blue <-ggplot(df, aes(blue,fill=group)) + theme_classic()+
        geom_histogram(
        aes(y=after_stat(density)),alpha=0.2, color='gray80',
        position='identity',bins=30)

green <-ggplot(df, aes(green,fill=group)) + theme_classic()+
        geom_histogram(
        aes(y=after_stat(density)),alpha=0.2, color='gray80',
        position='identity',bins=30)

nir1<-ggplot(df, aes(nir1,fill=group)) + theme_classic()+
      geom_histogram(
      aes(y=after_stat(density)),alpha=0.2, color='gray80',
      position='identity',bins=30)

swir<-ggplot(df, aes(swir,fill=group)) + theme_classic()+
      geom_histogram(
      aes(y=after_stat(density)),alpha=0.2, color='gray80',
      position='identity',bins=30)

blue+green+nir1+swir+plot_layout(ncol=2)