Maxent SDM

SDM start to finish in one RMarkdown file

Set-up

You will need to install maxnet if you have not already. This checks if it is installed.

if(!require(maxnet))
   devtools::install_github("BigelowLab/maxnet")

Load the necessary libraries. If you see errors that a library is not installed, you will need to install with install.packages("packagename").

suppressPackageStartupMessages({
library(maxnet)
library(dplyr)
library(maxnet)
library(sf)
library(stars)
library(geodata)
library(dismo)
library(lubridate)
library(sdmpredictors)
library(ggplot2)
library(cmocean)
library(janitor)
library(DT)
library(here)
library(rnaturalearth)
library(rnaturalearthdata)
library(raster)
library(ggspatial)
library(tidyverse)
library(robis)
})

Tell R that the root should be where this RMarkdown file resides. All our data files will be stored here.

sdm_dir <- "SDM"
here::i_am(paste0(sdm_dir,"/Turtle_maxnet.Rmd"))
here() starts at /Users/eli.holmes/Documents/GitHub/tutorials_marine_sdm

Set up the spatial region

Create a bounding box

We create a bounding box using minimum and maximum coordinate pairs and assign a standared WGS 84 coordinate reference system. This creates a sfs_POLYGON.

extent_polygon <- sf::st_bbox(c(xmin = 41.875, xmax = 65.125, 
                            ymax = -0.125, ymin = 32.125), 
                          crs = sf::st_crs(4326)) %>% 
  sf::st_as_sfc()

Save the bounding box for future use.

fil <- here::here(sdm_dir, "sdm_data", "BoundingBox.shp")
sf::write_sf(extent_polygon, fil)

Get the polygon in text format.

pol_geometry <- sf::st_as_text(extent_polygon[[1]])
pol_geometry
[1] "POLYGON ((41.875 32.125, 65.125 32.125, 65.125 -0.125, 41.875 -0.125, 41.875 32.125))"

Create a world map with our region

This allows us to check our polygon of interest is located in the correct region.

#Getting base map
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")

#Plotting map
world_box <- ggplot() + 
  #Adding base map
  geom_sf(data = world) +
  #Adding bounding box
  geom_sf(data = extent_polygon, color = "red", fill = NA)+
  #Setting theme of plots to not include a grey background
  theme_bw()

world_box

Save the plot.

fil <- here::here(sdm_dir,  "sdm_data", "world_box.rda")
save(world_box, file=fil)

Create a region map

Create a base map of our region and save it.

base_region_map <- ggplot()+
  #Adding base layer (world map)
  geom_sf(data = world, fill = "antiquewhite")+
  #Constraining map to original bounding box
  lims(x = c(st_bbox(extent_polygon)$xmin, sf::st_bbox(extent_polygon)$xmax),
       y = c(sf::st_bbox(extent_polygon)$ymin, sf::st_bbox(extent_polygon)$ymax))
base_region_map

Save it

fil <- here::here(sdm_dir,  "sdm_data", "base_region_map.rda")
save(base_region_map, file=fil)

We will add some more features to our map: colors, scale and compass.

region_map <- base_region_map +
  #Add scale bar on the top right of the plot
  annotation_scale(location = "tr", width_hint = 0.5)+
  #Add north arrow on the top left of plot
  annotation_north_arrow(location = "tl", which_north = "true",
                         #Include small buffer from plot edge
                         pad_x = unit(0.01, "in"), pad_y = unit(0.05, "in"),
                         #Set style of north arrow
                         style = north_arrow_fancy_orienteering) +
  #Changing color, type and size of grid lines
  theme(panel.grid.major = element_line(color = gray(.5), linetype = "dashed", size = 0.5), 
  #Change background of map
  panel.background = element_rect(fill = "aliceblue")) +
  labs(x = "longitude", y = "latitude")
Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
region_map
Scale on map varies by more than 10%, scale bar may be inaccurate

Save.

fil <- here::here(sdm_dir,  "sdm_data", "region_map.rda")
save(region_map, file=fil)

We add some labels for the countries.

#Extracting labels for countries in base map
world_points <- world %>% 
  sf::st_make_valid(world) %>%
  #Getting centroids for all polygons in the world base map
  sf::st_centroid(geometry) %>% 
  #Getting coordinates for each centroid
  sf::st_coordinates() %>% 
  #Adding centroids to original base map
  dplyr::bind_cols(world)

#Do not use spherical geometry
sf::sf_use_s2(FALSE)

#Adding labels to map
region_map_label <- region_map +
  geom_text(data = world_points, 
            #Point to coordinates and column with country names
            aes(x = X, y = Y, label = name),
            #Changing color and size of labels
            color = "darkblue", size = 3, 
            #Avoid label overlap
            check_overlap = TRUE)
# Save
fil <- here::here(sdm_dir,  "sdm_data",  "region_map_label.rda")
save(region_map_label, file=fil)

#Checking final map
region_map_label

Loading in the saved files

Later when we need the extent polygon, we use

#Loading bounding box for the area of interest
fil <- here::here(sdm_dir, "sdm_data", "BoundingBox.shp")
extent_polygon <- sf::read_sf(fil)

We often will need a sf bbox (bounding box object). To create that use

bbox <- sf::st_bbox(extent_polygon)

We load the maps as

fil <- here::here(sdm_dir, "sdm_data", "region_map_label.rda")
load(fil)

Get occurrence data from robis

pol_geometry is defined above.

pol_geometry
[1] "POLYGON ((41.875 32.125, 65.125 32.125, 65.125 -0.125, 41.875 -0.125, 41.875 32.125))"

Get the data. We use eval=redo so that we do not redownload data if we do not need to.

redo <- FALSE

Set the species we want

spp <- c("Chelonia mydas", "Caretta caretta", "Eretmochelys imbricata", "Lepidochelys olivacea", "Natator depressus", "Dermochelys coriacea")

Download the data.

obs <- robis::occurrence(spp, startdate = as.Date("2000-01-01"), geometry = pol_geometry)

This has many columns that we don’t need. We reduce to fewer columns.

cols.to.use <- c("occurrenceID", "scientificName", 
                 "dateIdentified", "eventDate", 
                 "decimalLatitude", "decimalLongitude", "coordinateUncertaintyInMeters",
                 "individualCount","lifeStage", "sex",
                 "bathymetry",  "shoredistance", "sst", "sss")
obs <- obs[,cols.to.use]

We also add a cleaner date with YYYY-MM-DD format.

obs$date <- as.Date(obs$eventDate)

Save our data.

obs_csv <- here::here(sdm_dir,  "sdm_data", "sdm_data_all.csv")
readr::write_csv(obs, obs_csv)

Clean and prep data

Clean and prepare the data for our model and save to a new file name.

Load data in

# presence data
fil <- here::here(sdm_dir, "sdm_data", "occ_all.csv")
occ_all <- read.csv(fil)

We will call the cleaned data occ.

# subset the occurences to include just those in the water
occ <- occ_all %>% 
  subset(bathymetry > 0 & 
        shoredistance > 0 & 
        coordinateUncertaintyInMeters < 200)

# seeing how often each species occurs
table(occ$scientificName) 

Caretta caretta  Chelonia mydas 
           5141            7060 

After cleaning we discover that we only have loggerhead and green sea turtles. Also there are only juvenile loggerheads and we do not know the life-stage of the green turtles.

table(occ$lifeStage, occ$scientificName, useNA="ifany")
          
           Caretta caretta Chelonia mydas
  Juvenile            5141              0
  <NA>                   0           7060

Select columns and add a common name column.

colnames(occ)
 [1] "occurrenceID"                  "scientificName"               
 [3] "dateIdentified"                "eventDate"                    
 [5] "decimalLatitude"               "decimalLongitude"             
 [7] "coordinateUncertaintyInMeters" "individualCount"              
 [9] "lifeStage"                     "sex"                          
[11] "bathymetry"                    "shoredistance"                
[13] "sst"                           "sss"                          
[15] "date"                         

We want these. The last two are sea surface temperature and salinity.

cols <- c("scientificName", "date", "decimalLatitude", "decimalLongitude", "lifeStage", "bathymetry", "sst", "sss")

Subset the columns.

occ.sub <- occ %>% dplyr::select(all_of(cols))

Change the column names.

colnames(occ.sub) <- c("sci.name", "date", "lat", "lon", "life.stage", "bathy", "SST", "SSS")

Add common.name column.

occ.sub <- occ.sub %>% 
  mutate(common.name = case_when(sci.name == "Caretta caretta" ~ "Loggerhead",
                                 sci.name == "Chelonia mydas" ~ "Green"))

Save the cleaned file

fil <- here::here(sdm_dir,  "sdm_data", "occ_clean.csv")
if (redo) readr::write_csv(occ.sub, fil)

Create background data

We will get random samples from our region.

Get a marine raster layer

We just need one because we use this to sample lat/lons from the marine environment. sdmpredictors will download many files so we need to specify a directory.

# set a default data directory
options(sdmpredictors_datadir = here::here(sdm_dir, "sdm_data"))

# choosing marine
env_datasets <- sdmpredictors::list_datasets(terrestrial = FALSE, marine = TRUE)
env_stack <- sdmpredictors::load_layers("MS_bathy_5m")
env_stack <- env_stack %>% raster::crop(extent_polygon)

Plot to check that the layer looks ok. This is bathymetry.

plot(env_stack)

Look at the raster to get some info on it.

env_stack
class      : RasterBrick 
dimensions : 388, 280, 108640, 1  (nrow, ncol, ncell, nlayers)
resolution : 0.08333333, 0.08333333  (x, y)
extent     : 41.83333, 65.16667, -0.1666667, 32.16667  (xmin, xmax, ymin, ymax)
crs        : +proj=longlat +datum=WGS84 +no_defs 
source     : memory
names      : MS_bathy_5m 
min values :       -5468 
max values :          -1 

Sample points from this

It returns a sf points object.

nsamp <- 1000
absence <- dismo::randomPoints(env_stack[[1]], nsamp)
colnames(absence) <- c("lon", "lat")

Make a plot.

absence_sf <- absence %>% 
    as_tibble() %>% 
    sf::st_as_sf(coords = c(x="lon", y="lat"), crs = 4326)
mapview::mapview(absence_sf, col.regions = "gray")
Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
multiple of vector length (arg 1)

Save the absence locations to a file.

fil <- here::here(sdm_dir,  "sdm_data", "absence.csv")
write.csv(absence, file=fil, row.names = FALSE)

Download sdmpredictors layers

Set datasets to marine.

datasets <- sdmpredictors::list_datasets(terrestrial = FALSE, marine = TRUE)
layers <- list_layers(datasets)
#View(layers) # if you want to view

Choose layers.

layercodes = c("BO_sstmean", "BO_bathymean", "BO22_ph", "BO2_dissoxmean_bdmean", "BO2_salinitymean_ss", "BO2_chlomean_ss", "BO21_nitratemean_ss")

Download layers.

env <- sdmpredictors::load_layers(layercodes, rasterstack = TRUE)
env_crop <- env %>% raster::crop(extent_polygon)

Look at our layers.

plot(env_crop)

Save the raster brick for later reloading.

env.stars <- stars::st_as_stars(env_crop) # convert to stars object
fil <- here::here(sdm_dir,  "sdm_data", "env_stack.tif")
stars::write_stars(env.stars, fil)
#y <- stars::read_stars(fil)
# We need to do this for sampling
env.stars <- terra::split(env.stars)

Environmental predictors for points

We will use the stars package to sample from our raster layers.

Load in our point data as data frames.

# presence data
fil <- here::here(sdm_dir, "sdm_data", "occ_clean.csv")
df.occ <- read.csv(fil) 

# absence data
fil <- here::here(sdm_dir, "sdm_data", "absence.csv")
df.abs <- read.csv(fil)

Convert data frames to sf points objects. This is what stars needs.

df.abs <- na.omit(df.abs) # just in case
sf.abs <- sf::st_as_sf(df.abs, coords = c("lon", "lat"), crs = 4326)
sf.occ <- sf::st_as_sf(df.occ, coords = c("lon", "lat"), crs = 4326)

Get environment values for the absence points. Each row in sf.abs is a row in env.abs.

env.abs <- stars::st_extract(env.stars, sf::st_coordinates(sf.abs)) %>% 
  dplyr::as_tibble() %>% 
  na.omit()

head(env.abs)
# A tibble: 6 × 7
  BO_sstmean BO_bathymean BO22_ph BO2_dissoxmean_bdmean BO2_salinitymean_ss
       <dbl>        <dbl>   <dbl>                 <dbl>               <dbl>
1       27.7        -4108    8.19                161.                  36.0
2       29.6        -4243    8.19                176.                  35.2
3       27.1        -3462    8.13                128.                  36.5
4       26.1          -52    8.14                219.                  38.9
5       26.4        -3037    8.14                135.                  36.1
6       26.3         -856    8.13                  4.43                36.2
# ℹ 2 more variables: BO2_chlomean_ss <dbl>, BO21_nitratemean_ss <dbl>

Get environment values for the occurence points. Each row in sf.occ is a row in env.occ.

env.occ <- stars::st_extract(env.stars, sf::st_coordinates(sf.occ)) %>% 
  dplyr::as_tibble() %>% 
  na.omit()

head(env.occ)
# A tibble: 6 × 7
  BO_sstmean BO_bathymean BO22_ph BO2_dissoxmean_bdmean BO2_salinitymean_ss
       <dbl>        <dbl>   <dbl>                 <dbl>               <dbl>
1       26.4          -59    8.17                  183.                35.6
2       28.6        -3158    8.18                  154.                35.7
3       26.4          -92    8.17                  145.                35.6
4       26.8        -2764    8.15                  130.                36.1
5       27.9           -4    8.13                  198.                38.7
6       27.6           -8    8.13                  198.                38.7
# ℹ 2 more variables: BO2_chlomean_ss <dbl>, BO21_nitratemean_ss <dbl>

Now make this into one data frame with a pa column for 1 is a occurrence row and 0 if an absence row.

pres <- c(rep(1, nrow(env.occ)), rep(0, nrow(env.abs)))
sdm_data <- data.frame(pa = pres, rbind(env.occ, env.abs))
head(sdm_data)
  pa BO_sstmean BO_bathymean BO22_ph BO2_dissoxmean_bdmean BO2_salinitymean_ss
1  1     26.431          -59   8.173              182.6581            35.63274
2  1     28.648        -3158   8.181              154.3245            35.73521
3  1     26.428          -92   8.173              145.4299            35.64719
4  1     26.822        -2764   8.148              130.0710            36.09258
5  1     27.851           -4   8.133              197.8163            38.67305
6  1     27.585           -8   8.133              197.6590            38.71139
  BO2_chlomean_ss BO21_nitratemean_ss
1        0.337094            2.043796
2        0.107879            0.132562
3        0.343290            2.086616
4        0.217175            0.615217
5        0.140721            0.000003
6        0.141303            0.000003

Save to a file. We will use for other models.

fil <- here::here(sdm_dir,  "sdm_data",  "sdm_data.csv")
write.csv(sdm_data, row.names = FALSE, file=fil)

Fit Maxnet model

maxnet::maxnet(pres, environ)

  • pres string of 1s and 0s for whether the row is a occurrence or a absence.
  • environ a data frame of the environmental variables only
pres <- sdm_data$pa
environ <- sdm_data %>% dplyr::select(-pa)
sdm.model <- maxnet::maxnet(pres, environ)

Model metrics

responses <- plot(sdm.model, type = "cloglog")

We have some bathymetry values > 0 which might be a problem.

table(environ$BO_bathymean>0)

FALSE  TRUE 
12617   277 

Predicting

clamp <- TRUE       # see ?predict.maxnet for details
type <- "cloglog"
bb <- sf::st_bbox(extent_polygon) # make a sf bounding box 
predicted <- predict(sdm.model, 
                     env.stars %>% sf::st_crop(bb), 
                     clamp = clamp, type = type)
although coordinates are longitude/latitude, st_intersects assumes that they
are planar
predicted
stars object with 2 dimensions and 1 attribute
attribute(s):
             Min.   1st Qu.    Median     Mean   3rd Qu.      Max.  NA's
pred  0.003818902 0.1137863 0.1655903 0.174387 0.2148277 0.9999996 52802
dimension(s):
  from  to offset    delta                       refsys x/y
x    1 280  41.83  0.08333 +proj=longlat +datum=WGS8... [x]
y    1 388  32.17 -0.08333 +proj=longlat +datum=WGS8... [y]

Visualization

We can plot the predictions like so, but the default palette is not great and are map is distorted.

ggplot() +
    geom_stars(data = predicted)

We can try the cmocean palette and fix the coordinates.

ggplot() +
  geom_stars(data = predicted) +
  scale_fill_cmocean() +
  coord_equal()

Or other palettes and annotation.

predplot <- ggplot() +
  geom_stars(data = predicted) +
  scale_fill_cmocean(name = "ice", direction = -1, guide = guide_colorbar(barwidth = 1, barheight = 10, ticks = FALSE, nbin = 1000, frame.colour = "black"), limits = c(0, 1)) +
  theme_linedraw() +
  coord_equal() +
  theme(panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  labs(title = "Loggerhead and green sea turtle SDM in the Arabian Sea",
       x = "Longitude",
       y = "Latitude",
       fill = "Probability",
       shape = "Species (presence)",
       subtitle = "Environmental predictors: mean SS temp, mean SS salinity, mean bathymetry, \nmean pH, mean DO, mean SS chlorophyll-a, mean SS nitrate")

predplot

# ggsave("SDM_loggerhead_green_w points.pdf", height = 6, width = 8.5)

Without the occurrence data points.

predplot +
  geom_point(sf.occ, mapping = aes(shape = common.name, geometry = geometry), stat = "sf_coordinates", alpha = 0.3, color = "purple") +
  geom_point(sf.abs, mapping = aes(geometry = geometry), stat = "sf_coordinates", alpha = 0.3, color = "green")

# ggsave("SDM_loggerhead_green.pdf", height = 6, width = 8.5)

Discussion

We did not do much cleaning of the data and we combined loggerheads and green sea turtles. We should separate these. Also some of the data are clearly tagging data and we should subsample that data to remove some of the temporal autocorrelation. We should also experiment with higher and lower numbers of background points.