Using R for dNHC Research

Setup

See 00_Setup.R

Check installed packages

Create a list of required packages

list.of.packages <- c("dplyr", # v.0.8.5
                      "tidyr", # v.1.0.2
                      "plyr", # v.1.8.6
                      "spocc", # v.1.0.8
                      "ridigbio", # v.0.3.5
                      "tibble", # v.3.0.0
                      "rbison",
                      "CoordinateCleaner",
                      "lubridate",
                      "ggplot2",
                      "gtools",
                      "leaflet", 
                      "sp", 
                      "rgeos", 
                      "ggspatial", 
                      "raster", 
                      "agricolae", 
                      "gridExtra")

Compare to installed packages

Compare list of required to installed packages. Retain only missing packages (!).

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]

Install missing packages

Run for loop to install any missing packages. This loops through the list of new.packages and installs each.

for(i in 1:length(new.packages)){
  install.packages(new.packages[i])
}

Cleaning Data

See 01_CleaningData.R
Download data and clean it.

Load Packages

library(CoordinateCleaner)
library(lubridate)

Load Functions

This is a function I created with Natalie Patten (a very talent undergrad at UF). It will be part of her R package gatoRs (Geographic And Taxonomic Occurrence R-based Scrubbing).

source("functions/DownloadingDataMore.R")

Download Shortia galacifolia

Use spocc_combined to create a csv containing occurrence records from iDigBio, GBIF, and BISON. Find out more about this function here.

spocc_combine("Shortia galacifolia", "data/Shortia_galacifolia_raw.csv")

Read in downloaded data frame

rawdf <- read.csv("data/Shortia_galacifolia_raw.csv")

Cleaning

Inspect data frame

What columns are included?

names(rawdf)

##  [1] "name"                          "basis"                        
##  [3] "date"                          "institutionID"                
##  [5] "collectionCode"                "collectionID"                 
##  [7] "country"                       "county"                       
##  [9] "state"                         "locality"                     
## [11] "Latitude"                      "Longitude"                    
## [13] "ID"                            "coordinateUncertaintyInMeters"
## [15] "informationWithheld"           "habitat"                      
## [17] "prov"                          "spocc.latitude"               
## [19] "spocc.longitude"               "spocc.prov"                   
## [21] "spocc.date"                    "spocc.name"

How many observations do we start with?

nrow(rawdf)

## [1] 635

1. Resolve taxon names

Inspect scientific names included in the raw df.

unique(rawdf$name)

## [1] shortia galacifolia                Shortia galacifolia Torr. & A.Gray
## Levels: shortia galacifolia Shortia galacifolia Torr. & A.Gray

Create a list of accepted names based on the name column in your data frame

search <-  c("Shortia galacifolia")

Filter to only include accepted namesUsing the R package dplyr, we: 1. Filter the data frame to only include rows with the accepted names. 2. Filter out any rows with NA for dwc.scientificName. 3. Create a column called name and set it equal to “Shortia galacifolia”.

df <- rawdf %>% 
      filter(grepl(search, name, ignore.case =  TRUE)) %>% 
      mutate(new.name = "Shortia galacifolia")

How many observations do we have now?

nrow(df)

## [1] 635

2. Decrease number of columns

Merge the two locality columns

df$Latitude <-  dplyr::coalesce(df$Latitude, df$spocc.latitude)
df$Longitude <-  dplyr::coalesce(df$Longitude, df$spocc.longitude)

Merge the two date columns

df$date <-  dplyr::coalesce(df$date, df$spocc.date)

Subset columns

Using the R package dplyr, we select and rename columns.

df <- df %>% 
      dplyr::select(ID = ID,
                name = new.name,
                basis = basis,
                coordinateUncertaintyInMeters = coordinateUncertaintyInMeters,
                informationWithheld = informationWithheld,
                lat = Latitude,
                long = Longitude,
                date = date)

3. Clean localities

Remove NAs

Using the R package dplyr, we: 1. Filter out(!) any rows where long ‘is.na’. 2. Filter out(!) any rows where lat ‘is.na’.

df <- df %>% 
      filter(!is.na(long)) %>% 
      filter(!is.na(lat))

How many observations do we have now?

nrow(df)

## [1] 9

Fix precision

Using the R base function ‘round’, we round lat and long to two decimal places.

df$lat <- round(df$lat, digits = 2)
df$long <- round(df$long, digits = 2)

Remove unlikely points

Remove points at 0.00, 0.00 Using the R package dplyr, we:
1. Filter to retain rows where long is NOT(!) equal to 0.00. 2. Filter to retain rows where long is NOT(!) equal to 0.00.

df <- df %>% 
      filter(long != 0.00) %>% 
      filter(lat != 0.00)

Remove bad coordinates

Remove coordinates in cultivated zones, botanical gardens, and outside our desired range Using the R package CoordinateCleaner, we first if points are at biodiversity institutions and remove any points that are occurring at institutions.

df <- cc_inst(df, 
              lon = "long",
              lat = "lat",
              species = "name")

## Testing biodiversity institutions

## Removed 0 records.

Next, we look for geographic outliers and remove outliers.

df <- cc_outl(df,
              lon = "long",
              lat = "lat",
              species = "name")

## Testing geographic outliers

## Removed 1 records.

How many observations do we have now?

nrow(df)

## [1] 8

4. Remove Duplicates

Fix dates

Using the R package lubridate, we first parse the date into the same format.

df$date <- lubridate::ymd(df$date)

Next you are going to seperate date into year, month, and day - where every column only contains one set of information.

df <- df %>% 
      dplyr::mutate(year = lubridate::year(date),
                month = lubridate::month(date),
                day = lubridate::day(date))

Remove rows with identical lat, long, year, month, and day

If a specimen shares lat, long, and event date we are assuming that it is identical. Many specimen lack date and lat/long, so this may be getting rid of information you would want to keep.

df <- distinct(df, lat, long, year, month, day, .keep_all = TRUE)

How many observations do we have now?

nrow(df)

## [1] 8

5. Save Cleaned .csv

write.csv(df, "data/Shortia_galacifolia_061521-cleaned.csv", row.names = FALSE)

Mapping

See 02_Mapping.R.
Map occurrence records using ggplot2 and leaflet.

Load Packages

library(ggplot2)
library(leaflet)
library(sp)
library(rgeos)
library(ggspatial)

Read data file

df <- read.csv("data/Shortia_galacifolia_061521-cleaned.csv")

Map using ggplot2

Make a simple map using ggplot2.

Set base maps

Here we download a USA map, a state level USA map, and a county level USA map.

USA <- borders(database = "usa", colour = "gray50", fill = "gray50")
states <-  borders("state", colour = "black", fill = NA)
county <- borders(database = "county", colour = "gray40", fill = NA)

Plot

Here we are going to add our USA map and the occurrence points. Next, we zoom into our area of interest. Then, we fix the x and y labels. Finally, we add a scale and a north arrow

simple_map <- ggplot() +
              USA +
              # Better to add state or county here - order matters! 
              geom_point(df, 
                         mapping = aes(x = long, y = lat), 
                         col = "blue") +
              coord_sf(xlim = c(-86, -76), ylim = c(30, 38)) +
              xlab("Longitude") +
              ylab("Latitude") +
              annotation_scale() +
              annotation_north_arrow( height = unit(1, "cm"),
                                      width = unit(1, "cm"), 
                                      location = "tl")

simple_map

Add States

simple_map + states

Add Counties

simple_map + county

Map using Leaflet

Find out more about leaflet here. Basically it uses JavaScript to make interactive maps.

Make points spatial

pts <- SpatialPoints(df[,7:6])

Make a simple leaflet map

m <- leaflet(pts) %>% 
     addCircles() %>% 
     addTiles() 
m

Set extent and zoom out

m <- fitBounds(m, -86, 30, -76, 38)
m

Make fancy icon

ShortiaIcon <- makeIcon(
  iconUrl = "https://live.staticflickr.com/2914/32392790054_27192e77e6_b.jpg",
  iconWidth = 18, iconHeight = 28,
  iconAnchorX = 17, iconAnchorY = 27
)

m <- leaflet(pts) %>% 
     addTiles()  %>% 
     addMarkers(icon = ShortiaIcon) 
m

Add label to see long, lat

m <- leaflet(pts) %>% 
     addTiles()  %>% 
     addMarkers(label = paste0(df$long, ", ", df$lat))  
m

Add rectangle marking extent

m <- leaflet(pts) %>% 
     addTiles()  %>% 
     addMarkers(label = paste0(df$long, ", ", df$lat))   %>%
     addRectangles(
        lng1 = max(df$long), lat1 = max(df$lat),
        lng2 = min(df$long), lat2 = min(df$lat),
        fillColor = "transparent")
m

Point Statistics

See 03_PointStats.R.
Point sample and run a simple ANOVA.

Load Packages

library(gtools)
library(raster)
library(dplyr)
library(tibble)
library(agricolae)
library(gridExtra)

Read data files

dfS <- read.csv("data/Shortia_galacifolia_061521-cleaned.csv")
dfG <- read.csv("data/Galax_urceolata_061521-cleaned.csv")

Modify the data frame

Combined the two df

df <- rbind(dfS, dfG)

Subset spatial points

pts <- SpatialPoints(df[,7:6])

Import BioClim files

List BioClim files

list <- list.files("data/bio/", 
                    full.names = TRUE,
                    recursive = FALSE)

Order list using gtools

list <- mixedsort(sort(list))

Load the rasters

envtStack <- raster::stack(list)

Point Sample

Extract value for each point

ptExtracted <- raster::extract(envtStack, pts)

Convert to data frame

ptExtracteddf <- as.data.frame(ptExtracted)

Add species name

ptExtracteddf <- ptExtracteddf %>%
                 mutate(name = as.character(df$name))

ANOVA

ANOVA or an analysis of variance. To run an ANOVA, you first set the linear model using lm. Next, the aov function will run the ANOVA.

bio1aov <- aov(lm(ptExtracteddf[, 1] ~ name, data = ptExtracteddf))
bio1aov

## Call:
##    aov(formula = lm(ptExtracteddf[, 1] ~ name, data = ptExtracteddf))
## 
## Terms:
##                      name Residuals
## Sum of Squares   14.98401 153.62829
## Deg. of Freedom         1        31
## 
## Residual standard error: 2.226152
## Estimated effects may be unbalanced
## 2 observations deleted due to missingness

Tukey HSD test

Here we will do a multiple comparisons of treatments by means of Tukey.

b1 <- HSD.test(bio1aov, trt = "name", alpha = 0.05)  
b1

## $statistics
##    MSerror Df     Mean       CV
##   4.955751 31 13.07235 17.02947
## 
## $parameters
##    test name.t ntr StudentizedRange alpha
##   Tukey   name   2         2.884308  0.05
## 
## $means
##                     ptExtracteddf[, 1]      std  r       Min      Max      Q25
## Galax urceolata               12.69117 2.386864 25  8.533334 17.05833 11.06667
## Shortia galacifolia           14.26354 1.553680  8 10.841666 15.25833 13.70729
##                          Q50      Q75
## Galax urceolata     13.52917 14.14583
## Shortia galacifolia 15.15833 15.18229
## 
## $comparison
## NULL
## 
## $groups
##                     ptExtracteddf[, 1] groups
## Shortia galacifolia           14.26354      a
## Galax urceolata               12.69117      a
## 
## attr(,"class")
## [1] "group"

Separate the groups

bio1_group <- b1$groups
bio1_group <- rownames_to_column(bio1_group, var = "name")
bio1_group <- bio1_group %>%
              dplyr::select(name, groups)
bio1_group

##                  name groups
## 1 Shortia galacifolia      a
## 2     Galax urceolata      a

Subset from the original df

Here we subset the name and bio_1 column from the original df.

part <- ptExtracteddf %>%
        dplyr::select(name, bio_1)

Join the subset df to the bio1_group data frame

bio1pl <- left_join(part, bio1_group, by = "name")

Plot

bio1_aov_plot <- ggplot(bio1pl, aes(x = name, y = bio_1)) +
                  geom_boxplot(aes(fill = groups)) +
                  geom_text(data = bio1_group, 
                            mapping = aes(x = name,
                                          y = 20, 
                                          label = groups), 
                            size = 5, inherit.aes = FALSE) +
                  theme(axis.text.x = element_text(angle = 90, 
                                                   size = 8, 
                                                   face = 'italic'))
bio1_aov_plot

Loop through all variables

First, set a list of the variable names and an empty list to store the resulting plots in.

variablelist <- colnames(ptExtracteddf)[1:20]
plotlist <- c()

for(i in 1:20){
      ### ANOVA 
      bioaov <- aov(lm(ptExtracteddf[, i] ~ name, data = ptExtracteddf))
      ### Tukey HSD test
      b <- HSD.test(bioaov, trt = "name", alpha = 0.05)    
      ### Separate the groups
      bio_group <- b$groups
      bio_group <- rownames_to_column(bio_group, var = "name")
      bio_group <- bio_group %>%
                   dplyr::select(name, groups)
      ### Subset from the original df
      part <- ptExtracteddf %>%
              dplyr::select(name, variablelist[i])
      #### Join the subset df to the bio_group data frame
      biopl <- left_join(part, bio_group, by = "name")
      #### Plot
      plotlist[[i]] <- ggplot(biopl, aes(x = name, y = biopl[,2])) +
                       geom_boxplot(aes(fill = groups)) +
                       geom_text(data = bio_group, 
                                mapping = aes(x = name,
                                              y = (max(na.omit(biopl[,2]))* 1.3), 
                                              label = groups), 
                                size = 5, inherit.aes = FALSE) +
                               scale_x_discrete(labels = c('G', 'S')) +
                       ggtitle(label  = paste(variablelist[i]))
      
}

gridExtra::grid.arrange(grobs = plotlist)