See 00_Setup.R
Create a list of required packages
list.of.packages <- c("dplyr", # v.0.8.5
"tidyr", # v.1.0.2
"plyr", # v.1.8.6
"spocc", # v.1.0.8
"ridigbio", # v.0.3.5
"tibble", # v.3.0.0
"rbison",
"CoordinateCleaner",
"lubridate",
"ggplot2",
"gtools",
"leaflet",
"sp",
"rgeos",
"ggspatial",
"raster",
"agricolae",
"gridExtra")
Compare list of required to installed packages. Retain only missing packages (!).
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
Run for loop to install any missing packages. This loops through the list of new.packages and installs each.
for(i in 1:length(new.packages)){
install.packages(new.packages[i])
}
See 01_CleaningData.R
Download data and clean it.
library(CoordinateCleaner)
library(lubridate)
This is a function I created with Natalie Patten (a very talent undergrad at UF). It will be part of her R package gatoRs (Geographic And Taxonomic Occurrence R-based Scrubbing).
source("functions/DownloadingDataMore.R")
Use spocc_combined to create a csv containing occurrence records from iDigBio, GBIF, and BISON. Find out more about this function here.
spocc_combine("Shortia galacifolia", "data/Shortia_galacifolia_raw.csv")
rawdf <- read.csv("data/Shortia_galacifolia_raw.csv")
What columns are included?
names(rawdf)
## [1] "name" "basis"
## [3] "date" "institutionID"
## [5] "collectionCode" "collectionID"
## [7] "country" "county"
## [9] "state" "locality"
## [11] "Latitude" "Longitude"
## [13] "ID" "coordinateUncertaintyInMeters"
## [15] "informationWithheld" "habitat"
## [17] "prov" "spocc.latitude"
## [19] "spocc.longitude" "spocc.prov"
## [21] "spocc.date" "spocc.name"
How many observations do we start with?
nrow(rawdf)
## [1] 635
Inspect scientific names included in the raw df.
unique(rawdf$name)
## [1] shortia galacifolia Shortia galacifolia Torr. & A.Gray
## Levels: shortia galacifolia Shortia galacifolia Torr. & A.Gray
Create a list of accepted names based on the name column in your data frame
search <- c("Shortia galacifolia")
Filter to only include accepted namesUsing the R package dplyr, we: 1. Filter the data frame to only include rows with the accepted names. 2. Filter out any rows with NA for dwc.scientificName. 3. Create a column called name and set it equal to “Shortia galacifolia”.
df <- rawdf %>%
filter(grepl(search, name, ignore.case = TRUE)) %>%
mutate(new.name = "Shortia galacifolia")
How many observations do we have now?
nrow(df)
## [1] 635
df$Latitude <- dplyr::coalesce(df$Latitude, df$spocc.latitude)
df$Longitude <- dplyr::coalesce(df$Longitude, df$spocc.longitude)
df$date <- dplyr::coalesce(df$date, df$spocc.date)
Using the R package dplyr, we select and rename columns.
df <- df %>%
dplyr::select(ID = ID,
name = new.name,
basis = basis,
coordinateUncertaintyInMeters = coordinateUncertaintyInMeters,
informationWithheld = informationWithheld,
lat = Latitude,
long = Longitude,
date = date)
Using the R package dplyr, we: 1. Filter out(!) any rows where long ‘is.na’. 2. Filter out(!) any rows where lat ‘is.na’.
df <- df %>%
filter(!is.na(long)) %>%
filter(!is.na(lat))
How many observations do we have now?
nrow(df)
## [1] 9
Using the R base function ‘round’, we round lat and long to two decimal places.
df$lat <- round(df$lat, digits = 2)
df$long <- round(df$long, digits = 2)
Remove points at 0.00, 0.00 Using the R package dplyr, we:
1. Filter to retain rows where long is NOT(!) equal to 0.00. 2. Filter to retain rows where long is NOT(!) equal to 0.00.
df <- df %>%
filter(long != 0.00) %>%
filter(lat != 0.00)
Remove coordinates in cultivated zones, botanical gardens, and outside our desired range Using the R package CoordinateCleaner, we first if points are at biodiversity institutions and remove any points that are occurring at institutions.
df <- cc_inst(df,
lon = "long",
lat = "lat",
species = "name")
## Testing biodiversity institutions
## Removed 0 records.
Next, we look for geographic outliers and remove outliers.
df <- cc_outl(df,
lon = "long",
lat = "lat",
species = "name")
## Testing geographic outliers
## Removed 1 records.
How many observations do we have now?
nrow(df)
## [1] 8
Using the R package lubridate, we first parse the date into the same format.
df$date <- lubridate::ymd(df$date)
Next you are going to seperate date into year, month, and day - where every column only contains one set of information.
df <- df %>%
dplyr::mutate(year = lubridate::year(date),
month = lubridate::month(date),
day = lubridate::day(date))
If a specimen shares lat, long, and event date we are assuming that it is identical. Many specimen lack date and lat/long, so this may be getting rid of information you would want to keep.
df <- distinct(df, lat, long, year, month, day, .keep_all = TRUE)
How many observations do we have now?
nrow(df)
## [1] 8
write.csv(df, "data/Shortia_galacifolia_061521-cleaned.csv", row.names = FALSE)
See 02_Mapping.R.
Map occurrence records using ggplot2 and leaflet.
library(ggplot2)
library(leaflet)
library(sp)
library(rgeos)
library(ggspatial)
df <- read.csv("data/Shortia_galacifolia_061521-cleaned.csv")
Make a simple map using ggplot2.
Here we download a USA map, a state level USA map, and a county level USA map.
USA <- borders(database = "usa", colour = "gray50", fill = "gray50")
states <- borders("state", colour = "black", fill = NA)
county <- borders(database = "county", colour = "gray40", fill = NA)
Here we are going to add our USA map and the occurrence points. Next, we zoom into our area of interest. Then, we fix the x and y labels. Finally, we add a scale and a north arrow
simple_map <- ggplot() +
USA +
# Better to add state or county here - order matters!
geom_point(df,
mapping = aes(x = long, y = lat),
col = "blue") +
coord_sf(xlim = c(-86, -76), ylim = c(30, 38)) +
xlab("Longitude") +
ylab("Latitude") +
annotation_scale() +
annotation_north_arrow( height = unit(1, "cm"),
width = unit(1, "cm"),
location = "tl")
simple_map
simple_map + states
simple_map + county
Find out more about leaflet here. Basically it uses JavaScript to make interactive maps.
pts <- SpatialPoints(df[,7:6])
m <- leaflet(pts) %>%
addCircles() %>%
addTiles()
m
m <- fitBounds(m, -86, 30, -76, 38)
m
ShortiaIcon <- makeIcon(
iconUrl = "https://live.staticflickr.com/2914/32392790054_27192e77e6_b.jpg",
iconWidth = 18, iconHeight = 28,
iconAnchorX = 17, iconAnchorY = 27
)
m <- leaflet(pts) %>%
addTiles() %>%
addMarkers(icon = ShortiaIcon)
m
m <- leaflet(pts) %>%
addTiles() %>%
addMarkers(label = paste0(df$long, ", ", df$lat))
m
m <- leaflet(pts) %>%
addTiles() %>%
addMarkers(label = paste0(df$long, ", ", df$lat)) %>%
addRectangles(
lng1 = max(df$long), lat1 = max(df$lat),
lng2 = min(df$long), lat2 = min(df$lat),
fillColor = "transparent")
m
See 03_PointStats.R.
Point sample and run a simple ANOVA.
library(gtools)
library(raster)
library(dplyr)
library(tibble)
library(agricolae)
library(gridExtra)
dfS <- read.csv("data/Shortia_galacifolia_061521-cleaned.csv")
dfG <- read.csv("data/Galax_urceolata_061521-cleaned.csv")
df <- rbind(dfS, dfG)
pts <- SpatialPoints(df[,7:6])
list <- list.files("data/bio/",
full.names = TRUE,
recursive = FALSE)
list <- mixedsort(sort(list))
envtStack <- raster::stack(list)
ptExtracted <- raster::extract(envtStack, pts)
ptExtracteddf <- as.data.frame(ptExtracted)
ptExtracteddf <- ptExtracteddf %>%
mutate(name = as.character(df$name))
ANOVA or an analysis of variance. To run an ANOVA, you first set the linear model using lm. Next, the aov function will run the ANOVA.
bio1aov <- aov(lm(ptExtracteddf[, 1] ~ name, data = ptExtracteddf))
bio1aov
## Call:
## aov(formula = lm(ptExtracteddf[, 1] ~ name, data = ptExtracteddf))
##
## Terms:
## name Residuals
## Sum of Squares 14.98401 153.62829
## Deg. of Freedom 1 31
##
## Residual standard error: 2.226152
## Estimated effects may be unbalanced
## 2 observations deleted due to missingness
Here we will do a multiple comparisons of treatments by means of Tukey.
b1 <- HSD.test(bio1aov, trt = "name", alpha = 0.05)
b1
## $statistics
## MSerror Df Mean CV
## 4.955751 31 13.07235 17.02947
##
## $parameters
## test name.t ntr StudentizedRange alpha
## Tukey name 2 2.884308 0.05
##
## $means
## ptExtracteddf[, 1] std r Min Max Q25
## Galax urceolata 12.69117 2.386864 25 8.533334 17.05833 11.06667
## Shortia galacifolia 14.26354 1.553680 8 10.841666 15.25833 13.70729
## Q50 Q75
## Galax urceolata 13.52917 14.14583
## Shortia galacifolia 15.15833 15.18229
##
## $comparison
## NULL
##
## $groups
## ptExtracteddf[, 1] groups
## Shortia galacifolia 14.26354 a
## Galax urceolata 12.69117 a
##
## attr(,"class")
## [1] "group"
bio1_group <- b1$groups
bio1_group <- rownames_to_column(bio1_group, var = "name")
bio1_group <- bio1_group %>%
dplyr::select(name, groups)
bio1_group
## name groups
## 1 Shortia galacifolia a
## 2 Galax urceolata a
Here we subset the name and bio_1 column from the original df.
part <- ptExtracteddf %>%
dplyr::select(name, bio_1)
bio1pl <- left_join(part, bio1_group, by = "name")
bio1_aov_plot <- ggplot(bio1pl, aes(x = name, y = bio_1)) +
geom_boxplot(aes(fill = groups)) +
geom_text(data = bio1_group,
mapping = aes(x = name,
y = 20,
label = groups),
size = 5, inherit.aes = FALSE) +
theme(axis.text.x = element_text(angle = 90,
size = 8,
face = 'italic'))
bio1_aov_plot
First, set a list of the variable names and an empty list to store the resulting plots in.
variablelist <- colnames(ptExtracteddf)[1:20]
plotlist <- c()
for(i in 1:20){
### ANOVA
bioaov <- aov(lm(ptExtracteddf[, i] ~ name, data = ptExtracteddf))
### Tukey HSD test
b <- HSD.test(bioaov, trt = "name", alpha = 0.05)
### Separate the groups
bio_group <- b$groups
bio_group <- rownames_to_column(bio_group, var = "name")
bio_group <- bio_group %>%
dplyr::select(name, groups)
### Subset from the original df
part <- ptExtracteddf %>%
dplyr::select(name, variablelist[i])
#### Join the subset df to the bio_group data frame
biopl <- left_join(part, bio_group, by = "name")
#### Plot
plotlist[[i]] <- ggplot(biopl, aes(x = name, y = biopl[,2])) +
geom_boxplot(aes(fill = groups)) +
geom_text(data = bio_group,
mapping = aes(x = name,
y = (max(na.omit(biopl[,2]))* 1.3),
label = groups),
size = 5, inherit.aes = FALSE) +
scale_x_discrete(labels = c('G', 'S')) +
ggtitle(label = paste(variablelist[i]))
}
gridExtra::grid.arrange(grobs = plotlist)