Chapter 3 Data Cleaning

3.1 Load data

3.1.1 Load the river herring database and its tables

library(tidyverse)
library(DBI)

#Load database
rh_db <- dbConnect(RSQLite::SQLite(),
                        "../DataClean/rh.db")

#Load tables
fish <- dbGetQuery(rh_db, "SELECT * FROM fish;")
envir <- dbGetQuery(rh_db, "SELECT * FROM envir;")
diet <- dbGetQuery(rh_db, "SELECT * FROM diet;")
diet_size <- dbGetQuery(rh_db, "SELECT * FROM diet_size;")
zp_biomass <- dbGetQuery(rh_db, "SELECT * FROM zp_biomass;")
zp_size <- dbGetQuery(rh_db, "SELECT * FROM zp_size;")

3.2 Data prep, fish and prey size

3.2.1 The steps for the data preparation are as follows:

Join the fish and diet size data
Calculate the average prey length for each prey type in each fish
Calculate the estimated total length for prey items that were not measured (were not within the first 20 of that prey type) for each prey type in each fish
Calculate the total prey length for each prey type in each fish
Calculate the total prey length for all prey types in each fish
Calculate the total number of prey within each fish
Calculate the average prey length

#Prep fish and prey size data
avgfishpreylengthstep1 <- diet_size %>%
  left_join(fish, by = "fish_id") %>% #Step 1
  mutate(rowavg = rowMeans(subset(diet_size, select = c(6:25)), 
                           na.rm = TRUE)) %>% #Step 2 
  mutate(over20bm = if_else(prey_tot>20, (prey_tot-20)*(rowavg), 0)) #Step 3

avgfishpreylength <- avgfishpreylengthstep1 %>%
  mutate(preylengthsum = rowSums(subset(avgfishpreylengthstep1, #Step 4
                                        select = c(6:25,32)), na.rm = TRUE)) %>% 
  group_by(fish_id) %>% 
  mutate(fishpreylength = sum(preylengthsum)) %>% #Step 5
  mutate(fishpreytot = sum(prey_tot)) %>% #Step 6
  mutate(fishavgpreylength = fishpreylength/fishpreytot) #Step 7

3.3 Data prep, prey selectivity

3.3.1 The steps for the zooplankton data preparation are as follows:

Calculate total number of sites sampled in each cove for that month
Calculate the average number of individuals per zooplankton category across sampling sites for each cove month combo
Calculate the total number of individuals for each zooplankton category sampled at each cove month combo
Calculate the total number of individuals sampled at each cove month combo
Calculate the percent of the total zooplankton individuals that each zooplankton category makes up in each cove month combo
Simplify data set to needed columns and rows
Add a column stating that this is water column data
Rename zp_order to category

#Prep the water column data
zpwc <- zp_biomass %>%
  group_by(covemonth) %>%
  mutate(sites_tot = length(unique(site))) %>%
  ungroup() %>% #Step 1
  group_by(covemonth, zp_genus) %>%
  mutate(cmavgindiv = sum(zp_indiv_per_m3)/sites_tot) %>% 
  select(-c(zp_id, site, zp_indiv_per_m3, zp_microg_per_m3)) %>%
  distinct() %>%
  ungroup() %>% #Step 2
  group_by(covemonth, zp_order) %>%
  mutate(cattot = sum(cmavgindiv)) %>%
  ungroup() %>% #Step 3
  group_by(covemonth) %>%
  mutate(covmonthtot = sum(cmavgindiv)) %>%
  ungroup %>% #Step 4
  mutate(perctot = cattot/covmonthtot) %>% #Step 5
  select(1:4, 11) %>%
  distinct() %>% #Step 6
  mutate(gutwater = "watercolumn") %>% #Step 7
  rename(category = zp_order) #Step 8

zpwc$category <- str_replace_all(zpwc$category, 'Cyclopoida', 'Copepoda')

3.3.2 The steps for the prey data preparation are as follows:

Join fish data to diet data
Calculate the total number of prey individuals for each prey category sampled at each cove month combo
Calculate the total number of prey individuals sampled at each cove month combo
Calculate the percent of the total prey that each prey category is for each cove month combo
Simplify data set to needed columns and rows
Add a column stating that this is gut content data
Reorder the columns to desired structure
Rename prey_cat to category

#Prep the gut content data
zpgc <- diet %>%
  left_join(fish, by = "fish_id") %>% #Step 1
  group_by(covemonth, prey_cat) %>%
  mutate(cattot = sum(total)) %>%
  ungroup() %>% #Step 2
  group_by(covemonth) %>%
  mutate(covmonthtot = sum(total)) %>%
  ungroup() %>% #Step 3
  mutate(perctot = cattot/covmonthtot) %>% #Step 4
  select(4, 7:9, 14) %>%
  distinct() %>% #Step 5
  mutate(gutwater = "gut") %>% #Step 6
  relocate(prey_cat, .after = covemonth) %>% #Step 7
  rename(category = prey_cat) #Step 8

3.3.3 The steps for combining the data sets and calculating selectivity are as follows:

Bind the two data sets
Filter the data set to only include the top three categories— Cladocera, Copepoda, Ploima
Filter the data set to only include July, August, and September
Factor the month and cove levels
Pivot the data set wider
Add in actual 0s
Create data set for Ostracoda
Add row for Wethersfield data in July
Convert to percent
Factor month and cove levels

#Create data set with water column and gut content data
zpwcgc <- rbind(zpwc, zpgc) %>% #Step 1
  filter(category == "Cladocera" | 
           category == "Copepoda" | 
           category == "Ploima") %>% #Step 2
  filter(month == "July" |
           month == "August" |
           month == "September") #Step 3
  

zpwcgc$month <- factor(zpwcgc$month, levels = c("July", "August", 
                                                "September"))
zpwcgc$cove <- factor(zpwcgc$cove, levels = c("wethersfield", "chapman", 
                                              "hamburg")) #Step 4

zpwcgcwide <- zpwcgc %>%
  pivot_wider(names_from = gutwater,
              values_from = perctot) %>%
  mutate(selectivity = (gut-watercolumn)/(gut+watercolumn)) #Step 5

zpwcgcwide[9,6] = 0 #Replace NA in Wethersfield/Sept/Ploima with actual 0
zpwcgcwide[27,5] = 0 #Replace NA in Chapman/July/Copepoda with actual 0 #Step 6

#Cannot calculate selectivity values for Ostracoda because they were not sampled
#from the environment. Instead a data set for Ostracoda percent of total prey
#items is needed created

ostra <- zpgc %>%
  filter(category == "Ostracoda") %>% #Step 7
  add_row (cove = "wethersfield", month = "July", 
           covemonth = "wethersfieldJuly", category = "Ostracoda", perctot = 0, 
           gutwater = "gut") %>% #Step 8
  mutate(actperc = perctot * 100) #Step 9
  #Add a row with actual 0 for Wethersfield/July/Ostracoda

ostra$month <- factor(ostra$month, levels = c("July", "August", "September"))
ostra$cove <- factor(ostra$cove, levels = c("wethersfield", "chapman", 
                                            "hamburg")) #Step 10