Chapter 3 Data Cleaning

3.1 Load data

3.1.1 Load the river herring database and its tables

library(tidyverse)
library(DBI)

#Load database
rh_db <- dbConnect(RSQLite::SQLite(),
                        "../DataClean/rh.db")

#Load tables
fish <- dbGetQuery(rh_db, "SELECT * FROM fish;")
envir <- dbGetQuery(rh_db, "SELECT * FROM envir;")
diet <- dbGetQuery(rh_db, "SELECT * FROM diet;")
diet_size <- dbGetQuery(rh_db, "SELECT * FROM diet_size;")
zp_biomass <- dbGetQuery(rh_db, "SELECT * FROM zp_biomass;")
zp_size <- dbGetQuery(rh_db, "SELECT * FROM zp_size;")

3.2 Data prep, fish and prey size

3.2.1 The steps for the data preparation are as follows:

  1. Join the fish and diet size data

  2. Calculate the average prey length for each prey type in each fish

  3. Calculate the estimated total length for prey items that were not measured (were not within the first 20 of that prey type) for each prey type in each fish

  4. Calculate the total prey length for each prey type in each fish

  5. Calculate the total prey length for all prey types in each fish

  6. Calculate the total number of prey within each fish

  7. Calculate the average prey length

#Prep fish and prey size data
avgfishpreylengthstep1 <- diet_size %>%
  left_join(fish, by = "fish_id") %>% #Step 1
  mutate(rowavg = rowMeans(subset(diet_size, select = c(6:25)), 
                           na.rm = TRUE)) %>% #Step 2 
  mutate(over20bm = if_else(prey_tot>20, (prey_tot-20)*(rowavg), 0)) #Step 3

avgfishpreylength <- avgfishpreylengthstep1 %>%
  mutate(preylengthsum = rowSums(subset(avgfishpreylengthstep1, #Step 4
                                        select = c(6:25,32)), na.rm = TRUE)) %>% 
  group_by(fish_id) %>% 
  mutate(fishpreylength = sum(preylengthsum)) %>% #Step 5
  mutate(fishpreytot = sum(prey_tot)) %>% #Step 6
  mutate(fishavgpreylength = fishpreylength/fishpreytot) #Step 7

3.3 Data prep, prey selectivity

3.3.1 The steps for the zooplankton data preparation are as follows:

  1. Calculate total number of sites sampled in each cove for that month

  2. Calculate the average number of individuals per zooplankton category across sampling sites for each cove month combo

  3. Calculate the total number of individuals for each zooplankton category sampled at each cove month combo

  4. Calculate the total number of individuals sampled at each cove month combo

  5. Calculate the percent of the total zooplankton individuals that each zooplankton category makes up in each cove month combo

  6. Simplify data set to needed columns and rows

  7. Add a column stating that this is water column data

  8. Rename zp_order to category

#Prep the water column data
zpwc <- zp_biomass %>%
  group_by(covemonth) %>%
  mutate(sites_tot = length(unique(site))) %>%
  ungroup() %>% #Step 1
  group_by(covemonth, zp_genus) %>%
  mutate(cmavgindiv = sum(zp_indiv_per_m3)/sites_tot) %>% 
  select(-c(zp_id, site, zp_indiv_per_m3, zp_microg_per_m3)) %>%
  distinct() %>%
  ungroup() %>% #Step 2
  group_by(covemonth, zp_order) %>%
  mutate(cattot = sum(cmavgindiv)) %>%
  ungroup() %>% #Step 3
  group_by(covemonth) %>%
  mutate(covmonthtot = sum(cmavgindiv)) %>%
  ungroup %>% #Step 4
  mutate(perctot = cattot/covmonthtot) %>% #Step 5
  select(1:4, 11) %>%
  distinct() %>% #Step 6
  mutate(gutwater = "watercolumn") %>% #Step 7
  rename(category = zp_order) #Step 8

zpwc$category <- str_replace_all(zpwc$category, 'Cyclopoida', 'Copepoda')

3.3.2 The steps for the prey data preparation are as follows:

  1. Join fish data to diet data

  2. Calculate the total number of prey individuals for each prey category sampled at each cove month combo

  3. Calculate the total number of prey individuals sampled at each cove month combo

  4. Calculate the percent of the total prey that each prey category is for each cove month combo

  5. Simplify data set to needed columns and rows

  6. Add a column stating that this is gut content data

  7. Reorder the columns to desired structure

  8. Rename prey_cat to category

#Prep the gut content data
zpgc <- diet %>%
  left_join(fish, by = "fish_id") %>% #Step 1
  group_by(covemonth, prey_cat) %>%
  mutate(cattot = sum(total)) %>%
  ungroup() %>% #Step 2
  group_by(covemonth) %>%
  mutate(covmonthtot = sum(total)) %>%
  ungroup() %>% #Step 3
  mutate(perctot = cattot/covmonthtot) %>% #Step 4
  select(4, 7:9, 14) %>%
  distinct() %>% #Step 5
  mutate(gutwater = "gut") %>% #Step 6
  relocate(prey_cat, .after = covemonth) %>% #Step 7
  rename(category = prey_cat) #Step 8

3.3.3 The steps for combining the data sets and calculating selectivity are as follows:

  1. Bind the two data sets

  2. Filter the data set to only include the top three categories— Cladocera, Copepoda, Ploima

  3. Filter the data set to only include July, August, and September

  4. Factor the month and cove levels

  5. Pivot the data set wider

  6. Add in actual 0s

  7. Create data set for Ostracoda

  8. Add row for Wethersfield data in July

  9. Convert to percent

  10. Factor month and cove levels

#Create data set with water column and gut content data
zpwcgc <- rbind(zpwc, zpgc) %>% #Step 1
  filter(category == "Cladocera" | 
           category == "Copepoda" | 
           category == "Ploima") %>% #Step 2
  filter(month == "July" |
           month == "August" |
           month == "September") #Step 3
  

zpwcgc$month <- factor(zpwcgc$month, levels = c("July", "August", 
                                                "September"))
zpwcgc$cove <- factor(zpwcgc$cove, levels = c("wethersfield", "chapman", 
                                              "hamburg")) #Step 4

zpwcgcwide <- zpwcgc %>%
  pivot_wider(names_from = gutwater,
              values_from = perctot) %>%
  mutate(selectivity = (gut-watercolumn)/(gut+watercolumn)) #Step 5

zpwcgcwide[9,6] = 0 #Replace NA in Wethersfield/Sept/Ploima with actual 0
zpwcgcwide[27,5] = 0 #Replace NA in Chapman/July/Copepoda with actual 0 #Step 6

#Cannot calculate selectivity values for Ostracoda because they were not sampled
#from the environment. Instead a data set for Ostracoda percent of total prey
#items is needed created

ostra <- zpgc %>%
  filter(category == "Ostracoda") %>% #Step 7
  add_row (cove = "wethersfield", month = "July", 
           covemonth = "wethersfieldJuly", category = "Ostracoda", perctot = 0, 
           gutwater = "gut") %>% #Step 8
  mutate(actperc = perctot * 100) #Step 9
  #Add a row with actual 0 for Wethersfield/July/Ostracoda

ostra$month <- factor(ostra$month, levels = c("July", "August", "September"))
ostra$cove <- factor(ostra$cove, levels = c("wethersfield", "chapman", 
                                            "hamburg")) #Step 10