Chapter 2 Database Creation
A variety of data was collected for this project; including data specific to individual juvenile river herring, gut content information for each of those fish, specific measurements for individual gut content items, environmental data, zooplankton samples, and specific measurements for individual zooplankton. To better organize and store this data, I have created a database using R and SQL (Figure 2.1).
Figure 2.1: Schematic of database design
To recreate this database, simply execute the code below. Note, each table is referred to by the title provided to each box in the above figure.
2.1 Load required package and initate database building
Load the DBI package into R and initiate a connection between R and SQLite by creating the database file.
library(DBI)
#Create Database ----
rh_db <- dbConnect(RSQLite::SQLite(), "../DataClean/rh.db")2.2 Create FISH table
The fish table includes the following data: fish_id (primary key), month captured, cove captured at, covemonth combination (foreign key), fish age in days, and TL in mm.
##Create fish table in sql
dbExecute(rh_db, "CREATE TABLE fish (
fish_id char(5) NOT NULL,
cove varchar(12) CHECK (cove IN ('chapman', 'wethersfield', 'hamburg')),
month varchar(10) CHECK (month IN ('July', 'August', 'September')),
covemonth varchar(22) CHECK (covemonth IN ('hamburgAugust', 'hamburgJuly',
'wethersfieldJuly', 'wethersfieldAugust', 'wethersfieldSeptember', 'chapmanJuly',
'chapmanAugust', 'chapmanSeptember')),
age_days integer(2),
TL_mm integer(2),
PRIMARY KEY (fish_id)
);")
##Load fish csv into R
fish <- read.csv("../DataClean/Fish.csv",
stringsAsFactors = FALSE)
##Check that names align
names(fish)
##Write loaded csv into sql table
dbWriteTable(rh_db, "fish", fish, append = TRUE)2.3 Create ENVIR table
The envir table includes the following data: covemonth (primary key), month, cove, average conductivity (avg_cond), average temperature in celsius (avg_temp), average percent dissolved oxygen (avg_DOperc), and average milligrams of dissolved oxygen per liter of water (avg_DOmgl). These are averages of measurements taken over a range of water depths.
##Create envir table in sql
dbExecute(rh_db, "CREATE TABLE envir (
cove varchar(12) CHECK (cove IN ('chapman', 'wethersfield', 'hamburg')),
month varchar(10) CHECK (month IN ('June', 'July', 'August', 'September')),
covemonth varchar(22) NOT NULL PRIMARY KEY CHECK (covemonth IN ('hamburgAugust', 'hamburgJuly',
'hamburgJune', 'hamburgSeptember', 'wethersfieldJune', 'wethersfieldJuly',
'wethersfieldAugust', 'wethersfieldSeptember', 'chapmanJune', 'chapmanJuly',
'chapmanAugust', 'chapmanSeptember')),
avg_cond real,
avg_temp real,
avg_DOperc real,
avg_DOmgl real
);")
##Load fish csv into R
envir <- read.csv("../DataClean/Envir.csv",
stringsAsFactors = FALSE)
##Check that names align
names(envir)
##Write loaded csv into sql table
dbWriteTable(rh_db, "envir", envir, append = TRUE)2.4 Create DIET table
The diet table includes the following data: diet_id (primary key), fish_id (foreign key), lowest possible taxonomic classification of prey (prey_type), prey Order (prey_cat), total number of that type of prey found in the fish, and total biomass (micrograms) of that type of prey found in the fish.
##Create diet table in sql
dbExecute(rh_db, "CREATE TABLE diet (
diet_id char(7) PRIMARY KEY NOT NULL,
fish_id char(5),
prey_type varchar(20),
prey_cat varchar(20),
total integer(3),
preybiomass_microg double,
FOREIGN KEY (fish_id) REFERENCES fish(fish_id)
);")
##Load diet csv into R
diet <- read.csv("../DataClean/Diet.csv",
stringsAsFactors = FALSE)
##Check that names align
names(diet)
##Write loaded csv into sql table
dbWriteTable(rh_db, "diet", diet, append = TRUE)2.5 Create DIET_SIZE table
The diet_size table includes the following data: dietsize_id (primary key), fish_id (foreign key), lowest possible taxonomic classification of prey (prey_type), prey Order (prey_cat), and the total length (micrometers) of the first twenty individuals of that prey type identified in a fish.
##Create diet_size table in sql
dbExecute(rh_db, "CREATE TABLE diet_size (
dietsize_id char(11) PRIMARY KEY NOT NULL,
fish_id char(5),
prey_tot double,
prey_type varchar(20),
prey_cat varchar(20),
TL1 double,
TL2 double,
TL3 double,
TL4 double,
TL5 double,
TL6 double,
TL7 double,
TL8 double,
TL9 double,
TL10 double,
TL11 double,
TL12 double,
TL13 doulbe,
TL14 double,
TL15 double,
TL16 double,
TL17 double,
TL18 double,
TL19 double,
TL20 double,
FOREIGN KEY (fish_id) REFERENCES fish(fish_id)
);")
##Load diet_size csv into R
diet_size <- read.csv("../DataClean/Diet_Size.csv",
stringsAsFactors = FALSE)
##Check that names align
names(diet_size)
##Write loaded csv into sql table
dbWriteTable(rh_db, "diet_size", diet_size, append = TRUE)2.6 Create ZP_BIOMASS table
The zp_biomass table includes the following data: zp_id (primary key), cove, month, covemonth(foreign key), sampling site (approximately three unique sites were sampled at each cove, each month), zooplankton genus, zooplankton family, zooplankton order, the number of individual zooplankton per cubic meter of water, and the biomass of zooplankton (micrograms) per cubic meter of water.
##Create zp_biomass table in sql
dbExecute(rh_db, "CREATE TABLE zp_biomass (
zp_id char(5) PRIMARY KEY NOT NULL,
cove varchar(12) CHECK (cove IN ('chapman', 'wethersfield', 'hamburg')),
month varchar(10) CHECK (month IN ('June', 'July', 'August', 'September')),
covemonth varchar(22) CHECK (covemonth IN ('hamburgAugust', 'hamburgJuly',
'hamburgJune', 'hamburgSeptember', 'wethersfieldJune', 'wethersfieldJuly',
'wethersfieldAugust', 'wethersfieldSeptember', 'chapmanJune', 'chapmanJuly',
'chapmanAugust', 'chapmanSeptember')),
site integer(1),
zp_order varchar(20),
zp_family varchar(20),
zp_genus varchar(20),
zp_indiv_per_m3 double,
zp_microg_per_m3 double,
FOREIGN KEY (covemonth) REFERENCES envir(covemonth)
);")
##Load diet csv into R
zp_biomass <- read.csv("../DataClean/ZP_Biomass.csv",
stringsAsFactors = FALSE)
##Check that names align
names(zp_biomass)
##Write loaded csv into sql table
dbWriteTable(rh_db, "zp_biomass", zp_biomass, append = TRUE)2.7 Create ZP_SIZE table
The zp_size table includes the following data: zpsize_id (primary key), cove, month, covemonth(foreign key), sampling site (approximately three unique sites were sampled at each cove, each month), lowest possible taxonomic classification of zooplankton (zp_type), zooplankton order,and the total length (micrometers) of the first twenty individuals of that zooplankton identified at each site at each covemonth combination.
##Create zp_size table in sql
dbExecute(rh_db, "CREATE TABLE zp_size (
zpsize_id char(8) PRIMARY KEY NOT NULL,
cove varchar(12) CHECK (cove IN ('chapman', 'wethersfield', 'hamburg')),
month varchar(10) CHECK (month IN ('June', 'July', 'August', 'September')),
covemonth varchar(22) CHECK (covemonth IN ('hamburgAugust', 'hamburgJuly',
'hamburgJune', 'hamburgSeptember', 'wethersfieldJune', 'wethersfieldJuly',
'wethersfieldAugust', 'wethersfieldSeptember', 'chapmanJune', 'chapmanJuly',
'chapmanAugust', 'chapmanSeptember')),
site integer(1),
zp_type varchar(20),
zp_order varchar(20),
TL1 double,
TL2 double,
TL3 double,
TL4 double,
TL5 double,
TL6 double,
TL7 double,
TL8 double,
TL9 double,
TL10 double,
TL11 double,
TL12 double,
TL13 doulbe,
TL14 double,
TL15 double,
TL16 double,
TL17 double,
TL18 double,
TL19 double,
TL20 double,
TL21 double,
TL22 double,
TL23 double,
TL24 double,
TL25 double,
TL26 double,
TL27 double,
TL28 double,
TL29 double,
TL30 double,
TL31 double,
TL32 double,
TL33 double,
TL34 double,
TL35 double,
TL36 double,
TL37 double,
TL38 double,
TL39 double,
TL40 double,
TL41 double,
TL42 double,
TL43 double,
TL44 double,
TL45 double,
TL46 double,
TL47 double,
TL48 double,
TL49 double,
TL50 double,
TL51 double,
TL52 double,
TL53 double,
TL54 double,
TL55 double,
TL56 double,
TL57 double,
TL58 double,
TL59 double,
TL60 double,
FOREIGN KEY (covemonth) REFERENCES envir(covemonth)
);")
##Load diet_size csv into R
zp_size <- read.csv("../DataClean/ZP_Size.csv",
stringsAsFactors = FALSE)
zp_size[zp_size == ''] <- NA
##Check that names align
names(zp_size)
##Write loaded csv into sql table
dbWriteTable(rh_db, "zp_size", zp_size, append = TRUE)2.8 Check data
#Confirm that data was loaded properly
library(DBI)
rh_db <- dbConnect(RSQLite::SQLite(), "../DataClean/rh.db")
##fish table
dbGetQuery(rh_db, "SELECT * FROM fish LIMIT 3;")## fish_id cove month covemonth age_days TL_mm
## 1 HC065 hamburg August hamburgAugust 49 47
## 2 HC066 hamburg August hamburgAugust 64 50
## 3 HC067 hamburg August hamburgAugust 62 53
##envir table
dbGetQuery(rh_db, "SELECT * FROM envir LIMIT 3;")## cove month covemonth avg_cond avg_temp avg_DOperc avg_DOmgl
## 1 chapman June chapmanJune 157.4400 20.66000 NA NA
## 2 chapman July chapmanJuly 183.5167 27.61667 116.3000 9.45
## 3 chapman August chapmanAugust 198.4833 27.51667 107.1333 8.40
##diet table
dbGetQuery(rh_db, "SELECT * FROM diet LIMIT 3;")## diet_id fish_id prey_type prey_cat total preybiomass_microg
## 1 diet001 CM016 BosminidaeHead Cladocera 4 0.2120711
## 2 diet002 CM016 Chydoridae Cladocera 1 1.3981294
## 3 diet003 CM016 Copepod Copepoda 11 13.0078145
##diet_size table
dbGetQuery(rh_db, "SELECT * FROM diet_size LIMIT 3;")## dietsize_id fish_id prey_tot prey_type prey_cat TL1 TL2
## 1 dietsize001 HC065 15 Ostracoda Ostracoda 0.5700000 0.5030000
## 2 dietsize002 HC065 4 Bosminidaehead Cladocera 0.2468393 0.2468393
## 3 dietsize003 HC065 1 Harpacticoida Harpacticoida 0.4570000 NA
## TL3 TL4 TL5 TL6 TL7 TL8 TL9 TL10 TL11 TL12 TL13 TL14
## 1 0.6500000 0.6430000 0.693 0.461 0.641 0.655 0.621 0.57 0.63 0.634 0.605 0.465
## 2 0.2468393 0.2468393 NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA NA
## TL15 TL16 TL17 TL18 TL19 TL20
## 1 0.546 NA NA NA NA NA
## 2 NA NA NA NA NA NA
## 3 NA NA NA NA NA NA
##zp_biomass table
dbGetQuery(rh_db, "SELECT * FROM zp_biomass LIMIT 3;")## zp_id cove month covemonth site zp_order zp_family
## 1 zp316 wethersfield August wethersfieldAugust 3 Ploima Synchaetidae
## 2 zp015 chapman June chapmanJune 1 Cladocera Chydoridae
## 3 zp035 chapman June chapmanJune 2 Cladocera Chydoridae
## zp_genus zp_indiv_per_m3 zp_microg_per_m3
## 1 Synchaeta 864.5461 17.7512
## 2 Allonella 1037.4553 853.6859
## 3 Allonella 1252.1012 951.6167
##zp_size table
dbGetQuery(rh_db, "SELECT * FROM zp_size LIMIT 3;")## zpsize_id cove month covemonth site zp_type zp_order TL1 TL2 TL3
## 1 zpsize01 chapman July chapmanJuly 1 Keratella Ploima 0.092 0.086 0.082
## 2 zpsize02 chapman July chapmanJuly 2 Keratella Ploima 0.091 0.076 0.076
## 3 zpsize03 chapman July chapmanJuly 3 Keratella Ploima 0.082 0.061 0.096
## TL4 TL5 TL6 TL7 TL8 TL9 TL10 TL11 TL12 TL13 TL14 TL15 TL16
## 1 0.083 0.084 0.088 0.091 0.102 0.091 0.089 0.080 0.071 0.088 0.099 0.125 0.083
## 2 0.082 0.127 0.079 0.092 0.081 0.082 0.115 0.075 0.073 0.086 0.107 0.092 0.074
## 3 0.092 0.082 0.087 0.087 0.087 0.093 0.091 0.083 0.091 0.077 0.110 0.090 0.085
## TL17 TL18 TL19 TL20 TL21 TL22 TL23 TL24 TL25 TL26 TL27 TL28 TL29
## 1 0.115 0.091 0.087 0.082 0.107 0.085 0.075 0.078 0.084 0.096 0.079 0.089 0.090
## 2 0.071 0.084 0.083 0.081 0.080 0.113 0.079 0.082 0.130 0.085 0.083 0.119 0.124
## 3 0.105 0.096 0.082 0.090 0.112 0.107 0.087 0.087 0.081 0.105 0.081 0.077 0.081
## TL30 TL31 TL32 TL33 TL34 TL35 TL36 TL37 TL38 TL39 TL40 TL41 TL42
## 1 0.085 0.087 0.088 0.093 0.102 0.081 0.085 0.083 0.087 0.071 0.077 0.096 0.078
## 2 0.126 0.118 0.087 0.084 0.096 0.094 0.086 0.081 0.083 NA NA NA NA
## 3 0.083 0.082 0.079 0.086 0.091 0.078 0.083 0.090 0.087 0.068 0.077 0.080 0.090
## TL43 TL44 TL45 TL46 TL47 TL48 TL49 TL50 TL51 TL52 TL53 TL54 TL55
## 1 0.078 0.067 0.074 0.080 0.073 0.089 0.089 0.093 0.089 0.087 0.090 0.100 0.115
## 2 NA NA NA NA NA NA NA NA NA NA NA NA NA
## 3 0.076 0.076 0.080 0.078 0.081 0.082 0.078 0.089 0.084 0.084 0.092 0.072 0.083
## TL56 TL57 TL58 TL59 TL60
## 1 0.085 0.09 0.082 0.089 0.093
## 2 NA NA NA NA NA
## 3 0.084 0.09 0.076 0.089 NA