The WHAP 2019 field data CSV on ServCat had significant errors when checked in May 2023. These errors were corrected in analysis but the file with those corrections has a different format than the current WHAP scripts use. This notebook documents the process of correcting that.
# Set up user directories
results_dir <- "results"
code_dir <- "src"
data_dir <- "data"
# Set up file names
corrected_csv_file_name <- "qdt2019c.csv"
servcat_csv_file_name <- "WHAP_2019_Quadrats_20211019.csv"
output_csv_file_name <- "WHAP_2019_Quadrats_20230525.csv"
required_packages <- c(
"tidyverse",
"here"
)
# (optional) Initialize here::here()
here::i_am("src/whap_2019_data_retransform.Rmd")
here() starts at C:/Users/cbdavies/OneDrive - DOI/WHAP/2019 Data Re-transform
# Load our (optional) function to make package loading more graceful
public_download_api <-
"https://ecos.fws.gov/ServCatServices/servcat/v4/rest/DownloadFile/"
try(source(paste0(public_download_api, "228091")))
if (exists("load_and_install_packages")) {
load_and_install_packages(required_packages)
} else {
message("Loading required packages: ", toString(required_packages))
loaded_packages <- sapply(required_packages,
\(x) library(x, character.only = TRUE, quietly = TRUE)
)
}
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.2 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
# (Optional) function for nicer messages in HTML RMarkdown docs
try(source(paste0(public_download_api, "227695")))
if (exists("make_errors_prettier") && !interactive()) make_errors_prettier()
# Make sure desired directories exist
if (!dir.exists(here(data_dir))) dir.create(here(data_dir))
if (!dir.exists(here(results_dir))) dir.create(here(results_dir))
The corrected 2019 data was received from Emilio Laca via email on 10 May 2023.
The uncorrected 2019 data was pulled from ServCat on 26 April 2023.
cli::cli_alert_info("Loading {corrected_csv_file_name}")
ℹ Loading qdt2019c.csv
corrected_data <- read_csv(here(data_dir, corrected_csv_file_name))
New names:
• `->
…1`
Rows: 543 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: “,”
chr (13): LIT, Date, Observer, Unit_Name, Subunit_Name, Combined, CirclePlot…
dbl (17): …1, OBJECTID, ITIS_TSN, PlantHeight, n_seed_heads, seed_head1_F2…
ℹ Use spec()
to retrieve the full column specification
for this data.
ℹ Specify the column types or set show_col_types = FALSE
to quiet this message.
cli::cli_alert_info("Loading {servcat_csv_file_name}")
ℹ Loading WHAP_2019_Quadrats_20211019.csv
servcat_data <- read_csv(here(data_dir, servcat_csv_file_name))
Rows: 3662 Columns: 19
── Column specification ────────────────────────────────────────────────────────
Delimiter: “,”
chr (12): LIT, unitName, subunitName, recordedBy, eventDate, quadratSize, ma…
dbl (7): GlobalID, decimalLatitude, decimalLongitude, taxonID, plantHeight,…
ℹ Use spec()
to retrieve the full column specification
for this data.
ℹ Specify the column types or set show_col_types = FALSE
to quiet this message.
Some of the columns on the ServCat data are already how we want them - the issues are mostly with the taxon columns and measurements.
quadrats_from_servcat <- servcat_data %>%
dplyr::select(GlobalID, LIT, unitName, subunitName, recordedBy, eventDate,
decimalLatitude, decimalLongitude, managementAction,
quadratSize, plantHeight, nSeedHeads, eventRemarks
) %>%
dplyr::distinct() %>%
dplyr::mutate(
# The latitude and longitude fields are switched, just fixing them
tempLatitude = decimalLongitude,
decimalLongitude = decimalLatitude,
decimalLatitude = tempLatitude,
tempLatitude = NULL
)
head(quadrats_from_servcat)
We only need a few columns from the seedhead data. First we will get the taxonomy and stratum columns.
corrected_taxonomy <- corrected_data %>%
dplyr::select(GlobalID = OBJECTID, vernacularName = CommonName,
taxonID = ITIS_TSN, stratum = Stratum
) %>%
dplyr::mutate(vernacularName = if_else(
vernacularName == "Timothy",
"Swamp Timothy",
vernacularName
),
stratum = if_else(
stratum == "Med",
"Medium",
stratum
),
stratum = stringr::str_to_lower(stratum)
) %>%
dplyr::distinct()
head(corrected_taxonomy)
There are ten measure columns we want to pivot and get data from.
seed_head_cols <- names(corrected_data) %>%
grep("^seed_head", ., value = TRUE )
pivoted_measures <- corrected_data %>%
dplyr::select(GlobalID = OBJECTID, all_of(seed_head_cols)) %>%
tidyr::pivot_longer(
seed_head_cols,
names_to = "measurementType",
values_to = "measurementValue",
values_drop_na = TRUE
) %>%
dplyr::mutate(
# Generate a GlobalID for the seed head measures
GlobalID_seed = paste(
GlobalID,
stringr::str_extract(
measurementType,
"(?!^seed_head)[1-5]"
),
sep = "_"
),
# Replace measurement types with the names used in current data
measurementType = if_else(
stringr::str_detect(measurementType, "F2TLength"),
"f2t_length_mm",
"sh_length_mm"
)
)
Warning Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use all_of()
or any_of()
instead.
# Was:
data %>% select(seed_head_cols)
# Now:
data %>% select(all_of(seed_head_cols))
See https://tidyselect.r-lib.org/reference/faq-external-vector.html.
This warning is displayed once every 8 hours.
Call lifecycle::last_lifecycle_warnings()
to see where
this warning was
generated.
head(pivoted_measures)
We now have three datasets to stick together:
quadrats_from_servcat
corrected_taxonomy
pivoted_measures
full_corrected_data <- quadrats_from_servcat %>%
dplyr::inner_join(corrected_taxonomy, by = join_by(GlobalID)) %>%
dplyr::inner_join(pivoted_measures, by = join_by(GlobalID)) %>%
# Just for convenience, put the columns in the original ServCat csv order
dplyr::relocate(names(servcat_data)) %>%
dplyr::arrange(GlobalID, GlobalID_seed)
head(full_corrected_data)
I used GlobalID_seed to find records that appeared in the ServCat data but not the full corrected data, and vice versa.
These records appear in the full corrected data data but not the ServCat data. In all cases I checked, it was because measurements were incorrectly labeled as F2TLength instead of regular seed head length (which is not measured in swamp timothy).
1_4, 1_5, 3_4, 3_5, 4_4, 4_5, 5_4, 5_5, 6_4, 6_5, 7_2, 8_4, 8_5, 9_2, 10_4, 10_5, 11_4, 11_5, 12_4, 12_5, 13_4, 13_5, 15_4, 15_5, 16_4, 16_5, 17_2, 18_4, 18_5, 272_1
These records appear in the ServCat data but not the full corrected data. Mostly these are records where the measurements are impossible, uncorrectable, or could not be resolved to a correct unit of measurement (like seed heads that were measured at 0.05 mm long).
88_1, 91_2, 91_3, 91_5, 91_1, 91_4, 100_4, 100_1, 100_2, 100_3, 100_5, 158_4, 158_3, 158_5, 158_2, 158_1, 180_3, 180_4, 180_2, 180_1, 180_5, 221_5, 221_1, 221_2, 221_4, 221_3, 233_3, 233_4, 233_1, 233_5, 233_2, 251_4, 251_3, 251_5, 251_1, 251_2, 252_4, 252_3, 252_5, 252_1, 252_2, 253_4, 253_3, 253_5, 253_1, 253_2, 254_4, 254_3, 254_5, 254_1, 254_2, 255_4, 255_3, 255_5, 255_1, 255_2, 256_4, 256_3, 256_5, 256_1, 256_2, 257_4, 257_3, 257_5, 257_1, 257_2, 258_4, 258_3, 258_5, 258_1, 258_2, 259_4, 259_3, 259_5, 259_1, 259_2, 260_4, 260_3, 260_5, 260_1, 260_2, 261_4, 261_3, 261_5, 261_1, 261_2, 262_4, 262_3, 262_5, 262_1, 262_2, 263_4, 263_3, 263_5, 263_1, 263_2, 345_2, 345_1, 345_5, 345_3, 345_4, 359_1, 359_3, 359_5, 359_2, 359_4, 367_4, 367_5, 367_1, 367_2, 367_3, 376_1, 376_2, 376_4, 376_3, 376_5, 488_5, 488_2, 488_3, 488_4, 488_1, 506_3, 506_1, 506_2, 506_4, 506_5, 511_1, 511_3, 511_4, 511_5, 511_2, 523_1, 530_5, 530_2, 530_1, 530_3, 530_4
write_csv(full_corrected_data, here(results_dir, output_csv_file_name))