The WHAP 2019 field data CSV on ServCat had significant errors when checked in May 2023. These errors were corrected in analysis but the file with those corrections has a different format than the current WHAP scripts use. This notebook documents the process of correcting that.

# Set up user directories
results_dir <- "results"
code_dir <- "src"
data_dir <- "data"

# Set up file names
corrected_csv_file_name <- "qdt2019c.csv"
servcat_csv_file_name <- "WHAP_2019_Quadrats_20211019.csv"
output_csv_file_name <- "WHAP_2019_Quadrats_20230525.csv"

required_packages <- c(
  "tidyverse", 
  "here"
)

# (optional) Initialize here::here()
here::i_am("src/whap_2019_data_retransform.Rmd")

here() starts at C:/Users/cbdavies/OneDrive - DOI/WHAP/2019 Data Re-transform

# Load our (optional) function to make package loading more graceful
public_download_api <- 
  "https://ecos.fws.gov/ServCatServices/servcat/v4/rest/DownloadFile/"
try(source(paste0(public_download_api, "228091")))

if (exists("load_and_install_packages")) {
  load_and_install_packages(required_packages)
} else {
  message("Loading required packages: ", toString(required_packages))
  loaded_packages <- sapply(required_packages, 
         \(x) library(x, character.only = TRUE, quietly = TRUE)
  )
}

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──

✔ dplyr 1.1.2 ✔ readr 2.1.4

✔ forcats 1.0.0 ✔ stringr 1.5.0

✔ ggplot2 3.4.2 ✔ tibble 3.2.1

✔ lubridate 1.9.2 ✔ tidyr 1.3.0

✔ purrr 1.0.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──

✖ dplyr::filter() masks stats::filter()

✖ dplyr::lag() masks stats::lag()

ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

# (Optional) function for nicer messages in HTML RMarkdown docs
try(source(paste0(public_download_api, "227695")))
if (exists("make_errors_prettier") && !interactive()) make_errors_prettier()

# Make sure desired directories exist
if (!dir.exists(here(data_dir))) dir.create(here(data_dir))
if (!dir.exists(here(results_dir))) dir.create(here(results_dir))

Load the input csvs

The corrected 2019 data was received from Emilio Laca via email on 10 May 2023.

The uncorrected 2019 data was pulled from ServCat on 26 April 2023.

cli::cli_alert_info("Loading {corrected_csv_file_name}")

ℹ Loading qdt2019c.csv

corrected_data <- read_csv(here(data_dir, corrected_csv_file_name))

New names:

• `->…1`

Rows: 543 Columns: 30

── Column specification ────────────────────────────────────────────────────────

Delimiter: “,”

chr (13): LIT, Date, Observer, Unit_Name, Subunit_Name, Combined, CirclePlot…

dbl (17): …1, OBJECTID, ITIS_TSN, PlantHeight, n_seed_heads, seed_head1_F2…

ℹ Use spec() to retrieve the full column specification for this data.

ℹ Specify the column types or set show_col_types = FALSE to quiet this message.

cli::cli_alert_info("Loading {servcat_csv_file_name}")

ℹ Loading WHAP_2019_Quadrats_20211019.csv

servcat_data <- read_csv(here(data_dir, servcat_csv_file_name))

Rows: 3662 Columns: 19

── Column specification ────────────────────────────────────────────────────────

Delimiter: “,”

chr (12): LIT, unitName, subunitName, recordedBy, eventDate, quadratSize, ma…

dbl (7): GlobalID, decimalLatitude, decimalLongitude, taxonID, plantHeight,…

ℹ Use spec() to retrieve the full column specification for this data.

ℹ Specify the column types or set show_col_types = FALSE to quiet this message.

Get event-level good data from ServCat data

Some of the columns on the ServCat data are already how we want them - the issues are mostly with the taxon columns and measurements.

quadrats_from_servcat <- servcat_data %>% 
  dplyr::select(GlobalID, LIT, unitName, subunitName, recordedBy, eventDate, 
                decimalLatitude, decimalLongitude, managementAction, 
                quadratSize, plantHeight, nSeedHeads, eventRemarks
  ) %>% 
  dplyr::distinct() %>% 
  dplyr::mutate(
    # The latitude and longitude fields are switched, just fixing them
    tempLatitude = decimalLongitude,
    decimalLongitude = decimalLatitude, 
    decimalLatitude = tempLatitude,
    tempLatitude = NULL
    ) 

head(quadrats_from_servcat)

Get seedhead-level good data from corrected data

We only need a few columns from the seedhead data. First we will get the taxonomy and stratum columns.

corrected_taxonomy <- corrected_data %>% 
  dplyr::select(GlobalID = OBJECTID, vernacularName = CommonName,
                taxonID = ITIS_TSN, stratum = Stratum
  ) %>% 
  dplyr::mutate(vernacularName = if_else(
    vernacularName == "Timothy",
    "Swamp Timothy",
    vernacularName
    ),
     stratum = if_else(
      stratum == "Med",
    "Medium",
    stratum
    ),
    stratum = stringr::str_to_lower(stratum)
  ) %>% 
  dplyr::distinct()

head(corrected_taxonomy)

Transform seedhead-level measure data from corrected data

There are ten measure columns we want to pivot and get data from.

seed_head_cols <- names(corrected_data) %>% 
  grep("^seed_head", ., value = TRUE )

pivoted_measures <- corrected_data %>% 
  dplyr::select(GlobalID = OBJECTID, all_of(seed_head_cols)) %>% 
  tidyr::pivot_longer(
    seed_head_cols,
    names_to = "measurementType",
    values_to = "measurementValue",
    values_drop_na = TRUE
  ) %>% 
  dplyr::mutate(
    # Generate a GlobalID for the seed head measures
    GlobalID_seed = paste(
      GlobalID, 
      stringr::str_extract(
        measurementType,
        "(?!^seed_head)[1-5]" 
      ),
      sep = "_"
    ),
    # Replace measurement types with the names used in current data
    measurementType = if_else(
      stringr::str_detect(measurementType, "F2TLength"),
      "f2t_length_mm",
      "sh_length_mm"
    )
  ) 

Warning Using an external vector in selections was deprecated in tidyselect 1.1.0.

ℹ Please use all_of() or any_of() instead.

# Was:

data %>% select(seed_head_cols)

# Now:

data %>% select(all_of(seed_head_cols))

See https://tidyselect.r-lib.org/reference/faq-external-vector.html.

This warning is displayed once every 8 hours.

Call lifecycle::last_lifecycle_warnings() to see where this warning was

generated.

head(pivoted_measures)

Join all the data back together

We now have three datasets to stick together:

quadrats_from_servcat

corrected_taxonomy

pivoted_measures

full_corrected_data <- quadrats_from_servcat %>% 
  dplyr::inner_join(corrected_taxonomy, by = join_by(GlobalID)) %>% 
  dplyr::inner_join(pivoted_measures, by = join_by(GlobalID)) %>%
  # Just for convenience, put the columns in the original ServCat csv order
  dplyr::relocate(names(servcat_data)) %>% 
  dplyr::arrange(GlobalID, GlobalID_seed)

head(full_corrected_data)

Checks on data completeness

I used GlobalID_seed to find records that appeared in the ServCat data but not the full corrected data, and vice versa.

These records appear in the full corrected data data but not the ServCat data. In all cases I checked, it was because measurements were incorrectly labeled as F2TLength instead of regular seed head length (which is not measured in swamp timothy).

1_4, 1_5, 3_4, 3_5, 4_4, 4_5, 5_4, 5_5, 6_4, 6_5, 7_2, 8_4, 8_5, 9_2, 10_4, 10_5, 11_4, 11_5, 12_4, 12_5, 13_4, 13_5, 15_4, 15_5, 16_4, 16_5, 17_2, 18_4, 18_5, 272_1

These records appear in the ServCat data but not the full corrected data. Mostly these are records where the measurements are impossible, uncorrectable, or could not be resolved to a correct unit of measurement (like seed heads that were measured at 0.05 mm long).

88_1, 91_2, 91_3, 91_5, 91_1, 91_4, 100_4, 100_1, 100_2, 100_3, 100_5, 158_4, 158_3, 158_5, 158_2, 158_1, 180_3, 180_4, 180_2, 180_1, 180_5, 221_5, 221_1, 221_2, 221_4, 221_3, 233_3, 233_4, 233_1, 233_5, 233_2, 251_4, 251_3, 251_5, 251_1, 251_2, 252_4, 252_3, 252_5, 252_1, 252_2, 253_4, 253_3, 253_5, 253_1, 253_2, 254_4, 254_3, 254_5, 254_1, 254_2, 255_4, 255_3, 255_5, 255_1, 255_2, 256_4, 256_3, 256_5, 256_1, 256_2, 257_4, 257_3, 257_5, 257_1, 257_2, 258_4, 258_3, 258_5, 258_1, 258_2, 259_4, 259_3, 259_5, 259_1, 259_2, 260_4, 260_3, 260_5, 260_1, 260_2, 261_4, 261_3, 261_5, 261_1, 261_2, 262_4, 262_3, 262_5, 262_1, 262_2, 263_4, 263_3, 263_5, 263_1, 263_2, 345_2, 345_1, 345_5, 345_3, 345_4, 359_1, 359_3, 359_5, 359_2, 359_4, 367_4, 367_5, 367_1, 367_2, 367_3, 376_1, 376_2, 376_4, 376_3, 376_5, 488_5, 488_2, 488_3, 488_4, 488_1, 506_3, 506_1, 506_2, 506_4, 506_5, 511_1, 511_3, 511_4, 511_5, 511_2, 523_1, 530_5, 530_2, 530_1, 530_3, 530_4

write_csv(full_corrected_data, here(results_dir, output_csv_file_name))