TITLE: Prototype taxonomic name checking function for SEOSAW
DATE: 2025-07-09
AUTHOR: John L. Godlee
====================================================================
We are hoping to move the SEOSAW database to use World Flora Online
(WFO) as its taxonomic backbone, rather than the African Plant
Database (APD) which we have been using since 2019.
[SEOSAW]:
https://seosaw.github.io/
[World Flora Online]:
https://www.worldfloraonline.org/
[African Plant Database]:
https://africanplantdatabase.ch/
The WorldFlora R package, developed by Roeland Kindt, provides code
for querying a downloaded copy of the WFO database and providing
taxonomic information. The key features we want for SEOSAW are:
check the validity of taxonomic names in tree inventory data, fuzzy
find accepted names to catch spelling errors, and provide accepted
names for synonyms.
[WorldFlora R package]:
https://cran.r-project.org/web//packages//WorldFlora/index.html
I've written a wrapper function around the code in the WorldFlora R
package. Some features:
- Optionally submit a lookup table to replace unmatched names.
- Consolidate consecutive whitespaces in the WFO database to a
single space, which can lead to poor matching.
- Use WorldFlora::WFO.prepare() with a default set of replacements
to fix common orthographic errors.
- Use WorldFlora::WFO.one() to find the best match where multiple
names are fuzzy matched by WorldFlora::WFO.match().
- Extract subspecies and variety epithets from matched names.
- Optionally return unmatched or multiply-matched names.
#' Replace taxonomic names using lookup tables
#'
#' @param x vector of species names
#' @param lookup a single dataframe or a list of dataframes
containing lookup
#' tables. The first column should contain names in `x` to
be changed. The
#' second column should contain the new names.
#'
#' @return Vector of corrected species names
#'
#' @details Lookup tables are run in order through the list of
lookup tables, meaning
#' names may change incrementally multiple times.
#'
#' @export
#'
synonymyFix <- function(x, lookup) {
# Make list if not already
if (!inherits(lookup, "list")) {
lookup <- list(lookup)
}
# Combine lookup tables into a single dataframe
lookup_combi <- as.data.frame(fastRbind(lookup))
# Check no NAs
if (any(is.na(lookup_combi))) {
stop("Lookup table cannot contain NA entries")
}
# Do substitution
out <- lookup_combi[,2][match(x, lookup_combi[,1])]
out[is.na(out)] <- x[is.na(out)]
return(out)
}
#' Return default pattern substitution for `taxonCheck()`
#'
#' @return vector of regex patterns for use with `taxonCheck()`
in argument
#' `sub.pattern`
#'
#' @export
#'
WFO.prepare_default <- function() {
c(
" indet$",
" sp[.]",
" spp[.]",
" ssp[.]",
" pl[.]",
" indet[.]",
" ind[.]",
" gen[.]",
" g[.]",
" fam[.]",
" nov[.]",
" prox[.]",
" cf[.]",
" aff[.]",
" s[.]s[.]",
" s[.]l[.]",
" p[.]p[.]",
" p[.] p[.]",
"[?]",
" inc[.]",
" stet[.]",
"nom[.] cons[.]",
"nom[.] dub[.]",
" nom[.] err[.]",
" nom[.] illeg[.]",
" nom[.] inval[.]",
" nom[.] nov[.]",
" nom[.] nud[.]",
" nom[.] obl[.]",
" nom[.] prot[.]",
" nom[.] rej[.]",
" nom[.] supp[.]",
" sensu auct[.]"
)
}
#' Correct and match taxonomic names to the World Flora
Taxonomic Backbone
#'
#' @param x vector of taxonomic names
#' @param WFO.file optional file name of static copy of World
Flora Online
#' Taxonomic Backbone. If not NULL, data will be reloaded
from this file
#' @param WFO.data optional dataset with static copy of World
Flora Online
#' Taxonomic backbone. Ignored if `WFO.file` is not NULL
#' @param lookup optional a single dataframe or a list of
dataframes containing
#' lookup tables. The first column should contain names in
`x` to be
#' changed. The second column should contain the new names.
#' @param ret_wfo logical, if TRUE the function stops after
#' `WorldFlora::WFO.match()` and returns the raw output
from this function.
#' @param ret_unk logical, if TRUE taxa not matched in the World
#' Flora Online are returned to the user as a vector
containing
#' the unmatched values. If FALSE these taxa are returned
as NA.
#' @param ret_multi logical, if TRUE taxa matching multiple
records in the
#' World Flora Online are returned to the user as a list
with one element
#' for each original name containing the unmatched values.
If FALSE the
#' "best" name is selected by `WorldFlora::WFO.one()`
#' @param sub.pattern vector with regular expressions defining
sections of `x`
#' to be removed during correction of common orthographic
errors by
#' `WorldFlora::WFO.prepare()`
#' @param fuzzy If larger than 0, then attempt fuzzy matching.
See `WorldFlora::WFO.match()`
#' @param ... Additional arguments passed to
`WorldFlora::WFO.match()`
#'
#' @return Dataframe with cleaned taxonomic names and metadata
#'
#' @details
#' Taxonomic names are matched against the World Flora Online
database using
#' `WorldFlora::WFO.match()`.
#'
#' The search algorithm is as follows:
#' \enumerate{
#' \item{Optionally replace names with `lookup`}
#' \item{Correct common orthographic errors with
`WorldFlora::WFO.prepare()`}
#' \item{Query `WorldFlora::WFO.match()` for accepted
#' name and taxonomic rank information}
#' \item{Optionally return multiple matches or
unsuccessful matches}
#' \item{Consolidate multiple matches with
`WorldFlora::WFO.one()`}
#' \item{Return formatted dataframe}
#' }
#'
#' Names that cannot be matched should be replaced with "Indet
indet" in
#' `lookup`. These are replaced with NA_character_ before
`WorldFlora::WFO.match()`
#'
#' @importFrom data.table fread data.table
#' @importFrom WorldFlora WFO.prepare WFO.match WFO.one
#'
#' @export
#'
taxonCheck <- function(x, WFO.file = NULL, WFO.data = NULL,
lookup = NULL, ret_wfo = FALSE, ret_unk = FALSE, ret_multi =
FALSE,
sub.pattern = WFO.prepare_default(), fuzzy = 0.1, ...) {
# Check WFO data is available
if (is.null(WFO.data) & is.null(WFO.file)) {
stop("Either WFO.data or WFO.file must be provided")
}
if (is.null(WFO.data)) {
message(paste("Reading WFO data"))
if (!file.exists(WFO.file)) {
stop("If WFO.data is NULL, a valid WFO.file must be
provided. See WorldFlora::WFO.download()")
}
WFO.data <- data.table::fread(WFO.file, encoding = "UTF-8")
} else {
WFO.data <- data.table::data.table(WFO.data)
}
WFO.data$scientificName <- gsub("\\s+", " ",
WFO.data$scientificName)
# Get unique taxonomic names
xu <- unique(x)
# Substitute names with lookup table
if (!is.null(lookup)) {
message("Substituting names with `lookup`")
xf <- synonymyFix(xu, lookup = lookup)
} else {
xf <- xu
}
# Prepare taxonomic names for WFO query
xs <- WorldFlora::WFO.prepare(xf, sub.pattern =
sub.pattern)$spec.name
# Replace Indet genera with ""
xi <- xs
xi[xi == "Indet"] <- ""
# Run WFO matching
message("Querying World Flora Online")
wfo <- WorldFlora::WFO.match(unique(xi),
WFO.data = WFO.data, Fuzzy = fuzzy, ...)
# Optionally return raw WFO output
if (ret_wfo) {
# Add original names
wfo_all <- dplyr::bind_rows(lapply(seq_along(xu),
function(i) {
orig <- xu[i]
cbind("taxon_name_orig" = orig, wfo[wfo$spec.name.ORIG ==
xi[i],])
}))
# Check all original names are matched back
stopifnot(all(!is.na(wfo_all$taxon_name_orig)))
return(wfo_all)
}
# Consolidate to single best name per taxon
wfo_one <- WorldFlora::WFO.one(wfo, verbose = FALSE)
# Add original names
wfo_one_all <- dplyr::bind_rows(lapply(seq_along(xu),
function(i) {
orig <- xu[i]
cbind("taxon_name_orig" = orig,
wfo_one[wfo_one$spec.name.ORIG == xi[i],])
}))
# Check all original names are matched back
stopifnot(all(!is.na(wfo_one_all$taxon_name_orig)))
wfo_sel <- wfo_one_all[,c(
"taxon_name_orig",
"spec.name.ORIG", # taxon_name_sanit
"Old.name", # taxon_name_syn
"Old.ID", # taxon_wfo_syn
"scientificName", # taxon_name_acc
"taxonID", # taxon_wfo_acc
"scientificNameAuthorship", # taxon_auth_acc
"taxonRank", # taxon_rank_acc
"parentNameUsageID", # taxon_wfo_parent
"specificEpithet", # taxon_epithet_acc
"genus", # taxon_genus_acc
"family" # taxon_family_acc
)]
wfo_sel <- unique(wfo_sel)
# All submitted names should be included in WFO output
stopifnot(all(sort(unique(wfo_sel$spec.name.ORIG)) ==
sort(unique(xi))))
# Consolidate genus and species
wfo_sel$species <- trimws(paste(wfo_sel$genus,
wfo_sel$specificEpithet))
wfo_sel$species <- ifelse(!wfo_sel$taxonRank %in%
c("species", "subspecies", "variety", "subvariety",
"form", "subform", "prole", "unranked"),
NA_character_, wfo_sel$species)
# Extract subsp. and var. epithets from accepted names
wfo_sel$taxon_subspecies_acc <- gsub(".*subsp\\.\\s", "",
wfo_sel$scientificName)
wfo_sel$taxon_subspecies_acc[!grepl("\\ssubsp\\.\\s",
wfo_sel$scientificName)] <- NA_character_
wfo_sel$taxon_variety_acc <- gsub(".*var\\.\\s", "",
wfo_sel$scientificName)
wfo_sel$taxon_variety_acc[!grepl("\\svar\\.\\s",
wfo_sel$scientificName)] <- NA_character_
# Fill wfo ID of synonyms
wfo_sel$Old.ID <- ifelse(wfo_sel$Old.ID == "",
wfo_sel$taxonID, wfo_sel$Old.ID)
wfo_sel$Old.name <- ifelse(wfo_sel$Old.name == "",
wfo_sel$scientificName, wfo_sel$Old.name)
# Add date of processing
wfo_sel$taxon_wfo_date <- Sys.Date()
# Create output dataframe
out <- wfo_sel[,c(
"taxon_name_orig",
"spec.name.ORIG", # taxon_name_sanit
"Old.name", # taxon_name_syn
"Old.ID", # taxon_wfo_syn
"scientificName", # taxon_name_acc
"taxonID", # taxon_wfo_acc
"scientificNameAuthorship", # taxon_auth_acc
"taxonRank", # taxon_rank_acc
"parentNameUsageID", # taxon_wfo_parent
"taxon_variety_acc",
"taxon_subspecies_acc",
"specificEpithet", # taxon_epithet_acc
"species", # taxon_species_acc
"genus", # taxon_genus_acc
"family", # taxon_family_acc
"taxon_wfo_date")]
names(out) <- c(
"taxon_name_orig",
"taxon_name_sanit",
"taxon_name_syn",
"taxon_wfo_syn",
"taxon_name_acc",
"taxon_wfo_acc",
"taxon_auth_acc",
"taxon_rank_acc",
"taxon_wfo_parent",
"taxon_variety_acc",
"taxon_subspecies_acc",
"taxon_epithet_acc",
"taxon_species_acc",
"taxon_genus_acc",
"taxon_family_acc",
"taxon_wfo_date")
# Optionally return unmatched names
if (ret_unk & any(out$taxon_name_sanit != out$taxon_name_syn,
na.rm = TRUE)) {
unmatched <- out$taxon_name_orig[
(out$taxon_name_sanit != out$taxon_name_syn) |
is.na(out$taxon_name_syn) | is.na(out$taxon_name_sanit)]
warning("Some taxonomic names not matched by WFO.match,
returning original names")
return(unmatched)
}
# Optionally return names with multiple matches
if (ret_multi & any(duplicated(wfo$taxon_name_orig))) {
multis <-
wfo$taxon_name_orig[duplicated(wfo$taxon_name_orig)]
multis_df <- wfo[wfo$taxon_name_orig %in% multis,]
multis_list <- split(multis_df, multis_df$taxon_name_orig)
warning("Some taxonomic names matched to multiple names by
WFO.match, returning options")
return(multis_list)
}
# Change "" to NA in all columns
out[] <- lapply(out, function(x) {
if (is.character(x)) {
x[x == ""] <- NA_character_
}
x
})
# All original names should be filled
stopifnot(all(!is.na(out$species[out$species_sanit != "Indet
indet"])))
# Return
return(out)
}