TITLE: Prototype taxonomic name checking function for SEOSAW
DATE: 2025-07-09
AUTHOR: John L. Godlee
====================================================================


We are hoping to move the SEOSAW database to use World Flora Online
(WFO) as its taxonomic backbone, rather than the African Plant
Database (APD) which we have been using since 2019.

 [SEOSAW]: https://seosaw.github.io/
 [World Flora Online]: https://www.worldfloraonline.org/
 [African Plant Database]: https://africanplantdatabase.ch/

The WorldFlora R package, developed by Roeland Kindt, provides code
for querying a downloaded copy of the WFO database and providing
taxonomic information. The key features we want for SEOSAW are:
check the validity of taxonomic names in tree inventory data, fuzzy
find accepted names to catch spelling errors, and provide accepted
names for synonyms.

 [WorldFlora R package]:
https://cran.r-project.org/web//packages//WorldFlora/index.html

I've written a wrapper function around the code in the WorldFlora R
package. Some features:

- Optionally submit a lookup table to replace unmatched names.
- Consolidate consecutive whitespaces in the WFO database to a
single space, which can lead to poor matching.
- Use WorldFlora::WFO.prepare() with a default set of replacements
to fix common orthographic errors.
- Use WorldFlora::WFO.one() to find the best match where multiple
names are fuzzy matched by WorldFlora::WFO.match().
- Extract subspecies and variety epithets from matched names.
- Optionally return unmatched or multiply-matched names.

   #' Replace taxonomic names using lookup tables
   #'
   #' @param x vector of species names
   #' @param lookup a single dataframe or a list of dataframes
containing lookup
   #'     tables. The first column should contain names in `x` to
be changed. The
   #'     second column should contain the new names.
   #'
   #' @return Vector of corrected species names
   #'
   #' @details Lookup tables are run in order through the list of
lookup tables, meaning
   #' names may change incrementally multiple times.
   #'
   #' @export
   #'
   synonymyFix <- function(x, lookup) {

     # Make list if not already
     if (!inherits(lookup, "list")) {
       lookup <- list(lookup)
     }

     # Combine lookup tables into a single dataframe
     lookup_combi <- as.data.frame(fastRbind(lookup))

     # Check no NAs
     if (any(is.na(lookup_combi))) {
       stop("Lookup table cannot contain NA entries")
     }

     # Do substitution
     out <- lookup_combi[,2][match(x, lookup_combi[,1])]
     out[is.na(out)] <- x[is.na(out)]

     return(out)
   }

   #' Return default pattern substitution for `taxonCheck()`
   #'
   #' @return vector of regex patterns for use with `taxonCheck()`
in argument
   #'     `sub.pattern`
   #'
   #' @export
   #'
   WFO.prepare_default <- function() {
     c(
       " indet$",
       " sp[.]",
       " spp[.]",
       " ssp[.]",
       " pl[.]",
       " indet[.]",
       " ind[.]",
       " gen[.]",
       " g[.]",
       " fam[.]",
       " nov[.]",
       " prox[.]",
       " cf[.]",
       " aff[.]",
       " s[.]s[.]",
       " s[.]l[.]",
       " p[.]p[.]",
       " p[.] p[.]",
       "[?]",
       " inc[.]",
       " stet[.]",
       "nom[.] cons[.]",
       "nom[.] dub[.]",
       " nom[.] err[.]",
       " nom[.] illeg[.]",
       " nom[.] inval[.]",
       " nom[.] nov[.]",
       " nom[.] nud[.]",
       " nom[.] obl[.]",
       " nom[.] prot[.]",
       " nom[.] rej[.]",
       " nom[.] supp[.]",
       " sensu auct[.]"
     )
   }

   #' Correct and match taxonomic names to the World Flora
Taxonomic Backbone
   #'
   #' @param x vector of taxonomic names
   #' @param WFO.file optional file name of static copy of World
Flora Online
   #'     Taxonomic Backbone. If not NULL, data will be reloaded
from this file
   #' @param WFO.data optional dataset with static copy of World
Flora Online
   #'     Taxonomic backbone. Ignored if `WFO.file` is not NULL
   #' @param lookup optional a single dataframe or a list of
dataframes containing
   #'     lookup tables. The first column should contain names in
`x` to be
   #'     changed. The second column should contain the new names.
   #' @param ret_wfo logical, if TRUE the function stops after
   #'     `WorldFlora::WFO.match()` and returns the raw output
from this function.
   #' @param ret_unk logical, if TRUE taxa not matched in the World
   #'     Flora Online are returned to the user as a vector
containing
   #'     the unmatched values. If FALSE these taxa are returned
as NA.
   #' @param ret_multi logical, if TRUE taxa matching multiple
records in the
   #'     World Flora Online are returned to the user as a list
with one element
   #'     for each original name containing the unmatched values.
If FALSE the
   #'     "best" name is selected by `WorldFlora::WFO.one()`
   #' @param sub.pattern vector with regular expressions defining
sections of `x`
   #'     to be removed during correction of common orthographic
errors by
   #'     `WorldFlora::WFO.prepare()`
   #' @param fuzzy If larger than 0, then attempt fuzzy matching.
See `WorldFlora::WFO.match()`
   #' @param ... Additional arguments passed to
`WorldFlora::WFO.match()`
   #'
   #' @return Dataframe with cleaned taxonomic names and metadata
   #'
   #' @details
   #' Taxonomic names are matched against the World Flora Online
database using
   #' `WorldFlora::WFO.match()`.
   #'
   #' The search algorithm is as follows:
   #'     \enumerate{
   #'       \item{Optionally replace names with `lookup`}
   #'       \item{Correct common orthographic errors with
`WorldFlora::WFO.prepare()`}
   #'       \item{Query `WorldFlora::WFO.match()` for accepted
   #'             name and taxonomic rank information}
   #'       \item{Optionally return multiple matches or
unsuccessful matches}
   #'       \item{Consolidate multiple matches with
`WorldFlora::WFO.one()`}
   #'       \item{Return formatted dataframe}
   #'     }
   #'
   #' Names that cannot be matched should be replaced with "Indet
indet" in
   #' `lookup`. These are replaced with NA_character_ before
`WorldFlora::WFO.match()`
   #'
   #' @importFrom data.table fread data.table
   #' @importFrom WorldFlora WFO.prepare WFO.match WFO.one
   #'
   #' @export
   #'
   taxonCheck <- function(x, WFO.file = NULL, WFO.data = NULL,
      lookup = NULL, ret_wfo = FALSE, ret_unk = FALSE, ret_multi =
FALSE,
      sub.pattern = WFO.prepare_default(), fuzzy = 0.1, ...) {

     # Check WFO data is available
     if (is.null(WFO.data) & is.null(WFO.file)) {
       stop("Either WFO.data or WFO.file must be provided")
     }

     if (is.null(WFO.data)) {
       message(paste("Reading WFO data"))
       if (!file.exists(WFO.file)) {
         stop("If WFO.data is NULL, a valid WFO.file must be
provided. See WorldFlora::WFO.download()")
       }
       WFO.data <- data.table::fread(WFO.file, encoding = "UTF-8")
     } else {
       WFO.data <- data.table::data.table(WFO.data)
     }

     WFO.data$scientificName <- gsub("\\s+", " ",
WFO.data$scientificName)

     # Get unique taxonomic names
     xu <- unique(x)

     # Substitute names with lookup table
     if (!is.null(lookup)) {
       message("Substituting names with `lookup`")
       xf <- synonymyFix(xu, lookup = lookup)
     } else {
       xf <- xu
     }

     # Prepare taxonomic names for WFO query
     xs <- WorldFlora::WFO.prepare(xf, sub.pattern =
sub.pattern)$spec.name

     # Replace Indet genera with ""
     xi <- xs
     xi[xi == "Indet"] <- ""

     # Run WFO matching
     message("Querying World Flora Online")
     wfo <- WorldFlora::WFO.match(unique(xi),
       WFO.data = WFO.data, Fuzzy = fuzzy, ...)


     # Optionally return raw WFO output
     if (ret_wfo) {
       # Add original names
       wfo_all <- dplyr::bind_rows(lapply(seq_along(xu),
function(i) {
         orig <- xu[i]
         cbind("taxon_name_orig" = orig, wfo[wfo$spec.name.ORIG ==
xi[i],])
       }))

       # Check all original names are matched back
       stopifnot(all(!is.na(wfo_all$taxon_name_orig)))

       return(wfo_all)
     }

     # Consolidate to single best name per taxon
     wfo_one <- WorldFlora::WFO.one(wfo, verbose = FALSE)

     # Add original names
     wfo_one_all <- dplyr::bind_rows(lapply(seq_along(xu),
function(i) {
       orig <- xu[i]
       cbind("taxon_name_orig" = orig,
wfo_one[wfo_one$spec.name.ORIG == xi[i],])
     }))

     # Check all original names are matched back
     stopifnot(all(!is.na(wfo_one_all$taxon_name_orig)))

     wfo_sel <- wfo_one_all[,c(
       "taxon_name_orig",
       "spec.name.ORIG",  # taxon_name_sanit
       "Old.name",  # taxon_name_syn
       "Old.ID",  # taxon_wfo_syn
       "scientificName",  # taxon_name_acc
       "taxonID",  # taxon_wfo_acc
       "scientificNameAuthorship",  # taxon_auth_acc
       "taxonRank",  # taxon_rank_acc
       "parentNameUsageID",  # taxon_wfo_parent
       "specificEpithet",  # taxon_epithet_acc
       "genus",  # taxon_genus_acc
       "family"  # taxon_family_acc
     )]
     wfo_sel <- unique(wfo_sel)

     # All submitted names should be included in WFO output
     stopifnot(all(sort(unique(wfo_sel$spec.name.ORIG)) ==
sort(unique(xi))))

     # Consolidate genus and species
     wfo_sel$species <- trimws(paste(wfo_sel$genus,
wfo_sel$specificEpithet))

     wfo_sel$species <- ifelse(!wfo_sel$taxonRank %in%
         c("species", "subspecies", "variety", "subvariety",
             "form", "subform", "prole", "unranked"),
       NA_character_, wfo_sel$species)

     # Extract subsp. and var. epithets from accepted names
     wfo_sel$taxon_subspecies_acc <- gsub(".*subsp\\.\\s", "",
wfo_sel$scientificName)
     wfo_sel$taxon_subspecies_acc[!grepl("\\ssubsp\\.\\s",
wfo_sel$scientificName)] <- NA_character_

     wfo_sel$taxon_variety_acc <- gsub(".*var\\.\\s", "",
wfo_sel$scientificName)
     wfo_sel$taxon_variety_acc[!grepl("\\svar\\.\\s",
wfo_sel$scientificName)] <- NA_character_

     # Fill wfo ID of synonyms
     wfo_sel$Old.ID <- ifelse(wfo_sel$Old.ID == "",
       wfo_sel$taxonID, wfo_sel$Old.ID)

     wfo_sel$Old.name <- ifelse(wfo_sel$Old.name == "",
       wfo_sel$scientificName, wfo_sel$Old.name)

     # Add date of processing
     wfo_sel$taxon_wfo_date <- Sys.Date()

     # Create output dataframe
     out <- wfo_sel[,c(
       "taxon_name_orig",
       "spec.name.ORIG",  # taxon_name_sanit
       "Old.name",  # taxon_name_syn
       "Old.ID",  # taxon_wfo_syn
       "scientificName",  # taxon_name_acc
       "taxonID",  # taxon_wfo_acc
       "scientificNameAuthorship",  # taxon_auth_acc
       "taxonRank",  # taxon_rank_acc
       "parentNameUsageID",  # taxon_wfo_parent
       "taxon_variety_acc",
       "taxon_subspecies_acc",
       "specificEpithet",  # taxon_epithet_acc
       "species",  # taxon_species_acc
       "genus",  # taxon_genus_acc
       "family",  # taxon_family_acc
       "taxon_wfo_date")]

     names(out) <- c(
       "taxon_name_orig",
       "taxon_name_sanit",
       "taxon_name_syn",
       "taxon_wfo_syn",
       "taxon_name_acc",
       "taxon_wfo_acc",
       "taxon_auth_acc",
       "taxon_rank_acc",
       "taxon_wfo_parent",
       "taxon_variety_acc",
       "taxon_subspecies_acc",
       "taxon_epithet_acc",
       "taxon_species_acc",
       "taxon_genus_acc",
       "taxon_family_acc",
       "taxon_wfo_date")

     # Optionally return unmatched names
     if (ret_unk & any(out$taxon_name_sanit != out$taxon_name_syn,
na.rm = TRUE)) {
       unmatched <- out$taxon_name_orig[
         (out$taxon_name_sanit != out$taxon_name_syn) |
           is.na(out$taxon_name_syn) | is.na(out$taxon_name_sanit)]
       warning("Some taxonomic names not matched by WFO.match,
returning original names")
       return(unmatched)
     }

     # Optionally return names with multiple matches
     if (ret_multi & any(duplicated(wfo$taxon_name_orig))) {
       multis <-
wfo$taxon_name_orig[duplicated(wfo$taxon_name_orig)]
       multis_df <- wfo[wfo$taxon_name_orig %in% multis,]
       multis_list <- split(multis_df, multis_df$taxon_name_orig)
       warning("Some taxonomic names matched to multiple names by
WFO.match, returning options")
       return(multis_list)
     }

     # Change "" to NA in all columns
     out[] <- lapply(out, function(x) {
       if (is.character(x)) {
         x[x == ""] <- NA_character_
       }
       x
     })

     # All original names should be filled
     stopifnot(all(!is.na(out$species[out$species_sanit != "Indet
indet"])))

     # Return
     return(out)
   }