TITLE: Scraping museum catalogues
DATE: 2021-07-05
AUTHOR: John L. Godlee
====================================================================


My partner is visiting some museums and art galleries in the
eastern United States in the autumn, to look at Maya, Aztec and
Mixtec artefacts that relate to slavery, captivity and forced
labour. To find artefacts, she was looking through the online
catalogues of each institution, and at the same time wanted to
record metadata about the objects to refer back to later.
Unfortunately, harvesting the metadata was taking a long time due
to all the copying and pasting and manually saving images. I tried
to help by writing a few scripts to scrape through the object
records online and format the metadata in an organised format.

Some of the institutions provide decent APIs to get artefact data,
but others only provide web pages, so I had to use a mixture of
different methods to scrape the information.

The institutions I scraped were:

-   Dumbarton Oaks
-   Museum of Fine Arts Boston
-   Nasher Museum of Art at Duke University
-   The Metropolitan Museum of Art New York
-   Yale Peabody Museum of Natural History
-   Penn Museum
-   Princeton University Art Museum
-   Smithsonian National Museum of Natural History

For each of the institutions I was given a txt file of links. I
used R to scrape the information as that's what I know best. For
institutions who don't have APIs, i.e. Dumbarton Oaks, Museum of
Fine Arts Boston, Nasher, Yale Peabody, and Penn Museum, I used
{rvest} to parse the html files. For example, for Nasher:

   # Packages
   library(rvest)
   library(dplyr)

   # List record URLS
   urls <- readLines("links.txt")

   # Download pages
   lapply(urls, function(x) {
     download.file(x, destfile = file.path("html",
         gsub("/.*", "",
gsub("https://emuseum.nasher.duke.edu/objects/", "", x))))
   })

   # List html files
   html_files <- list.files("html", "*", full.names = TRUE)

   # For each file
   out_list <- lapply(html_files, function(x) {
     x <- read_html(x)

     # Get object title
     obj_title <- x %>%
       html_nodes("div.titleField") %>%
       html_nodes("h1") %>%
       html_text()

     # Get object metadata
     obj_labels <- x %>%
       html_nodes("span.detailFieldLabel") %>%
       html_text() %>%
       gsub(":.*", "", .)

     obj_values <- x %>%
       html_nodes("span.detailFieldValue") %>%
       html_text()

     # Create dataframe
     out <- as.data.frame(t(data.frame(obj_values)))
     names(out) <- obj_labels

     # Extract image IDs
     main_img_id <- x %>%
       html_nodes("div.emuseum-img-wrap") %>%
       html_nodes("img") %>%
       html_attr("src") %>%
       gsub("/internal/media/dispatcher/", "", .) %>%
       gsub("/.*", "", .) %>%
       unique()

     sec_img_id <- x %>%
       html_nodes("div.secondarymedia-item") %>%
       html_nodes("a") %>%
       html_attr("data-media-id") %>%
       unique()

     img_id <- unique(c(main_img_id, sec_img_id))

     # Construct image URLs
     img_url <- paste0(

"https://emuseum.nasher.duke.edu/internal/media/dispatcher/",
       img_id,
       "/resize%3Aformat%3Dfull")

     # Create filenames
     img_filenames <- paste0(out$`Object number`, "_", img_id,
".jpg")

     # Download images
     if (length(img_url[!is.na(img_url)]) > 1) {
       download.file(img_url, destfile = file.path("img",
img_filenames),
         method = "libcurl")
     } else if (length(img_url[!is.na(img_url)]) == 1) {
       download.file(img_url, destfile = file.path("img",
img_filenames))
     }

     return(out)
   })

   # Write metadata to csv
   out <- do.call(bind_rows, out_list)

   write.csv(out, "all.csv", row.names = FALSE)

I think Princeton probably had the nicest and simplest API to use,
while the Smithsonian had the most difficult API. However, the
complexity of the Smithsonian API is probably because they have
lots of institutions all running the same API, and a very diverse
range of records.

To query the API I used {httr}, and to parse the JSON returned by
the APIs I used {jsonlite}. Using the Princeton API as an example:

   library(httr)
   library(jsonlite)
   library(dplyr)

   base <- "https://data.artmuseum.princeton.edu/objects/"

   # Import links
   links <- readLines("links.txt")

   # Get IDs
   ids <- gsub(".*/", "", links)

   # For each ID, get record
   out_list <- lapply(ids, function(x) {
     message(x)
     # Get record
     resp <- GET(paste0(base, x))

     # Parse JSON
     resp_parsed <- content(resp, as = "parsed")

     # Save JSON
     write(content(resp, as = "text"), file.path("json", paste0(x,
".json")))

     ifnull <- function(x) {
       if (is.null(x)) {
         return("NA")
       } else {
         return(x)
       }
     }

     # Extract description
     desc_df <- data.frame(
       displayperiod = ifnull(resp_parsed$displayperiod),
       displayculture = ifnull(resp_parsed$displayculture),
       classification = ifnull(resp_parsed$classification),
       daterange = ifnull(resp_parsed$daterange),
       description = ifnull(paste(lapply(resp_parsed$texts,
function(x) {
         x$textentryhtml
       }), collapse = "; ")),
       accessionyear = ifnull(resp_parsed$accessionyear),
       title = ifnull(resp_parsed$titles[[1]]$title),
       catalograisonne = ifnull(resp_parsed$catalograisonne),
       objectnumber = ifnull(resp_parsed$objectnumber),
       objectid = ifnull(resp_parsed$objectid),
       department = ifnull(resp_parsed$department),
       country = ifnull(resp_parsed$geography[[1]]$country),
       locale = ifnull(resp_parsed$geography[[1]]$locale),
       region = ifnull(resp_parsed$geography[[1]]$region),
       subcontinent =
ifnull(resp_parsed$geography[[1]]$subcontinent),
       locus = ifnull(resp_parsed$geography[[1]]$locus),
       county = ifnull(resp_parsed$geography[[1]]$county),
       excavation = ifnull(resp_parsed$geography[[1]]$excavation),
       state = ifnull(resp_parsed$geography[[1]]$state),
       latitude = ifnull(resp_parsed$geography[[1]]$location$lat),
       longitude = ifnull(resp_parsed$geography[[1]]$location$lon),
       river = ifnull(resp_parsed$geography[[1]]$location$river),
       continent = ifnull(resp_parsed$geography[[1]]$continent),
       medium = ifnull(resp_parsed$medium),
       dimensions =
ifnull(paste(lapply(resp_parsed$dimensionelements, function(x) {
         paste(x$type, x$dimension, x$element, x$units, sep = ":")
       }), collapse = "; "))
       )

     img_list <- lapply(resp_parsed$media, function(x) {
       c(x$uri, x$id)
       })

     img_filenames <- paste0(x, "_", lapply(img_list, "[", 2),
".jpg")

     img_urls <- paste0(lapply(img_list, "[", 1),
"/full/full/0/default.jpg")

     if (length(img_list[!is.na(img_list)]) > 1) {
       try(download.file(img_urls, destfile = file.path("img",
img_filenames),
         method = "libcurl"))
     } else if (length(img_list[!is.na(img_list)]) == 1) {
       try(download.file(img_urls, destfile = file.path("img",
img_filenames)))
     }

     return(desc_df)
   })

   # Write metadata to csv
   out <- do.call(bind_rows, out_list)

   write.csv(out, "all.csv", row.names = FALSE)