Extract information from JSON data

Silvie Cinková

2025-12-15

library(tidyverse)
if(system.file(package = "jsonlite") == "") {
  cat("Package 'jsonlite' is needed. Installing.\n")
}
if (require("jsonlite") == FALSE) {
  library(jsonlite)
}

JSON

  • JavaScript Object Notation
{"array": [ "object1":
             {"name1": "property-string/bool", 
               "name2": number},
           "object2": {"name222": "property..."}
]
}
   

FBI list of Wanted Persons

Individual info

FBI API

https://www.fbi.gov/wanted/api

Interactive demo

https://api.fbi.gov/docs#/wanted/_wanted__wanted_get

Experiment with parameters

the basic api url: https://api.fbi.gov/wanted/v1/list

Concatenate parameters:

  • first param: use ?param=value

  • 2nd+ param: use &param=value

https://api.fbi.gov/wanted/v1/list?race=black&sex=female&poster_classification=missing

Encode URL

  • Diacritics, punctuation, spaces!!!

  • reserved characters TRUE, too

URLencode("Bärbel Müller?!", repeated = FALSE, reserved = TRUE)
[1] "B%C3%A4rbel%20M%C3%BCller%3F%21"
URLencode("Bärbel Müller?!", repeated = FALSE, reserved = FALSE)
[1] "B%C3%A4rbel%20M%C3%BCller?!"

Structure of FBI Wanted API responses

https://api.fbi.gov/wanted/v1/list?field_offices=seattle

Objects (FBI Wanted) in the items list

Raw JSON file - indented

Raw JSON file - no indentation

API request with httr::GET in R

  • you want the content but get more
request <- httr::GET("https://api.fbi.gov/wanted/list")
class(request); mode(request); names(request)
[1] "response"
[1] "list"
 [1] "url"         "status_code" "headers"     "all_headers" "cookies"    
 [6] "content"     "date"        "times"       "request"     "handle"     

Extract content from response

req_content <- request %>% httr::content() 
mode(req_content); names(req_content)
[1] "list"
[1] "total" "items" "page" 

Extract content as text

  • when you are not happy with the direct parsing
text <-  httr::GET("https://api.fbi.gov/wanted/v1/list?page=1") %>%
  httr::content(as = "text")
text %>% stringr::str_trunc(width = 350, ellipsis = ".....")
[1] "{\"total\":1060,\"items\":[{\"possible_states\":null,\"warning_message\":null,\"field_offices\":[\"lasvegas\"],\"details\":\"<p>The Federal Bureau of Investigation's Las Vegas Field Office in Nevada is seeking the public's assistance in identifying the unknown suspect responsible for the defacement of Federal Buildings located at 501 South Las Vegas Boulevar....."

R object from JSON (text)

  • jsonlite::fromJSON
text_fromjson <- httr::GET("https://api.fbi.gov/wanted/v1/list?page=1") %>%
  httr::content(as = "text") %>% jsonlite::fromJSON()
mode(text_fromjson); class(text_fromjson); names(text_fromjson)
[1] "list"
[1] "list"
[1] "total" "items" "page" 

Structure of FBI Wanted items

names(req_content)
[1] "total" "items" "page" 
req_content$total # total count of items (all pages)
[1] 1060
req_content$page # which page 
[1] 1
length(req_content$items) # count of items on page 
[1] 20

Zoom in on items

items <- req_content$items 
class(items); names(items); length(items)
[1] "list"
NULL
[1] 20

Element names in items

names(items[[1]]) # one item 
 [1] "possible_states"        "warning_message"        "field_offices"         
 [4] "details"                "locations"              "age_range"             
 [7] "path"                   "occupations"            "eyes_raw"              
[10] "scars_and_marks"        "weight"                 "poster_classification" 
[13] "possible_countries"     "eyes"                   "files"                 
[16] "modified"               "age_min"                "caution"               
[19] "description"            "person_classification"  "hair"                  
[22] "reward_max"             "title"                  "coordinates"           
[25] "place_of_birth"         "languages"              "race_raw"              
[28] "reward_min"             "complexion"             "aliases"               
[31] "url"                    "ncic"                   "height_min"            
[34] "race"                   "publication"            "weight_max"            
[37] "subjects"               "images"                 "suspects"              
[40] "remarks"                "reward_text"            "nationality"           
[43] "legat_names"            "dates_of_birth_used"    "status"                
[46] "build"                  "weight_min"             "hair_raw"              
[49] "uid"                    "sex"                    "height_max"            
[52] "additional_information" "age_max"                "pathId"                

Get items as a data frame

  • parse the JSON with jsonlite

  • make structure as flat as possible (almost default)

class(text_fromjson); names(text_fromjson) 
[1] "list"
[1] "total" "items" "page" 

items as a data frame

class(text_fromjson$items); colnames(text_fromjson$items)
[1] "data.frame"
 [1] "possible_states"        "warning_message"        "field_offices"         
 [4] "details"                "locations"              "age_range"             
 [7] "path"                   "occupations"            "eyes_raw"              
[10] "scars_and_marks"        "weight"                 "poster_classification" 
[13] "possible_countries"     "eyes"                   "files"                 
[16] "modified"               "age_min"                "caution"               
[19] "description"            "person_classification"  "hair"                  
[22] "reward_max"             "title"                  "coordinates"           
[25] "place_of_birth"         "languages"              "race_raw"              
[28] "reward_min"             "complexion"             "aliases"               
[31] "url"                    "ncic"                   "height_min"            
[34] "race"                   "publication"            "weight_max"            
[37] "subjects"               "images"                 "suspects"              
[40] "remarks"                "reward_text"            "nationality"           
[43] "legat_names"            "dates_of_birth_used"    "status"                
[46] "build"                  "weight_min"             "hair_raw"              
[49] "uid"                    "sex"                    "height_max"            
[52] "additional_information" "age_max"                "pathId"                

Nested dataframe items_df

items_df <- text_fromjson$items
nonatomic <- items_df %>% select(where(~ !is.atomic(.x))) 
colnames(nonatomic)
[1] "field_offices"       "occupations"         "files"              
[4] "coordinates"         "languages"           "aliases"            
[7] "subjects"            "images"              "dates_of_birth_used"

What class are the non-atomic columns?

for (i in seq_along(colnames(nonatomic))) {
  nonatomic %>% pull(i) %>% class() %>% print()
}
[1] "list"
[1] "list"
[1] "list"
[1] "list"
[1] "list"
[1] "list"
[1] "list"
[1] "list"
[1] "list"

Use purrr to avoid loops

items_df %>% keep(~ !is.atomic(.x)) %>% map_chr(~ class(.x)) 
      field_offices         occupations               files         coordinates 
             "list"              "list"              "list"              "list" 
          languages             aliases            subjects              images 
             "list"              "list"              "list"              "list" 
dates_of_birth_used 
             "list" 

What is inside the lists?

items_df %>% 
  purrr::keep(~ !is.atomic(.x)) %>% 
  glimpse()
Rows: 20
Columns: 9
$ field_offices       <list> "lasvegas", "louisville", <NULL>, "miami", "dalla…
$ occupations         <list> <NULL>, <NULL>, <NULL>, <NULL>, <NULL>, <NULL>, <…
$ files               <list> [<data.frame[1 x 2]>], [<data.frame[1 x 2]>], [<d…
$ coordinates         <list> [], [], [], [], [], [], [], [], [], [], [], [], […
$ languages           <list> <NULL>, <NULL>, <NULL>, <NULL>, <NULL>, <"English…
$ aliases             <list> <NULL>, <NULL>, <NULL>, <NULL>, <"Cecilia Rodrigu…
$ subjects            <list> "Seeking Information", "Criminal Enterprise Inves…
$ images              <list> [<data.frame[5 x 4]>], [<data.frame[2 x 4]>], [<d…
$ dates_of_birth_used <list> <NULL>, "September 25, 1980", <NULL>, "March 27, …

Zoom in on nested columns

(langs_df <- items_df %>% 
    select(title, languages, files, field_offices,
      dates_of_birth_used) %>% 
  filter(map_lgl(languages, ~!is.null(.x))) ) %>%
  glimpse()
Rows: 2
Columns: 5
$ title               <chr> "ELLA MAE BEGAY", "MIGUEL ANGEL AGUILAR OJEDA"
$ languages           <list> <"English", "Navajo">, <"Spanish", "English">
$ files               <list> [<data.frame[3 x 2]>], [<data.frame[2 x 2]>]
$ field_offices       <list> <"albuquerque", "phoenix">, "philadelphia"
$ dates_of_birth_used <list> "September 28, 1958", "August 5, 1990"

tidyr::unnest

  • unnest,

  • unnest_longer,

  • unnest_wider

tidyr::unnest_longer

langs_df %>% 
  unnest_longer(languages) %>%
  glimpse()
Rows: 4
Columns: 5
$ title               <chr> "ELLA MAE BEGAY", "ELLA MAE BEGAY", "MIGUEL ANGEL …
$ languages           <chr> "English", "Navajo", "Spanish", "English"
$ files               <list> [<data.frame[3 x 2]>], [<data.frame[3 x 2]>], [<da…
$ field_offices       <list> <"albuquerque", "phoenix">, <"albuquerque", "phoe…
$ dates_of_birth_used <list> "September 28, 1958", "September 28, 1958", "Augu…

tidyr::unnest_wider

langs_df %>% 
  unnest_wider(languages, names_sep = "--") %>% 
  glimpse()
Rows: 2
Columns: 6
$ title               <chr> "ELLA MAE BEGAY", "MIGUEL ANGEL AGUILAR OJEDA"
$ `languages--1`      <chr> "English", "Spanish"
$ `languages--2`      <chr> "Navajo", "English"
$ files               <list> [<data.frame[3 x 2]>], [<data.frame[2 x 2]>]
$ field_offices       <list> <"albuquerque", "phoenix">, "philadelphia"
$ dates_of_birth_used <list> "September 28, 1958", "August 5, 1990"

unnest_longer several columns

(un01_df <- langs_df %>% 
    unnest_longer(c(field_offices, languages))) %>%
  glimpse()
Rows: 4
Columns: 5
$ title               <chr> "ELLA MAE BEGAY", "ELLA MAE BEGAY", "MIGUEL ANGEL …
$ languages           <chr> "English", "Navajo", "Spanish", "English"
$ files               <list> [<data.frame[3 x 2]>], [<data.frame[3 x 2]>], [<da…
$ field_offices       <chr> "albuquerque", "phoenix", "philadelphia", "philad…
$ dates_of_birth_used <list> "September 28, 1958", "September 28, 1958", "Augus…

unnest_longer column by column

(un02_df <- langs_df %>% 
    unnest_longer(field_offices) %>%
    unnest_longer(languages)) #%>%
# A tibble: 6 × 5
  title                      languages files  field_offices dates_of_birth_used
  <chr>                      <chr>     <list> <chr>         <list>             
1 ELLA MAE BEGAY             English   <df>   albuquerque   <chr [1]>          
2 ELLA MAE BEGAY             Navajo    <df>   albuquerque   <chr [1]>          
3 ELLA MAE BEGAY             English   <df>   phoenix       <chr [1]>          
4 ELLA MAE BEGAY             Navajo    <df>   phoenix       <chr [1]>          
5 MIGUEL ANGEL AGUILAR OJEDA Spanish   <df>   philadelphia  <chr [1]>          
6 MIGUEL ANGEL AGUILAR OJEDA English   <df>   philadelphia  <chr [1]>          
 # glimpse()

Unnesting a data frame

(un04_df <- langs_df %>% 
    select(title, files) %>% 
    unnest(files)) %>% 
  glimpse()
Rows: 5
Columns: 3
$ title <chr> "ELLA MAE BEGAY", "ELLA MAE BEGAY", "ELLA MAE BEGAY", "MIGUEL AN…
$ url   <chr> "https://www.fbi.gov/wanted/kidnap/ella-mae-begay/download.pdf",…
$ name  <chr> "English", "DIN BIZAAD KEHGO", "EN ESPAOL", "English", "EN ESPAO…

Nested data frame with unnest_wider

(un05_df <- langs_df %>% 
    select(title, files) %>%
    unnest_wider(files, names_sep = "---")) %>%
  glimpse()
Rows: 2
Columns: 3
$ title          <chr> "ELLA MAE BEGAY", "MIGUEL ANGEL AGUILAR OJEDA"
$ `files---url`  <list<chr>> <"https://www.fbi.gov/wanted/kidnap/ella-mae-begay/down…
$ `files---name` <list<chr>> <"English", "DIN BIZAAD KEHGO", "EN ESPAOL">, <"E…

Lists in nested columns 😧

langs_FilesAsList %>% pull(files)
[[1]]
[[1]]$url
[1] "https://www.fbi.gov/wanted/kidnap/ella-mae-begay/download.pdf"                                                       
[2] "https://www.fbi.gov/wanted/kidnap/ella-mae-begay/ella-mae-begay_navajo.pdf/@@download/file/ella-mae-begay_Navajo.pdf"
[3] "https://www.fbi.gov/wanted/kidnap/ella-mae-begay/begayspanish.pdf/@@download/file/BegaySpanish.pdf"                  

[[1]]$name
[1] "English"          "DIN BIZAAD KEHGO" "EN ESPAOL"       


[[2]]
[[2]]$url
[1] "https://www.fbi.gov/wanted/murders/miguel-angel-aguilar-ojeda/download.pdf"                                       
[2] "https://www.fbi.gov/wanted/murders/miguel-angel-aguilar-ojeda/ojedaspanish2.pdf/@@download/file/OjedaSpanish2.pdf"

[[2]]$name
[1] "English"   "EN ESPAOL"

hoist nested list elements into columns

 langs_FilesAsList %>% 
  tidyr::hoist(files, POSTER_URL = list( 1L), 
    POSTER_NAMES = list(2L)) 
                       title        languages
1             ELLA MAE BEGAY  English, Navajo
2 MIGUEL ANGEL AGUILAR OJEDA Spanish, English
                                                                                                                                                                                                                                                                               POSTER_URL
1 https://www.fbi.gov/wanted/kidnap/ella-mae-begay/download.pdf, https://www.fbi.gov/wanted/kidnap/ella-mae-begay/ella-mae-begay_navajo.pdf/@@download/file/ella-mae-begay_Navajo.pdf, https://www.fbi.gov/wanted/kidnap/ella-mae-begay/begayspanish.pdf/@@download/file/BegaySpanish.pdf
2                                                                                           https://www.fbi.gov/wanted/murders/miguel-angel-aguilar-ojeda/download.pdf, https://www.fbi.gov/wanted/murders/miguel-angel-aguilar-ojeda/ojedaspanish2.pdf/@@download/file/OjedaSpanish2.pdf
                          POSTER_NAMES        field_offices dates_of_birth_used
1 English, DIN BIZAAD KEHGO, EN ESPAOL albuquerque, phoenix  September 28, 1958
2                   English, EN ESPAOL         philadelphia      August 5, 1990

hoist, then unnest.

langs_FilesAsList %>% 
 tidyr::hoist(files, POSTER_URL = list( 1L), POSTER_NAMES = list(2L)) %>%
  unnest_longer(c(starts_with("POSTER")))
# A tibble: 5 × 6
  title      languages POSTER_URL POSTER_NAMES field_offices dates_of_birth_used
  <chr>      <list>    <chr>      <chr>        <list>        <list>             
1 ELLA MAE … <chr [2]> https://w… English      <chr [2]>     <chr [1]>          
2 ELLA MAE … <chr [2]> https://w… DIN BIZAAD … <chr [2]>     <chr [1]>          
3 ELLA MAE … <chr [2]> https://w… EN ESPAOL    <chr [2]>     <chr [1]>          
4 MIGUEL AN… <chr [2]> https://w… English      <chr [1]>     <chr [1]>          
5 MIGUEL AN… <chr [2]> https://w… EN ESPAOL    <chr [1]>     <chr [1]>          

Do anything inside nested columns

  • operations on lists in general: purrr

  • purrr::map(<your list>, ~ <some function on your list>(.x, <other arguments>))

langs_FilesAsList <- langs_df %>% 
  mutate(across(files, ~ map(.x, ~ as.list(.x)))) 

Transform the dataframes column with a loop

test_df <- langs_df
for (i in 1:nrow(langs_df)) {
  test_df$files[[i]] <- as.list(test_df$files[[i]])
}

test_df$files[[1]]
$url
[1] "https://www.fbi.gov/wanted/kidnap/ella-mae-begay/download.pdf"                                                       
[2] "https://www.fbi.gov/wanted/kidnap/ella-mae-begay/ella-mae-begay_navajo.pdf/@@download/file/ella-mae-begay_Navajo.pdf"
[3] "https://www.fbi.gov/wanted/kidnap/ella-mae-begay/begayspanish.pdf/@@download/file/BegaySpanish.pdf"                  

$name
[1] "English"          "DIN BIZAAD KEHGO" "EN ESPAOL"