tidyr and stringr

Silvie Cinková

2025-08-13

Libraries

library(readr)
library(dplyr)
library(tidyr)
library(magrittr)
library(gapminder)
library(ggplot2)
library(tidytext)
library(stringr)

tidyr

  • lumps and splits column values into new columns

  • transforms several variable columns into categories of a new variable and the other way round

  • completes observations with missing values

  • drops rows with missing values in specified columns

  • expands observations with all possible groups

  • manages nested columns (vector/list inside column!)

Data

billionaires_df <- read_tsv("~/DATA.NPFL112/billionaires_combined.tsv", show_col_types = FALSE)
bil_unite <- billionaires_df %>% distinct(name.x, birth_comb) %>% drop_na(birth_comb)
bil_separate <- billionaires_df %>% distinct(name.x, person) %>% drop_na(name.x)
gap_cze_ger_gdp <- gapminder %>% 
  filter(country %in% c("Czech Republic", "Germany")) %>% 
  select(country, year, gdpPercap)

billionaires_df (selected columns)

#|echo: false
slice_head(billionaires_df, n = 5) %>% 
  select(c(person, name.x, industry, countries, time)) %>% 
  kableExtra::kable()
person name.x industry countries time
a_jayson_adair A. Jayson Adair Automotive usa 2021
a_jayson_adair A. Jayson Adair Automotive usa 2022
a_jerrold_perenchio A. Jerrold Perenchio Media & Entertainment; Media;Entertainment usa 2002
a_jerrold_perenchio A. Jerrold Perenchio Media & Entertainment; Media;Entertainment usa 2003
a_jerrold_perenchio A. Jerrold Perenchio Media & Entertainment; Media;Entertainment usa 2004

tidyr::unite (before)

bil_unite %>% slice(1:10)
# A tibble: 10 × 2
   name.x                         birth_comb
   <chr>                               <dbl>
 1 A. Jerrold Perenchio                 1931
 2 Abdulla bin Ahmad Al Ghurair         1955
 3 Abdullah bin Sulaiman Al Rajhi       1929
 4 Abdulsamad Rabiu                     1960
 5 Abhay Soi                            1973
 6 Abhay Vakil                          1952
 7 Abigail Johnson                      1962
 8 Abilio dos Santos Diniz              1937
 9 Achal Bakeri                         1961
10 Acharya Balakrishna                  1972

tidyr::unite (after)

bil_unite %>% slice(1:10) %>%
  unite(col = ID, name.x, birth_comb, sep = "***", remove = FALSE) %>% 
  kableExtra::kable()
ID name.x birth_comb
A. Jerrold Perenchio***1931 A. Jerrold Perenchio 1931
Abdulla bin Ahmad Al Ghurair***1955 Abdulla bin Ahmad Al Ghurair 1955
Abdullah bin Sulaiman Al Rajhi***1929 Abdullah bin Sulaiman Al Rajhi 1929
Abdulsamad Rabiu***1960 Abdulsamad Rabiu 1960
Abhay Soi***1973 Abhay Soi 1973
Abhay Vakil***1952 Abhay Vakil 1952
Abigail Johnson***1962 Abigail Johnson 1962
Abilio dos Santos Diniz***1937 Abilio dos Santos Diniz 1937
Achal Bakeri***1961 Achal Bakeri 1961
Acharya Balakrishna***1972 Acharya Balakrishna 1972

Operations on strings

  • preparation for tidyr::separate

  • Regular Expressions (“_regex_es”)

  • the stringr library (or base R grep & Co.)

  • regex tutorial https://www.regexone.com/

Isolated strings: prep for tidyr::separate

  • split values into two or more new columns
  • like stringr::str_split or base::strsplit on strings
  • eats the separators
  • define the separator as a fixed string or escape with \

Isolated strings: prep for tidyr::separate

stringr::str_split(string = "I*like*dogs", pattern = stringr::fixed("*"), simplify = TRUE)
     [,1] [,2]   [,3]  
[1,] "I"  "like" "dogs"
stringr::str_split(string = "I*like*dogs", pattern = stringr::fixed("*"), simplify = TRUE)
     [,1] [,2]   [,3]  
[1,] "I"  "like" "dogs"

A more general regex

str_split(string = "I*A3like*B3dogs", pattern = "\\*[A-Z]\\d", simplify = TRUE)
     [,1] [,2]   [,3]  
[1,] "I"  "like" "dogs"
str_split(string = "I*A3like*B3pineapples83dogs", pattern = "\\*[^*]*\\d", simplify = TRUE)
     [,1] [,2]   [,3]  
[1,] "I"  "like" "dogs"

Split into columns

set.seed(23)
bil_separate %>% slice_sample(n = 10) %>%
  separate_wider_delim(col = name.x, cols_remove = FALSE,
    names = c("firstnames", "middlenames", "lastnames" ), 
    names_repair = "minimal",
    delim = " ", 
    too_few = "align_end",
    too_many = "merge"
    )
# A tibble: 10 × 5
   firstnames middlenames lastnames   name.x              person             
   <chr>      <chr>       <chr>       <chr>               <chr>              
 1 <NA>       Xiong       Haitao      Xiong Haitao        xiong_haitao       
 2 <NA>       Paul        Gauselmann  Paul Gauselmann     paul_gauselmann    
 3 <NA>       Duan        Yongping    Duan Yongping       duan_yongping      
 4 <NA>       Vladimir    Gruzdev     Vladimir Gruzdev    vladimir_gruzdev   
 5 <NA>       Dinesh      Nandwana    Dinesh Nandwana     dinesh_nandwana    
 6 Forrest    Mars,       Jr.         Forrest Mars, Jr.   forrest_mars_jr    
 7 Wilbur     Ross,       Jr.         Wilbur Ross, Jr.    wilbur_ross_jr     
 8 <NA>       Andreas     Struengmann Andreas Struengmann andreas_struengmann
 9 <NA>       Huang       Qiaolong    Huang Qiaolong      huang_qiaolong     
10 <NA>       Hasmukh     Chudgar     Hasmukh Chudgar     hasmukh_chudgar    

separate_wider_delim

set.seed(1950)
bil_unite %>% select(!c(birth_comb)) %>%
slice_sample(n = 10) %>%
  separate_wider_delim(col = name.x, 
    too_few = "debug", too_many = "debug",
    names = c("first_name", "surname"), delim = " ") 
# A tibble: 10 × 6
   first_name surname  name.x           name.x_ok name.x_pieces name.x_remainder
   <chr>      <chr>    <chr>            <lgl>             <int> <chr>           
 1 Peter      Spuhler  Peter Spuhler    TRUE                  2 ""              
 2 G.         M.       G. M. Rao        FALSE                 3 " Rao"          
 3 Lorenzo    Mendoza  Lorenzo Mendoza  TRUE                  2 ""              
 4 Vadim      Yakunin  Vadim Yakunin    TRUE                  2 ""              
 5 Kutayba    Alghanim Kutayba Alghanim TRUE                  2 ""              
 6 Minoru     Mori     Minoru Mori      TRUE                  2 ""              
 7 Li         Guangyu  Li Guangyu       TRUE                  2 ""              
 8 Baokun     Bai      Baokun Bai       TRUE                  2 ""              
 9 Luiza      Helena   Luiza Helena Tr… FALSE                 3 " Trajano"      
10 Han        Xiao     Han Xiao         TRUE                  2 ""              

separate_wider_regex

set.seed(1950)
bil_unite %>% select(!c(birth_comb)) %>%
slice_sample(n = 10) %>%
  separate_wider_regex(col = name.x, 
    too_few = "debug", 
    patterns = c(first_name = "(?:^[^\\s]+)",
      middle = "(?:\\1? ?[^ ]+)?",
      surname = "(?:.*\\s[^\\s]+$)"
    
  ))
# A tibble: 10 × 7
   first_name middle    surname name.x name.x_ok name.x_matches name.x_remainder
   <chr>      <chr>     <chr>   <chr>  <lgl>              <int> <chr>           
 1 Peter      ""        " Spuh… Peter… TRUE                   3 ""              
 2 G.         " M."     " Rao"  G. M.… TRUE                   3 ""              
 3 Lorenzo    ""        " Mend… Loren… TRUE                   3 ""              
 4 Vadim      ""        " Yaku… Vadim… TRUE                   3 ""              
 5 Kutayba    ""        " Algh… Kutay… TRUE                   3 ""              
 6 Minoru     ""        " Mori" Minor… TRUE                   3 ""              
 7 Li         ""        " Guan… Li Gu… TRUE                   3 ""              
 8 Baokun     ""        " Bai"  Baoku… TRUE                   3 ""              
 9 Luiza      " Helena" " Traj… Luiza… TRUE                   3 ""              
10 Han        ""        " Xiao" Han X… TRUE                   3 ""              

Pivoting

  • longer table to wider and back

gapminder

How would you

gapminder <- gapminder

gapminder %>% filter(country %in% c("Germany", "Czech Republic", "Poland")) %>% 
  slice_max(by = year, order_by = year, n = 6, with_ties = TRUE)
# A tibble: 36 × 6
   country        continent  year lifeExp      pop gdpPercap
   <fct>          <fct>     <int>   <dbl>    <int>     <dbl>
 1 Czech Republic Europe     1952    66.9  9125183     6876.
 2 Germany        Europe     1952    67.5 69145952     7144.
 3 Poland         Europe     1952    61.3 25730551     4029.
 4 Czech Republic Europe     1957    69.0  9513758     8256.
 5 Germany        Europe     1957    69.1 71019069    10188.
 6 Poland         Europe     1957    65.8 28235346     4734.
 7 Czech Republic Europe     1962    69.9  9620282    10137.
 8 Germany        Europe     1962    70.3 73739117    12902.
 9 Poland         Europe     1962    67.6 30329617     5339.
10 Czech Republic Europe     1967    70.4  9835109    11399.
# ℹ 26 more rows

tidyr::pivot_wider

gap_czger_gdp_wide <- gap_cze_ger_gdp %>% 
  pivot_wider(names_from = country, values_from = gdpPercap)
gap_czger_gdp_wide %>% slice(1:3) %>% kableExtra::kable()
year Czech Republic Germany
1952 6876.140 7144.114
1957 8256.344 10187.827
1962 10136.867 12902.463
gap_czger_gdp_wide %$% 
  cor(x = `Czech Republic`, y = Germany, method = "pearson")
[1] 0.946459

tidyr::pivot_longer

gap_czger_gdp_wide %>% slice(1:10) %>% 
  pivot_longer(cols = c(`Czech Republic`, Germany), names_to = "COUNTRY", values_to = "GDPperCap") 
# A tibble: 20 × 3
    year COUNTRY        GDPperCap
   <int> <chr>              <dbl>
 1  1952 Czech Republic     6876.
 2  1952 Germany            7144.
 3  1957 Czech Republic     8256.
 4  1957 Germany           10188.
 5  1962 Czech Republic    10137.
 6  1962 Germany           12902.
 7  1967 Czech Republic    11399.
 8  1967 Germany           14746.
 9  1972 Czech Republic    13108.
10  1972 Germany           18016.
11  1977 Czech Republic    14800.
12  1977 Germany           20513.
13  1982 Czech Republic    15377.
14  1982 Germany           22032.
15  1987 Czech Republic    16310.
16  1987 Germany           24639.
17  1992 Czech Republic    14297.
18  1992 Germany           26505.
19  1997 Czech Republic    16049.
20  1997 Germany           27789.

Separating into rows I

# A tibble: 6 × 6
  person               time countries industry       income_groups world_6region
  <chr>               <dbl> <chr>     <chr>          <chr>         <chr>        
1 a_jayson_adair       2021 usa       Automotive     high_income   america      
2 a_jayson_adair       2022 usa       Automotive     high_income   america      
3 a_jerrold_perenchio  2002 usa       Media & Enter… high_income   america      
4 a_jerrold_perenchio  2003 usa       Media & Enter… high_income   america      
5 a_jerrold_perenchio  2004 usa       Media & Enter… high_income   america      
6 a_jerrold_perenchio  2005 usa       Media & Enter… high_income   america      

Separating into rows II

industry_terms <- billionaires_df %>% 
  select(c(person, time, countries, industry, income_groups, world_6region)) %>%
  separate_longer_delim(cols = c("industry"), delim = ";") 
set.seed(33)
industry_terms %>% slice_sample(n = 10)
# A tibble: 10 × 6
   person          time countries industry           income_groups world_6region
   <chr>          <dbl> <chr>     <chr>              <chr>         <chr>        
 1 yang_huiyan     2022 chn       "Real Estate, Edu… upper_middle… east_asia_pa…
 2 horst_wortmann  2016 deu       "Apparel & Textil… high_income   europe_centr…
 3 wang_wenxue     2018 chn       "Real Estate"      upper_middle… east_asia_pa…
 4 b_wayne_hughes  2005 usa       "Service"          high_income   america      
 5 daniela_herz    2013 deu       "Investments"      high_income   europe_centr…
 6 edward_lampert  2017 usa       " Finance"         high_income   america      
 7 xu_jingren      2020 chn       "Healthcare"       upper_middle… east_asia_pa…
 8 qiu_guanghe     2021 chn       "Apparel"          upper_middle… east_asia_pa…
 9 elon_musk       2019 usa       "Automotive"       high_income   america      
10 ennio_doris     2009 ita       " Finance"         high_income   europe_centr…

continuation

industry_terms %<>%  
  separate_longer_delim(cols = c("industry"), delim = "&") %>%
  separate_longer_delim(cols = c("industry"), delim = "and") %>%
  separate_longer_delim(cols = c("industry"), delim = ",")
set.seed(10)
industry_terms %>% slice_sample(n = 10)
# A tibble: 10 × 6
   person                  time countries industry   income_groups world_6region
   <chr>                  <dbl> <chr>     <chr>      <chr>         <chr>        
 1 christy_walton          2013 usa       "Fashion " high_income   america      
 2 ty_warner               2004 usa       "Hospital… high_income   america      
 3 mustafa_rahmi_koc       2018 tur       " Gas"     upper_middle… europe_centr…
 4 oleg_deripaska          2008 rus       "Metals "  upper_middle… europe_centr…
 5 doris_fisher            2021 usa       " Fashion… high_income   america      
 6 gerald_ford             2017 usa       "Finance " high_income   america      
 7 gregorio_perez_companc  2005 arg       " Food"    upper_middle… america      
 8 robert_bass             2012 usa       "Energy"   high_income   america      
 9 norma_lerner            2018 usa       "Financia… high_income   america      
10 yuri_kovalchuk          2018 rus       "Finance " upper_middle… europe_centr…

TF*IDF: domain how typical of person?

industry_tf_idf <- industry_terms %>% filter(nchar(industry) > 1) %>% 
  group_by(industry, countries, world_6region) %>% 
  count(name = "freq") %>% ungroup() %>%
  tidytext::bind_tf_idf(term = industry, 
                        document = countries, 
                        n = freq) %>% 
  group_by(countries, world_6region) %>% 
  slice_max(order_by = tf_idf, n = 3) %>% ungroup()

Social network analysis

  • entities and relations

  • two types of entities: bipartite network

    • countries are connected through terms

    • terms are connected through countries

Nodes and edges

library(tidygraph)
library(ggraph)

# Prepare edge list
edges <- industry_tf_idf %>%
  select(countries, industry, tf_idf)

# Create node list
nodes <- tibble(name = unique(c(edges$countries, edges$industry))) %>%
  mutate(type = if_else(name %in% edges$countries, "country", "industry"))

Graph object

# Create graph object
graph <- tbl_graph(nodes = nodes, edges = edges, directed = FALSE)

Plot the graph object with

# Plot with ggraph
p <- ggraph(graph, layout = "auto") +
  geom_edge_link(aes(edge_width = tf_idf), alpha = 0.6) +
  geom_node_point(aes(color = type), size = 0.3) +
  geom_node_text(aes(label = name, color = type), repel = TRUE, size = 2, max.overlaps = 100) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_void() +
  labs(title = "Bipartite Network: Countries and Industries")
p
ggsave(plot = p, filename = "~/NPFL112_2025_ZS/OUTPUT_FILES/billionairs_industry_country_ggraph.svg", width = 7 * 2, height = 7 * 2.2, limitsize = FALSE)

Plot as image file