tidyr and stringr2025-08-13
tidyrlumps and splits column values into new columns
transforms several variable columns into categories of a new variable and the other way round
completes observations with missing values
drops rows with missing values in specified columns
expands observations with all possible groups
manages nested columns (vector/list inside column!)
billionaires_df <- read_tsv("~/DATA.NPFL112/billionaires_combined.tsv", show_col_types = FALSE)
bil_unite <- billionaires_df %>% distinct(name.x, birth_comb) %>% drop_na(birth_comb)
bil_separate <- billionaires_df %>% distinct(name.x, person) %>% drop_na(name.x)
gap_cze_ger_gdp <- gapminder %>%
filter(country %in% c("Czech Republic", "Germany")) %>%
select(country, year, gdpPercap)billionaires_df (selected columns)#|echo: false
slice_head(billionaires_df, n = 5) %>%
select(c(person, name.x, industry, countries, time)) %>%
kableExtra::kable()| person | name.x | industry | countries | time |
|---|---|---|---|---|
| a_jayson_adair | A. Jayson Adair | Automotive | usa | 2021 |
| a_jayson_adair | A. Jayson Adair | Automotive | usa | 2022 |
| a_jerrold_perenchio | A. Jerrold Perenchio | Media & Entertainment; Media;Entertainment | usa | 2002 |
| a_jerrold_perenchio | A. Jerrold Perenchio | Media & Entertainment; Media;Entertainment | usa | 2003 |
| a_jerrold_perenchio | A. Jerrold Perenchio | Media & Entertainment; Media;Entertainment | usa | 2004 |
tidyr::unite (before)# A tibble: 10 × 2
name.x birth_comb
<chr> <dbl>
1 A. Jerrold Perenchio 1931
2 Abdulla bin Ahmad Al Ghurair 1955
3 Abdullah bin Sulaiman Al Rajhi 1929
4 Abdulsamad Rabiu 1960
5 Abhay Soi 1973
6 Abhay Vakil 1952
7 Abigail Johnson 1962
8 Abilio dos Santos Diniz 1937
9 Achal Bakeri 1961
10 Acharya Balakrishna 1972
tidyr::unite (after)bil_unite %>% slice(1:10) %>%
unite(col = ID, name.x, birth_comb, sep = "***", remove = FALSE) %>%
kableExtra::kable()| ID | name.x | birth_comb |
|---|---|---|
| A. Jerrold Perenchio***1931 | A. Jerrold Perenchio | 1931 |
| Abdulla bin Ahmad Al Ghurair***1955 | Abdulla bin Ahmad Al Ghurair | 1955 |
| Abdullah bin Sulaiman Al Rajhi***1929 | Abdullah bin Sulaiman Al Rajhi | 1929 |
| Abdulsamad Rabiu***1960 | Abdulsamad Rabiu | 1960 |
| Abhay Soi***1973 | Abhay Soi | 1973 |
| Abhay Vakil***1952 | Abhay Vakil | 1952 |
| Abigail Johnson***1962 | Abigail Johnson | 1962 |
| Abilio dos Santos Diniz***1937 | Abilio dos Santos Diniz | 1937 |
| Achal Bakeri***1961 | Achal Bakeri | 1961 |
| Acharya Balakrishna***1972 | Acharya Balakrishna | 1972 |
preparation for tidyr::separate
Regular Expressions (“_regex_es”)
the stringr library (or base R grep & Co.)
regex tutorial https://www.regexone.com/
tidyr::separatestringr::str_split or base::strsplit on strings\tidyr::separate [,1] [,2] [,3]
[1,] "I" "like" "dogs"
[,1] [,2] [,3]
[1,] "I" "like" "dogs"
set.seed(23)
bil_separate %>% slice_sample(n = 10) %>%
separate_wider_delim(col = name.x, cols_remove = FALSE,
names = c("firstnames", "middlenames", "lastnames" ),
names_repair = "minimal",
delim = " ",
too_few = "align_end",
too_many = "merge"
)# A tibble: 10 × 5
firstnames middlenames lastnames name.x person
<chr> <chr> <chr> <chr> <chr>
1 <NA> Xiong Haitao Xiong Haitao xiong_haitao
2 <NA> Paul Gauselmann Paul Gauselmann paul_gauselmann
3 <NA> Duan Yongping Duan Yongping duan_yongping
4 <NA> Vladimir Gruzdev Vladimir Gruzdev vladimir_gruzdev
5 <NA> Dinesh Nandwana Dinesh Nandwana dinesh_nandwana
6 Forrest Mars, Jr. Forrest Mars, Jr. forrest_mars_jr
7 Wilbur Ross, Jr. Wilbur Ross, Jr. wilbur_ross_jr
8 <NA> Andreas Struengmann Andreas Struengmann andreas_struengmann
9 <NA> Huang Qiaolong Huang Qiaolong huang_qiaolong
10 <NA> Hasmukh Chudgar Hasmukh Chudgar hasmukh_chudgar
separate_wider_delimset.seed(1950)
bil_unite %>% select(!c(birth_comb)) %>%
slice_sample(n = 10) %>%
separate_wider_delim(col = name.x,
too_few = "debug", too_many = "debug",
names = c("first_name", "surname"), delim = " ") # A tibble: 10 × 6
first_name surname name.x name.x_ok name.x_pieces name.x_remainder
<chr> <chr> <chr> <lgl> <int> <chr>
1 Peter Spuhler Peter Spuhler TRUE 2 ""
2 G. M. G. M. Rao FALSE 3 " Rao"
3 Lorenzo Mendoza Lorenzo Mendoza TRUE 2 ""
4 Vadim Yakunin Vadim Yakunin TRUE 2 ""
5 Kutayba Alghanim Kutayba Alghanim TRUE 2 ""
6 Minoru Mori Minoru Mori TRUE 2 ""
7 Li Guangyu Li Guangyu TRUE 2 ""
8 Baokun Bai Baokun Bai TRUE 2 ""
9 Luiza Helena Luiza Helena Tr… FALSE 3 " Trajano"
10 Han Xiao Han Xiao TRUE 2 ""
separate_wider_regexset.seed(1950)
bil_unite %>% select(!c(birth_comb)) %>%
slice_sample(n = 10) %>%
separate_wider_regex(col = name.x,
too_few = "debug",
patterns = c(first_name = "(?:^[^\\s]+)",
middle = "(?:\\1? ?[^ ]+)?",
surname = "(?:.*\\s[^\\s]+$)"
))# A tibble: 10 × 7
first_name middle surname name.x name.x_ok name.x_matches name.x_remainder
<chr> <chr> <chr> <chr> <lgl> <int> <chr>
1 Peter "" " Spuh… Peter… TRUE 3 ""
2 G. " M." " Rao" G. M.… TRUE 3 ""
3 Lorenzo "" " Mend… Loren… TRUE 3 ""
4 Vadim "" " Yaku… Vadim… TRUE 3 ""
5 Kutayba "" " Algh… Kutay… TRUE 3 ""
6 Minoru "" " Mori" Minor… TRUE 3 ""
7 Li "" " Guan… Li Gu… TRUE 3 ""
8 Baokun "" " Bai" Baoku… TRUE 3 ""
9 Luiza " Helena" " Traj… Luiza… TRUE 3 ""
10 Han "" " Xiao" Han X… TRUE 3 ""
How would you
gapminder <- gapminder
gapminder %>% filter(country %in% c("Germany", "Czech Republic", "Poland")) %>%
slice_max(by = year, order_by = year, n = 6, with_ties = TRUE)# A tibble: 36 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Czech Republic Europe 1952 66.9 9125183 6876.
2 Germany Europe 1952 67.5 69145952 7144.
3 Poland Europe 1952 61.3 25730551 4029.
4 Czech Republic Europe 1957 69.0 9513758 8256.
5 Germany Europe 1957 69.1 71019069 10188.
6 Poland Europe 1957 65.8 28235346 4734.
7 Czech Republic Europe 1962 69.9 9620282 10137.
8 Germany Europe 1962 70.3 73739117 12902.
9 Poland Europe 1962 67.6 30329617 5339.
10 Czech Republic Europe 1967 70.4 9835109 11399.
# ℹ 26 more rows
tidyr::pivot_widertidyr::pivot_longergap_czger_gdp_wide %>% slice(1:10) %>%
pivot_longer(cols = c(`Czech Republic`, Germany), names_to = "COUNTRY", values_to = "GDPperCap") # A tibble: 20 × 3
year COUNTRY GDPperCap
<int> <chr> <dbl>
1 1952 Czech Republic 6876.
2 1952 Germany 7144.
3 1957 Czech Republic 8256.
4 1957 Germany 10188.
5 1962 Czech Republic 10137.
6 1962 Germany 12902.
7 1967 Czech Republic 11399.
8 1967 Germany 14746.
9 1972 Czech Republic 13108.
10 1972 Germany 18016.
11 1977 Czech Republic 14800.
12 1977 Germany 20513.
13 1982 Czech Republic 15377.
14 1982 Germany 22032.
15 1987 Czech Republic 16310.
16 1987 Germany 24639.
17 1992 Czech Republic 14297.
18 1992 Germany 26505.
19 1997 Czech Republic 16049.
20 1997 Germany 27789.
# A tibble: 6 × 6
person time countries industry income_groups world_6region
<chr> <dbl> <chr> <chr> <chr> <chr>
1 a_jayson_adair 2021 usa Automotive high_income america
2 a_jayson_adair 2022 usa Automotive high_income america
3 a_jerrold_perenchio 2002 usa Media & Enter… high_income america
4 a_jerrold_perenchio 2003 usa Media & Enter… high_income america
5 a_jerrold_perenchio 2004 usa Media & Enter… high_income america
6 a_jerrold_perenchio 2005 usa Media & Enter… high_income america
industry_terms <- billionaires_df %>%
select(c(person, time, countries, industry, income_groups, world_6region)) %>%
separate_longer_delim(cols = c("industry"), delim = ";")
set.seed(33)
industry_terms %>% slice_sample(n = 10)# A tibble: 10 × 6
person time countries industry income_groups world_6region
<chr> <dbl> <chr> <chr> <chr> <chr>
1 yang_huiyan 2022 chn "Real Estate, Edu… upper_middle… east_asia_pa…
2 horst_wortmann 2016 deu "Apparel & Textil… high_income europe_centr…
3 wang_wenxue 2018 chn "Real Estate" upper_middle… east_asia_pa…
4 b_wayne_hughes 2005 usa "Service" high_income america
5 daniela_herz 2013 deu "Investments" high_income europe_centr…
6 edward_lampert 2017 usa " Finance" high_income america
7 xu_jingren 2020 chn "Healthcare" upper_middle… east_asia_pa…
8 qiu_guanghe 2021 chn "Apparel" upper_middle… east_asia_pa…
9 elon_musk 2019 usa "Automotive" high_income america
10 ennio_doris 2009 ita " Finance" high_income europe_centr…
industry_terms %<>%
separate_longer_delim(cols = c("industry"), delim = "&") %>%
separate_longer_delim(cols = c("industry"), delim = "and") %>%
separate_longer_delim(cols = c("industry"), delim = ",")
set.seed(10)
industry_terms %>% slice_sample(n = 10)# A tibble: 10 × 6
person time countries industry income_groups world_6region
<chr> <dbl> <chr> <chr> <chr> <chr>
1 christy_walton 2013 usa "Fashion " high_income america
2 ty_warner 2004 usa "Hospital… high_income america
3 mustafa_rahmi_koc 2018 tur " Gas" upper_middle… europe_centr…
4 oleg_deripaska 2008 rus "Metals " upper_middle… europe_centr…
5 doris_fisher 2021 usa " Fashion… high_income america
6 gerald_ford 2017 usa "Finance " high_income america
7 gregorio_perez_companc 2005 arg " Food" upper_middle… america
8 robert_bass 2012 usa "Energy" high_income america
9 norma_lerner 2018 usa "Financia… high_income america
10 yuri_kovalchuk 2018 rus "Finance " upper_middle… europe_centr…
industry_tf_idf <- industry_terms %>% filter(nchar(industry) > 1) %>%
group_by(industry, countries, world_6region) %>%
count(name = "freq") %>% ungroup() %>%
tidytext::bind_tf_idf(term = industry,
document = countries,
n = freq) %>%
group_by(countries, world_6region) %>%
slice_max(order_by = tf_idf, n = 3) %>% ungroup()# Plot with ggraph
p <- ggraph(graph, layout = "auto") +
geom_edge_link(aes(edge_width = tf_idf), alpha = 0.6) +
geom_node_point(aes(color = type), size = 0.3) +
geom_node_text(aes(label = name, color = type), repel = TRUE, size = 2, max.overlaps = 100) +
scale_edge_width(range = c(0.2, 2)) +
theme_void() +
labs(title = "Bipartite Network: Countries and Industries")
p
Social network analysis
entities and relations
two types of entities: bipartite network
countries are connected through terms
terms are connected through countries