Libraries

library(readr)
library(dplyr)
library(tidyr)
library(magrittr)
library(gapminder)
library(ggplot2)
library(tidytext)

Data

billionaires_df <- read_tsv("datasets_ATRIUM/billionaires_combined.tsv", show_col_types = FALSE)
bil_unite <- billionaires_df %>% distinct(name.x, birth_comb) %>% drop_na(birth_comb)
bil_separate <- billionaires_df %>% distinct(name.x, person) %>% drop_na(name.x)
gap_cze_ger_gdp <- gapminder %>% 
  filter(country %in% c("Czech Republic", "Germany")) %>% 
  select(country, year, gdpPercap)

`tidyr`

lumps and splits column values into new columns
transforms several variable columns into categories of a new variable and the other way round
completes observations with missing values
manages nested columns (vector/list inside column!)

`tidyr::unite` (before)

bil_unite %>% slice(1:10)

# A tibble: 10 × 2
   name.x                         birth_comb
   <chr>                               <dbl>
 1 A. Jerrold Perenchio                 1931
 2 Abdulla bin Ahmad Al Ghurair         1955
 3 Abdullah bin Sulaiman Al Rajhi       1929
 4 Abdulsamad Rabiu                     1960
 5 Abhay Soi                            1973
 6 Abhay Vakil                          1952
 7 Abigail Johnson                      1962
 8 Abilio dos Santos Diniz              1937
 9 Achal Bakeri                         1961
10 Acharya Balakrishna                  1972

`tidyr::unite` (after)

bil_unite %>% slice(1:10) %>%
  unite(col = ID, name.x, birth_comb,                     sep = "***", remove = FALSE) %>% 
  kableExtra::kable()

ID	name.x	birth_comb
A. Jerrold Perenchio***1931	A. Jerrold Perenchio	1931
Abdulla bin Ahmad Al Ghurair***1955	Abdulla bin Ahmad Al Ghurair	1955
Abdullah bin Sulaiman Al Rajhi***1929	Abdullah bin Sulaiman Al Rajhi	1929
Abdulsamad Rabiu***1960	Abdulsamad Rabiu	1960
Abhay Soi***1973	Abhay Soi	1973
Abhay Vakil***1952	Abhay Vakil	1952
Abigail Johnson***1962	Abigail Johnson	1962
Abilio dos Santos Diniz***1937	Abilio dos Santos Diniz	1937
Achal Bakeri***1961	Achal Bakeri	1961
Acharya Balakrishna***1972	Acharya Balakrishna	1972

`tidyr::separate`

splits values into two or more new columns

bil_separate %>% slice(1:2) %>%
  separate(col = name.x, into = c("firstnames", "middlenames", "lastnames" ), sep = " ", fill = "left")

# A tibble: 2 × 4
  firstnames middlenames lastnames person             
  <chr>      <chr>       <chr>     <chr>              
1 A.         Jayson      Adair     a_jayson_adair     
2 A.         Jerrold     Perenchio a_jerrold_perenchio

set.seed(15)
bil_separate %>% slice_sample(n = 10) %>% 
  separate(col = name.x, 
           into = c("firstnames",  "lastnames" ), 
           sep = " ", fill = "left", remove = FALSE)

# A tibble: 10 × 4
   name.x                    firstnames lastnames         person                
   <chr>                     <chr>      <chr>             <chr>                 
 1 Xu Xiaoqun                Xu         Xiaoqun           xu_xiaoqun            
 2 Irwin Jacobs              Irwin      Jacobs            irwin_jacobs          
 3 Wang Chaobin              Wang       Chaobin           wang_chaobin          
 4 Anton Schlecker           Anton      Schlecker         anton_schlecker       
 5 Thomas James              Thomas     James             thomas_james          
 6 GSK Velu                  GSK        Velu              gsk_velu              
 7 Charoen Sirivadhanabhakdi Charoen    Sirivadhanabhakdi charoen_sirivadhanabh…
 8 Yuriy Kosiuk              Yuriy      Kosiuk            yuriy_kosiuk          
 9 Elaine Marshall           Elaine     Marshall          elaine_marshall       
10 Jaime Botin               Jaime      Botin             jaime_botin_1

`tidyr::pivot_wider`

gap_czger_gdp_wide <- gap_cze_ger_gdp %>% 
  pivot_wider(names_from = country, values_from = gdpPercap)
gap_czger_gdp_wide %>% slice(1:3) %>% kableExtra::kable()

year	Czech Republic	Germany
1952	6876.140	7144.114
1957	8256.344	10187.827
1962	10136.867	12902.463

gap_czger_gdp_wide %$% 
  cor(x = `Czech Republic`, y = Germany, method = "pearson")

[1] 0.946459

`tidyr::pivot_longer`

gap_czger_gdp_wide %>% slice(1:10) %>% 
  pivot_longer(cols = c(`Czech Republic`, Germany), names_to = "COUNTRY", values_to = "GDPperCap")

# A tibble: 20 × 3
    year COUNTRY        GDPperCap
   <int> <chr>              <dbl>
 1  1952 Czech Republic     6876.
 2  1952 Germany            7144.
 3  1957 Czech Republic     8256.
 4  1957 Germany           10188.
 5  1962 Czech Republic    10137.
 6  1962 Germany           12902.
 7  1967 Czech Republic    11399.
 8  1967 Germany           14746.
 9  1972 Czech Republic    13108.
10  1972 Germany           18016.
11  1977 Czech Republic    14800.
12  1977 Germany           20513.
13  1982 Czech Republic    15377.
14  1982 Germany           22032.
15  1987 Czech Republic    16310.
16  1987 Germany           24639.
17  1992 Czech Republic    14297.
18  1992 Germany           26505.
19  1997 Czech Republic    16049.
20  1997 Germany           27789.

Separating into rows

industry_terms <- billionaires_df %>% 
  select(c(person, time, countries, industry, income_groups, world_6region)) %>%
  separate_longer_delim(cols = c("industry"), delim = ";") 
set.seed(33)
industry_terms %>% slice_sample(n = 10)

# A tibble: 10 × 6
   person          time countries industry           income_groups world_6region
   <chr>          <dbl> <chr>     <chr>              <chr>         <chr>        
 1 yang_huiyan     2022 chn       "Real Estate, Edu… upper_middle… east_asia_pa…
 2 horst_wortmann  2016 deu       "Apparel & Textil… high_income   europe_centr…
 3 wang_wenxue     2018 chn       "Real Estate"      upper_middle… east_asia_pa…
 4 b_wayne_hughes  2005 usa       "Service"          high_income   america      
 5 daniela_herz    2013 deu       "Investments"      high_income   europe_centr…
 6 edward_lampert  2017 usa       " Finance"         high_income   america      
 7 xu_jingren      2020 chn       "Healthcare"       upper_middle… east_asia_pa…
 8 qiu_guanghe     2021 chn       "Apparel"          upper_middle… east_asia_pa…
 9 elon_musk       2019 usa       "Automotive"       high_income   america      
10 ennio_doris     2009 ita       " Finance"         high_income   europe_centr…

continuation

industry_terms %<>%  
  separate_longer_delim(cols = c("industry"), delim = "&") %>%
  separate_longer_delim(cols = c("industry"), delim = "and") %>%
  separate_longer_delim(cols = c("industry"), delim = ",")
set.seed(10)
industry_terms %>% slice_sample(n = 10)

# A tibble: 10 × 6
   person                  time countries industry   income_groups world_6region
   <chr>                  <dbl> <chr>     <chr>      <chr>         <chr>        
 1 christy_walton          2013 usa       "Fashion " high_income   america      
 2 ty_warner               2004 usa       "Hospital… high_income   america      
 3 mustafa_rahmi_koc       2018 tur       " Gas"     upper_middle… europe_centr…
 4 oleg_deripaska          2008 rus       "Metals "  upper_middle… europe_centr…
 5 doris_fisher            2021 usa       " Fashion… high_income   america      
 6 gerald_ford             2017 usa       "Finance " high_income   america      
 7 gregorio_perez_companc  2005 arg       " Food"    upper_middle… america      
 8 robert_bass             2012 usa       "Energy"   high_income   america      
 9 norma_lerner            2018 usa       "Financia… high_income   america      
10 yuri_kovalchuk          2018 rus       "Finance " upper_middle… europe_centr…

industry_tf_idf <- industry_terms %>% filter(nchar(industry) > 1) %>% 
  group_by(industry, countries, world_6region) %>% 
  count(name = "freq") %>% ungroup() %>%
  tidytext::bind_tf_idf(term = industry, 
                        document = countries, 
                        n = freq) %>% 
  group_by(countries, world_6region) %>% 
  slice_max(order_by = tf_idf, n = 3) %>% ungroup()

library(dplyr)
library(tidygraph)
library(ggraph)

# Prepare edge list
edges <- industry_tf_idf %>%
  select(countries, industry, tf_idf)

# Create node list
nodes <- tibble(name = unique(c(edges$countries, edges$industry))) %>%
  mutate(type = if_else(name %in% edges$countries, "country", "industry"))

# Create graph object
graph <- tbl_graph(nodes = nodes, edges = edges, directed = FALSE)
graph

# A tbl_graph: 176 nodes and 204 edges
#
# A bipartite simple graph with 11 components
#
# Node Data: 176 × 2 (active)
   name  type   
   <chr> <chr>  
 1 ago   country
 2 are   country
 3 arg   country
 4 aus   country
 5 aut   country
 6 bel   country
 7 bgd   country
 8 bgr   country
 9 bra   country
10 brb   country
# ℹ 166 more rows
#
# Edge Data: 204 × 3
   from    to tf_idf
  <int> <int>  <dbl>
1     1    74 0.168 
2     1    75 0.115 
3     1    76 0.0968
# ℹ 201 more rows

# Plot with ggraph
ggraph(graph, layout = "auto") +
  geom_edge_link(aes(edge_width = tf_idf), alpha = 0.6) +
  geom_node_point(aes(color = type), size = 0.3) +
  geom_node_text(aes(label = name, color = type), repel = TRUE, size = 2, max.overlaps = 100) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_void() +
  labs(title = "Bipartite Network: Countries and Industries")

ggsave(filename = "my_output_files/billionairs_industry_country_ggraph.pdf", width = 7 * 2, height = 7 * 2.2)

tidyr and stringr

Libraries

Data

tidyr

tidyr::unite (before)

tidyr::unite (after)

tidyr::separate

tidyr::pivot_wider

tidyr::pivot_longer

Separating into rows

continuation

`tidyr` and `stringr`

`tidyr`

`tidyr::unite` (before)

`tidyr::unite` (after)

`tidyr::separate`

`tidyr::pivot_wider`

`tidyr::pivot_longer`