dplyr::inner_join(x = gapminder_europe, y = geo_europe, by =c("country"="name"))
# A tibble: 29 × 7
country lifeExp pop gdpPercap country.y main_religion_2008 income_3groups
<chr> <dbl> <int> <dbl> <chr> <chr> <chr>
1 Albania 76.4 3.60e6 5937. alb muslim middle_income
2 Austria 79.8 8.20e6 36126. aut christian high_income
3 Belgium 79.4 1.04e7 33693. bel christian high_income
4 Bosnia … 74.9 4.55e6 7446. bih <NA> middle_income
5 Bulgaria 73.0 7.32e6 10681. bgr christian middle_income
6 Croatia 75.7 4.49e6 14619. hrv christian high_income
7 Czech R… 76.5 1.02e7 22833. cze christian high_income
8 Denmark 78.3 5.47e6 35278. dnk christian high_income
9 Finland 79.3 5.24e6 33207. fin christian high_income
10 France 80.7 6.11e7 30470. fra christian high_income
# ℹ 19 more rows
Keep gapminder intact, no matter what.
One country gets NA in geo_europe columns.
dplyr::left_join(x = gapminder_europe, y = geo_europe,by =c("country"="name"))
# A tibble: 30 × 7
country lifeExp pop gdpPercap country.y main_religion_2008 income_3groups
<chr> <dbl> <int> <dbl> <chr> <chr> <chr>
1 Albania 76.4 3.60e6 5937. alb muslim middle_income
2 Austria 79.8 8.20e6 36126. aut christian high_income
3 Belgium 79.4 1.04e7 33693. bel christian high_income
4 Bosnia … 74.9 4.55e6 7446. bih <NA> middle_income
5 Bulgaria 73.0 7.32e6 10681. bgr christian middle_income
6 Croatia 75.7 4.49e6 14619. hrv christian high_income
7 Czech R… 76.5 1.02e7 22833. cze christian high_income
8 Denmark 78.3 5.47e6 35278. dnk christian high_income
9 Finland 79.3 5.24e6 33207. fin christian high_income
10 France 80.7 6.11e7 30470. fra christian high_income
# ℹ 20 more rows
Which is missing?
country mismatch \(\rightarrow\)is.na(country.y)
dplyr::left_join(x = gapminder_europe, y = geo_europe,by =c("country"="name")) %>%filter(is.na(country.y))
# A tibble: 1 × 7
country lifeExp pop gdpPercap country.y main_religion_2008 income_3groups
<chr> <dbl> <int> <dbl> <chr> <chr> <chr>
1 United K… 79.4 6.08e7 33203. <NA> <NA> <NA>
Focus on geo_europe
dplyr::right_join(x = gapminder_europe, y = geo_europe,by =c("country"="name"))
# A tibble: 73 × 7
country lifeExp pop gdpPercap country.y main_religion_2008 income_3groups
<chr> <dbl> <int> <dbl> <chr> <chr> <chr>
1 Albania 76.4 3.60e6 5937. alb muslim middle_income
2 Austria 79.8 8.20e6 36126. aut christian high_income
3 Belgium 79.4 1.04e7 33693. bel christian high_income
4 Bosnia … 74.9 4.55e6 7446. bih <NA> middle_income
5 Bulgaria 73.0 7.32e6 10681. bgr christian middle_income
6 Croatia 75.7 4.49e6 14619. hrv christian high_income
7 Czech R… 76.5 1.02e7 22833. cze christian high_income
8 Denmark 78.3 5.47e6 35278. dnk christian high_income
9 Finland 79.3 5.24e6 33207. fin christian high_income
10 France 80.7 6.11e7 30470. fra christian high_income
# ℹ 63 more rows
Mismatches in geo_europe
dplyr::right_join(x = gapminder_europe, y = geo_europe,by =c("country"="name")) %>%filter(is.na(lifeExp) |is.na(pop) |is.na(gdpPercap))
# A tibble: 44 × 7
country lifeExp pop gdpPercap country.y main_religion_2008 income_3groups
<chr> <dbl> <int> <dbl> <chr> <chr> <chr>
1 Abkhazia NA NA NA abkh <NA> <NA>
2 Akrotiri… NA NA NA akr_a_dhe <NA> <NA>
3 Åland NA NA NA ala <NA> <NA>
4 Andorra NA NA NA and christian high_income
5 Armenia NA NA NA arm christian middle_income
6 Antarcti… NA NA NA ata <NA> <NA>
7 Azerbaij… NA NA NA aze muslim middle_income
8 Belarus NA NA NA blr christian middle_income
9 Channel … NA NA NA chanisl christian high_income
10 Czechosl… NA NA NA cheslo <NA> <NA>
# ℹ 34 more rows
Get what you can
dplyr::full_join(x = gapminder_europe, y = geo_europe,by =c("country"="name"))
# A tibble: 74 × 7
country lifeExp pop gdpPercap country.y main_religion_2008 income_3groups
<chr> <dbl> <int> <dbl> <chr> <chr> <chr>
1 Albania 76.4 3.60e6 5937. alb muslim middle_income
2 Austria 79.8 8.20e6 36126. aut christian high_income
3 Belgium 79.4 1.04e7 33693. bel christian high_income
4 Bosnia … 74.9 4.55e6 7446. bih <NA> middle_income
5 Bulgaria 73.0 7.32e6 10681. bgr christian middle_income
6 Croatia 75.7 4.49e6 14619. hrv christian high_income
7 Czech R… 76.5 1.02e7 22833. cze christian high_income
8 Denmark 78.3 5.47e6 35278. dnk christian high_income
9 Finland 79.3 5.24e6 33207. fin christian high_income
10 France 80.7 6.11e7 30470. fra christian high_income
# ℹ 64 more rows
anti_join detects mismatches instantly
like setdiff in vectors
anti_join(x = gapminder_europe, y = geo_europe,by =c("country"="name"))
# A tibble: 1 × 4
country lifeExp pop gdpPercap
<fct> <dbl> <int> <dbl>
1 United Kingdom 79.4 60776238 33203.
anti_join the other way round
anti_join( x = geo_europe, y = gapminder_europe,by =c( "name"="country"))
# A tibble: 44 × 4
country name main_religion_2008 income_3groups
<chr> <chr> <chr> <chr>
1 abkh Abkhazia <NA> <NA>
2 akr_a_dhe Akrotiri and Dhekelia <NA> <NA>
3 ala Åland <NA> <NA>
4 and Andorra christian high_income
5 arm Armenia christian middle_income
6 ata Antarctica <NA> <NA>
7 aze Azerbaijan muslim middle_income
8 blr Belarus christian middle_income
9 chanisl Channel Islands christian high_income
10 cheslo Czechoslovakia <NA> <NA>
# ℹ 34 more rows
semi_join detects matches
semi_join( x = geo_europe, y = gapminder_europe,by =c( "name"="country"))
# A tibble: 29 × 4
country name main_religion_2008 income_3groups
<chr> <chr> <chr> <chr>
1 alb Albania muslim middle_income
2 aut Austria christian high_income
3 bel Belgium christian high_income
4 bgr Bulgaria christian middle_income
5 bih Bosnia and Herzegovina <NA> middle_income
6 che Switzerland christian high_income
7 cze Czech Republic christian high_income
8 deu Germany christian high_income
9 dnk Denmark christian high_income
10 esp Spain christian high_income
# ℹ 19 more rows
When your observations are not unique
The two tibbles: math
maths
# A tibble: 4 × 3
name birthplace math_test
<chr> <chr> <dbl>
1 John Smith Honolulu 72
2 Mary Brown Milan 40
3 John Smith Prague 25
4 Helene Field Beijing 91
The two tibbles: social sciences
social_sciences
# A tibble: 4 × 3
name birthplace soc_test
<chr> <chr> <dbl>
1 Mary Brown Milan 12
2 John Smith Honolulu 5
3 John Smith Prague 76
4 Helene Field Beijing 49
Possible rescue: unique by several columns
left_join(maths, social_sciences, by = (c("name", "birthplace")))
# A tibble: 4 × 4
name birthplace math_test soc_test
<chr> <chr> <dbl> <dbl>
1 John Smith Honolulu 72 5
2 Mary Brown Milan 40 12
3 John Smith Prague 25 76
4 Helene Field Beijing 91 49
No chance to join
If you cannot find anything that makes them unique.
maths2 <-select(maths, -birthplace)social_sciences2 <-select(social_sciences, !birthplace)left_join(maths2, social_sciences2, by ="name")
Warning in left_join(maths2, social_sciences2, by = "name"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 2 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
# A tibble: 6 × 3
name math_test soc_test
<chr> <dbl> <dbl>
1 John Smith 72 5
2 John Smith 72 76
3 Mary Brown 40 12
4 John Smith 25 5
5 John Smith 25 76
6 Helene Field 91 49
dplyr::join help
explore the arguments
relationship
multiple
unmatched
Data with typos in the key column(s)
libraries fuzzyjoin along with stringdist (used by fuzzyjoin)
DataCamp course Intermediate Regular Expressions in R > Similarities Between Strings
JRC Names
Steinberger Ralf, Bruno Pouliquen, Mijail Kabadjov, Jenya Belyaeva & Erik van der Goot (2011).JRC-Names: A freely available, highly multilingual named entity resource. Proceedings of the 8th International Conference Recent Advances in Natural Language Processing (RANLP). Hissar, Bulgaria, 12-14 September 2011.