Subsetting and Aggregations with dplyr

Silvie Cinková

2025-08-01

dplyr: Operations on data frames

  • on rows

  • on columns

Libraries

library(dplyr, warn.conflicts = FALSE, quietly = TRUE)
library(gapminder, warn.conflicts = FALSE, quietly = TRUE)
library(magrittr, warn.conflicts = FALSE, quietly = TRUE)
library(ggplot2, warn.conflicts = FALSE, quietly = TRUE) #for diamonds dataset

magrittr pipe in dplyr: %>%

René Magritte 1898-1967: Belgian surrealist fascinated by semiotics

https://commons.wikimedia.org/wiki/File:Duisburg,_Landschaftspark_Duisburg-Nord_--_2020_--_7824-6.jpg

… but the concept behind is rather the plumbing pipeline

How to use a pipe %>%

  • left is input to the right

  • . is its placeholder

c("h", "e", "l", "l", "o") %>% 
  toupper(x = .) 
[1] "H" "E" "L" "L" "O"

A series of pipes

paste0(c("H", "E", "L", "L", "O"),  collapse = "-" )
[1] "H-E-L-L-O"
c("h", "e", "l", "l", "o") %>% toupper(x = .) %>% 
  paste0(.,  collapse = "-") %>%
  paste0("Oh, ", ., "!")
[1] "Oh, H-E-L-L-O!"

gapminder (R dataset)

glimpse(gapminder)
Rows: 1,704
Columns: 6
$ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

Subset rows with dplyr

  • filter() pick rows according to a condition

  • slice() pick rows according to their position

  • distinct() de-duplicate (pick the first/last occurring)

  • arrange() sort rows according to values of a variable

  • group_by() tell dplyr that you want the next operation to run separately on each group of rows with the same value of a categorical variable.

dplyr::filter

filter(.data = gapminder, year == 2007, country %in% c("Angola", "Bahrain")) 
# A tibble: 2 × 6
  country continent  year lifeExp      pop gdpPercap
  <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
1 Angola  Africa     2007    42.7 12420476     4797.
2 Bahrain Asia       2007    75.6   708573    29796.
filter(.data = gapminder, is.na(year) | is.na(pop)) 
# A tibble: 0 × 6
# ℹ 6 variables: country <fct>, continent <fct>, year <int>, lifeExp <dbl>,
#   pop <int>, gdpPercap <dbl>

dplyr::slice

slice(.data = gapminder, c(2:3, seq(from = 1600, to = nrow(gapminder), by = 50)))
# A tibble: 5 × 6
  country        continent  year lifeExp      pop gdpPercap
  <fct>          <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan    Asia       1957    30.3  9240934      821.
2 Afghanistan    Asia       1962    32.0 10267083      853.
3 United Kingdom Europe     1967    71.4 54959000    14143.
4 Vietnam        Asia       1977    55.8 50533506      714.
5 Zimbabwe       Africa     1987    62.4  9216418      706.

slice with helper function n()

  • n() returns the number of rows.

  • dplyr helpers work only inside other functions

    • No arguments! n(gapminder) will not work
slice(gapminder, 1, 2, n(), n()-1 ) #n: last, 2nd last row 
# A tibble: 4 × 6
  country     continent  year lifeExp      pop gdpPercap
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.
2 Afghanistan Asia       1957    30.3  9240934      821.
3 Zimbabwe    Africa     2007    43.5 12311143      470.
4 Zimbabwe    Africa     2002    40.0 11926563      672.

slice_head, slice_tail

slice_head(gapminder, n = 2)
# A tibble: 2 × 6
  country     continent  year lifeExp     pop gdpPercap
  <fct>       <fct>     <int>   <dbl>   <int>     <dbl>
1 Afghanistan Asia       1952    28.8 8425333      779.
2 Afghanistan Asia       1957    30.3 9240934      821.
dplyr::slice_tail(gapminder, n = 2)
# A tibble: 2 × 6
  country  continent  year lifeExp      pop gdpPercap
  <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
1 Zimbabwe Africa     2002    40.0 11926563      672.
2 Zimbabwe Africa     2007    43.5 12311143      470.

slice_sample

slice_sample(gapminder, n = 3)
# A tibble: 3 × 6
  country   continent  year lifeExp      pop gdpPercap
  <fct>     <fct>     <int>   <dbl>    <int>     <dbl>
1 Argentina Americas   1992    71.9 33958947     9308.
2 Taiwan    Asia       1992    74.3 20686918    15216.
3 Benin     Africa     1977    49.2  3168267     1029.
slice_sample(gapminder, n = 3)
# A tibble: 3 × 6
  country  continent  year lifeExp      pop gdpPercap
  <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
1 Iran     Asia       2002    69.5 66907826     9241.
2 Vietnam  Asia       1982    58.8 56142181      707.
3 Thailand Asia       1992    67.3 56667095     4617.

slice_max, slice_min

slice_max(gapminder, order_by = lifeExp, n = 2, with_ties = TRUE)
# A tibble: 2 × 6
  country          continent  year lifeExp       pop gdpPercap
  <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
1 Japan            Asia       2007    82.6 127467972    31656.
2 Hong Kong, China Asia       2007    82.2   6980412    39725.
slice_min(gapminder, order_by = lifeExp, n = 2, with_ties = TRUE)
# A tibble: 2 × 6
  country     continent  year lifeExp     pop gdpPercap
  <fct>       <fct>     <int>   <dbl>   <int>     <dbl>
1 Rwanda      Africa     1992    23.6 7290203      737.
2 Afghanistan Asia       1952    28.8 8425333      779.

arrange

arrange(gapminder, year) %>% slice(., 1, 2, n(), n()-1 )
# A tibble: 4 × 6
  country     continent  year lifeExp      pop gdpPercap
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.
2 Albania     Europe     1952    55.2  1282697     1601.
3 Zimbabwe    Africa     2007    43.5 12311143      470.
4 Zambia      Africa     2007    42.4 11746035     1271.
gapminder %>% arrange(.data = ., year) %>% slice(., 1, 2, n(), n()-1 )
# A tibble: 4 × 6
  country     continent  year lifeExp      pop gdpPercap
  <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
1 Afghanistan Asia       1952    28.8  8425333      779.
2 Albania     Europe     1952    55.2  1282697     1601.
3 Zimbabwe    Africa     2007    43.5 12311143      470.
4 Zambia      Africa     2007    42.4 11746035     1271.

distinct

gapminder %>% distinct(continent)
# A tibble: 5 × 1
  continent
  <fct>    
1 Asia     
2 Europe   
3 Africa   
4 Americas 
5 Oceania  

Add rows to data frame

  • Row is never a vector!
ad2002 <- gapminder %>% filter(country %in% c("Albania", "Denmark"), year == 2002)
ad2007 <- gapminder %>% filter(country %in% c("Albania", "Denmark"), year == 2007)
bind_rows(ad2002, ad2007) 
# A tibble: 4 × 6
  country continent  year lifeExp     pop gdpPercap
  <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
1 Albania Europe     2002    75.7 3508512     4604.
2 Denmark Europe     2002    77.2 5374693    32167.
3 Albania Europe     2007    76.4 3600523     5937.
4 Denmark Europe     2007    78.3 5468120    35278.

Subset columns: select

gapminder %>% select(c(country, pop, 6)) %>% colnames()
[1] "country"   "pop"       "gdpPercap"
gapminder %>% select(!c(continent, lifeExp, gdpPercap)) %>% colnames()
[1] "country" "year"    "pop"    
gapminder %>% select(TIME = year, country, population = pop) %>% colnames()
[1] "TIME"       "country"    "population"

rename and relocate

  • keep all columns even if you do not mention them
gapminder %>% rename(COUNTRY = country) %>% colnames()
[1] "COUNTRY"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"
gapminder %>% relocate(gdpPercap, pop) %>% colnames()
[1] "gdpPercap" "pop"       "country"   "continent" "year"      "lifeExp"  

Helpers select columns by names

  • contains, matches, starts_with, ends_with
gapminder %>% select(contains("co")) %>% colnames()
[1] "country"   "continent"
gapminder %>% select(matches("ou?n")) %>% colnames()
[1] "country"   "continent"

Helpers select columns by position

  • everything, last_col
gapminder %>% select(c(pop, last_col(), everything())) %>% colnames()
[1] "pop"       "gdpPercap" "country"   "continent" "year"      "lifeExp"  

Helpers select columns by classes or values

  • where
gapminder %>% select(c(where(~ !is.numeric(.x)))) %>% colnames()
[1] "country"   "continent"
gapminder %>% slice_head(n = 2) %>% 
  select(where(~ is.numeric(.x))) %>%
  select(where(~ max(.x) > 100))
# A tibble: 2 × 3
   year     pop gdpPercap
  <int>   <int>     <dbl>
1  1952 8425333      779.
2  1957 9240934      821.

Function/formula notation

  • on column values - typically you check vector class

  • Formula notation: ~ somefunction(.x, …)

gapminder %>% slice_head(n = 2) %>% 
  select(where(is.numeric))
# A tibble: 2 × 4
   year lifeExp     pop gdpPercap
  <int>   <dbl>   <int>     <dbl>
1  1952    28.8 8425333      779.
2  1957    30.3 9240934      821.
# short for this formula notation
gapminder %>% slice_head(n = 2) %>% 
  select(where(~ is.numeric(.x)))
# A tibble: 2 × 4
   year lifeExp     pop gdpPercap
  <int>   <dbl>   <int>     <dbl>
1  1952    28.8 8425333      779.
2  1957    30.3 9240934      821.

rather just for fun: feed it almost any function that works for all columns (no error)

gapminder %>% slice_head(n = 2) %>% 
  select(where(~ is.numeric(.x))) %>%
  select(where(~ max(.x) > 100))
# A tibble: 2 × 3
   year     pop gdpPercap
  <int>   <int>     <dbl>
1  1952 8425333      779.
2  1957 9240934      821.

Add new columns with bind_cols

  • add vectors or data frames
new_df <- select(gapminder, c(new_pop = pop, new_lifeExp = lifeExp))
new_vec <- gapminder$country # base R column subset
gapminder %>% bind_cols(new_df, new_country = new_vec) %>% glimpse()
Rows: 1,704
Columns: 9
$ country     <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan"…
$ continent   <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia…
$ year        <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997…
$ lifeExp     <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40…
$ pop         <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, …
$ gdpPercap   <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134…
$ new_pop     <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, …
$ new_lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40…
$ new_country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan"…

pull : extract column to create a vector

  • pull extracts one column as vector/factor
gapminder %>% select(continent) %>% 
  pull() %>% str()
 Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
gapminder %>% distinct(continent) %>% pull() %>% levels()
[1] "Africa"   "Americas" "Asia"     "Europe"   "Oceania" 

Aggregation with summarize

  • categorical variables: count

  • numeric variables: mean, median, etc.

gapminder %>% filter(year == 2007, 
                     continent == "Europe") %>%
  summarize(mean_pop = mean(pop),  
            mean_lifeExp = round(mean(lifeExp)))
# A tibble: 1 × 2
   mean_pop mean_lifeExp
      <dbl>        <dbl>
1 19536618.           78

grouped aggregations (group_by)

gap2000 <- gapminder %>% filter(year > 1997) %>% 
  group_by(continent, year) %>% 
  summarize(mean_pop = mean(pop), 
            mean_lifeExp = mean(lifeExp) )
gap2000
# A tibble: 10 × 4
# Groups:   continent [5]
   continent  year   mean_pop mean_lifeExp
   <fct>     <int>      <dbl>        <dbl>
 1 Africa     2002  16033152.         53.3
 2 Africa     2007  17875763.         54.8
 3 Americas   2002  33990910.         72.4
 4 Americas   2007  35954847.         73.6
 5 Asia       2002 109145521.         69.2
 6 Asia       2007 115513752.         70.7
 7 Europe     2002  19274129.         76.7
 8 Europe     2007  19536618.         77.6
 9 Oceania    2002  11727414.         79.7
10 Oceania    2007  12274974.         80.7

grouped data frame

groups(gap2000)
[[1]]
continent

gap2000 <- gap2000 %>% ungroup()
groups(gap2000)
list()