library(ggplot2)
library(dplyr)
library(nycflights13)

These notes are based on the following introduction to dplyr vignette.

For a more thorough discussion, you can look at the Data transformation chapter of R for Data Science

The dplyr and tidyr cheatsheet is another fantastic reference.

Basics of dplyr

The dplyr package introduces 5 basic verbs that help to streamline the data manipulation process.

It also has several other functions such as slice(), rename(), transmute(), sample_n() and sample_frac(), all of which you may find useful.

Exploring the nycflights13 data

We’ll illustrate the basics of dplyr using the flights data. This dataset contains information on 336776 that departed from New York City in 2013.

head(flights)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
summary(flights)
##       year          month             day           dep_time   
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400  
##                                                  NA's   :8255  
##  sched_dep_time   dep_delay          arr_time    sched_arr_time
##  Min.   : 106   Min.   : -43.00   Min.   :   1   Min.   :   1  
##  1st Qu.: 906   1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124  
##  Median :1359   Median :  -2.00   Median :1535   Median :1556  
##  Mean   :1344   Mean   :  12.64   Mean   :1502   Mean   :1536  
##  3rd Qu.:1729   3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945  
##  Max.   :2359   Max.   :1301.00   Max.   :2400   Max.   :2359  
##                 NA's   :8255      NA's   :8713                 
##    arr_delay          carrier              flight       tailnum         
##  Min.   : -86.000   Length:336776      Min.   :   1   Length:336776     
##  1st Qu.: -17.000   Class :character   1st Qu.: 553   Class :character  
##  Median :  -5.000   Mode  :character   Median :1496   Mode  :character  
##  Mean   :   6.895                      Mean   :1972                     
##  3rd Qu.:  14.000                      3rd Qu.:3465                     
##  Max.   :1272.000                      Max.   :8500                     
##  NA's   :9430                                                           
##     origin              dest              air_time        distance   
##  Length:336776      Length:336776      Min.   : 20.0   Min.   :  17  
##  Class :character   Class :character   1st Qu.: 82.0   1st Qu.: 502  
##  Mode  :character   Mode  :character   Median :129.0   Median : 872  
##                                        Mean   :150.7   Mean   :1040  
##                                        3rd Qu.:192.0   3rd Qu.:1389  
##                                        Max.   :695.0   Max.   :4983  
##                                        NA's   :9430                  
##       hour           minute        time_hour                  
##  Min.   : 1.00   Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 9.00   1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :13.00   Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :13.18   Mean   :26.23   Mean   :2013-07-03 05:02:36  
##  3rd Qu.:17.00   3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :23.00   Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 

Data subsets with filter()

filter() allows you to select a subset of rows in a data frame. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame:

Let’s look at all the flights that departed on January 1st and where the departure time was delayed by at least 15 minutes.

filter(flights, 
       month == 1, 
       day == 1,
       dep_delay >= 15)
## # A tibble: 163 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      632            608        24      740
## 2   2013     1     1      732            645        47     1011
## 3   2013     1     1      749            710        39      939
## 4   2013     1     1      811            630       101     1047
## 5   2013     1     1      826            715        71     1136
## 6   2013     1     1      848           1835       853     1001
## 7   2013     1     1      903            820        43     1045
## 8   2013     1     1      906            843        23     1134
## 9   2013     1     1      909            810        59     1331
## 10  2013     1     1      920            905        15     1039
## # ... with 153 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

How does this compare to other syntax we’ve learned about?

# This gets clunky fast...
flights[flights$month == 1 & flights$day == 1 & flights$dep_delay >= 15, ]
## # A tibble: 167 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      632            608        24      740
## 2   2013     1     1      732            645        47     1011
## 3   2013     1     1      749            710        39      939
## 4   2013     1     1      811            630       101     1047
## 5   2013     1     1      826            715        71     1136
## 6   2013     1     1      848           1835       853     1001
## 7   2013     1     1      903            820        43     1045
## 8   2013     1     1      906            843        23     1134
## 9   2013     1     1      909            810        59     1331
## 10  2013     1     1      920            905        15     1039
## # ... with 157 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

The subset() syntax is better.

subset(flights, month == 1 & day == 1 & dep_delay >= 15)
## # A tibble: 163 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      632            608        24      740
## 2   2013     1     1      732            645        47     1011
## 3   2013     1     1      749            710        39      939
## 4   2013     1     1      811            630       101     1047
## 5   2013     1     1      826            715        71     1136
## 6   2013     1     1      848           1835       853     1001
## 7   2013     1     1      903            820        43     1045
## 8   2013     1     1      906            843        23     1134
## 9   2013     1     1      909            810        59     1331
## 10  2013     1     1      920            905        15     1039
## # ... with 153 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

But the dplyr syntax is more flexible, allowing you to either use full Boolean expressions, or use , to indicate “and”.

filter(flights, month == 1 & day == 1 & dep_delay >= 15)
## # A tibble: 163 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      632            608        24      740
## 2   2013     1     1      732            645        47     1011
## 3   2013     1     1      749            710        39      939
## 4   2013     1     1      811            630       101     1047
## 5   2013     1     1      826            715        71     1136
## 6   2013     1     1      848           1835       853     1001
## 7   2013     1     1      903            820        43     1045
## 8   2013     1     1      906            843        23     1134
## 9   2013     1     1      909            810        59     1331
## 10  2013     1     1      920            905        15     1039
## # ... with 153 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Rearrange rows with arrange()

You can think of arrange() as a “sort by” operation. This function takes a data frame and a set of column names by which to order the data. Later columns are used to break ties (i.e., order within) earlier columns.

Here’s an example that arranges the data in order of departure date.

arrange(flights, year, month, day)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

You can also add expressions to the arrange() command. For instance, if you wanted to sort the flights in descending order of departure delay, you could use the desc() command:

arrange(flights, desc(dep_delay))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     9      641            900      1301     1242
## 2   2013     6    15     1432           1935      1137     1607
## 3   2013     1    10     1121           1635      1126     1239
## 4   2013     9    20     1139           1845      1014     1457
## 5   2013     7    22      845           1600      1005     1044
## 6   2013     4    10     1100           1900       960     1342
## 7   2013     3    17     2321            810       911      135
## 8   2013     6    27      959           1900       899     1236
## 9   2013     7    22     2257            759       898      121
## 10  2013    12     5      756           1700       896     1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Select columns with select()

The select() function can be thought of as a substitute for the select = argument in a subset() command. One notable difference is the more flexible syntax offered by select().

# Select columns by name
select(flights, year, month, day)
## # A tibble: 336,776 × 3
##     year month   day
##    <int> <int> <int>
## 1   2013     1     1
## 2   2013     1     1
## 3   2013     1     1
## 4   2013     1     1
## 5   2013     1     1
## 6   2013     1     1
## 7   2013     1     1
## 8   2013     1     1
## 9   2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows
# Select all columns between year and day (inclusive)
select(flights, year:day)
## # A tibble: 336,776 × 3
##     year month   day
##    <int> <int> <int>
## 1   2013     1     1
## 2   2013     1     1
## 3   2013     1     1
## 4   2013     1     1
## 5   2013     1     1
## 6   2013     1     1
## 7   2013     1     1
## 8   2013     1     1
## 9   2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows
# Select all columns except those from year to day (inclusive)
select(flights, -(year:day))
## # A tibble: 336,776 × 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>
## 1       517            515         2      830            819        11
## 2       533            529         4      850            830        20
## 3       542            540         2      923            850        33
## 4       544            545        -1     1004           1022       -18
## 5       554            600        -6      812            837       -25
## 6       554            558        -4      740            728        12
## 7       555            600        -5      913            854        19
## 8       557            600        -3      709            723       -14
## 9       557            600        -3      838            846        -8
## 10      558            600        -2      753            745         8
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

You can use helper functions such as starts_with(), ends_with(), matches() and contains() as part of your select call.

  • starts_with("abc"): matches names that begin with “abc”.

  • ends_with("xyz"): matches names that end with “xyz”.

  • contains("ijk"): matches names that contain “ijk”.

  • matches("(.)\\1"): selects variables that match a regular expression. This one matches any variables that contain repeated characters. You’ll learn more about regular expressions in strings.

  • num_range("x", 1:3) matches x1, x2 and x3.

# Pull all of the departure-related columns
select(flights, contains("dep"))
## # A tibble: 336,776 × 3
##    dep_time sched_dep_time dep_delay
##       <int>          <int>     <dbl>
## 1       517            515         2
## 2       533            529         4
## 3       542            540         2
## 4       544            545        -1
## 5       554            600        -6
## 6       554            558        -4
## 7       555            600        -5
## 8       557            600        -3
## 9       557            600        -3
## 10      558            600        -2
## # ... with 336,766 more rows
# Pull all of the arrival and departure related columns
select(flights, 
       contains("dep"),
       contains("arr"))
## # A tibble: 336,776 × 7
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>
## 1       517            515         2      830            819        11
## 2       533            529         4      850            830        20
## 3       542            540         2      923            850        33
## 4       544            545        -1     1004           1022       -18
## 5       554            600        -6      812            837       -25
## 6       554            558        -4      740            728        12
## 7       555            600        -5      913            854        19
## 8       557            600        -3      709            723       -14
## 9       557            600        -3      838            846        -8
## 10      558            600        -2      753            745         8
## # ... with 336,766 more rows, and 1 more variables: carrier <chr>

Add new columns with mutate()

We already saw the mutate() command in action previously. This function replaces an order base-R function called transform().

# Calculate delay reduction in travel (gain) and average speed
mutate(flights,
  gain = arr_delay - dep_delay,
  speed = distance / air_time * 60)
## # A tibble: 336,776 × 21
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>

An interesting thing that you can do with mutate() but not transform() is to create columns based on transformations of new columns that you just created within the same command. Here’s an example.

mutate(flights,
  gain = arr_delay - dep_delay,
  gain_per_hour = gain / (air_time / 60)
)
## # A tibble: 336,776 × 21
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, gain <dbl>, gain_per_hour <dbl>

Here’s what would happen if we tried doing the same thing with the transform() command:

transform(flights,
  gain = arr_delay - dep_delay,
  gain_per_hour = gain / (air_time / 60)
)
# Error in eval(expr, envir, enclos) : object 'gain' not found

transmute()

If all you want to keep from the mutate() are the newly formed variables, you can either chain together a mutate() with a select(), or you can directly use the transmute() command.

transmute(flights,
  gain = arr_delay - dep_delay,
  gain_per_hour = gain / (air_time / 60)
)
## # A tibble: 336,776 × 2
##     gain gain_per_hour
##    <dbl>         <dbl>
## 1      9      2.378855
## 2     16      4.229075
## 3     31     11.625000
## 4    -17     -5.573770
## 5    -19     -9.827586
## 6     16      6.400000
## 7     24      9.113924
## 8    -11    -12.452830
## 9     -5     -2.142857
## 10    10      4.347826
## # ... with 336,766 more rows

Summary tables with summarise()

You can think of summarise() as performing a similar operation to the plyr::ddply() function. On its own, summarise() just returns a 1-line summary data frame.

summarise(flights, 
          mean_dep_delay = mean(dep_delay, na.rm = TRUE), 
          mean_arr_delay = mean(arr_delay, na.rm = TRUE)
          )
## # A tibble: 1 × 2
##   mean_dep_delay mean_arr_delay
##            <dbl>          <dbl>
## 1       12.63907       6.895377

summarise_at()

You can also summarize multiple variables with multiple functions using the related summarize_at() function, which has the syntax:

summarise_at(data, vars(...), funs(...))

Here’s an example

summarise_at(flights,
             vars(dep_delay, arr_delay),
             funs(mean, median, sd),
             na.rm = TRUE)
## # A tibble: 1 × 6
##   dep_delay_mean arr_delay_mean dep_delay_median arr_delay_median
##            <dbl>          <dbl>            <dbl>            <dbl>
## 1       12.63907       6.895377               -2               -5
## # ... with 2 more variables: dep_delay_sd <dbl>, arr_delay_sd <dbl>
# Notice how the na.rm argument is supplied to summarise_at(),
# but it is actually used by the mean, median and sd functions.

Using group_by()

To obtain summaries within some grouping scheme, you can use the group_by() command followed by summarise().

Here we’ll illustrate how this approach can be used to better understand the association between arrival delays and distance traveled.

# Form a summary table showing the number of flights,
# average distance, and arrival delay for each airplane

by_tailnum <- group_by(flights, tailnum)
delay <- summarise(by_tailnum,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE))

# Subset the data to only include frequently flown planes
# and distances < 3000 
delay <- filter(delay, count > 20, dist < 3000)

# Plot
ggplot(delay, aes(dist, delay)) +
  geom_point(aes(size = count), alpha = 1/2) +
  geom_smooth() +
  scale_size_area()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

Handy summary functions

In addition to functions such as min(), max(), …, median() etc., you can also use the following, which are enabled by the dplyr library:

  • n(): the number of observations in the current group

  • n_distinct(x) :the number of unique values in x.

  • first(x), last(x) and nth(x, n) - these work similarly to x[1], x[length(x)], and x[n] but give you more control over the result if the value is missing.

You can use these functions to, for instance, count the number of planes and number of flights for each possible destination:

destinations <- group_by(flights, dest)

summarise(destinations,
  planes = n_distinct(tailnum),
  flights = n()
)
## # A tibble: 105 × 3
##     dest planes flights
##    <chr>  <int>   <int>
## 1    ABQ    108     254
## 2    ACK     58     265
## 3    ALB    172     439
## 4    ANC      6       8
## 5    ATL   1180   17215
## 6    AUS    993    2439
## 7    AVL    159     275
## 8    BDL    186     443
## 9    BGR     46     375
## 10   BHM     45     297
## # ... with 95 more rows

Successive summaries

When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll-up a dataset:

daily <- group_by(flights, year, month, day)

# Tabulate number of flights on each day
per_day   <- summarise(daily, flights = n())
per_day
## Source: local data frame [365 x 4]
## Groups: year, month [?]
## 
##     year month   day flights
##    <int> <int> <int>   <int>
## 1   2013     1     1     842
## 2   2013     1     2     943
## 3   2013     1     3     914
## 4   2013     1     4     915
## 5   2013     1     5     720
## 6   2013     1     6     832
## 7   2013     1     7     933
## 8   2013     1     8     899
## 9   2013     1     9     902
## 10  2013     1    10     932
## # ... with 355 more rows
# Tabulate number of flights on each month
per_month <- summarise(per_day, flights = sum(flights))
per_month
## Source: local data frame [12 x 3]
## Groups: year [?]
## 
##     year month flights
##    <int> <int>   <int>
## 1   2013     1   27004
## 2   2013     2   24951
## 3   2013     3   28834
## 4   2013     4   28330
## 5   2013     5   28796
## 6   2013     6   28243
## 7   2013     7   29425
## 8   2013     8   29327
## 9   2013     9   27574
## 10  2013    10   28889
## 11  2013    11   27268
## 12  2013    12   28135
# Total number of flights that year
per_year  <- summarise(per_month, flights = sum(flights))
per_year
## # A tibble: 1 × 2
##    year flights
##   <int>   <int>
## 1  2013  336776

distinct()

distinct() allows you to identify the unique values of variables (or combinations of variables) in your data.

# How many different planes departed from NYC airports
# in 2013?
distinct(flights, tailnum)
## # A tibble: 4,044 × 1
##    tailnum
##      <chr>
## 1   N14228
## 2   N24211
## 3   N619AA
## 4   N804JB
## 5   N668DN
## 6   N39463
## 7   N516JB
## 8   N829AS
## 9   N593JB
## 10  N3ALAA
## # ... with 4,034 more rows
# How many distinct (origin, dest) pairs were there?
distinct(flights, origin, dest)
## # A tibble: 224 × 2
##    origin  dest
##     <chr> <chr>
## 1     EWR   IAH
## 2     LGA   IAH
## 3     JFK   MIA
## 4     JFK   BQN
## 5     LGA   ATL
## 6     EWR   ORD
## 7     EWR   FLL
## 8     LGA   IAD
## 9     JFK   MCO
## 10    LGA   ORD
## # ... with 214 more rows

rename()

We’ve done a lot of variable renaming in this class. In most of the cases we’ve renamed all of the columns all at once. If we want to change only a few column names, this can get frustrating. rename() addresses precisely this issue.

rename(flights, 
       yr = year,
       dep.time = dep_time)
## # A tibble: 336,776 × 19
##       yr month   day dep.time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Piping (chaining)

In this section we’ll introduce the %>% (“pipe”) command, which you’ll quickly find indispensible when chaining together multiple operations.

To illustrate a use case, suppose we wanted to do some grouping, sub-setting, summarizing, and then further filtering of the summary. For instance, we might be interested in identifying days in 2013 where the average arrival or departure delay was especially high.

Here’s one approach.

# Group by day of the year
a1 <- group_by(flights, year, month, day)

# Select just the arrival and departure delay columns
a2 <- select(a1, arr_delay, dep_delay)
## Adding missing grouping variables: `year`, `month`, `day`
# Calculate average delays
a3 <- summarise(a2,
  mean_arr_delay = mean(arr_delay, na.rm = TRUE),
  mean_dep_delay = mean(dep_delay, na.rm = TRUE))

# Filter to the days where the average delay was at least 30 mins
a4 <- filter(a3, mean_arr_delay > 30 | mean_dep_delay > 30)

Here’s another approach, which wraps all of the functions together to avoid having to create intermediate variables (a1, a2 and a3) during the computation.

filter(
  summarise(
    select(
      group_by(flights, year, month, day),
      arr_delay, dep_delay
    ),
    mean_arr_delay = mean(arr_delay, na.rm = TRUE),
    mean_dep_delay = mean(dep_delay, na.rm = TRUE)
  ),
  mean_arr_delay > 30 | mean_dep_delay > 30
)
## Adding missing grouping variables: `year`, `month`, `day`
## Source: local data frame [49 x 5]
## Groups: year, month [11]
## 
##     year month   day mean_arr_delay mean_dep_delay
##    <int> <int> <int>          <dbl>          <dbl>
## 1   2013     1    16       34.24736       24.61287
## 2   2013     1    31       32.60285       28.65836
## 3   2013     2    11       36.29009       39.07360
## 4   2013     2    27       31.25249       37.76327
## 5   2013     3     8       85.86216       83.53692
## 6   2013     3    18       41.29189       30.11796
## 7   2013     4    10       38.41231       33.02368
## 8   2013     4    12       36.04814       34.83843
## 9   2013     4    18       36.02848       34.91536
## 10  2013     4    19       47.91170       46.12783
## # ... with 39 more rows

While this performs the exact same operation, it’s nearly impossible to read. This is largely due to the fact that you have to parse the operation from the inside out, rather than left-to-right or top-to-bottom.

A much better approach is to use %>%, which is automatically loaded when you load dplyr. Essentially, given a function f(x, y), x %>% f(y) is interpreted as f(x, y). This allows us to chain operations together using much more readable syntax.

flights %>%
  group_by(year, month, day) %>%
  select(arr_delay, dep_delay) %>%
  summarise(
    mean_arr_delay = mean(arr_delay, na.rm = TRUE),
    mean_dep_delay = mean(dep_delay, na.rm = TRUE)
  ) %>%
  filter(mean_arr_delay > 30 | mean_dep_delay > 30)
## Adding missing grouping variables: `year`, `month`, `day`
## Source: local data frame [49 x 5]
## Groups: year, month [11]
## 
##     year month   day mean_arr_delay mean_dep_delay
##    <int> <int> <int>          <dbl>          <dbl>
## 1   2013     1    16       34.24736       24.61287
## 2   2013     1    31       32.60285       28.65836
## 3   2013     2    11       36.29009       39.07360
## 4   2013     2    27       31.25249       37.76327
## 5   2013     3     8       85.86216       83.53692
## 6   2013     3    18       41.29189       30.11796
## 7   2013     4    10       38.41231       33.02368
## 8   2013     4    12       36.04814       34.83843
## 9   2013     4    18       36.02848       34.91536
## 10  2013     4    19       47.91170       46.12783
## # ... with 39 more rows

Example: delay gain per hour

gain.df <- flights %>% 
  mutate(gain = dep_delay - arr_delay,
         gain_per_hour = gain / (air_time / 60)) %>%
  group_by(tailnum) %>%
  summarise(count = n(),
            av_gain = mean(gain_per_hour, na.rm = TRUE),
            av_dep_delay = mean(dep_delay, na.rm = TRUE),
            av_arr_delay = mean(arr_delay, na.rm = TRUE),
            av_dist = mean(distance)
  ) %>%
  filter(count > 10, av_dist < 3000)

ggplot(gain.df, aes(x = av_dist, y = av_gain, size = count)) +
  geom_point(alpha = 0.3) +
  scale_size_area() +
  geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(gain.df, aes(x = av_dep_delay, y = av_gain, size = count)) +
  geom_point(alpha = 0.3) +
  scale_size_area() +
  geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(gain.df, aes(x = av_arr_delay, y = av_gain, size = count)) +
  geom_point(alpha = 0.3) +
  scale_size_area() +
  geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

Example: average delay time for each origin, destination

In this example we’ll pipe a summary table directly into a ggplot call.

flights %>% 
  group_by(origin) %>%
  summarise(av_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x = origin, y = av_dep_delay)) + 
  geom_bar(stat = "identity") +
  ylab("Average departure delay") +
  xlab("Origin airport")

flights %>% 
  group_by(dest, origin) %>%
  summarise(av_dep_delay = mean(dep_delay, na.rm = TRUE),
            count = n()) %>%
  ungroup() %>%
  filter(origin == "EWR", count > 50) %>%
  mutate(dest = reorder(dest, av_dep_delay)) %>%
  ggplot(aes(x = dest, y = av_dep_delay, 
             size = count)) + 
  geom_point(alpha = 0.5) +
  scale_size_area() +
  ylab("Average departure delay") +
  xlab("Destination airport") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))