https://r4ds.had.co.nz/

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.6.1

## -- Attaching packages --------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.3.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.6.3

## Warning: package 'tibble' was built under R version 3.6.1

## Warning: package 'tidyr' was built under R version 3.6.1

## Warning: package 'readr' was built under R version 3.6.1

## Warning: package 'purrr' was built under R version 3.6.1

## Warning: package 'dplyr' was built under R version 3.6.1

## Warning: package 'stringr' was built under R version 3.6.1

## Warning: package 'forcats' was built under R version 3.6.1

## -- Conflicts ------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#library(dplyr)

vroom package: Fast reading of delimited files

https://github.com/r-lib/vroom https://vroom.r-lib.org/articles/vroom.html

dplyr package

join

All columns from x or y will be kept. If multiple matches between x and y, all combinations of the matches are returned:

inner_join: return all rows from x where there are matching values in y
left_join: return all rows from x
right_join: return all rows from y
full_join: return all rows and all columns from both x and y

Filtering joins keep columns from x:

semi_join: return all rows from x where there are matching values in y. It will never duplicate rows of x even if one row matching multiple y (as cloumns in y will not be kept, no need to duplicate row in x).
anti_join: return all rows from x where there are not matching values in y

Nesting joins create a list column of for columns from y:

nest_join: return all rows and all columns from x. Adds a list column of tibbles. Each tibble contains all the rows from y that match that row of x.

filter row based on providing values and use order of values (like selecting by row.names in base R)

mtcars1=as_tibble(mtcars,rownames="car")
selectedCars=c("Lotus Europa" ,"Maserati Bora","Ferrari Dino" )

mtcars1 %>% filter(car %in% selectedCars) %>% arrange(match(car,(selectedCars)))

## # A tibble: 3 x 12
##   car       mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lotus ~  30.4     4  95.1   113  3.77  1.51  16.9     1     1     5     2
## 2 Masera~  15       8 301     335  3.54  3.57  14.6     0     1     5     8
## 3 Ferrar~  19.7     6 145     175  3.62  2.77  15.5     0     1     5     6

#Or
enframe(selectedCars,name=NULL,value="car")%>% left_join(mtcars1)

## Joining, by = "car"

## # A tibble: 3 x 12
##   car       mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lotus ~  30.4     4  95.1   113  3.77  1.51  16.9     1     1     5     2
## 2 Masera~  15       8 301     335  3.54  3.57  14.6     0     1     5     8
## 3 Ferrar~  19.7     6 145     175  3.62  2.77  15.5     0     1     5     6

data.frame(car=selectedCars,stringsAsFactors = FALSE) %>% left_join(mtcars1)

## Joining, by = "car"

##             car  mpg cyl  disp  hp drat    wt qsec vs am gear carb
## 1  Lotus Europa 30.4   4  95.1 113 3.77 1.513 16.9  1  1    5    2
## 2 Maserati Bora 15.0   8 301.0 335 3.54 3.570 14.6  0  1    5    8
## 3  Ferrari Dino 19.7   6 145.0 175 3.62 2.770 15.5  0  1    5    6

mtcars1 %>% semi_join(data.frame(car=selectedCars,stringsAsFactors = FALSE))

## Joining, by = "car"

## # A tibble: 3 x 12
##   car       mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lotus ~  30.4     4  95.1   113  3.77  1.51  16.9     1     1     5     2
## 2 Ferrar~  19.7     6 145     175  3.62  2.77  15.5     0     1     5     6
## 3 Masera~  15       8 301     335  3.54  3.57  14.6     0     1     5     8

top_n, top_frac: first rows by a variable value, If n is positive, selects the top rows. If negative, selects the bottom rows. It takes all rows when having tie in top_n. can add %>% slice(1) if only need first one

slice: first rows by position

between

readr package

read_tsv, read_csv, read_delim

Functions

mutate: Make new variable

table1 %>% 
  mutate(rate = cases / population * 10000)

## # A tibble: 6 x 5
##   country      year  cases population  rate
##   <chr>       <int>  <int>      <int> <dbl>
## 1 Afghanistan  1999    745   19987071 0.373
## 2 Afghanistan  2000   2666   20595360 1.29 
## 3 Brazil       1999  37737  172006362 2.19 
## 4 Brazil       2000  80488  174504898 4.61 
## 5 China        1999 212258 1272915272 1.67 
## 6 China        2000 213766 1280428583 1.67

mutate+case_when

group_by: takes an existing tbl and converts it into a grouped tbl

by_cyl <- mtcars %>% group_by(cyl)

# grouping doesn't change how the data looks (apart from listing
# how it's grouped):
by_cyl

## # A tibble: 32 x 11
## # Groups:   cyl [3]
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##  * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
##  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
##  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
##  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
##  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
##  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
##  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
##  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
##  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
## 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
## # ... with 22 more rows

# It changes how it acts with the other dplyr verbs:
by_cyl %>% summarise(
  disp = mean(disp),
  hp = mean(hp)
)

## # A tibble: 3 x 3
##     cyl  disp    hp
##   <dbl> <dbl> <dbl>
## 1     4  105.  82.6
## 2     6  183. 122. 
## 3     8  353. 209.

count: Count number per factor

#sum (count) of manufacturer
mpg %>% 
  count(manufacturer)

## # A tibble: 15 x 2
##    manufacturer     n
##    <chr>        <int>
##  1 audi            18
##  2 chevrolet       19
##  3 dodge           37
##  4 ford            25
##  5 honda            9
##  6 hyundai         14
##  7 jeep             8
##  8 land rover       4
##  9 lincoln          3
## 10 mercury          4
## 11 nissan          13
## 12 pontiac          5
## 13 subaru          14
## 14 toyota          34
## 15 volkswagen      27

#sum of hwy by manufacturer
mpg %>% 
  count(manufacturer,wt=hwy)

## # A tibble: 15 x 2
##    manufacturer     n
##    <chr>        <int>
##  1 audi           476
##  2 chevrolet      416
##  3 dodge          664
##  4 ford           484
##  5 honda          293
##  6 hyundai        376
##  7 jeep           141
##  8 land rover      66
##  9 lincoln         51
## 10 mercury         72
## 11 nissan         320
## 12 pontiac        132
## 13 subaru         358
## 14 toyota         847
## 15 volkswagen     789

gather: takes multiple columns and collapses into key-value pairs

# From http://stackoverflow.com/questions/1181060
stocks <- tibble(
  time = as.Date('2009-01-01') + 0:9,
  X = rnorm(10, 0, 1),
  Y = rnorm(10, 0, 2),
  Z = rnorm(10, 0, 4)
)
head(stocks)

## # A tibble: 6 x 4
##   time            X      Y      Z
##   <date>      <dbl>  <dbl>  <dbl>
## 1 2009-01-01 -1.56  -0.254  0.198
## 2 2009-01-02  0.649 -1.50   7.08 
## 3 2009-01-03 -0.150 -1.41  -0.591
## 4 2009-01-04  1.07  -3.95   1.85 
## 5 2009-01-05 -0.213  0.695 -3.02 
## 6 2009-01-06  0.589 -0.372  3.20

#use everything except "time" as values. "stock" and "price" as new variable names for key and value
gather(stocks, "stock", "price", -time)

## # A tibble: 30 x 3
##    time       stock  price
##    <date>     <chr>  <dbl>
##  1 2009-01-01 X     -1.56 
##  2 2009-01-02 X      0.649
##  3 2009-01-03 X     -0.150
##  4 2009-01-04 X      1.07 
##  5 2009-01-05 X     -0.213
##  6 2009-01-06 X      0.589
##  7 2009-01-07 X     -0.840
##  8 2009-01-08 X      0.563
##  9 2009-01-09 X      0.168
## 10 2009-01-10 X      0.387
## # ... with 20 more rows

#use X, Y, Z as values, so don't need -time. "stock" and "price" as new variable names for key and value
stocks %>% gather("X", "Y","Z",key="stock",value="price")

## # A tibble: 30 x 3
##    time       stock  price
##    <date>     <chr>  <dbl>
##  1 2009-01-01 X     -1.56 
##  2 2009-01-02 X      0.649
##  3 2009-01-03 X     -0.150
##  4 2009-01-04 X      1.07 
##  5 2009-01-05 X     -0.213
##  6 2009-01-06 X      0.589
##  7 2009-01-07 X     -0.840
##  8 2009-01-08 X      0.563
##  9 2009-01-09 X      0.168
## 10 2009-01-10 X      0.387
## # ... with 20 more rows

spread: opposite of gathering. use it when an observation is scattered across multiple rows.

stocksm <- stocks %>% gather(stock, price, -time)
head(stocksm)

## # A tibble: 6 x 3
##   time       stock  price
##   <date>     <chr>  <dbl>
## 1 2009-01-01 X     -1.56 
## 2 2009-01-02 X      0.649
## 3 2009-01-03 X     -0.150
## 4 2009-01-04 X      1.07 
## 5 2009-01-05 X     -0.213
## 6 2009-01-06 X      0.589

stocksm %>% spread(stock, price)

## # A tibble: 10 x 4
##    time            X      Y       Z
##    <date>      <dbl>  <dbl>   <dbl>
##  1 2009-01-01 -1.56  -0.254  0.198 
##  2 2009-01-02  0.649 -1.50   7.08  
##  3 2009-01-03 -0.150 -1.41  -0.591 
##  4 2009-01-04  1.07  -3.95   1.85  
##  5 2009-01-05 -0.213  0.695 -3.02  
##  6 2009-01-06  0.589 -0.372  3.20  
##  7 2009-01-07 -0.840  2.43  -0.976 
##  8 2009-01-08  0.563  2.63  -0.743 
##  9 2009-01-09  0.168 -5.06   3.05  
## 10 2009-01-10  0.387 -0.735 -0.0689

stocksm %>% spread(time, price)

## # A tibble: 3 x 11
##   stock `2009-01-01` `2009-01-02` `2009-01-03` `2009-01-04` `2009-01-05`
##   <chr>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
## 1 X           -1.56         0.649       -0.150         1.07       -0.213
## 2 Y           -0.254       -1.50        -1.41         -3.95        0.695
## 3 Z            0.198        7.08        -0.591         1.85       -3.02 
## # ... with 5 more variables: `2009-01-06` <dbl>, `2009-01-07` <dbl>,
## #   `2009-01-08` <dbl>, `2009-01-09` <dbl>, `2009-01-10` <dbl>

#If duplicate row identifiers and want to keep all of them
#https://github.com/tidyverse/tidyr/issues/426
#https://www.r-bloggers.com/workaround-for-tidyrspread-with-duplicate-row-identifiers/
df=tibble(age=c(21,17,32,38,35),gender=c("Male","Female","Female","Male","Male"))
df <- df %>% 
  group_by(gender) %>% 
  mutate(grouped_id = row_number())
df

## # A tibble: 5 x 3
## # Groups:   gender [2]
##     age gender grouped_id
##   <dbl> <chr>       <int>
## 1    21 Male            1
## 2    17 Female          1
## 3    32 Female          2
## 4    38 Male            2
## 5    35 Male            3

df %>% 
  spread(gender, age) %>% 
  select(-grouped_id)

## # A tibble: 3 x 2
##   Female  Male
##    <dbl> <dbl>
## 1     17    21
## 2     32    38
## 3     NA    35

separate: pulls apart one column into multiple columns, by splitting wherever a separator character appears

df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
df %>% separate(x, c("A", "B"))

##      A    B
## 1 <NA> <NA>
## 2    a    b
## 3    a    d
## 4    b    c

# If you just want the second variable:
df %>% separate(x, c(NA, "B"))

##      B
## 1 <NA>
## 2    b
## 3    d
## 4    c

#use extra and fill arguments to control what happens
df <- data.frame(x = c("a", "a b", "a b c", NA))
df %>% separate(x, c("a", "b"))

## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [3].

## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [1].

##      a    b
## 1    a <NA>
## 2    a    b
## 3    a    b
## 4 <NA> <NA>

# The same behaviour drops the c but no warnings
df %>% separate(x, c("a", "b"), extra = "drop", fill = "right")

##      a    b
## 1    a <NA>
## 2    a    b
## 3    a    b
## 4 <NA> <NA>

# If only want to split specified number of times use extra = "merge"
df <- data.frame(x = c("x: 123", "y: error: 7"))
df %>% separate(x, c("key", "value"), ": ", extra = "merge")

##   key    value
## 1   x      123
## 2   y error: 7

# convert = TRUE detects column classes
# Argument col can take quasiquotation to work with strings
var <- "x"
df %>% separate(!!var, c("key","value"), ":")

## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [2].

##   key  value
## 1   x    123
## 2   y  error

unite: is the inverse of separate(): it combines multiple columns into a single column.

unite(mtcars, "vs_am", c("vs","am"))

##                      mpg cyl  disp  hp drat    wt  qsec vs_am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46   0_1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02   0_1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61   1_1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44   1_0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02   0_0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22   1_0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84   0_0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00   1_0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90   1_0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30   1_0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90   1_0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40   0_0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60   0_0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00   0_0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98   0_0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82   0_0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42   0_0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47   1_1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52   1_1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90   1_1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01   1_0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87   0_0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30   0_0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41   0_0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05   0_0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90   1_1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70   0_1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90   1_1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50   0_1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50   0_1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60   0_1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60   1_1    4    2

#tidyr package

separate_rows: Separate a collapsed column into multiple rows

library(tidyr)
#If more than one column, should have the same number of rows after seprate
df <- data.frame(
  x = 1:3,
  y = c("a", "d,e,f", "g,h"),
  z = c("1", "2,3,4", "5,6"),
  stringsAsFactors = FALSE
)
separate_rows(df, y, z, convert = TRUE)

##   x y z
## 1 1 a 1
## 2 2 d 2
## 3 2 e 3
## 4 2 f 4
## 5 3 g 5
## 6 3 h 6

purrr package

reduce: combines the elements of a vector into a single value

Other notes

tidyeval

#sym() creates a symbol from a string and syms() creates a list of symbols from a character vector.

df <- data.frame(x = c("a", "a b", "a b c", NA))

var <- "x"
df %>% separate(!!var, c("key","value","note"), " ")

## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 2].

##    key value note
## 1    a  <NA> <NA>
## 2    a     b <NA>
## 3    a     b    c
## 4 <NA>  <NA> <NA>

df %>% summarise(n=length(grep(" ",x))) #2

##   n
## 1 2

df %>% summarise(n=length(grep(" ",!!var))) #0, as it grep in value of x

##   n
## 1 0

df %>% summarise(n=length(grep(" ",!!sym(var)))) #2, correct

##   n
## 1 2

R for Data Science Notes

Shilin Zhao

8/27/2019