RforDataScienceNotes.Rmd
## Warning: package 'tidyverse' was built under R version 3.6.1
## -- Attaching packages --------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.3.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.1
## Warning: package 'tidyr' was built under R version 3.6.1
## Warning: package 'readr' was built under R version 3.6.1
## Warning: package 'purrr' was built under R version 3.6.1
## Warning: package 'dplyr' was built under R version 3.6.1
## Warning: package 'stringr' was built under R version 3.6.1
## Warning: package 'forcats' was built under R version 3.6.1
## -- Conflicts ------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
https://github.com/r-lib/vroom https://vroom.r-lib.org/articles/vroom.html
All columns from x or y will be kept. If multiple matches between x and y, all combinations of the matches are returned:
Filtering joins keep columns from x:
Nesting joins create a list column of for columns from y:
mtcars1=as_tibble(mtcars,rownames="car")
selectedCars=c("Lotus Europa" ,"Maserati Bora","Ferrari Dino" )
mtcars1 %>% filter(car %in% selectedCars) %>% arrange(match(car,(selectedCars)))
## # A tibble: 3 x 12
## car mpg cyl disp hp drat wt qsec vs am gear carb
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lotus ~ 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
## 2 Masera~ 15 8 301 335 3.54 3.57 14.6 0 1 5 8
## 3 Ferrar~ 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
## Joining, by = "car"
## # A tibble: 3 x 12
## car mpg cyl disp hp drat wt qsec vs am gear carb
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lotus ~ 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
## 2 Masera~ 15 8 301 335 3.54 3.57 14.6 0 1 5 8
## 3 Ferrar~ 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
## Joining, by = "car"
## car mpg cyl disp hp drat wt qsec vs am gear carb
## 1 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
## 2 Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.6 0 1 5 8
## 3 Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.5 0 1 5 6
## Joining, by = "car"
## # A tibble: 3 x 12
## car mpg cyl disp hp drat wt qsec vs am gear carb
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lotus ~ 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
## 2 Ferrar~ 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
## 3 Masera~ 15 8 301 335 3.54 3.57 14.6 0 1 5 8
## # A tibble: 6 x 5
## country year cases population rate
## <chr> <int> <int> <int> <dbl>
## 1 Afghanistan 1999 745 19987071 0.373
## 2 Afghanistan 2000 2666 20595360 1.29
## 3 Brazil 1999 37737 172006362 2.19
## 4 Brazil 2000 80488 174504898 4.61
## 5 China 1999 212258 1272915272 1.67
## 6 China 2000 213766 1280428583 1.67
by_cyl <- mtcars %>% group_by(cyl)
# grouping doesn't change how the data looks (apart from listing
# how it's grouped):
by_cyl
## # A tibble: 32 x 11
## # Groups: cyl [3]
## mpg cyl disp hp drat wt qsec vs am gear carb
## * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # ... with 22 more rows
# It changes how it acts with the other dplyr verbs:
by_cyl %>% summarise(
disp = mean(disp),
hp = mean(hp)
)
## # A tibble: 3 x 3
## cyl disp hp
## <dbl> <dbl> <dbl>
## 1 4 105. 82.6
## 2 6 183. 122.
## 3 8 353. 209.
## # A tibble: 15 x 2
## manufacturer n
## <chr> <int>
## 1 audi 18
## 2 chevrolet 19
## 3 dodge 37
## 4 ford 25
## 5 honda 9
## 6 hyundai 14
## 7 jeep 8
## 8 land rover 4
## 9 lincoln 3
## 10 mercury 4
## 11 nissan 13
## 12 pontiac 5
## 13 subaru 14
## 14 toyota 34
## 15 volkswagen 27
## # A tibble: 15 x 2
## manufacturer n
## <chr> <int>
## 1 audi 476
## 2 chevrolet 416
## 3 dodge 664
## 4 ford 484
## 5 honda 293
## 6 hyundai 376
## 7 jeep 141
## 8 land rover 66
## 9 lincoln 51
## 10 mercury 72
## 11 nissan 320
## 12 pontiac 132
## 13 subaru 358
## 14 toyota 847
## 15 volkswagen 789
# From http://stackoverflow.com/questions/1181060
stocks <- tibble(
time = as.Date('2009-01-01') + 0:9,
X = rnorm(10, 0, 1),
Y = rnorm(10, 0, 2),
Z = rnorm(10, 0, 4)
)
head(stocks)
## # A tibble: 6 x 4
## time X Y Z
## <date> <dbl> <dbl> <dbl>
## 1 2009-01-01 -1.56 -0.254 0.198
## 2 2009-01-02 0.649 -1.50 7.08
## 3 2009-01-03 -0.150 -1.41 -0.591
## 4 2009-01-04 1.07 -3.95 1.85
## 5 2009-01-05 -0.213 0.695 -3.02
## 6 2009-01-06 0.589 -0.372 3.20
#use everything except "time" as values. "stock" and "price" as new variable names for key and value
gather(stocks, "stock", "price", -time)
## # A tibble: 30 x 3
## time stock price
## <date> <chr> <dbl>
## 1 2009-01-01 X -1.56
## 2 2009-01-02 X 0.649
## 3 2009-01-03 X -0.150
## 4 2009-01-04 X 1.07
## 5 2009-01-05 X -0.213
## 6 2009-01-06 X 0.589
## 7 2009-01-07 X -0.840
## 8 2009-01-08 X 0.563
## 9 2009-01-09 X 0.168
## 10 2009-01-10 X 0.387
## # ... with 20 more rows
#use X, Y, Z as values, so don't need -time. "stock" and "price" as new variable names for key and value
stocks %>% gather("X", "Y","Z",key="stock",value="price")
## # A tibble: 30 x 3
## time stock price
## <date> <chr> <dbl>
## 1 2009-01-01 X -1.56
## 2 2009-01-02 X 0.649
## 3 2009-01-03 X -0.150
## 4 2009-01-04 X 1.07
## 5 2009-01-05 X -0.213
## 6 2009-01-06 X 0.589
## 7 2009-01-07 X -0.840
## 8 2009-01-08 X 0.563
## 9 2009-01-09 X 0.168
## 10 2009-01-10 X 0.387
## # ... with 20 more rows
## # A tibble: 6 x 3
## time stock price
## <date> <chr> <dbl>
## 1 2009-01-01 X -1.56
## 2 2009-01-02 X 0.649
## 3 2009-01-03 X -0.150
## 4 2009-01-04 X 1.07
## 5 2009-01-05 X -0.213
## 6 2009-01-06 X 0.589
## # A tibble: 10 x 4
## time X Y Z
## <date> <dbl> <dbl> <dbl>
## 1 2009-01-01 -1.56 -0.254 0.198
## 2 2009-01-02 0.649 -1.50 7.08
## 3 2009-01-03 -0.150 -1.41 -0.591
## 4 2009-01-04 1.07 -3.95 1.85
## 5 2009-01-05 -0.213 0.695 -3.02
## 6 2009-01-06 0.589 -0.372 3.20
## 7 2009-01-07 -0.840 2.43 -0.976
## 8 2009-01-08 0.563 2.63 -0.743
## 9 2009-01-09 0.168 -5.06 3.05
## 10 2009-01-10 0.387 -0.735 -0.0689
## # A tibble: 3 x 11
## stock `2009-01-01` `2009-01-02` `2009-01-03` `2009-01-04` `2009-01-05`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 X -1.56 0.649 -0.150 1.07 -0.213
## 2 Y -0.254 -1.50 -1.41 -3.95 0.695
## 3 Z 0.198 7.08 -0.591 1.85 -3.02
## # ... with 5 more variables: `2009-01-06` <dbl>, `2009-01-07` <dbl>,
## # `2009-01-08` <dbl>, `2009-01-09` <dbl>, `2009-01-10` <dbl>
#If duplicate row identifiers and want to keep all of them
#https://github.com/tidyverse/tidyr/issues/426
#https://www.r-bloggers.com/workaround-for-tidyrspread-with-duplicate-row-identifiers/
df=tibble(age=c(21,17,32,38,35),gender=c("Male","Female","Female","Male","Male"))
df <- df %>%
group_by(gender) %>%
mutate(grouped_id = row_number())
df
## # A tibble: 5 x 3
## # Groups: gender [2]
## age gender grouped_id
## <dbl> <chr> <int>
## 1 21 Male 1
## 2 17 Female 1
## 3 32 Female 2
## 4 38 Male 2
## 5 35 Male 3
## # A tibble: 3 x 2
## Female Male
## <dbl> <dbl>
## 1 17 21
## 2 32 38
## 3 NA 35
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
## B
## 1 <NA>
## 2 b
## 3 d
## 4 c
#use extra and fill arguments to control what happens
df <- data.frame(x = c("a", "a b", "a b c", NA))
df %>% separate(x, c("a", "b"))
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [3].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [1].
## a b
## 1 a <NA>
## 2 a b
## 3 a b
## 4 <NA> <NA>
# The same behaviour drops the c but no warnings
df %>% separate(x, c("a", "b"), extra = "drop", fill = "right")
## a b
## 1 a <NA>
## 2 a b
## 3 a b
## 4 <NA> <NA>
# If only want to split specified number of times use extra = "merge"
df <- data.frame(x = c("x: 123", "y: error: 7"))
df %>% separate(x, c("key", "value"), ": ", extra = "merge")
## key value
## 1 x 123
## 2 y error: 7
# convert = TRUE detects column classes
# Argument col can take quasiquotation to work with strings
var <- "x"
df %>% separate(!!var, c("key","value"), ":")
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [2].
## key value
## 1 x 123
## 2 y error
## mpg cyl disp hp drat wt qsec vs_am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0_1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0_1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1_1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1_0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0_0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1_0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0_0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1_0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1_0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1_0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1_0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0_0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0_0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0_0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0_0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0_0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0_0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1_1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1_1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1_1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1_0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0_0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0_0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0_0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0_0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1_1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0_1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1_1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0_1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0_1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0_1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1_1 4 2
#tidyr package
library(tidyr)
#If more than one column, should have the same number of rows after seprate
df <- data.frame(
x = 1:3,
y = c("a", "d,e,f", "g,h"),
z = c("1", "2,3,4", "5,6"),
stringsAsFactors = FALSE
)
separate_rows(df, y, z, convert = TRUE)
## x y z
## 1 1 a 1
## 2 2 d 2
## 3 2 e 3
## 4 2 f 4
## 5 3 g 5
## 6 3 h 6
#sym() creates a symbol from a string and syms() creates a list of symbols from a character vector.
df <- data.frame(x = c("a", "a b", "a b c", NA))
var <- "x"
df %>% separate(!!var, c("key","value","note"), " ")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 2].
## key value note
## 1 a <NA> <NA>
## 2 a b <NA>
## 3 a b c
## 4 <NA> <NA> <NA>
## n
## 1 2
## n
## 1 0
## n
## 1 2