Test 3

3. Data Visualization

library(ggplot2)
library(tibble)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(modelr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(nycflights13)
library(purrr)
## 
## Attaching package: 'purrr'
## The following objects are masked from 'package:dplyr':
## 
##     contains, order_by
library(broom)
## 
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
## 
##     bootstrap
library(gapminder)

options(na.action = na.warn)
mpg
## # A tibble: 234 x 11
##    manufacturer      model displ  year   cyl      trans   drv   cty   hwy
##           <chr>      <chr> <dbl> <int> <int>      <chr> <chr> <int> <int>
## 1          audi         a4   1.8  1999     4   auto(l5)     f    18    29
## 2          audi         a4   1.8  1999     4 manual(m5)     f    21    29
## 3          audi         a4   2.0  2008     4 manual(m6)     f    20    31
## 4          audi         a4   2.0  2008     4   auto(av)     f    21    30
## 5          audi         a4   2.8  1999     6   auto(l5)     f    16    26
## 6          audi         a4   2.8  1999     6 manual(m5)     f    18    26
## 7          audi         a4   3.1  2008     6   auto(av)     f    18    27
## 8          audi a4 quattro   1.8  1999     4 manual(m5)     4    18    26
## 9          audi a4 quattro   1.8  1999     4   auto(l5)     4    16    25
## 10         audi a4 quattro   2.0  2008     4 manual(m6)     4    20    28
## # ... with 224 more rows, and 2 more variables: fl <chr>, class <chr>
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
    geom_point(mapping = aes(color = class)) +
    geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE )

plot of chunk unnamed-chunk-3

ggplot(data = diamonds) +
    geom_bar(mapping = aes(x = cut))

plot of chunk unnamed-chunk-4

diamonds
## # A tibble: 53,940 x 10
##    carat       cut color clarity depth table price     x     y     z
##    <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1   0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2   0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3   0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4   0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5   0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6   0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
## 7   0.24 Very Good     I    VVS1  62.3    57   336  3.95  3.98  2.47
## 8   0.26 Very Good     H     SI1  61.9    55   337  4.07  4.11  2.53
## 9   0.22      Fair     E     VS2  65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338  4.00  4.05  2.39
## # ... with 53,930 more rows
ggplot(data = diamonds) + 
    geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))

plot of chunk unnamed-chunk-6

24. Model Building

diamonds2 <- diamonds %>% 
  filter(carat <= 2.5) %>% 
  mutate(lprice = log2(price), lcarat = log2(carat))
ggplot(diamonds2, aes(lcarat, lprice)) +
    geom_hex(bins = 50)

plot of chunk unnamed-chunk-8

mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)
grid <- diamonds2 %>% 
    modelr::data_grid(carat = seq_range(carat, 20)) %>%
    mutate(lcarat = log2(carat)) %>%
    add_predictions(mod_diamond, "lprice") %>%
    mutate(price = 2 ^ lprice)
ggplot(diamonds2, aes(carat, price)) +
    geom_hex(bins = 50) +
    geom_line(data = grid, colour = "red", size = 1)

plot of chunk unnamed-chunk-11

diamonds2 <- diamonds2 %>%
    add_residuals(mod_diamond, "lresid")

ggplot(diamonds2, aes(lcarat, lresid)) +
    geom_hex(bins = 50)

plot of chunk unnamed-chunk-12

mod_diamonds2 <- lm(lprice ~ lcarat + color + cut + clarity, data = diamonds2)
grid <- diamonds2 %>%
    data_grid(cut, .model = mod_diamonds2) %>%
    add_predictions(mod_diamonds2)
grid
## # A tibble: 5 x 5
##         cut     lcarat color clarity     pred
##       <ord>      <dbl> <chr>   <chr>    <dbl>
## 1      Fair -0.5145732     G     SI1 10.98985
## 2      Good -0.5145732     G     SI1 11.10479
## 3 Very Good -0.5145732     G     SI1 11.15824
## 4   Premium -0.5145732     G     SI1 11.19055
## 5     Ideal -0.5145732     G     SI1 11.22187
ggplot(grid, aes(cut, pred)) +
    geom_point()

plot of chunk unnamed-chunk-15

diamonds2 <- diamonds2 %>%
    add_residuals(mod_diamonds2, "lresid2")

ggplot(diamonds2, aes(lcarat, lresid2)) +
    geom_hex(bins = 50)

plot of chunk unnamed-chunk-16

diamonds2 %>%
    filter(abs(lresid2) > 1) %>%
    add_predictions(mod_diamonds2) %>%
    mutate(pred = round(2 ^ pred)) %>%
    select(price, pred, carat:table, x:z) %>%
    arrange(price)
## # A tibble: 16 x 11
##    price  pred carat       cut color clarity depth table     x     y     z
##    <int> <dbl> <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   1013   264  0.25      Fair     F     SI2  54.4    64  4.30  4.23  2.32
## 2   1186   284  0.25   Premium     G     SI2  59.0    60  5.33  5.28  3.12
## 3   1186   284  0.25   Premium     G     SI2  58.8    60  5.33  5.28  3.12
## 4   1262  2644  1.03      Fair     E      I1  78.2    54  5.72  5.59  4.42
## 5   1415   639  0.35      Fair     G     VS2  65.9    54  5.57  5.53  3.66
## 6   1415   639  0.35      Fair     G     VS2  65.9    54  5.57  5.53  3.66
## 7   1715   576  0.32      Fair     F     VS2  59.6    60  4.42  4.34  2.61
## 8   1776   412  0.29      Fair     F     SI1  55.8    60  4.48  4.41  2.48
## 9   2160   314  0.34      Fair     F      I1  55.8    62  4.72  4.60  2.60
## 10  2366   774  0.30 Very Good     D    VVS2  60.6    58  4.33  4.35  2.63
## 11  3360  1373  0.51   Premium     F     SI1  62.7    62  5.09  4.96  3.15
## 12  3807  1540  0.61      Good     F     SI2  62.5    65  5.36  5.29  3.33
## 13  3920  1705  0.51      Fair     F    VVS2  65.4    60  4.98  4.90  3.23
## 14  4368  1705  0.51      Fair     F    VVS2  60.7    66  5.21  5.11  3.13
## 15 10011  4048  1.01      Fair     D     SI2  64.6    58  6.25  6.20  4.02
## 16 10470 23622  2.46   Premium     E     SI2  59.7    59  8.82  8.76  5.25
daily <- flights %>%
    mutate(date = make_datetime(year, month, day)) %>%
    group_by(date) %>%
    summarise(n = n())
daily
## # A tibble: 365 x 2
##          date     n
##        <time> <int>
## 1  2013-01-01   842
## 2  2013-01-02   943
## 3  2013-01-03   914
## 4  2013-01-04   915
## 5  2013-01-05   720
## 6  2013-01-06   832
## 7  2013-01-07   933
## 8  2013-01-08   899
## 9  2013-01-09   902
## 10 2013-01-10   932
## # ... with 355 more rows
ggplot(daily, aes(date, n)) +
    geom_line()

plot of chunk unnamed-chunk-18

daily <- daily %>%
    mutate(wday = wday(date, label = TRUE)) 

ggplot(daily, aes(wday, n)) +
    geom_boxplot()

plot of chunk unnamed-chunk-19

mod <- lm(n ~ wday, data = daily)

grid <- daily %>%
    data_grid(wday) %>%
    add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
    geom_boxplot() +
    geom_point(data = grid, colour = "red", size = 4)

plot of chunk unnamed-chunk-20

daily <- daily %>%
    add_residuals(mod)
daily %>%
    ggplot(aes(date, resid)) +
    geom_ref_line(h = 0) +
    geom_line()

plot of chunk unnamed-chunk-21

ggplot(daily, aes(date, resid, colour = wday)) +
    geom_ref_line(h = 0) +
    geom_line()

plot of chunk unnamed-chunk-22

daily %>%
    filter(resid < - 100)
## # A tibble: 11 x 4
##          date     n  wday     resid
##        <time> <int> <ord>     <dbl>
## 1  2013-01-01   842  Tues -109.3585
## 2  2013-01-20   786   Sun -105.4808
## 3  2013-05-26   729   Sun -162.4808
## 4  2013-07-04   737 Thurs -228.7500
## 5  2013-07-05   822   Fri -145.4615
## 6  2013-09-01   718   Sun -173.4808
## 7  2013-11-28   634 Thurs -331.7500
## 8  2013-11-29   661   Fri -306.4615
## 9  2013-12-24   761  Tues -190.3585
## 10 2013-12-25   719   Wed -243.6923
## 11 2013-12-31   776  Tues -175.3585
daily %>%
    ggplot(aes(date, resid)) +
    geom_ref_line(h = 0) +
    geom_line(colour = "grey50") +
    geom_smooth(se = FALSE, span = 0.20)

plot of chunk unnamed-chunk-24

daily %>%
    filter(wday == "Sat") %>%
    ggplot(aes(date, n)) +
    geom_point() +
    geom_line() +
    scale_x_datetime(NULL, date_breaks = "1 month", date_labels = "%b")

plot of chunk unnamed-chunk-25

term <- function(date) {
    cut(date, 
        breaks = as.POSIXct(ymd(20130101, 20130605, 20130825, 20140101)),
        labels = c("spring", "summer", "fall")
        )
}

daily <- daily %>% 
    mutate(term = term(date))

daily %>%
    filter(wday == "Sat") %>%
    ggplot(aes(date, n, colour = term)) +
    geom_point(alpha = 1/3) +
    geom_line() +
    scale_x_datetime(NULL, date_breaks = "1 month", date_labels = "%b")

plot of chunk unnamed-chunk-26

mod3 <- MASS::rlm(n ~ wday * term, data = daily) 

daily %>%
    add_residuals(mod3, "resid") %>%
    ggplot(aes(date, resid)) +
    geom_hline(yintercept = 0, size = 2, colour = "white") +
    geom_line()

plot of chunk unnamed-chunk-27

25. Many Models

gapminder
## # A tibble: 1,704 x 6
##        country continent  year lifeExp      pop gdpPercap
##         <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1  Afghanistan      Asia  1952  28.801  8425333  779.4453
## 2  Afghanistan      Asia  1957  30.332  9240934  820.8530
## 3  Afghanistan      Asia  1962  31.997 10267083  853.1007
## 4  Afghanistan      Asia  1967  34.020 11537966  836.1971
## 5  Afghanistan      Asia  1972  36.088 13079460  739.9811
## 6  Afghanistan      Asia  1977  38.438 14880372  786.1134
## 7  Afghanistan      Asia  1982  39.854 12881816  978.0114
## 8  Afghanistan      Asia  1987  40.822 13867957  852.3959
## 9  Afghanistan      Asia  1992  41.674 16317921  649.3414
## 10 Afghanistan      Asia  1997  41.763 22227415  635.3414
## # ... with 1,694 more rows
gapminder %>%
    ggplot(aes(year, lifeExp, group = country)) +
    geom_line(alpha = 1/3)

plot of chunk unnamed-chunk-29

by_country <- gapminder %>%
    group_by(country, continent) %>%
    nest()
country_model <- function(df) {
    lm(lifeExp ~ year, data = df)
}

models <- map(by_country$data, country_model)

by_country <- by_country %>% 
    mutate(model = map(data, country_model))
by_country
## # A tibble: 142 x 4
##        country continent              data    model
##         <fctr>    <fctr>            <list>   <list>
## 1  Afghanistan      Asia <tibble [12 x 4]> <S3: lm>
## 2      Albania    Europe <tibble [12 x 4]> <S3: lm>
## 3      Algeria    Africa <tibble [12 x 4]> <S3: lm>
## 4       Angola    Africa <tibble [12 x 4]> <S3: lm>
## 5    Argentina  Americas <tibble [12 x 4]> <S3: lm>
## 6    Australia   Oceania <tibble [12 x 4]> <S3: lm>
## 7      Austria    Europe <tibble [12 x 4]> <S3: lm>
## 8      Bahrain      Asia <tibble [12 x 4]> <S3: lm>
## 9   Bangladesh      Asia <tibble [12 x 4]> <S3: lm>
## 10     Belgium    Europe <tibble [12 x 4]> <S3: lm>
## # ... with 132 more rows
by_country <- by_country %>%
    mutate(
        resids = map2(data, model, add_residuals)
    )
by_country
## # A tibble: 142 x 5
##        country continent              data    model            resids
##         <fctr>    <fctr>            <list>   <list>            <list>
## 1  Afghanistan      Asia <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 2      Albania    Europe <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 3      Algeria    Africa <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 4       Angola    Africa <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 5    Argentina  Americas <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 6    Australia   Oceania <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 7      Austria    Europe <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 8      Bahrain      Asia <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 9   Bangladesh      Asia <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 10     Belgium    Europe <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## # ... with 132 more rows
resids <- unnest(by_country, resids)
resids
## # A tibble: 1,704 x 7
##        country continent  year lifeExp      pop gdpPercap       resid
##         <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>       <dbl>
## 1  Afghanistan      Asia  1952  28.801  8425333  779.4453 -1.10629487
## 2  Afghanistan      Asia  1957  30.332  9240934  820.8530 -0.95193823
## 3  Afghanistan      Asia  1962  31.997 10267083  853.1007 -0.66358159
## 4  Afghanistan      Asia  1967  34.020 11537966  836.1971 -0.01722494
## 5  Afghanistan      Asia  1972  36.088 13079460  739.9811  0.67413170
## 6  Afghanistan      Asia  1977  38.438 14880372  786.1134  1.64748834
## 7  Afghanistan      Asia  1982  39.854 12881816  978.0114  1.68684499
## 8  Afghanistan      Asia  1987  40.822 13867957  852.3959  1.27820163
## 9  Afghanistan      Asia  1992  41.674 16317921  649.3414  0.75355828
## 10 Afghanistan      Asia  1997  41.763 22227415  635.3414 -0.53408508
## # ... with 1,694 more rows
resids %>%
    ggplot(aes(year, resid)) +
    geom_line(aes(group = country), alpha = 1/3) +
    geom_smooth(se = FALSE)

plot of chunk unnamed-chunk-34

glance(nz_mod)
## Error in glance(nz_mod): object 'nz_mod' not found
by_country %>%
    mutate(glance = map(model, glance)) %>%
    unnest(glance)
## # A tibble: 142 x 16
##        country continent              data    model            resids
##         <fctr>    <fctr>            <list>   <list>            <list>
## 1  Afghanistan      Asia <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 2      Albania    Europe <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 3      Algeria    Africa <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 4       Angola    Africa <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 5    Argentina  Americas <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 6    Australia   Oceania <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 7      Austria    Europe <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 8      Bahrain      Asia <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 9   Bangladesh      Asia <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 10     Belgium    Europe <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## # ... with 132 more rows, and 11 more variables: r.squared <dbl>,
## #   adj.r.squared <dbl>, sigma <dbl>, statistic <dbl>, p.value <dbl>,
## #   df <int>, logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
## #   df.residual <int>
glance <- by_country %>%
    mutate(glance = map(model, glance)) %>%
    unnest(glance, .drop=TRUE)
glance %>% 
    ggplot(aes(continent, r.squared)) +
    geom_jitter(width = 0.5)

plot of chunk unnamed-chunk-38

gapminder %>%
    group_by(country, continent) %>%
    nest()
## # A tibble: 142 x 3
##        country continent              data
##         <fctr>    <fctr>            <list>
## 1  Afghanistan      Asia <tibble [12 x 4]>
## 2      Albania    Europe <tibble [12 x 4]>
## 3      Algeria    Africa <tibble [12 x 4]>
## 4       Angola    Africa <tibble [12 x 4]>
## 5    Argentina  Americas <tibble [12 x 4]>
## 6    Australia   Oceania <tibble [12 x 4]>
## 7      Austria    Europe <tibble [12 x 4]>
## 8      Bahrain      Asia <tibble [12 x 4]>
## 9   Bangladesh      Asia <tibble [12 x 4]>
## 10     Belgium    Europe <tibble [12 x 4]>
## # ... with 132 more rows
gapminder %>%
    nest(year:gdpPercap)
## # A tibble: 142 x 3
##        country continent              data
##         <fctr>    <fctr>            <list>
## 1  Afghanistan      Asia <tibble [12 x 4]>
## 2      Albania    Europe <tibble [12 x 4]>
## 3      Algeria    Africa <tibble [12 x 4]>
## 4       Angola    Africa <tibble [12 x 4]>
## 5    Argentina  Americas <tibble [12 x 4]>
## 6    Australia   Oceania <tibble [12 x 4]>
## 7      Austria    Europe <tibble [12 x 4]>
## 8      Bahrain      Asia <tibble [12 x 4]>
## 9   Bangladesh      Asia <tibble [12 x 4]>
## 10     Belgium    Europe <tibble [12 x 4]>
## # ... with 132 more rows
df <- tibble(x1 = c("a,b,c", "d,e,f,g"))

df %>%
    mutate(x2 = stringr::str_split(x1, ","))
## # A tibble: 2 x 2
##        x1        x2
##     <chr>    <list>
## 1   a,b,c <chr [3]>
## 2 d,e,f,g <chr [4]>
df %>%
    mutate(x2 = stringr::str_split(x1, ",")) %>%
    unnest()
## # A tibble: 7 x 2
##        x1    x2
##     <chr> <chr>
## 1   a,b,c     a
## 2   a,b,c     b
## 3   a,b,c     c
## 4 d,e,f,g     d
## 5 d,e,f,g     e
## 6 d,e,f,g     f
## 7 d,e,f,g     g
sim <- tibble::frame_data(
    ~f, ~params,
    "runif", list(min = -1, max = 1),
    "rnorm", list(sd = 5),
    "rpois", list(lambda = 10)
)
sim %>% 
    mutate(sims = invoke_map(f, params, n = 10))
## # A tibble: 3 x 3
##       f     params       sims
##   <chr>     <list>     <list>
## 1 runif <list [2]> <dbl [10]>
## 2 rnorm <list [1]> <dbl [10]>
## 3 rpois <list [1]> <int [10]>
mtcars %>%
    group_by(cyl) %>%
    summarise(q = list(quantile(mpg))) %>%
    unnest()
## # A tibble: 15 x 2
##      cyl     q
##    <dbl> <dbl>
## 1      4 21.40
## 2      4 22.80
## 3      4 26.00
## 4      4 30.40
## 5      4 33.90
## 6      6 17.80
## 7      6 18.65
## 8      6 19.70
## 9      6 21.00
## 10     6 21.40
## 11     8 10.40
## 12     8 14.40
## 13     8 15.20
## 14     8 16.25
## 15     8 19.20
x <- list(
    a = 1:5,
    b = 3:4,
    c = 5:6
)

df <- tibble::enframe(x)
df
## # A tibble: 3 x 2
##    name     value
##   <chr>    <list>
## 1     a <int [5]>
## 2     b <int [2]>
## 3     c <int [2]>
df %>% 
    mutate(
        smry = map2_chr(name, value, ~ stringr::str_c(.x, ": ", .y[1]))
    )
## # A tibble: 3 x 3
##    name     value  smry
##   <chr>    <list> <chr>
## 1     a <int [5]>  a: 1
## 2     b <int [2]>  b: 3
## 3     c <int [2]>  c: 5
df <- tibble(
    x = list(
        letters[1:5], 
        1:3,
        runif(5)
    )
)
df
## # A tibble: 3 x 1
##           x
##      <list>
## 1 <chr [5]>
## 2 <int [3]>
## 3 <dbl [5]>
df %>% mutate(
    type = map_char(x, typeof),
    length = map_int(x, length)
)
## Error in eval(expr, envir, enclos): could not find function "map_char"
df <- tibble(
    x = list(
        list(a = 1, b = 2),
        list(a = 2, c = 4)
    )
)

df %>% mutate(
    a = map_dbl(x, "a"),
    b = map_dbl(x, "b", .null = NA_real_)
)
## # A tibble: 2 x 3
##            x     a     b
##       <list> <dbl> <dbl>
## 1 <list [2]>     1     2
## 2 <list [2]>     2    NA
tibble(x = 1:2, y = list(1:4, 1)) %>%
    unnest(y)
## # A tibble: 5 x 2
##       x     y
##   <int> <dbl>
## 1     1     1
## 2     1     2
## 3     1     3
## 4     1     4
## 5     2     1

한글로 쓰는 데는 큰 문제가 없나?

답글 남기기

이메일은 공개되지 않습니다. 필수 입력창은 * 로 표시되어 있습니다.