7.1 基础介绍

# 用一个真实的数据集替换,让每一个操作都有实际含义和价值 mtcars
DT <- data.table(
  x = rep(c("b", "a", "c"), each = 3),
  v = c(1, 1, 1, 2, 2, 1, 1, 2, 2),
  y = c(1, 3, 6), a = 1:9, b = 9:1
)
DT
##    x v y a b
## 1: b 1 1 1 9
## 2: b 1 3 2 8
## 3: b 1 6 3 7
## 4: a 2 1 4 6
## 5: a 2 3 5 5
## 6: a 1 6 6 4
## 7: c 1 1 7 3
## 8: c 2 3 8 2
## 9: c 2 6 9 1
# 分组求和
DT[, sum(v), by = .(y %% 2)]
##    y V1
## 1: 1  9
## 2: 0  4
DT[, sum(v), by = .(bool = y %% 2)]
##    bool V1
## 1:    1  9
## 2:    0  4
DT[, .SD[2], by = x] # 每组第二行
##    x v y a b
## 1: b 1 3 2 8
## 2: a 2 3 5 5
## 3: c 2 3 8 2
DT[, tail(.SD, 2), by = x] # 每组最后两行
##    x v y a b
## 1: b 1 3 2 8
## 2: b 1 6 3 7
## 3: a 2 3 5 5
## 4: a 1 6 6 4
## 5: c 2 3 8 2
## 6: c 2 6 9 1
# 除了 x 列外,所有列都按 x 分组求和
DT[, lapply(.SD, sum), by = x]
##    x v  y  a  b
## 1: b 3 10  6 24
## 2: a 5 10 15 15
## 3: c 5 10 24  6
# 各个列都按 x 分组取最小
DT[, .SD[which.min(v)], by = x] # 分组嵌套查询
##    x v y a b
## 1: b 1 1 1 9
## 2: a 1 6 6 4
## 3: c 1 1 7 3
DT[, list(MySum = sum(v), MyMin = min(v), MyMax = max(v)), by = .(x, y %% 2)] # 表达式嵌套
##    x y MySum MyMin MyMax
## 1: b 1     2     1     1
## 2: b 0     1     1     1
## 3: a 1     4     2     2
## 4: a 0     1     1     1
## 5: c 1     3     1     2
## 6: c 0     2     2     2
DT[, .(a = .(a), b = .(b)), by = x] # 按 x 分组,将 a,b 两列的值列出来
##    x     a     b
## 1: b 1,2,3 9,8,7
## 2: a 4,5,6 6,5,4
## 3: c 7,8,9 3,2,1
DT[, .(seq = min(a):max(b)), by = x] # 列操作不仅仅是聚合
##     x seq
##  1: b   1
##  2: b   2
##  3: b   3
##  4: b   4
##  5: b   5
##  6: b   6
##  7: b   7
##  8: b   8
##  9: b   9
## 10: a   4
## 11: a   5
## 12: a   6
## 13: c   7
## 14: c   6
## 15: c   5
## 16: c   4
## 17: c   3
# 按 x 分组对 v 求和,然后过滤出和小于 20 的行
DT[, sum(v), by = x][V1 < 20] # 组合查询
##    x V1
## 1: b  3
## 2: a  5
## 3: c  5
DT[, sum(v), by = x][order(-V1)] # 对结果排序
##    x V1
## 1: a  5
## 2: c  5
## 3: b  3
DT[, c(.N, lapply(.SD, sum)), by = x] # 计算每一组的和,每一组的观测数
##    x N v  y  a  b
## 1: b 3 3 10  6 24
## 2: a 3 5 10 15 15
## 3: c 3 5 10 24  6
# 两个复杂的操作,还没弄清楚这个技术存在的意义
DT[,
  {
    tmp <- mean(y)
    .(a = a - tmp, b = b - tmp)
  },
  by = x
] # anonymous lambda in 'j', j accepts any valid
##    x          a          b
## 1: b -2.3333333  5.6666667
## 2: b -1.3333333  4.6666667
## 3: b -0.3333333  3.6666667
## 4: a  0.6666667  2.6666667
## 5: a  1.6666667  1.6666667
## 6: a  2.6666667  0.6666667
## 7: c  3.6666667 -0.3333333
## 8: c  4.6666667 -1.3333333
## 9: c  5.6666667 -2.3333333
# using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v'
DT[, c(.(y = max(y)), lapply(.SD, min)), by = rleid(v), .SDcols = v:b]
##    rleid y v y a b
## 1:     1 6 1 1 1 7
## 2:     2 3 2 1 4 5
## 3:     3 6 1 1 6 3
## 4:     4 6 2 3 8 1

7.1.1 过滤

mtcars_df <- as.data.table(mtcars)

过滤 cyl = 6 并且 gear = 4 的记录

mtcars_df[cyl == 6 & gear == 4]
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 4: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4

过滤操作是针对数据框的行(记录)

mtcars_df[cyl == 6 & gear == 4, .(mpg, disp)]
##     mpg  disp
## 1: 21.0 160.0
## 2: 21.0 160.0
## 3: 19.2 167.6
## 4: 17.8 167.6
subset(x = mtcars_df, subset = cyl == 6 & gear == 4, select = c(mpg, disp))
##     mpg  disp
## 1: 21.0 160.0
## 2: 21.0 160.0
## 3: 19.2 167.6
## 4: 17.8 167.6
mtcars |> 
  dplyr::filter(cyl == 6 & gear == 4) |> 
  dplyr::select(mpg, disp)
##                mpg  disp
## Mazda RX4     21.0 160.0
## Mazda RX4 Wag 21.0 160.0
## Merc 280      19.2 167.6
## Merc 280C     17.8 167.6

7.1.2 变换

根据已有的列生成新的列,或者修改已有的列,一次只能修改一列

mtcars_df[, mean_mpg := mean(mpg)][, mean_disp := mean(disp)]
mtcars_df[1:6, ]
##     mpg cyl disp  hp drat    wt  qsec vs am gear carb mean_mpg mean_disp
## 1: 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4 20.09062  230.7219
## 2: 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4 20.09062  230.7219
## 3: 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1 20.09062  230.7219
## 4: 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1 20.09062  230.7219
## 5: 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 20.09062  230.7219
## 6: 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1 20.09062  230.7219
mtcars_df[, .(mean_mpg = mean(mpg), mean_disp = mean(disp))]
##    mean_mpg mean_disp
## 1: 20.09062  230.7219
# mtcars_df[, .(mean_mpg := mean(mpg), mean_disp := mean(disp))] # 报错
# 正确的姿势
mtcars_df[, `:=`(mean_mpg = mean(mpg), mean_disp = mean(disp))][, .(mpg, disp, mean_mpg, mean_disp)] |>  head()
##     mpg disp mean_mpg mean_disp
## 1: 21.0  160 20.09062  230.7219
## 2: 21.0  160 20.09062  230.7219
## 3: 22.8  108 20.09062  230.7219
## 4: 21.4  258 20.09062  230.7219
## 5: 18.7  360 20.09062  230.7219
## 6: 18.1  225 20.09062  230.7219
mtcars |> 
  dplyr::summarise(mean_mpg = mean(mpg), mean_disp = mean(disp))
##   mean_mpg mean_disp
## 1 20.09062  230.7219
mtcars |> 
  dplyr::mutate(mean_mpg = mean(mpg), mean_disp = mean(disp)) |> 
  dplyr::select(mpg, disp, mean_mpg, mean_disp) |> 
  head()
##                    mpg disp mean_mpg mean_disp
## Mazda RX4         21.0  160 20.09062  230.7219
## Mazda RX4 Wag     21.0  160 20.09062  230.7219
## Datsun 710        22.8  108 20.09062  230.7219
## Hornet 4 Drive    21.4  258 20.09062  230.7219
## Hornet Sportabout 18.7  360 20.09062  230.7219
## Valiant           18.1  225 20.09062  230.7219

7.1.3 聚合

分组统计 多个分组变量

dcast(mtcars_df, cyl ~ gear, value.var = "mpg", fun = mean)
##    cyl     3      4    5
## 1:   4 21.50 26.925 28.2
## 2:   6 19.75 19.750 19.7
## 3:   8 15.05    NaN 15.4
tapply(mtcars$mpg, list(mtcars$cyl, mtcars$gear), mean)
##       3      4    5
## 4 21.50 26.925 28.2
## 6 19.75 19.750 19.7
## 8 15.05     NA 15.4
mtcars_df[, .(mean_mpg = mean(mpg)), by = .(cyl, gear)]
##    cyl gear mean_mpg
## 1:   6    4   19.750
## 2:   4    4   26.925
## 3:   6    3   19.750
## 4:   8    3   15.050
## 5:   4    3   21.500
## 6:   4    5   28.200
## 7:   8    5   15.400
## 8:   6    5   19.700
aggregate(data = mtcars_df, mpg ~ cyl + gear, FUN = mean)
##   cyl gear    mpg
## 1   4    3 21.500
## 2   6    3 19.750
## 3   8    3 15.050
## 4   4    4 26.925
## 5   6    4 19.750
## 6   4    5 28.200
## 7   6    5 19.700
## 8   8    5 15.400
mtcars |> 
  dplyr::group_by(cyl, gear) |> 
  dplyr::summarise(mean_mpg = mean(mpg))
## # A tibble: 8 × 3
## # Groups:   cyl [3]
##     cyl  gear mean_mpg
##   <dbl> <dbl>    <dbl>
## 1     4     3     21.5
## 2     4     4     26.9
## 3     4     5     28.2
## 4     6     3     19.8
## 5     6     4     19.8
## 6     6     5     19.7
## 7     8     3     15.0
## 8     8     5     15.4

7.1.4 命名

修改列名,另存一份生效

sub_mtcars_df <- mtcars_df[, .(mean_mpg = mean(mpg)), by = .(cyl, gear)]
setNames(sub_mtcars_df, c("cyl", "gear", "ave_mpg"))
##    cyl gear ave_mpg
## 1:   6    4  19.750
## 2:   4    4  26.925
## 3:   6    3  19.750
## 4:   8    3  15.050
## 5:   4    3  21.500
## 6:   4    5  28.200
## 7:   8    5  15.400
## 8:   6    5  19.700
# 注意 sub_mtcars_df 并没有修改列名
sub_mtcars_df
##    cyl gear mean_mpg
## 1:   6    4   19.750
## 2:   4    4   26.925
## 3:   6    3   19.750
## 4:   8    3   15.050
## 5:   4    3   21.500
## 6:   4    5   28.200
## 7:   8    5   15.400
## 8:   6    5   19.700

修改列名并直接起作用,在原来的数据集上生效

setnames(sub_mtcars_df, old = c("mean_mpg"), new = c("ave_mpg"))
# sub_mtcars_df 已经修改了列名
sub_mtcars_df
##    cyl gear ave_mpg
## 1:   6    4  19.750
## 2:   4    4  26.925
## 3:   6    3  19.750
## 4:   8    3  15.050
## 5:   4    3  21.500
## 6:   4    5  28.200
## 7:   8    5  15.400
## 8:   6    5  19.700

修改列名最好使用 data.table 包的函数 setnames() 明确指出了要修改的列名,

7.1.5 排序

按照某(些)列从大到小或从小到大的顺序排列, 先按 cyl 升序,然后按 gear 降序

mtcars_df[, .(mpg, cyl, gear)][cyl == 4][order(cyl, -gear)]
##      mpg cyl gear
##  1: 26.0   4    5
##  2: 30.4   4    5
##  3: 22.8   4    4
##  4: 24.4   4    4
##  5: 22.8   4    4
##  6: 32.4   4    4
##  7: 30.4   4    4
##  8: 33.9   4    4
##  9: 27.3   4    4
## 10: 21.4   4    4
## 11: 21.5   4    3
mtcars |> 
  dplyr::select(mpg, cyl, gear) |> 
  dplyr::filter(cyl == 4) |> 
  dplyr::arrange(cyl, desc(gear))
##                 mpg cyl gear
## Porsche 914-2  26.0   4    5
## Lotus Europa   30.4   4    5
## Datsun 710     22.8   4    4
## Merc 240D      24.4   4    4
## Merc 230       22.8   4    4
## Fiat 128       32.4   4    4
## Honda Civic    30.4   4    4
## Toyota Corolla 33.9   4    4
## Fiat X1-9      27.3   4    4
## Volvo 142E     21.4   4    4
## Toyota Corona  21.5   4    3

7.1.6 变形

melt 宽的变长的

DT <- data.table(
  i_1 = c(1:5, NA),
  i_2 = c(NA, 6, 7, 8, 9, 10),
  f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
  f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered = TRUE),
  c_1 = sample(c(letters[1:3], NA), 6, TRUE),
  d_1 = as.Date(c(1:3, NA, 4:5), origin = "2013-09-01"),
  d_2 = as.Date(6:1, origin = "2012-01-01")
)
DT[, .(i_1, i_2, f_1, f_2)]
##    i_1 i_2 f_1 f_2
## 1:   1  NA   a   z
## 2:   2   6   a   a
## 3:   3   7   c   x
## 4:   4   8   c   c
## 5:   5   9   b   x
## 6:  NA  10   a   x
melt(DT, id = 1:2, measure = c("f_1", "f_2"))
##     i_1 i_2 variable value
##  1:   1  NA      f_1     a
##  2:   2   6      f_1     a
##  3:   3   7      f_1     c
##  4:   4   8      f_1     c
##  5:   5   9      f_1     b
##  6:  NA  10      f_1     a
##  7:   1  NA      f_2     z
##  8:   2   6      f_2     a
##  9:   3   7      f_2     x
## 10:   4   8      f_2     c
## 11:   5   9      f_2     x
## 12:  NA  10      f_2     x

dcast 长的变宽的

sleep <- as.data.table(sleep)
dcast(sleep, group ~ ID, value.var = "extra")
##    group   1    2    3    4    5   6   7   8   9  10
## 1:     1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0.0 2.0
## 2:     2 1.9  0.8  1.1  0.1 -0.1 4.4 5.5 1.6 4.6 3.4
# 如果有多个值
dcast(mtcars_df, cyl ~ gear, value.var = "mpg")
##    cyl  3 4 5
## 1:   4  1 8 2
## 2:   6  2 4 1
## 3:   8 12 0 2
dcast(mtcars_df, cyl ~ gear, value.var = "mpg", fun = mean)
##    cyl     3      4    5
## 1:   4 21.50 26.925 28.2
## 2:   6 19.75 19.750 19.7
## 3:   8 15.05    NaN 15.4

tidyr 包提供数据变形的函数 tidyr::pivot_longer()tidyr::pivot_wider() 相比于 Base R 提供的 reshape() 和 data.table 提供的 melt()dcast() 更加形象的命名

tidyr::pivot_wider(data = sleep, names_from = "ID", values_from = "extra")
## # A tibble: 2 × 11
##   group   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`
##   <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1       0.7  -1.6  -0.2  -1.2  -0.1   3.4   3.7   0.8   0     2  
## 2 2       1.9   0.8   1.1   0.1  -0.1   4.4   5.5   1.6   4.6   3.4
reshape(data = sleep, v.names = "extra", idvar = "group", timevar = "ID", direction = "wide")
##    group extra.1 extra.2 extra.3 extra.4 extra.5 extra.6 extra.7 extra.8
## 1:     1     0.7    -1.6    -0.2    -1.2    -0.1     3.4     3.7     0.8
## 2:     2     1.9     0.8     1.1     0.1    -0.1     4.4     5.5     1.6
##    extra.9 extra.10
## 1:     0.0      2.0
## 2:     4.6      3.4
  • idvar 分组变量
  • timevar 组内编号
  • v.names 个体观察值
  • sep 新的列名是由参数 v.names (extra) 和参数值 timevar (ID) 拼接起来的,默认 sep = "." 推荐使用下划线来做分割 sep = "_"
head(ToothGrowth)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5
ToothGrowth$time <- rep(1:10, 6)
reshape(ToothGrowth,
  v.names = "len", idvar = c("supp", "dose"),
  timevar = "time", direction = "wide"
)
##    supp dose len.1 len.2 len.3 len.4 len.5 len.6 len.7 len.8 len.9 len.10
## 1    VC  0.5   4.2  11.5   7.3   5.8   6.4  10.0  11.2  11.2   5.2    7.0
## 11   VC  1.0  16.5  16.5  15.2  17.3  22.5  17.3  13.6  14.5  18.8   15.5
## 21   VC  2.0  23.6  18.5  33.9  25.5  26.4  32.5  26.7  21.5  23.3   29.5
## 31   OJ  0.5  15.2  21.5  17.6   9.7  14.5  10.0   8.2   9.4  16.5    9.7
## 41   OJ  1.0  19.7  23.3  23.6  26.4  20.0  25.2  25.8  21.2  14.5   27.3
## 51   OJ  2.0  25.5  26.4  22.4  24.5  24.8  30.9  26.4  27.3  29.4   23.0

以数据集 ToothGrowth 为例,变量 supp(大组),dose(小组) 和 time(组内个体编号) 一起决定唯一的一个数据 len,特别适合纵向数据的变形操作

7.1.7 分组

分组切片,取每组第一个和最后一个值

Loblolly |> 
  dplyr::group_by(Seed) |> 
  dplyr::arrange(height, age, Seed) |> 
  dplyr::slice(1, dplyr::n())
## # A tibble: 28 × 3
## # Groups:   Seed [14]
##    height   age Seed 
##     <dbl> <dbl> <ord>
##  1   3.93     3 329  
##  2  56.4     25 329  
##  3   4.12     3 327  
##  4  56.8     25 327  
##  5   4.38     3 325  
##  6  58.5     25 325  
##  7   3.91     3 307  
##  8  59.1     25 307  
##  9   3.46     3 331  
## 10  59.5     25 331  
## # ℹ 18 more rows

dplyr::slice() 和函数 slice.index() 有关系吗?

7.1.8 合并

合并操作对应于数据库中的连接操作, dplyr 包的哲学就来源于对数据库操作的进一步抽象, data.table 包的 merge 函数就对应为 dplyr 包的 join 函数

data.table::mergedplyr::join

给出一个表格,数据操作, data.table 实现, dplyr 实现

dt1 <- data.table(A = letters[1:10], X = 1:10, key = "A")
dt2 <- data.table(A = letters[5:14], Y = 1:10, key = "A")
merge(dt1, dt2) # 内连接
##    A  X Y
## 1: e  5 1
## 2: f  6 2
## 3: g  7 3
## 4: h  8 4
## 5: i  9 5
## 6: j 10 6

参数 key 的作用相当于建立一个索引,通过它实现更快的数据操作速度

key = c("x","y","z") 或者 key = "x,y,z" 其中 x,y,z 是列名

data(band_members, band_instruments, package = "dplyr")
band_members
## # A tibble: 3 × 2
##   name  band   
##   <chr> <chr>  
## 1 Mick  Stones 
## 2 John  Beatles
## 3 Paul  Beatles
band_instruments
## # A tibble: 3 × 2
##   name  plays 
##   <chr> <chr> 
## 1 John  guitar
## 2 Paul  bass  
## 3 Keith guitar
dplyr::inner_join(band_members, band_instruments)
## # A tibble: 2 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

list 列表里每个元素都是 data.frame 时,最适合用 data.table::rbindlist 合并

# 合并列表 https://recology.info/2018/10/limiting-dependencies/
function(x) {
  tibble::as_tibble((x <- data.table::setDF(
    data.table::rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "id")
  )
  ))
}
## function(x) {
##   tibble::as_tibble((x <- data.table::setDF(
##     data.table::rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "id")
##   )
##   ))
## }