7.1 基础介绍
# 用一个真实的数据集替换,让每一个操作都有实际含义和价值 mtcars
<- data.table(
DT x = rep(c("b", "a", "c"), each = 3),
v = c(1, 1, 1, 2, 2, 1, 1, 2, 2),
y = c(1, 3, 6), a = 1:9, b = 9:1
) DT
## x v y a b
## 1: b 1 1 1 9
## 2: b 1 3 2 8
## 3: b 1 6 3 7
## 4: a 2 1 4 6
## 5: a 2 3 5 5
## 6: a 1 6 6 4
## 7: c 1 1 7 3
## 8: c 2 3 8 2
## 9: c 2 6 9 1
# 分组求和
sum(v), by = .(y %% 2)] DT[,
## y V1
## 1: 1 9
## 2: 0 4
sum(v), by = .(bool = y %% 2)] DT[,
## bool V1
## 1: 1 9
## 2: 0 4
2], by = x] # 每组第二行 DT[, .SD[
## x v y a b
## 1: b 1 3 2 8
## 2: a 2 3 5 5
## 3: c 2 3 8 2
tail(.SD, 2), by = x] # 每组最后两行 DT[,
## x v y a b
## 1: b 1 3 2 8
## 2: b 1 6 3 7
## 3: a 2 3 5 5
## 4: a 1 6 6 4
## 5: c 2 3 8 2
## 6: c 2 6 9 1
# 除了 x 列外,所有列都按 x 分组求和
lapply(.SD, sum), by = x] DT[,
## x v y a b
## 1: b 3 10 6 24
## 2: a 5 10 15 15
## 3: c 5 10 24 6
# 各个列都按 x 分组取最小
which.min(v)], by = x] # 分组嵌套查询 DT[, .SD[
## x v y a b
## 1: b 1 1 1 9
## 2: a 1 6 6 4
## 3: c 1 1 7 3
list(MySum = sum(v), MyMin = min(v), MyMax = max(v)), by = .(x, y %% 2)] # 表达式嵌套 DT[,
## x y MySum MyMin MyMax
## 1: b 1 2 1 1
## 2: b 0 1 1 1
## 3: a 1 4 2 2
## 4: a 0 1 1 1
## 5: c 1 3 1 2
## 6: c 0 2 2 2
a = .(a), b = .(b)), by = x] # 按 x 分组,将 a,b 两列的值列出来 DT[, .(
## x a b
## 1: b 1,2,3 9,8,7
## 2: a 4,5,6 6,5,4
## 3: c 7,8,9 3,2,1
seq = min(a):max(b)), by = x] # 列操作不仅仅是聚合 DT[, .(
## x seq
## 1: b 1
## 2: b 2
## 3: b 3
## 4: b 4
## 5: b 5
## 6: b 6
## 7: b 7
## 8: b 8
## 9: b 9
## 10: a 4
## 11: a 5
## 12: a 6
## 13: c 7
## 14: c 6
## 15: c 5
## 16: c 4
## 17: c 3
# 按 x 分组对 v 求和,然后过滤出和小于 20 的行
sum(v), by = x][V1 < 20] # 组合查询 DT[,
## x V1
## 1: b 3
## 2: a 5
## 3: c 5
sum(v), by = x][order(-V1)] # 对结果排序 DT[,
## x V1
## 1: a 5
## 2: c 5
## 3: b 3
c(.N, lapply(.SD, sum)), by = x] # 计算每一组的和,每一组的观测数 DT[,
## x N v y a b
## 1: b 3 3 10 6 24
## 2: a 3 5 10 15 15
## 3: c 3 5 10 24 6
# 两个复杂的操作,还没弄清楚这个技术存在的意义
DT[,
{<- mean(y)
tmp a = a - tmp, b = b - tmp)
.(
},= x
by # anonymous lambda in 'j', j accepts any valid ]
## x a b
## 1: b -2.3333333 5.6666667
## 2: b -1.3333333 4.6666667
## 3: b -0.3333333 3.6666667
## 4: a 0.6666667 2.6666667
## 5: a 1.6666667 1.6666667
## 6: a 2.6666667 0.6666667
## 7: c 3.6666667 -0.3333333
## 8: c 4.6666667 -1.3333333
## 9: c 5.6666667 -2.3333333
# using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v'
c(.(y = max(y)), lapply(.SD, min)), by = rleid(v), .SDcols = v:b] DT[,
## rleid y v y a b
## 1: 1 6 1 1 1 7
## 2: 2 3 2 1 4 5
## 3: 3 6 1 1 6 3
## 4: 4 6 2 3 8 1
7.1.1 过滤
<- as.data.table(mtcars) mtcars_df
过滤 cyl = 6 并且 gear = 4 的记录
== 6 & gear == 4] mtcars_df[cyl
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
过滤操作是针对数据框的行(记录)
== 6 & gear == 4, .(mpg, disp)] mtcars_df[cyl
## mpg disp
## 1: 21.0 160.0
## 2: 21.0 160.0
## 3: 19.2 167.6
## 4: 17.8 167.6
subset(x = mtcars_df, subset = cyl == 6 & gear == 4, select = c(mpg, disp))
## mpg disp
## 1: 21.0 160.0
## 2: 21.0 160.0
## 3: 19.2 167.6
## 4: 17.8 167.6
|>
mtcars ::filter(cyl == 6 & gear == 4) |>
dplyr::select(mpg, disp) dplyr
## mpg disp
## Mazda RX4 21.0 160.0
## Mazda RX4 Wag 21.0 160.0
## Merc 280 19.2 167.6
## Merc 280C 17.8 167.6
7.1.2 变换
根据已有的列生成新的列,或者修改已有的列,一次只能修改一列
:= mean(mpg)][, mean_disp := mean(disp)]
mtcars_df[, mean_mpg 1:6, ] mtcars_df[
## mpg cyl disp hp drat wt qsec vs am gear carb mean_mpg mean_disp
## 1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 20.09062 230.7219
## 2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 20.09062 230.7219
## 3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 20.09062 230.7219
## 4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 20.09062 230.7219
## 5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 20.09062 230.7219
## 6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 20.09062 230.7219
mean_mpg = mean(mpg), mean_disp = mean(disp))] mtcars_df[, .(
## mean_mpg mean_disp
## 1: 20.09062 230.7219
# mtcars_df[, .(mean_mpg := mean(mpg), mean_disp := mean(disp))] # 报错
# 正确的姿势
`:=`(mean_mpg = mean(mpg), mean_disp = mean(disp))][, .(mpg, disp, mean_mpg, mean_disp)] |> head() mtcars_df[,
## mpg disp mean_mpg mean_disp
## 1: 21.0 160 20.09062 230.7219
## 2: 21.0 160 20.09062 230.7219
## 3: 22.8 108 20.09062 230.7219
## 4: 21.4 258 20.09062 230.7219
## 5: 18.7 360 20.09062 230.7219
## 6: 18.1 225 20.09062 230.7219
|>
mtcars ::summarise(mean_mpg = mean(mpg), mean_disp = mean(disp)) dplyr
## mean_mpg mean_disp
## 1 20.09062 230.7219
|>
mtcars ::mutate(mean_mpg = mean(mpg), mean_disp = mean(disp)) |>
dplyr::select(mpg, disp, mean_mpg, mean_disp) |>
dplyrhead()
## mpg disp mean_mpg mean_disp
## Mazda RX4 21.0 160 20.09062 230.7219
## Mazda RX4 Wag 21.0 160 20.09062 230.7219
## Datsun 710 22.8 108 20.09062 230.7219
## Hornet 4 Drive 21.4 258 20.09062 230.7219
## Hornet Sportabout 18.7 360 20.09062 230.7219
## Valiant 18.1 225 20.09062 230.7219
7.1.3 聚合
分组统计 多个分组变量
dcast(mtcars_df, cyl ~ gear, value.var = "mpg", fun = mean)
## cyl 3 4 5
## 1: 4 21.50 26.925 28.2
## 2: 6 19.75 19.750 19.7
## 3: 8 15.05 NaN 15.4
tapply(mtcars$mpg, list(mtcars$cyl, mtcars$gear), mean)
## 3 4 5
## 4 21.50 26.925 28.2
## 6 19.75 19.750 19.7
## 8 15.05 NA 15.4
mean_mpg = mean(mpg)), by = .(cyl, gear)] mtcars_df[, .(
## cyl gear mean_mpg
## 1: 6 4 19.750
## 2: 4 4 26.925
## 3: 6 3 19.750
## 4: 8 3 15.050
## 5: 4 3 21.500
## 6: 4 5 28.200
## 7: 8 5 15.400
## 8: 6 5 19.700
aggregate(data = mtcars_df, mpg ~ cyl + gear, FUN = mean)
## cyl gear mpg
## 1 4 3 21.500
## 2 6 3 19.750
## 3 8 3 15.050
## 4 4 4 26.925
## 5 6 4 19.750
## 6 4 5 28.200
## 7 6 5 19.700
## 8 8 5 15.400
|>
mtcars ::group_by(cyl, gear) |>
dplyr::summarise(mean_mpg = mean(mpg)) dplyr
## # A tibble: 8 × 3
## # Groups: cyl [3]
## cyl gear mean_mpg
## <dbl> <dbl> <dbl>
## 1 4 3 21.5
## 2 4 4 26.9
## 3 4 5 28.2
## 4 6 3 19.8
## 5 6 4 19.8
## 6 6 5 19.7
## 7 8 3 15.0
## 8 8 5 15.4
7.1.4 命名
修改列名,另存一份生效
<- mtcars_df[, .(mean_mpg = mean(mpg)), by = .(cyl, gear)]
sub_mtcars_df setNames(sub_mtcars_df, c("cyl", "gear", "ave_mpg"))
## cyl gear ave_mpg
## 1: 6 4 19.750
## 2: 4 4 26.925
## 3: 6 3 19.750
## 4: 8 3 15.050
## 5: 4 3 21.500
## 6: 4 5 28.200
## 7: 8 5 15.400
## 8: 6 5 19.700
# 注意 sub_mtcars_df 并没有修改列名
sub_mtcars_df
## cyl gear mean_mpg
## 1: 6 4 19.750
## 2: 4 4 26.925
## 3: 6 3 19.750
## 4: 8 3 15.050
## 5: 4 3 21.500
## 6: 4 5 28.200
## 7: 8 5 15.400
## 8: 6 5 19.700
修改列名并直接起作用,在原来的数据集上生效
setnames(sub_mtcars_df, old = c("mean_mpg"), new = c("ave_mpg"))
# sub_mtcars_df 已经修改了列名
sub_mtcars_df
## cyl gear ave_mpg
## 1: 6 4 19.750
## 2: 4 4 26.925
## 3: 6 3 19.750
## 4: 8 3 15.050
## 5: 4 3 21.500
## 6: 4 5 28.200
## 7: 8 5 15.400
## 8: 6 5 19.700
修改列名最好使用 data.table 包的函数 setnames()
明确指出了要修改的列名,
7.1.5 排序
按照某(些)列从大到小或从小到大的顺序排列, 先按 cyl 升序,然后按 gear 降序
== 4][order(cyl, -gear)] mtcars_df[, .(mpg, cyl, gear)][cyl
## mpg cyl gear
## 1: 26.0 4 5
## 2: 30.4 4 5
## 3: 22.8 4 4
## 4: 24.4 4 4
## 5: 22.8 4 4
## 6: 32.4 4 4
## 7: 30.4 4 4
## 8: 33.9 4 4
## 9: 27.3 4 4
## 10: 21.4 4 4
## 11: 21.5 4 3
|>
mtcars ::select(mpg, cyl, gear) |>
dplyr::filter(cyl == 4) |>
dplyr::arrange(cyl, desc(gear)) dplyr
## mpg cyl gear
## Porsche 914-2 26.0 4 5
## Lotus Europa 30.4 4 5
## Datsun 710 22.8 4 4
## Merc 240D 24.4 4 4
## Merc 230 22.8 4 4
## Fiat 128 32.4 4 4
## Honda Civic 30.4 4 4
## Toyota Corolla 33.9 4 4
## Fiat X1-9 27.3 4 4
## Volvo 142E 21.4 4 4
## Toyota Corona 21.5 4 3
7.1.6 变形
melt 宽的变长的
<- data.table(
DT i_1 = c(1:5, NA),
i_2 = c(NA, 6, 7, 8, 9, 10),
f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered = TRUE),
c_1 = sample(c(letters[1:3], NA), 6, TRUE),
d_1 = as.Date(c(1:3, NA, 4:5), origin = "2013-09-01"),
d_2 = as.Date(6:1, origin = "2012-01-01")
)
DT[, .(i_1, i_2, f_1, f_2)]
## i_1 i_2 f_1 f_2
## 1: 1 NA a z
## 2: 2 6 a a
## 3: 3 7 c x
## 4: 4 8 c c
## 5: 5 9 b x
## 6: NA 10 a x
melt(DT, id = 1:2, measure = c("f_1", "f_2"))
## i_1 i_2 variable value
## 1: 1 NA f_1 a
## 2: 2 6 f_1 a
## 3: 3 7 f_1 c
## 4: 4 8 f_1 c
## 5: 5 9 f_1 b
## 6: NA 10 f_1 a
## 7: 1 NA f_2 z
## 8: 2 6 f_2 a
## 9: 3 7 f_2 x
## 10: 4 8 f_2 c
## 11: 5 9 f_2 x
## 12: NA 10 f_2 x
dcast 长的变宽的
<- as.data.table(sleep)
sleep dcast(sleep, group ~ ID, value.var = "extra")
## group 1 2 3 4 5 6 7 8 9 10
## 1: 1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0.0 2.0
## 2: 2 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6 4.6 3.4
# 如果有多个值
dcast(mtcars_df, cyl ~ gear, value.var = "mpg")
## cyl 3 4 5
## 1: 4 1 8 2
## 2: 6 2 4 1
## 3: 8 12 0 2
dcast(mtcars_df, cyl ~ gear, value.var = "mpg", fun = mean)
## cyl 3 4 5
## 1: 4 21.50 26.925 28.2
## 2: 6 19.75 19.750 19.7
## 3: 8 15.05 NaN 15.4
tidyr 包提供数据变形的函数 tidyr::pivot_longer()
和 tidyr::pivot_wider()
相比于 Base R 提供的 reshape()
和 data.table 提供的 melt()
和 dcast()
更加形象的命名
::pivot_wider(data = sleep, names_from = "ID", values_from = "extra") tidyr
## # A tibble: 2 × 11
## group `1` `2` `3` `4` `5` `6` `7` `8` `9` `10`
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2
## 2 2 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6 4.6 3.4
reshape(data = sleep, v.names = "extra", idvar = "group", timevar = "ID", direction = "wide")
## group extra.1 extra.2 extra.3 extra.4 extra.5 extra.6 extra.7 extra.8
## 1: 1 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8
## 2: 2 1.9 0.8 1.1 0.1 -0.1 4.4 5.5 1.6
## extra.9 extra.10
## 1: 0.0 2.0
## 2: 4.6 3.4
idvar
分组变量timevar
组内编号v.names
个体观察值sep
新的列名是由参数v.names
(extra) 和参数值timevar
(ID) 拼接起来的,默认sep = "."
推荐使用下划线来做分割sep = "_"
head(ToothGrowth)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
## 5 6.4 VC 0.5
## 6 10.0 VC 0.5
$time <- rep(1:10, 6)
ToothGrowthreshape(ToothGrowth,
v.names = "len", idvar = c("supp", "dose"),
timevar = "time", direction = "wide"
)
## supp dose len.1 len.2 len.3 len.4 len.5 len.6 len.7 len.8 len.9 len.10
## 1 VC 0.5 4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0
## 11 VC 1.0 16.5 16.5 15.2 17.3 22.5 17.3 13.6 14.5 18.8 15.5
## 21 VC 2.0 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5
## 31 OJ 0.5 15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7
## 41 OJ 1.0 19.7 23.3 23.6 26.4 20.0 25.2 25.8 21.2 14.5 27.3
## 51 OJ 2.0 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0
以数据集 ToothGrowth 为例,变量 supp(大组),dose(小组) 和 time(组内个体编号) 一起决定唯一的一个数据 len,特别适合纵向数据的变形操作
7.1.7 分组
分组切片,取每组第一个和最后一个值
|>
Loblolly ::group_by(Seed) |>
dplyr::arrange(height, age, Seed) |>
dplyr::slice(1, dplyr::n()) dplyr
## # A tibble: 28 × 3
## # Groups: Seed [14]
## height age Seed
## <dbl> <dbl> <ord>
## 1 3.93 3 329
## 2 56.4 25 329
## 3 4.12 3 327
## 4 56.8 25 327
## 5 4.38 3 325
## 6 58.5 25 325
## 7 3.91 3 307
## 8 59.1 25 307
## 9 3.46 3 331
## 10 59.5 25 331
## # ℹ 18 more rows
dplyr::slice()
和函数 slice.index()
有关系吗?
7.1.8 合并
合并操作对应于数据库中的连接操作, dplyr 包的哲学就来源于对数据库操作的进一步抽象, data.table 包的 merge 函数就对应为 dplyr 包的 join 函数
data.table::merge
和 dplyr::join
给出一个表格,数据操作, data.table 实现, dplyr 实现
<- data.table(A = letters[1:10], X = 1:10, key = "A")
dt1 <- data.table(A = letters[5:14], Y = 1:10, key = "A")
dt2 merge(dt1, dt2) # 内连接
## A X Y
## 1: e 5 1
## 2: f 6 2
## 3: g 7 3
## 4: h 8 4
## 5: i 9 5
## 6: j 10 6
参数 key 的作用相当于建立一个索引,通过它实现更快的数据操作速度
key = c("x","y","z")
或者 key = "x,y,z"
其中 x,y,z 是列名
data(band_members, band_instruments, package = "dplyr")
band_members
## # A tibble: 3 × 2
## name band
## <chr> <chr>
## 1 Mick Stones
## 2 John Beatles
## 3 Paul Beatles
band_instruments
## # A tibble: 3 × 2
## name plays
## <chr> <chr>
## 1 John guitar
## 2 Paul bass
## 3 Keith guitar
::inner_join(band_members, band_instruments) dplyr
## # A tibble: 2 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
list 列表里每个元素都是 data.frame 时,最适合用 data.table::rbindlist
合并
# 合并列表 https://recology.info/2018/10/limiting-dependencies/
function(x) {
::as_tibble((x <- data.table::setDF(
tibble::rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "id")
data.table
)
)) }
## function(x) {
## tibble::as_tibble((x <- data.table::setDF(
## data.table::rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "id")
## )
## ))
## }