6.23 高频数据操作
以数据集 dat 为例介绍常用的数据操作
set.seed(2020)
<- data.frame(
dat num_a = rep(seq(4), each = 4), num_b = rep(seq(4), times = 4),
group_a = sample(x = letters[1:3], size = 16, replace = T),
group_b = sample(x = LETTERS[1:3], size = 16, replace = T)
)<- as.data.table(dat)
dat dat
## num_a num_b group_a group_b
## 1: 1 1 c B
## 2: 1 2 b B
## 3: 1 3 a B
## 4: 1 4 a C
## 5: 2 1 b B
## 6: 2 2 b C
## 7: 2 3 a B
## 8: 2 4 a A
## 9: 3 1 b C
## 10: 3 2 b B
## 11: 3 3 b B
## 12: 3 4 a B
## 13: 4 1 b C
## 14: 4 2 c B
## 15: 4 3 b C
## 16: 4 4 a C
6.23.2 分组计数
length(num_a)), by = .(group_a)] # dat[, .N , by = .(group_a)] dat[, .(
## group_a V1
## 1: c 2
## 2: b 8
## 3: a 6
length(num_a)), by = .(group_b)] dat[, .(
## group_b V1
## 1: B 9
## 2: C 6
## 3: A 1
length(num_a)), by = .(group_a, group_b)] dat[, .(
## group_a group_b V1
## 1: c B 2
## 2: b B 4
## 3: a B 3
## 4: a C 2
## 5: b C 4
## 6: a A 1
6.23.3 分组抽样
以 group_a
为组别, a、 b、 c 分别有 6、 8、 2 条记录
# 无放回的抽样
<- dat[, .SD[sample(x = .N, size = 2, replace = FALSE)], by = group_a]
dt_sample_1 # 有放回的随机抽样
<- dat[, .SD[sample(x = .N, size = 3, replace = TRUE)], by = group_a] dt_sample_2
可能存在该组样本不平衡,有的组的样本量不足你想要的样本量。每个组无放回地抽取 4 个样本,如果该组样本量不足 4,则全部抽取全部样本量。
sample(x = .N, size = min(4, .N))], by = group_a] dat[, .SD[
## group_a num_a num_b group_b
## 1: c 1 1 B
## 2: c 4 2 B
## 3: b 3 2 B
## 4: b 2 2 C
## 5: b 2 1 B
## 6: b 3 3 B
## 7: a 1 3 B
## 8: a 2 3 B
## 9: a 2 4 A
## 10: a 1 4 C
还可以按照指定的比例抽取样本量 19
6.23.4 分组排序
data.table 包的分组排序问题 https://d.cosx.org/d/421650-datatable/3
with(dat, order(-ave(num_a, group_a, FUN = max), -num_a)), ] dat[
## num_a num_b group_a group_b
## 1: 4 1 b C
## 2: 4 2 c B
## 3: 4 3 b C
## 4: 4 4 a C
## 5: 3 1 b C
## 6: 3 2 b B
## 7: 3 3 b B
## 8: 3 4 a B
## 9: 2 1 b B
## 10: 2 2 b C
## 11: 2 3 a B
## 12: 2 4 a A
## 13: 1 1 c B
## 14: 1 2 b B
## 15: 1 3 a B
## 16: 1 4 a C
# num_a 降序排列,然后对 group_a 升序排列
with(dat, order(-num_a, group_a)), ] dat[
## num_a num_b group_a group_b
## 1: 4 4 a C
## 2: 4 1 b C
## 3: 4 3 b C
## 4: 4 2 c B
## 5: 3 4 a B
## 6: 3 1 b C
## 7: 3 2 b B
## 8: 3 3 b B
## 9: 2 3 a B
## 10: 2 4 a A
## 11: 2 1 b B
## 12: 2 2 b C
## 13: 1 3 a B
## 14: 1 4 a C
## 15: 1 2 b B
## 16: 1 1 c B
# 简写
order(-num_a, group_a)] dat[
## num_a num_b group_a group_b
## 1: 4 4 a C
## 2: 4 1 b C
## 3: 4 3 b C
## 4: 4 2 c B
## 5: 3 4 a B
## 6: 3 1 b C
## 7: 3 2 b B
## 8: 3 3 b B
## 9: 2 3 a B
## 10: 2 4 a A
## 11: 2 1 b B
## 12: 2 2 b C
## 13: 1 3 a B
## 14: 1 4 a C
## 15: 1 2 b B
## 16: 1 1 c B
setorder()
函数直接修改原始数据记录的排序
setorder(dat, -num_a, group_a)
参考多个列分组排序 20
如果数据集 dat 包含缺失值,考虑去掉缺失值
length(!is.na(num_a))), by = .(group_a)] dat[, .(
## group_a V1
## 1: c 2
## 2: b 8
## 3: a 6
如果数据集 dat 包含重复值,考虑去掉重复值
length(unique(num_a))), by = .(group_a)] dat[, .(
## group_a V1
## 1: c 2
## 2: b 4
## 3: a 4
按 Species 分组,对 Sepal.Length 降序排列,取 Top 3
<- as.data.table(iris)
iris order(-Sepal.Length), .SD[1:3], by = "Species"] iris[
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1: virginica 7.9 3.8 6.4 2.0
## 2: virginica 7.7 3.8 6.7 2.2
## 3: virginica 7.7 2.6 6.9 2.3
## 4: versicolor 7.0 3.2 4.7 1.4
## 5: versicolor 6.9 3.1 4.9 1.5
## 6: versicolor 6.8 2.8 4.8 1.4
## 7: setosa 5.8 4.0 1.2 0.2
## 8: setosa 5.7 4.4 1.5 0.4
## 9: setosa 5.7 3.8 1.7 0.3
对 iris 各个列排序
<- head(iris)
dat <- do.call(what = "order", args = dat[, c(5, 1, 2, 3)])
ind dat[ind, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 4.6 3.1 1.5 0.2 setosa
## 2: 4.7 3.2 1.3 0.2 setosa
## 3: 4.9 3.0 1.4 0.2 setosa
## 4: 5.0 3.6 1.4 0.2 setosa
## 5: 5.1 3.5 1.4 0.2 setosa
## 6: 5.4 3.9 1.7 0.4 setosa
按 Species 分组,对 Sepal.Length 降序排列,取 Top 3
= as.data.table(iris)
iris order(-Sepal.Length), .SD[1:3], by="Species"] iris[
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1: virginica 7.9 3.8 6.4 2.0
## 2: virginica 7.7 3.8 6.7 2.2
## 3: virginica 7.7 2.6 6.9 2.3
## 4: versicolor 7.0 3.2 4.7 1.4
## 5: versicolor 6.9 3.1 4.9 1.5
## 6: versicolor 6.8 2.8 4.8 1.4
## 7: setosa 5.8 4.0 1.2 0.2
## 8: setosa 5.7 4.4 1.5 0.4
## 9: setosa 5.7 3.8 1.7 0.3
对 iris 各个列排序,依次对第 5、1、2、3 列升序排列
<- do.call(what = "order", args = iris[,c(5,1,2,3)])
ind head(iris[ind, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 4.3 3.0 1.1 0.1 setosa
## 2: 4.4 2.9 1.4 0.2 setosa
## 3: 4.4 3.0 1.3 0.2 setosa
## 4: 4.4 3.2 1.3 0.2 setosa
## 5: 4.5 2.3 1.3 0.3 setosa
## 6: 4.6 3.1 1.5 0.2 setosa
|
|