6.2 提取子集
subset(x, subset, select, drop = FALSE, ...)参数 subset代表行操作,select 代表列操作,函数 subset 从数据框中提取部分数据
subset(iris, subset = Species == "virginica" & Sepal.Length > 7.5)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 106 7.6 3.0 6.6 2.1 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 136 7.7 3.0 6.1 2.3 virginica
# summary(iris$Sepal.Length) mean(iris$Sepal.Length)
# 且的逻辑
# subset(iris, Species == "virginica" & Sepal.Length > 5.8)
subset(iris, Species == "virginica" &
Sepal.Length == median(Sepal.Length))## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 102 5.8 2.7 5.1 1.9 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 143 5.8 2.7 5.1 1.9 virginica
# 在行的子集范围内
subset(iris, Species %in% c("virginica", "versicolor") &
Sepal.Length == median(Sepal.Length))## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 68 5.8 2.7 4.1 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 102 5.8 2.7 5.1 1.9 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 143 5.8 2.7 5.1 1.9 virginica
# 在列的子集内 先选中列
subset(iris, Sepal.Length == median(Sepal.Length),
select = c("Sepal.Length", "Species")
)## Sepal.Length Species
## 15 5.8 setosa
## 68 5.8 versicolor
## 83 5.8 versicolor
## 93 5.8 versicolor
## 102 5.8 virginica
## 115 5.8 virginica
## 143 5.8 virginica
高级操作:加入正则表达式筛选
## sometimes requiring a logical 'subset' argument is a nuisance
nm <- rownames(state.x77)
start_with_M <- nm %in% grep("^M", nm, value = TRUE)
subset(state.x77, start_with_M, Illiteracy:Murder)## Illiteracy Life Exp Murder
## Maine 0.7 70.39 2.7
## Maryland 0.9 70.22 8.5
## Massachusetts 1.1 71.83 3.3
## Michigan 0.9 70.63 11.1
## Minnesota 0.6 72.96 2.3
## Mississippi 2.4 68.09 12.5
## Missouri 0.8 70.69 9.3
## Montana 0.6 70.56 5.0
# 简化
subset(state.x77, subset = grepl("^M", rownames(state.x77)), select = Illiteracy:Murder)## Illiteracy Life Exp Murder
## Maine 0.7 70.39 2.7
## Maryland 0.9 70.22 8.5
## Massachusetts 1.1 71.83 3.3
## Michigan 0.9 70.63 11.1
## Minnesota 0.6 72.96 2.3
## Mississippi 2.4 68.09 12.5
## Missouri 0.8 70.69 9.3
## Montana 0.6 70.56 5.0
# 继续简化
subset(state.x77, grepl("^M", rownames(state.x77)), Illiteracy:Murder)## Illiteracy Life Exp Murder
## Maine 0.7 70.39 2.7
## Maryland 0.9 70.22 8.5
## Massachusetts 1.1 71.83 3.3
## Michigan 0.9 70.63 11.1
## Minnesota 0.6 72.96 2.3
## Mississippi 2.4 68.09 12.5
## Missouri 0.8 70.69 9.3
## Montana 0.6 70.56 5.0
警告:这是一个为了交互使用打造的便捷函数。对于编程,最好使用标准的子集函数,如 [,特别地,参数 subset 的非标准计算(non-standard evaluation)18 可能带来意想不到的后果。
使用索引 [
iris[iris$Species == "virginica" & iris$Sepal.Length == 5.8, ]## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 102 5.8 2.7 5.1 1.9 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 143 5.8 2.7 5.1 1.9 virginica
iris[iris$Species == "virginica" &
iris$Sepal.Length == median(iris$Sepal.Length), ]## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 102 5.8 2.7 5.1 1.9 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 143 5.8 2.7 5.1 1.9 virginica
iris[
iris$Species == "virginica" &
iris$Sepal.Length == median(iris$Sepal.Length),
c("Sepal.Length", "Species")
]## Sepal.Length Species
## 102 5.8 virginica
## 115 5.8 virginica
## 143 5.8 virginica
iris[iris$Species == "setosa" & iris$Sepal.Length > 5.5, grepl("Sepal", colnames(iris))]## Sepal.Length Sepal.Width
## 15 5.8 4.0
## 16 5.7 4.4
## 19 5.7 3.8
subset(iris,
subset = Species == "setosa" & Sepal.Length > 5.5,
select = grepl("Sepal", colnames(iris))
)## Sepal.Length Sepal.Width
## 15 5.8 4.0
## 16 5.7 4.4
## 19 5.7 3.8
选择操作是针对数据框的列(变量/特征/字段)
library(data.table)
mtcars$cars <- rownames(mtcars)
mtcars_df <- as.data.table(mtcars)mtcars_df[, .(mpg, disp)] |> head()## mpg disp
## 1: 21.0 160
## 2: 21.0 160
## 3: 22.8 108
## 4: 21.4 258
## 5: 18.7 360
## 6: 18.1 225
mtcars |>
dplyr::select(mpg, disp) |>
head()## mpg disp
## Mazda RX4 21.0 160
## Mazda RX4 Wag 21.0 160
## Datsun 710 22.8 108
## Hornet 4 Drive 21.4 258
## Hornet Sportabout 18.7 360
## Valiant 18.1 225
Thomas Lumley (2003) Standard nonstandard evaluation rules. https://developer.r-project.org/nonstandard-eval.pdf↩︎