6.2 提取子集
subset(x, subset, select, drop = FALSE, ...)
参数 subset
代表行操作,select
代表列操作,函数 subset
从数据框中提取部分数据
subset(iris, subset = Species == "virginica" & Sepal.Length > 7.5)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 106 7.6 3.0 6.6 2.1 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 136 7.7 3.0 6.1 2.3 virginica
# summary(iris$Sepal.Length) mean(iris$Sepal.Length)
# 且的逻辑
# subset(iris, Species == "virginica" & Sepal.Length > 5.8)
subset(iris, Species == "virginica" &
== median(Sepal.Length)) Sepal.Length
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 102 5.8 2.7 5.1 1.9 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 143 5.8 2.7 5.1 1.9 virginica
# 在行的子集范围内
subset(iris, Species %in% c("virginica", "versicolor") &
== median(Sepal.Length)) Sepal.Length
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 68 5.8 2.7 4.1 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 102 5.8 2.7 5.1 1.9 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 143 5.8 2.7 5.1 1.9 virginica
# 在列的子集内 先选中列
subset(iris, Sepal.Length == median(Sepal.Length),
select = c("Sepal.Length", "Species")
)
## Sepal.Length Species
## 15 5.8 setosa
## 68 5.8 versicolor
## 83 5.8 versicolor
## 93 5.8 versicolor
## 102 5.8 virginica
## 115 5.8 virginica
## 143 5.8 virginica
高级操作:加入正则表达式筛选
## sometimes requiring a logical 'subset' argument is a nuisance
<- rownames(state.x77)
nm <- nm %in% grep("^M", nm, value = TRUE)
start_with_M subset(state.x77, start_with_M, Illiteracy:Murder)
## Illiteracy Life Exp Murder
## Maine 0.7 70.39 2.7
## Maryland 0.9 70.22 8.5
## Massachusetts 1.1 71.83 3.3
## Michigan 0.9 70.63 11.1
## Minnesota 0.6 72.96 2.3
## Mississippi 2.4 68.09 12.5
## Missouri 0.8 70.69 9.3
## Montana 0.6 70.56 5.0
# 简化
subset(state.x77, subset = grepl("^M", rownames(state.x77)), select = Illiteracy:Murder)
## Illiteracy Life Exp Murder
## Maine 0.7 70.39 2.7
## Maryland 0.9 70.22 8.5
## Massachusetts 1.1 71.83 3.3
## Michigan 0.9 70.63 11.1
## Minnesota 0.6 72.96 2.3
## Mississippi 2.4 68.09 12.5
## Missouri 0.8 70.69 9.3
## Montana 0.6 70.56 5.0
# 继续简化
subset(state.x77, grepl("^M", rownames(state.x77)), Illiteracy:Murder)
## Illiteracy Life Exp Murder
## Maine 0.7 70.39 2.7
## Maryland 0.9 70.22 8.5
## Massachusetts 1.1 71.83 3.3
## Michigan 0.9 70.63 11.1
## Minnesota 0.6 72.96 2.3
## Mississippi 2.4 68.09 12.5
## Missouri 0.8 70.69 9.3
## Montana 0.6 70.56 5.0
警告:这是一个为了交互使用打造的便捷函数。对于编程,最好使用标准的子集函数,如 [
,特别地,参数 subset
的非标准计算(non-standard evaluation)18 可能带来意想不到的后果。
使用索引 [
$Species == "virginica" & iris$Sepal.Length == 5.8, ] iris[iris
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 102 5.8 2.7 5.1 1.9 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 143 5.8 2.7 5.1 1.9 virginica
$Species == "virginica" &
iris[iris$Sepal.Length == median(iris$Sepal.Length), ] iris
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 102 5.8 2.7 5.1 1.9 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 143 5.8 2.7 5.1 1.9 virginica
iris[$Species == "virginica" &
iris$Sepal.Length == median(iris$Sepal.Length),
irisc("Sepal.Length", "Species")
]
## Sepal.Length Species
## 102 5.8 virginica
## 115 5.8 virginica
## 143 5.8 virginica
$Species == "setosa" & iris$Sepal.Length > 5.5, grepl("Sepal", colnames(iris))] iris[iris
## Sepal.Length Sepal.Width
## 15 5.8 4.0
## 16 5.7 4.4
## 19 5.7 3.8
subset(iris,
subset = Species == "setosa" & Sepal.Length > 5.5,
select = grepl("Sepal", colnames(iris))
)
## Sepal.Length Sepal.Width
## 15 5.8 4.0
## 16 5.7 4.4
## 19 5.7 3.8
选择操作是针对数据框的列(变量/特征/字段)
library(data.table)
$cars <- rownames(mtcars)
mtcars<- as.data.table(mtcars) mtcars_df
|> head() mtcars_df[, .(mpg, disp)]
## mpg disp
## 1: 21.0 160
## 2: 21.0 160
## 3: 22.8 108
## 4: 21.4 258
## 5: 18.7 360
## 6: 18.1 225
|>
mtcars ::select(mpg, disp) |>
dplyrhead()
## mpg disp
## Mazda RX4 21.0 160
## Mazda RX4 Wag 21.0 160
## Datsun 710 22.8 108
## Hornet 4 Drive 21.4 258
## Hornet Sportabout 18.7 360
## Valiant 18.1 225
Thomas Lumley (2003) Standard nonstandard evaluation rules. https://developer.r-project.org/nonstandard-eval.pdf↩︎