6.10 数据聚合
分组求和 https://stackoverflow.com/questions/1660124
主要是分组统计
apropos("apply")
## [1] "apply" "dendrapply" "eapply" "frollapply" "kernapply"
## [6] "lapply" "mapply" "rapply" "sapply" "tapply"
## [11] "vapply"
# 分组求和 colSums colMeans max
unique(iris$Species)
## [1] setosa versicolor virginica
## Levels: setosa versicolor virginica
# 分类求和
# colSums(iris[iris$Species == "setosa", -5])
# colSums(iris[iris$Species == "virginica", -5])
colSums(iris[iris$Species == "versicolor", -5])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 296.8 138.5 213.0 66.3
# apply(iris[iris$Species == "setosa", -5], 2, sum)
# apply(iris[iris$Species == "setosa", -5], 2, mean)
# apply(iris[iris$Species == "setosa", -5], 2, min)
# apply(iris[iris$Species == "setosa", -5], 2, max)
apply(iris[iris$Species == "setosa", -5], 2, quantile)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0% 4.3 2.300 1.000 0.1
## 25% 4.8 3.200 1.400 0.2
## 50% 5.0 3.400 1.500 0.2
## 75% 5.2 3.675 1.575 0.3
## 100% 5.8 4.400 1.900 0.6
aggregate: Compute Summary Statistics of Data Subsets
# 按分类变量 Species 分组求和
# aggregate(subset(iris, select = -Species), by = list(iris[, "Species"]), FUN = sum)
aggregate(iris[, -5], list(iris[, 5]), sum)
## Group.1 Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 250.3 171.4 73.1 12.3
## 2 versicolor 296.8 138.5 213.0 66.3
## 3 virginica 329.4 148.7 277.6 101.3
# 先确定位置,假设有很多分类变量
<- which("Species" == colnames(iris))
ind # 分组统计
aggregate(iris[, -ind], list(iris[, ind]), sum)
## Group.1 Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 250.3 171.4 73.1 12.3
## 2 versicolor 296.8 138.5 213.0 66.3
## 3 virginica 329.4 148.7 277.6 101.3
按照 Species 划分的类别,分组计算,使用公式表示形式,右边一定是分类变量,否则会报错误或者警告,输出奇怪的结果,请读者尝试运行aggregate(Species ~ Sepal.Length, data = iris, mean)
。公式法表示分组计算,~
左手边可以做加 +
减 -
乘 *
除 /
取余 %%
等数学运算。下面以数据集 iris 为例,只对 Sepal.Length 按 Species 分组计算
aggregate(Sepal.Length ~ Species, data = iris, mean)
## Species Sepal.Length
## 1 setosa 5.006
## 2 versicolor 5.936
## 3 virginica 6.588
与上述分组统计结果一样的命令,在大数据集上, 与 aggregate 相比,tapply 要快很多,by 是 tapply 的包裹,处理速度差不多。读者可以构造伪随机数据集验证。
# tapply(iris$Sepal.Length, list(iris$Species), mean)
with(iris, tapply(Sepal.Length, Species, mean))
## setosa versicolor virginica
## 5.006 5.936 6.588
by(iris$Sepal.Length, iris$Species, mean)
## iris$Species: setosa
## [1] 5.006
## ------------------------------------------------------------
## iris$Species: versicolor
## [1] 5.936
## ------------------------------------------------------------
## iris$Species: virginica
## [1] 6.588
对所有变量按 Species 分组计算
aggregate(. ~ Species, data = iris, mean)
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.006 3.428 1.462 0.246
## 2 versicolor 5.936 2.770 4.260 1.326
## 3 virginica 6.588 2.974 5.552 2.026
对变量 Sepal.Length 和 Sepal.Width 求和后,按 Species 分组计算
aggregate(Sepal.Length + Sepal.Width ~ Species, data = iris, mean)
## Species Sepal.Length + Sepal.Width
## 1 setosa 8.434
## 2 versicolor 8.706
## 3 virginica 9.562
对多个分类变量做分组计算,在数据集 ChickWeight 中 Chick和Diet都是数字编码的分类变量,其中 Chick 是有序的因子变量,Diet 是无序的因子变量,而 Time 是数值型的变量,表示小鸡出生的天数。
# 查看数据
str(ChickWeight)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 578 obs. of 4 variables:
## $ weight: num 42 51 59 64 76 93 106 125 149 171 ...
## $ Time : num 0 2 4 6 8 10 12 14 16 18 ...
## $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
## $ Diet : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "formula")=Class 'formula' language weight ~ Time | Chick
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "outer")=Class 'formula' language ~Diet
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "labels")=List of 2
## ..$ x: chr "Time"
## ..$ y: chr "Body weight"
## - attr(*, "units")=List of 2
## ..$ x: chr "(days)"
## ..$ y: chr "(gm)"
查看数据集ChickWeight的前几行
head(ChickWeight)
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
....
str(ChickWeight)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 578 obs. of 4 variables:
## $ weight: num 42 51 59 64 76 93 106 125 149 171 ...
## $ Time : num 0 2 4 6 8 10 12 14 16 18 ...
## $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
## $ Diet : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "formula")=Class 'formula' language weight ~ Time | Chick
....
对于数据集ChickWeight中的有序变量Chick,aggregate 会按照既定顺序返回分组计算的结果
aggregate(weight ~ Chick, data = ChickWeight, mean)
## Chick weight
## 1 18 37.00000
## 2 16 49.71429
## 3 15 60.12500
## 4 13 67.83333
## 5 9 81.16667
....
aggregate(weight ~ Diet, data = ChickWeight, mean)
## Diet weight
## 1 1 102.6455
## 2 2 122.6167
## 3 3 142.9500
## 4 4 135.2627
分类变量没有用数字编码,以 CO2 数据集为例,该数据集描述草植对二氧化碳的吸收情况,Plant 是具有12个水平的有序的因子变量,Type表示植物的源头分别是魁北克(Quebec)和密西西比(Mississippi),Treatment表示冷却(chilled)和不冷却(nonchilled)两种处理方式,conc表示周围环境中二氧化碳的浓度,uptake表示植物吸收二氧化碳的速率。
# 查看数据集
head(CO2)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
str(CO2)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 84 obs. of 5 variables:
## $ Plant : Ord.factor w/ 12 levels "Qn1"<"Qn2"<"Qn3"<..: 1 1 1 1 1 1 1 2 2 2 ...
## $ Type : Factor w/ 2 levels "Quebec","Mississippi": 1 1 1 1 1 1 1 1 1 1 ...
## $ Treatment: Factor w/ 2 levels "nonchilled","chilled": 1 1 1 1 1 1 1 1 1 1 ...
## $ conc : num 95 175 250 350 500 675 1000 95 175 250 ...
## $ uptake : num 16 30.4 34.8 37.2 35.3 39.2 39.7 13.6 27.3 37.1 ...
## - attr(*, "formula")=Class 'formula' language uptake ~ conc | Plant
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "outer")=Class 'formula' language ~Treatment * Type
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "labels")=List of 2
## ..$ x: chr "Ambient carbon dioxide concentration"
## ..$ y: chr "CO2 uptake rate"
## - attr(*, "units")=List of 2
## ..$ x: chr "(uL/L)"
## ..$ y: chr "(umol/m^2 s)"
对单个变量分组统计
aggregate(uptake ~ Plant, data = CO2, mean)
## Plant uptake
## 1 Qn1 33.22857
## 2 Qn2 35.15714
## 3 Qn3 37.61429
## 4 Qc1 29.97143
## 5 Qc3 32.58571
## 6 Qc2 32.70000
## 7 Mn3 24.11429
## 8 Mn2 27.34286
## 9 Mn1 26.40000
## 10 Mc2 12.14286
## 11 Mc3 17.30000
## 12 Mc1 18.00000
aggregate(uptake ~ Type, data = CO2, mean)
## Type uptake
## 1 Quebec 33.54286
## 2 Mississippi 20.88333
aggregate(uptake ~ Treatment, data = CO2, mean)
## Treatment uptake
## 1 nonchilled 30.64286
## 2 chilled 23.78333
对多个变量分组统计,查看二氧化碳吸收速率uptake随类型Type和处理方式Treatment
aggregate(uptake ~ Type + Treatment, data = CO2, mean)
## Type Treatment uptake
## 1 Quebec nonchilled 35.33333
## 2 Mississippi nonchilled 25.95238
## 3 Quebec chilled 31.75238
## 4 Mississippi chilled 15.81429
tapply(CO2$uptake, list(CO2$Type, CO2$Treatment), mean)
## nonchilled chilled
## Quebec 35.33333 31.75238
## Mississippi 25.95238 15.81429
by(CO2$uptake, list(CO2$Type, CO2$Treatment), mean)
## : Quebec
## : nonchilled
## [1] 35.33333
## ------------------------------------------------------------
## : Mississippi
## : nonchilled
## [1] 25.95238
## ------------------------------------------------------------
## : Quebec
## : chilled
## [1] 31.75238
## ------------------------------------------------------------
## : Mississippi
## : chilled
## [1] 15.81429
在这个例子中 tapply 和 by 的输出结果的表示形式不一样,aggregate 返回一个 data.frame 数据框,tapply 返回一个表格 table,by 返回特殊的数据类型 by。
Function by
is an object-oriented wrapper for tapply
applied to data frames.
# 分组求和
# by(iris[, 1], INDICES = list(iris$Species), FUN = sum)
# by(iris[, 2], INDICES = list(iris$Species), FUN = sum)
by(iris[, 3], INDICES = list(iris$Species), FUN = sum)
## : setosa
## [1] 73.1
## ------------------------------------------------------------
## : versicolor
## [1] 213
## ------------------------------------------------------------
## : virginica
## [1] 277.6
by(iris[1:3], INDICES = list(iris$Species), FUN = sum)
## : setosa
## [1] 494.8
## ------------------------------------------------------------
## : versicolor
## [1] 648.3
## ------------------------------------------------------------
## : virginica
## [1] 755.7
by(iris[1:3], INDICES = list(iris$Species), FUN = summary)
## : setosa
## Sepal.Length Sepal.Width Petal.Length
## Min. :4.300 Min. :2.300 Min. :1.000
## 1st Qu.:4.800 1st Qu.:3.200 1st Qu.:1.400
## Median :5.000 Median :3.400 Median :1.500
## Mean :5.006 Mean :3.428 Mean :1.462
## 3rd Qu.:5.200 3rd Qu.:3.675 3rd Qu.:1.575
## Max. :5.800 Max. :4.400 Max. :1.900
## ------------------------------------------------------------
## : versicolor
## Sepal.Length Sepal.Width Petal.Length
## Min. :4.900 Min. :2.000 Min. :3.00
## 1st Qu.:5.600 1st Qu.:2.525 1st Qu.:4.00
## Median :5.900 Median :2.800 Median :4.35
## Mean :5.936 Mean :2.770 Mean :4.26
## 3rd Qu.:6.300 3rd Qu.:3.000 3rd Qu.:4.60
## Max. :7.000 Max. :3.400 Max. :5.10
## ------------------------------------------------------------
## : virginica
## Sepal.Length Sepal.Width Petal.Length
## Min. :4.900 Min. :2.200 Min. :4.500
## 1st Qu.:6.225 1st Qu.:2.800 1st Qu.:5.100
## Median :6.500 Median :3.000 Median :5.550
## Mean :6.588 Mean :2.974 Mean :5.552
## 3rd Qu.:6.900 3rd Qu.:3.175 3rd Qu.:5.875
## Max. :7.900 Max. :3.800 Max. :6.900
by(iris, INDICES = list(iris$Species), FUN = summary)
## : setosa
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.300 Min. :1.000 Min. :0.100
## 1st Qu.:4.800 1st Qu.:3.200 1st Qu.:1.400 1st Qu.:0.200
## Median :5.000 Median :3.400 Median :1.500 Median :0.200
## Mean :5.006 Mean :3.428 Mean :1.462 Mean :0.246
## 3rd Qu.:5.200 3rd Qu.:3.675 3rd Qu.:1.575 3rd Qu.:0.300
## Max. :5.800 Max. :4.400 Max. :1.900 Max. :0.600
## Species
## setosa :50
## versicolor: 0
## virginica : 0
##
##
##
## ------------------------------------------------------------
## : versicolor
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## Min. :4.900 Min. :2.000 Min. :3.00 Min. :1.000 setosa : 0
## 1st Qu.:5.600 1st Qu.:2.525 1st Qu.:4.00 1st Qu.:1.200 versicolor:50
## Median :5.900 Median :2.800 Median :4.35 Median :1.300 virginica : 0
## Mean :5.936 Mean :2.770 Mean :4.26 Mean :1.326
## 3rd Qu.:6.300 3rd Qu.:3.000 3rd Qu.:4.60 3rd Qu.:1.500
## Max. :7.000 Max. :3.400 Max. :5.10 Max. :1.800
## ------------------------------------------------------------
## : virginica
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.900 Min. :2.200 Min. :4.500 Min. :1.400
## 1st Qu.:6.225 1st Qu.:2.800 1st Qu.:5.100 1st Qu.:1.800
## Median :6.500 Median :3.000 Median :5.550 Median :2.000
## Mean :6.588 Mean :2.974 Mean :5.552 Mean :2.026
## 3rd Qu.:6.900 3rd Qu.:3.175 3rd Qu.:5.875 3rd Qu.:2.300
## Max. :7.900 Max. :3.800 Max. :6.900 Max. :2.500
## Species
## setosa : 0
## versicolor: 0
## virginica :50
##
##
##
Group Averages Over Level Combinations of Factors 分组平均
str(warpbreaks)
## 'data.frame': 54 obs. of 3 variables:
## $ breaks : num 26 30 54 25 70 52 51 26 67 18 ...
## $ wool : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
## $ tension: Factor w/ 3 levels "L","M","H": 1 1 1 1 1 1 1 1 1 2 ...
head(warpbreaks)
## breaks wool tension
## 1 26 A L
## 2 30 A L
## 3 54 A L
## 4 25 A L
## 5 70 A L
## 6 52 A L
ave(warpbreaks$breaks, warpbreaks$wool)
## [1] 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704
## [9] 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704
## [17] 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704
## [25] 31.03704 31.03704 31.03704 25.25926 25.25926 25.25926 25.25926 25.25926
## [33] 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926
## [41] 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926
## [49] 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926
with(warpbreaks, ave(breaks, tension, FUN = function(x) mean(x, trim = 0.1)))
## [1] 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875
## [10] 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125
## [19] 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625
## [28] 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875
## [37] 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125
## [46] 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625
# 分组求和
with(warpbreaks, ave(breaks, tension, FUN = function(x) sum(x)))
## [1] 655 655 655 655 655 655 655 655 655 475 475 475 475 475 475 475 475 475 390
## [20] 390 390 390 390 390 390 390 390 655 655 655 655 655 655 655 655 655 475 475
## [39] 475 475 475 475 475 475 475 390 390 390 390 390 390 390 390 390
# 分组求和
with(iris, ave(Sepal.Length, Species, FUN = function(x) sum(x)))
## [1] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
## [13] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
## [25] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
## [37] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
## [49] 250.3 250.3 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
## [61] 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
## [73] 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
## [85] 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
## [97] 296.8 296.8 296.8 296.8 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [109] 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [121] 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [133] 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [145] 329.4 329.4 329.4 329.4 329.4 329.4