6.10 数据聚合

分组求和 https://stackoverflow.com/questions/1660124

主要是分组统计

apropos("apply")

##  [1] "apply"      "dendrapply" "eapply"     "frollapply" "kernapply" 
##  [6] "lapply"     "mapply"     "rapply"     "sapply"     "tapply"    
## [11] "vapply"

# 分组求和 colSums colMeans max
unique(iris$Species)

## [1] setosa     versicolor virginica 
## Levels: setosa versicolor virginica

# 分类求和
# colSums(iris[iris$Species == "setosa", -5])
# colSums(iris[iris$Species == "virginica", -5])
colSums(iris[iris$Species == "versicolor", -5])

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        296.8        138.5        213.0         66.3

# apply(iris[iris$Species == "setosa", -5], 2, sum)
# apply(iris[iris$Species == "setosa", -5], 2, mean)
# apply(iris[iris$Species == "setosa", -5], 2, min)
# apply(iris[iris$Species == "setosa", -5], 2, max)
apply(iris[iris$Species == "setosa", -5], 2, quantile)

##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0%            4.3       2.300        1.000         0.1
## 25%           4.8       3.200        1.400         0.2
## 50%           5.0       3.400        1.500         0.2
## 75%           5.2       3.675        1.575         0.3
## 100%          5.8       4.400        1.900         0.6

aggregate: Compute Summary Statistics of Data Subsets

# 按分类变量 Species 分组求和
# aggregate(subset(iris, select = -Species), by = list(iris[, "Species"]), FUN = sum)
aggregate(iris[, -5], list(iris[, 5]), sum)

##      Group.1 Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa        250.3       171.4         73.1        12.3
## 2 versicolor        296.8       138.5        213.0        66.3
## 3  virginica        329.4       148.7        277.6       101.3

# 先确定位置，假设有很多分类变量
ind <- which("Species" == colnames(iris))
# 分组统计
aggregate(iris[, -ind], list(iris[, ind]), sum)

##      Group.1 Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa        250.3       171.4         73.1        12.3
## 2 versicolor        296.8       138.5        213.0        66.3
## 3  virginica        329.4       148.7        277.6       101.3

按照 Species 划分的类别，分组计算，使用公式表示形式，右边一定是分类变量，否则会报错误或者警告，输出奇怪的结果，请读者尝试运行aggregate(Species ~ Sepal.Length, data = iris, mean)。公式法表示分组计算，~ 左手边可以做加 + 减 - 乘 * 除 / 取余 %% 等数学运算。下面以数据集 iris 为例，只对 Sepal.Length 按 Species 分组计算

aggregate(Sepal.Length ~ Species, data = iris, mean)

##      Species Sepal.Length
## 1     setosa        5.006
## 2 versicolor        5.936
## 3  virginica        6.588

与上述分组统计结果一样的命令，在大数据集上，与 aggregate 相比，tapply 要快很多，by 是 tapply 的包裹，处理速度差不多。读者可以构造伪随机数据集验证。

# tapply(iris$Sepal.Length, list(iris$Species), mean)
with(iris, tapply(Sepal.Length, Species, mean))

##     setosa versicolor  virginica 
##      5.006      5.936      6.588

by(iris$Sepal.Length, iris$Species, mean)

## iris$Species: setosa
## [1] 5.006
## ------------------------------------------------------------ 
## iris$Species: versicolor
## [1] 5.936
## ------------------------------------------------------------ 
## iris$Species: virginica
## [1] 6.588

对所有变量按 Species 分组计算

aggregate(. ~ Species, data = iris, mean)

##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa        5.006       3.428        1.462       0.246
## 2 versicolor        5.936       2.770        4.260       1.326
## 3  virginica        6.588       2.974        5.552       2.026

对变量 Sepal.Length 和 Sepal.Width 求和后，按 Species 分组计算

aggregate(Sepal.Length + Sepal.Width ~ Species, data = iris, mean)

##      Species Sepal.Length + Sepal.Width
## 1     setosa                      8.434
## 2 versicolor                      8.706
## 3  virginica                      9.562

对多个分类变量做分组计算，在数据集 ChickWeight 中 Chick和Diet都是数字编码的分类变量，其中 Chick 是有序的因子变量，Diet 是无序的因子变量，而 Time 是数值型的变量，表示小鸡出生的天数。

# 查看数据
str(ChickWeight)

## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   578 obs. of  4 variables:
##  $ weight: num  42 51 59 64 76 93 106 125 149 171 ...
##  $ Time  : num  0 2 4 6 8 10 12 14 16 18 ...
##  $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ Diet  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "formula")=Class 'formula'  language weight ~ Time | Chick
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "outer")=Class 'formula'  language ~Diet
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Time"
##   ..$ y: chr "Body weight"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(days)"
##   ..$ y: chr "(gm)"

查看数据集ChickWeight的前几行

head(ChickWeight)

##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
....

str(ChickWeight)

## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   578 obs. of  4 variables:
##  $ weight: num  42 51 59 64 76 93 106 125 149 171 ...
##  $ Time  : num  0 2 4 6 8 10 12 14 16 18 ...
##  $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ Diet  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "formula")=Class 'formula'  language weight ~ Time | Chick
....

对于数据集ChickWeight中的有序变量Chick，aggregate 会按照既定顺序返回分组计算的结果

aggregate(weight ~ Chick, data = ChickWeight, mean)

##    Chick    weight
## 1     18  37.00000
## 2     16  49.71429
## 3     15  60.12500
## 4     13  67.83333
## 5      9  81.16667
....

aggregate(weight ~ Diet, data = ChickWeight, mean)

##   Diet   weight
## 1    1 102.6455
## 2    2 122.6167
## 3    3 142.9500
## 4    4 135.2627

分类变量没有用数字编码，以 CO2 数据集为例，该数据集描述草植对二氧化碳的吸收情况，Plant 是具有12个水平的有序的因子变量，Type表示植物的源头分别是魁北克(Quebec)和密西西比(Mississippi)，Treatment表示冷却(chilled)和不冷却(nonchilled)两种处理方式，conc表示周围环境中二氧化碳的浓度，uptake表示植物吸收二氧化碳的速率。

# 查看数据集
head(CO2)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2

str(CO2)

## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   84 obs. of  5 variables:
##  $ Plant    : Ord.factor w/ 12 levels "Qn1"<"Qn2"<"Qn3"<..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ Type     : Factor w/ 2 levels "Quebec","Mississippi": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Treatment: Factor w/ 2 levels "nonchilled","chilled": 1 1 1 1 1 1 1 1 1 1 ...
##  $ conc     : num  95 175 250 350 500 675 1000 95 175 250 ...
##  $ uptake   : num  16 30.4 34.8 37.2 35.3 39.2 39.7 13.6 27.3 37.1 ...
##  - attr(*, "formula")=Class 'formula'  language uptake ~ conc | Plant
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "outer")=Class 'formula'  language ~Treatment * Type
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Ambient carbon dioxide concentration"
##   ..$ y: chr "CO2 uptake rate"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(uL/L)"
##   ..$ y: chr "(umol/m^2 s)"

对单个变量分组统计

aggregate(uptake ~ Plant, data = CO2, mean)

##    Plant   uptake
## 1    Qn1 33.22857
## 2    Qn2 35.15714
## 3    Qn3 37.61429
## 4    Qc1 29.97143
## 5    Qc3 32.58571
## 6    Qc2 32.70000
## 7    Mn3 24.11429
## 8    Mn2 27.34286
## 9    Mn1 26.40000
## 10   Mc2 12.14286
## 11   Mc3 17.30000
## 12   Mc1 18.00000

aggregate(uptake ~ Type, data = CO2, mean)

##          Type   uptake
## 1      Quebec 33.54286
## 2 Mississippi 20.88333

aggregate(uptake ~ Treatment, data = CO2, mean)

##    Treatment   uptake
## 1 nonchilled 30.64286
## 2    chilled 23.78333

对多个变量分组统计，查看二氧化碳吸收速率uptake随类型Type和处理方式Treatment

aggregate(uptake ~ Type + Treatment, data = CO2, mean)

##          Type  Treatment   uptake
## 1      Quebec nonchilled 35.33333
## 2 Mississippi nonchilled 25.95238
## 3      Quebec    chilled 31.75238
## 4 Mississippi    chilled 15.81429

tapply(CO2$uptake, list(CO2$Type, CO2$Treatment), mean)

##             nonchilled  chilled
## Quebec        35.33333 31.75238
## Mississippi   25.95238 15.81429

by(CO2$uptake, list(CO2$Type, CO2$Treatment), mean)

## : Quebec
## : nonchilled
## [1] 35.33333
## ------------------------------------------------------------ 
## : Mississippi
## : nonchilled
## [1] 25.95238
## ------------------------------------------------------------ 
## : Quebec
## : chilled
## [1] 31.75238
## ------------------------------------------------------------ 
## : Mississippi
## : chilled
## [1] 15.81429

在这个例子中 tapply 和 by 的输出结果的表示形式不一样，aggregate 返回一个 data.frame 数据框，tapply 返回一个表格 table，by 返回特殊的数据类型 by。

Function by is an object-oriented wrapper for tapply applied to data frames.

# 分组求和
# by(iris[, 1], INDICES = list(iris$Species), FUN = sum)
# by(iris[, 2], INDICES = list(iris$Species), FUN = sum)
by(iris[, 3], INDICES = list(iris$Species), FUN = sum)

## : setosa
## [1] 73.1
## ------------------------------------------------------------ 
## : versicolor
## [1] 213
## ------------------------------------------------------------ 
## : virginica
## [1] 277.6

by(iris[1:3], INDICES = list(iris$Species), FUN = sum)

## : setosa
## [1] 494.8
## ------------------------------------------------------------ 
## : versicolor
## [1] 648.3
## ------------------------------------------------------------ 
## : virginica
## [1] 755.7

by(iris[1:3], INDICES = list(iris$Species), FUN = summary)

## : setosa
##   Sepal.Length    Sepal.Width     Petal.Length  
##  Min.   :4.300   Min.   :2.300   Min.   :1.000  
##  1st Qu.:4.800   1st Qu.:3.200   1st Qu.:1.400  
##  Median :5.000   Median :3.400   Median :1.500  
##  Mean   :5.006   Mean   :3.428   Mean   :1.462  
##  3rd Qu.:5.200   3rd Qu.:3.675   3rd Qu.:1.575  
##  Max.   :5.800   Max.   :4.400   Max.   :1.900  
## ------------------------------------------------------------ 
## : versicolor
##   Sepal.Length    Sepal.Width     Petal.Length 
##  Min.   :4.900   Min.   :2.000   Min.   :3.00  
##  1st Qu.:5.600   1st Qu.:2.525   1st Qu.:4.00  
##  Median :5.900   Median :2.800   Median :4.35  
##  Mean   :5.936   Mean   :2.770   Mean   :4.26  
##  3rd Qu.:6.300   3rd Qu.:3.000   3rd Qu.:4.60  
##  Max.   :7.000   Max.   :3.400   Max.   :5.10  
## ------------------------------------------------------------ 
## : virginica
##   Sepal.Length    Sepal.Width     Petal.Length  
##  Min.   :4.900   Min.   :2.200   Min.   :4.500  
##  1st Qu.:6.225   1st Qu.:2.800   1st Qu.:5.100  
##  Median :6.500   Median :3.000   Median :5.550  
##  Mean   :6.588   Mean   :2.974   Mean   :5.552  
##  3rd Qu.:6.900   3rd Qu.:3.175   3rd Qu.:5.875  
##  Max.   :7.900   Max.   :3.800   Max.   :6.900

by(iris, INDICES = list(iris$Species), FUN = summary)

## : setosa
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.300   Min.   :1.000   Min.   :0.100  
##  1st Qu.:4.800   1st Qu.:3.200   1st Qu.:1.400   1st Qu.:0.200  
##  Median :5.000   Median :3.400   Median :1.500   Median :0.200  
##  Mean   :5.006   Mean   :3.428   Mean   :1.462   Mean   :0.246  
##  3rd Qu.:5.200   3rd Qu.:3.675   3rd Qu.:1.575   3rd Qu.:0.300  
##  Max.   :5.800   Max.   :4.400   Max.   :1.900   Max.   :0.600  
##        Species  
##  setosa    :50  
##  versicolor: 0  
##  virginica : 0  
##                 
##                 
##                 
## ------------------------------------------------------------ 
## : versicolor
##   Sepal.Length    Sepal.Width     Petal.Length   Petal.Width          Species  
##  Min.   :4.900   Min.   :2.000   Min.   :3.00   Min.   :1.000   setosa    : 0  
##  1st Qu.:5.600   1st Qu.:2.525   1st Qu.:4.00   1st Qu.:1.200   versicolor:50  
##  Median :5.900   Median :2.800   Median :4.35   Median :1.300   virginica : 0  
##  Mean   :5.936   Mean   :2.770   Mean   :4.26   Mean   :1.326                  
##  3rd Qu.:6.300   3rd Qu.:3.000   3rd Qu.:4.60   3rd Qu.:1.500                  
##  Max.   :7.000   Max.   :3.400   Max.   :5.10   Max.   :1.800                  
## ------------------------------------------------------------ 
## : virginica
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.900   Min.   :2.200   Min.   :4.500   Min.   :1.400  
##  1st Qu.:6.225   1st Qu.:2.800   1st Qu.:5.100   1st Qu.:1.800  
##  Median :6.500   Median :3.000   Median :5.550   Median :2.000  
##  Mean   :6.588   Mean   :2.974   Mean   :5.552   Mean   :2.026  
##  3rd Qu.:6.900   3rd Qu.:3.175   3rd Qu.:5.875   3rd Qu.:2.300  
##  Max.   :7.900   Max.   :3.800   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    : 0  
##  versicolor: 0  
##  virginica :50  
##                 
##                 
##

Group Averages Over Level Combinations of Factors 分组平均

str(warpbreaks)

## 'data.frame':    54 obs. of  3 variables:
##  $ breaks : num  26 30 54 25 70 52 51 26 67 18 ...
##  $ wool   : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tension: Factor w/ 3 levels "L","M","H": 1 1 1 1 1 1 1 1 1 2 ...

head(warpbreaks)

##   breaks wool tension
## 1     26    A       L
## 2     30    A       L
## 3     54    A       L
## 4     25    A       L
## 5     70    A       L
## 6     52    A       L

ave(warpbreaks$breaks, warpbreaks$wool)

##  [1] 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704
##  [9] 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704
## [17] 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704 31.03704
## [25] 31.03704 31.03704 31.03704 25.25926 25.25926 25.25926 25.25926 25.25926
## [33] 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926
## [41] 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926
## [49] 25.25926 25.25926 25.25926 25.25926 25.25926 25.25926

with(warpbreaks, ave(breaks, tension, FUN = function(x) mean(x, trim = 0.1)))

##  [1] 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875
## [10] 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125
## [19] 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625
## [28] 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875 35.6875
## [37] 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125 26.3125
## [46] 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625 21.0625

# 分组求和
with(warpbreaks, ave(breaks, tension, FUN = function(x) sum(x)))

##  [1] 655 655 655 655 655 655 655 655 655 475 475 475 475 475 475 475 475 475 390
## [20] 390 390 390 390 390 390 390 390 655 655 655 655 655 655 655 655 655 475 475
## [39] 475 475 475 475 475 475 475 390 390 390 390 390 390 390 390 390

# 分组求和
with(iris, ave(Sepal.Length, Species, FUN = function(x) sum(x)))

##   [1] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
##  [13] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
##  [25] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
##  [37] 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3 250.3
##  [49] 250.3 250.3 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
##  [61] 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
##  [73] 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
##  [85] 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8 296.8
##  [97] 296.8 296.8 296.8 296.8 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [109] 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [121] 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [133] 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4 329.4
## [145] 329.4 329.4 329.4 329.4 329.4 329.4