7.2 高频操作

以面向问题的方式介绍 Base R 提供的数据操作,然后过渡到 data.table,它是加强版的 Base R。

表 7.1: 单表的操作
base dplyr
df[order(x), , drop = FALSE] arrange(df, x)
df[!duplicated(x), , drop = FALSE], unique() distinct(df, x)
df[x & !is.na(x), , drop = FALSE], subset() filter(df, x)
df$z <- df$x + df$y, transform() mutate(df, z = x + y)
df$x pull(df, x)
N/A rename(df, y = x)
df[c("x", "y")], subset() select(df, x, y)
df[grepl(names(df), "^x")] select(df, starts_with("x")
mean(df$x) summarise(df, mean(x))
df[c(1, 2, 5), , drop = FALSE] slice(df, c(1, 2, 5))
表 7.2: 两表的操作
base dplyr
merge(df1, df2) inner_join(df1, df2)
merge(df1, df2, all.x = TRUE) left_join(df1, df2)
merge(df1, df2, all.y = TRUE) right_join(df1, df2)
merge(df1, df2, all = TRUE) full_join(df1, df2)
df1[df1$x %in% df2$x, , drop = FALSE] semi_join(df1, df2)
df1[!df1$x %in% df2$x, , drop = FALSE] anti_join(df1, df2)
class(mtcars)
## [1] "data.frame"
library(data.table)
mtcars <- as.data.table(mtcars)
class(mtcars)
## [1] "data.table" "data.frame"

7.2.1 选择多列

# base
mtcars[, c("cyl", "gear")] |>  head(3)
##    cyl gear
## 1:   6    4
## 2:   6    4
## 3:   4    4
# data.table
mtcars[, c("cyl", "gear")] |>  head(3)
##    cyl gear
## 1:   6    4
## 2:   6    4
## 3:   4    4
# dplyr
dplyr::select(mtcars, cyl, gear) |>  head(3)
##    cyl gear
## 1:   6    4
## 2:   6    4
## 3:   4    4

反选多列,选择除了 cyl 和 gear 的列

## 或者 mtcars[, setdiff(names(mtcars), c("cyl", "gear"))]
mtcars[, !(names(mtcars) %in% c("cyl", "gear"))] |>  head(3)
## [1]  TRUE FALSE  TRUE
subset(mtcars, select = -c(cyl, gear)) |>  head(3)
##     mpg disp  hp drat    wt  qsec vs am carb
## 1: 21.0  160 110 3.90 2.620 16.46  0  1    4
## 2: 21.0  160 110 3.90 2.875 17.02  0  1    4
## 3: 22.8  108  93 3.85 2.320 18.61  1  1    1

7.2.2 过滤多行

# base
mtcars[mtcars$cyl == 6 & mtcars$gear == 4, ]
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 4: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
subset(mtcars, subset = cyl == 6 & gear == 4)
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 4: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
# data.table
mtcars[cyl == 6 & gear == 4, ]
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 4: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
# dplyr
dplyr::filter(mtcars, cyl == 6 & gear == 4)
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 4: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4

7.2.3 去重多行

# base
mtcars[!duplicated(mtcars[, c("cyl", "gear")])]
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 3: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 4: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 5: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 6: 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 7: 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 8: 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
# data.table
mtcars[!duplicated(mtcars, by = c("cyl", "gear")), ]
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 3: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 4: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 5: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 6: 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 7: 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 8: 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
unique(mtcars, by = c("cyl", "gear"))
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 3: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 4: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 5: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 6: 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 7: 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 8: 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
# dplyr
dplyr::distinct(mtcars, cyl, gear, .keep_all = TRUE)
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2: 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 3: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 4: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 5: 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 6: 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 7: 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 8: 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6

7.2.4 合并操作

在数据库的操作中,合并又称为连接

7.2.4.1 左合并

# dplyr::inner_join()
# dplyr::left_join()
# dplyr::right_join()
# dplyr::full_join()

7.2.4.2 右合并

7.2.5 新添多列

mtcars[cyl == 6, `:=`(disp_mean = mean(disp), hp_mean = mean(hp))][cyl == 6, .(cyl, disp, hp, disp_mean, hp_mean)]
##    cyl  disp  hp disp_mean  hp_mean
## 1:   6 160.0 110  183.3143 122.2857
## 2:   6 160.0 110  183.3143 122.2857
## 3:   6 258.0 110  183.3143 122.2857
## 4:   6 225.0 105  183.3143 122.2857
## 5:   6 167.6 123  183.3143 122.2857
## 6:   6 167.6 123  183.3143 122.2857
## 7:   6 145.0 175  183.3143 122.2857

7.2.6 删除多列

删除列就是将该列的值清空,置为 NULL,下面将新添的两个列删除,根据列名的特点用正则表达式匹配

mtcars[, colnames(mtcars)[grep("_mean$", colnames(mtcars))] := NULL]

7.2.7 筛选多列

按照某一规则筛选多列

library(data.table)
iris <- as.data.table(iris)
iris[, head(.SD, 6), .SDcols = function(x) is.numeric(x)]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1:          5.1         3.5          1.4         0.2
## 2:          4.9         3.0          1.4         0.2
## 3:          4.7         3.2          1.3         0.2
## 4:          4.6         3.1          1.5         0.2
## 5:          5.0         3.6          1.4         0.2
## 6:          5.4         3.9          1.7         0.4

7.2.8 修改多列类型

mtcars[, (c("cyl", "disp")) := lapply(.SD, as.integer), .SDcols = c("cyl", "disp")]
str(mtcars)
## Classes 'data.table' and 'data.frame':   32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : int  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: int  160 160 108 258 360 225 360 146 140 167 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "index")= int(0)

7.2.9 取每组第一行

先将 mtcars 按 cyl 升序,gear 降序排列,然后按 cyl, gear 和 am 分组取第一行

mtcars[order(cyl, -gear)][, head(.SD, 1), by = list(cyl, gear, am)]
##     cyl gear am  mpg disp  hp drat    wt  qsec vs carb
##  1:   4    5  1 26.0  120  91 4.43 2.140 16.70  0    2
##  2:   4    4  1 22.8  108  93 3.85 2.320 18.61  1    1
##  3:   4    4  0 24.4  146  62 3.69 3.190 20.00  1    2
##  4:   4    3  0 21.5  120  97 3.70 2.465 20.01  1    1
##  5:   6    5  1 19.7  145 175 3.62 2.770 15.50  0    6
##  6:   6    4  1 21.0  160 110 3.90 2.620 16.46  0    4
##  7:   6    4  0 19.2  167 123 3.92 3.440 18.30  1    4
##  8:   6    3  0 21.4  258 110 3.08 3.215 19.44  1    1
##  9:   8    5  1 15.8  351 264 4.22 3.170 14.50  0    4
## 10:   8    3  0 18.7  360 175 3.15 3.440 17.02  0    2
# 或者
mtcars[order(cyl, -gear)][, .SD[1], by = list(cyl, gear, am)]
##     cyl gear am  mpg disp  hp drat    wt  qsec vs carb
##  1:   4    5  1 26.0  120  91 4.43 2.140 16.70  0    2
##  2:   4    4  1 22.8  108  93 3.85 2.320 18.61  1    1
##  3:   4    4  0 24.4  146  62 3.69 3.190 20.00  1    2
##  4:   4    3  0 21.5  120  97 3.70 2.465 20.01  1    1
##  5:   6    5  1 19.7  145 175 3.62 2.770 15.50  0    6
##  6:   6    4  1 21.0  160 110 3.90 2.620 16.46  0    4
##  7:   6    4  0 19.2  167 123 3.92 3.440 18.30  1    4
##  8:   6    3  0 21.4  258 110 3.08 3.215 19.44  1    1
##  9:   8    5  1 15.8  351 264 4.22 3.170 14.50  0    4
## 10:   8    3  0 18.7  360 175 3.15 3.440 17.02  0    2

7.2.10 计算环比同比

以数据集 AirPassengers 为例,重新整理后见表 7.3

library(magrittr)
dat <- data.frame(
  year = rep(1949:1960, each = 12),
  month = month.abb, num = AirPassengers
) %>%
  reshape(.,
    v.names = "num", idvar = "year", timevar = "month",
    direction = "wide", sep = ""
  ) %>%
  setNames(., gsub(pattern = "(num)", replacement = "", x = colnames(.)))

rownames(dat) <- subset(dat, select = year, drop = TRUE)
air_passengers <- subset(dat, select = -year)

knitr::kable(air_passengers,
  caption = "1949-1960年国际航班乘客数量变化",
  align = "c", row.names = TRUE
)
表 7.3: 1949-1960年国际航班乘客数量变化
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
1949 112 118 132 129 121 135 148 148 136 119 104 118
1950 115 126 141 135 125 149 170 170 158 133 114 140
1951 145 150 178 163 172 178 199 199 184 162 146 166
1952 171 180 193 181 183 218 230 242 209 191 172 194
1953 196 196 236 235 229 243 264 272 237 211 180 201
1954 204 188 235 227 234 264 302 293 259 229 203 229
1955 242 233 267 269 270 315 364 347 312 274 237 278
1956 284 277 317 313 318 374 413 405 355 306 271 306
1957 315 301 356 348 355 422 465 467 404 347 305 336
1958 340 318 362 348 363 435 491 505 404 359 310 337
1959 360 342 406 396 420 472 548 559 463 407 362 405
1960 417 391 419 461 472 535 622 606 508 461 390 432

横向计算环比,如1949年2月相比1月增长多少、3月相比2月增长多少,以此类推,就是计算环比?纵向计算同比,如1950年1月相比1949年1月增长多少、1951年相比1950年1月增长多少?

# 环比横向/同比纵向
mom <- function(x) diff(x, lag = 1) / x[-length(x)] # month to month
# 格式化输出
format_mom <- function(x) formatC(mom(x), format = "f", digits = 4)
library(formattable)
# 同比变化
air_passengers %>%
  apply(., 2, format_mom) %>%
  as.data.frame() %>%
  formattable(., list(
    Jan = color_tile("white", "pink"),
    Feb = color_tile("white", "springgreen4"),
    Mar = percent
  ))

library(DT)
datatable(air_passengers)

7.2.11 合并多个数据框

将所有列都保留,以 full_join() 方式合并

df1 <- iris[1:10, c(1, 5)]
df2 <- iris[11:15, c(1, 2, 5)]
df3 <- iris[16:30, c(1, 3, 5)]
all_dfs <- list(df1, df2, df3)
# base
Reduce(function(x, y, ...) merge(x, y, ..., all = TRUE), all_dfs)
##     Sepal.Length Species Sepal.Width Petal.Length
##  1:          4.3  setosa         3.0           NA
##  2:          4.4  setosa          NA           NA
##  3:          4.6  setosa          NA          1.0
##  4:          4.6  setosa          NA          1.0
##  5:          4.7  setosa          NA          1.6
##  6:          4.8  setosa         3.4          1.9
##  7:          4.8  setosa         3.0          1.9
##  8:          4.9  setosa          NA           NA
##  9:          4.9  setosa          NA           NA
## 10:          5.0  setosa          NA          1.6
## 11:          5.0  setosa          NA          1.6
## 12:          5.0  setosa          NA          1.6
## 13:          5.0  setosa          NA          1.6
## 14:          5.1  setosa          NA          1.4
## 15:          5.1  setosa          NA          1.5
## 16:          5.1  setosa          NA          1.5
## 17:          5.1  setosa          NA          1.7
## 18:          5.2  setosa          NA          1.5
## 19:          5.2  setosa          NA          1.4
## 20:          5.4  setosa         3.7          1.3
## 21:          5.4  setosa         3.7          1.7
## 22:          5.7  setosa          NA          1.5
## 23:          5.7  setosa          NA          1.7
## 24:          5.8  setosa         4.0           NA
##     Sepal.Length Species Sepal.Width Petal.Length
# dplyr
Reduce(function(x, y, ...) dplyr::full_join(x, y, ...), all_dfs)
## Warning in dplyr::full_join(x, y, ...): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 8 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
##     Sepal.Length Species Sepal.Width Petal.Length
##  1:          5.1  setosa          NA          1.4
##  2:          5.1  setosa          NA          1.5
##  3:          5.1  setosa          NA          1.5
##  4:          5.1  setosa          NA          1.7
##  5:          4.9  setosa          NA           NA
##  6:          4.7  setosa          NA          1.6
##  7:          4.6  setosa          NA          1.0
##  8:          5.0  setosa          NA          1.6
##  9:          5.0  setosa          NA          1.6
## 10:          5.4  setosa         3.7          1.3
## 11:          5.4  setosa         3.7          1.7
## 12:          4.6  setosa          NA          1.0
## 13:          5.0  setosa          NA          1.6
## 14:          5.0  setosa          NA          1.6
## 15:          4.4  setosa          NA           NA
## 16:          4.9  setosa          NA           NA
## 17:          4.8  setosa         3.4          1.9
## 18:          4.8  setosa         3.0          1.9
## 19:          4.3  setosa         3.0           NA
## 20:          5.8  setosa         4.0           NA
## 21:          5.7  setosa          NA          1.5
## 22:          5.7  setosa          NA          1.7
## 23:          5.2  setosa          NA          1.5
## 24:          5.2  setosa          NA          1.4
##     Sepal.Length Species Sepal.Width Petal.Length

合并完应该有30行,为啥只有24行?这是因为 merge() 函数对主键 key 相同的记录会合并,要想不合并,需要调用 rbindlist() 函数 https://d.cosx.org/d/421235

rbind() 列数相同的两个 data.frame 按行合并,cbind() 行数相同的两个 data.frame 按列合并,merge() 对行、列数没有要求

rbindlist(all_dfs, fill = TRUE)
##     Sepal.Length Species Sepal.Width Petal.Length
##  1:          5.1  setosa          NA           NA
##  2:          4.9  setosa          NA           NA
##  3:          4.7  setosa          NA           NA
##  4:          4.6  setosa          NA           NA
##  5:          5.0  setosa          NA           NA
##  6:          5.4  setosa          NA           NA
##  7:          4.6  setosa          NA           NA
##  8:          5.0  setosa          NA           NA
##  9:          4.4  setosa          NA           NA
## 10:          4.9  setosa          NA           NA
## 11:          5.4  setosa         3.7           NA
## 12:          4.8  setosa         3.4           NA
## 13:          4.8  setosa         3.0           NA
## 14:          4.3  setosa         3.0           NA
## 15:          5.8  setosa         4.0           NA
## 16:          5.7  setosa          NA          1.5
## 17:          5.4  setosa          NA          1.3
## 18:          5.1  setosa          NA          1.4
## 19:          5.7  setosa          NA          1.7
## 20:          5.1  setosa          NA          1.5
## 21:          5.4  setosa          NA          1.7
## 22:          5.1  setosa          NA          1.5
## 23:          4.6  setosa          NA          1.0
## 24:          5.1  setosa          NA          1.7
## 25:          4.8  setosa          NA          1.9
## 26:          5.0  setosa          NA          1.6
## 27:          5.0  setosa          NA          1.6
## 28:          5.2  setosa          NA          1.5
## 29:          5.2  setosa          NA          1.4
## 30:          4.7  setosa          NA          1.6
##     Sepal.Length Species Sepal.Width Petal.Length
# dplyr
dplyr::bind_rows(all_dfs)
##     Sepal.Length Species Sepal.Width Petal.Length
##  1:          5.1  setosa          NA           NA
##  2:          4.9  setosa          NA           NA
##  3:          4.7  setosa          NA           NA
##  4:          4.6  setosa          NA           NA
##  5:          5.0  setosa          NA           NA
##  6:          5.4  setosa          NA           NA
##  7:          4.6  setosa          NA           NA
##  8:          5.0  setosa          NA           NA
##  9:          4.4  setosa          NA           NA
## 10:          4.9  setosa          NA           NA
## 11:          5.4  setosa         3.7           NA
## 12:          4.8  setosa         3.4           NA
## 13:          4.8  setosa         3.0           NA
## 14:          4.3  setosa         3.0           NA
## 15:          5.8  setosa         4.0           NA
## 16:          5.7  setosa          NA          1.5
## 17:          5.4  setosa          NA          1.3
## 18:          5.1  setosa          NA          1.4
## 19:          5.7  setosa          NA          1.7
## 20:          5.1  setosa          NA          1.5
## 21:          5.4  setosa          NA          1.7
## 22:          5.1  setosa          NA          1.5
## 23:          4.6  setosa          NA          1.0
## 24:          5.1  setosa          NA          1.7
## 25:          4.8  setosa          NA          1.9
## 26:          5.0  setosa          NA          1.6
## 27:          5.0  setosa          NA          1.6
## 28:          5.2  setosa          NA          1.5
## 29:          5.2  setosa          NA          1.4
## 30:          4.7  setosa          NA          1.6
##     Sepal.Length Species Sepal.Width Petal.Length

7.2.12 分组聚合多个指标

https://stackoverflow.com/questions/24151602/calculate-multiple-aggregations-with-lapply-sd

# base
aggregate(
  data = mtcars, cbind(mpg, hp) ~ cyl,
  FUN = function(x) c(mean = mean(x), median = median(x))
)
##   cyl mpg.mean mpg.median   hp.mean hp.median
## 1   4 26.66364   26.00000  82.63636  91.00000
## 2   6 19.74286   19.70000 122.28571 110.00000
## 3   8 15.10000   15.20000 209.21429 192.50000
# 数据一致性 https://d.cosx.org/d/420763-base-r
with(
  aggregate(cbind(mpg, hp) ~ cyl, mtcars,
    FUN = function(x) c(mean = mean(x), median = median(x))
  ),
  cbind.data.frame(cyl, mpg, hp)
)
##   cyl     mean median      mean median
## 1   4 26.66364   26.0  82.63636   91.0
## 2   6 19.74286   19.7 122.28571  110.0
## 3   8 15.10000   15.2 209.21429  192.5
# data.table
mtcars[, as.list(unlist(lapply(.SD, function(x) {
  list(
    mean = mean(x),
    median = median(x)
  )
}))),
by = "cyl", .SDcols = c("mpg", "hp")
]
##    cyl mpg.mean mpg.median   hp.mean hp.median
## 1:   6 19.74286       19.7 122.28571     110.0
## 2:   4 26.66364       26.0  82.63636      91.0
## 3:   8 15.10000       15.2 209.21429     192.5
# dplyr
mtcars |> 
  dplyr::group_by(cyl) |> 
  dplyr::summarise(
    mean_mpg = mean(mpg), mean_hp = mean(hp),
    median_mpg = mean(mpg), median_hp = mean(hp)
  )
## # A tibble: 3 × 5
##     cyl mean_mpg mean_hp median_mpg median_hp
##   <int>    <dbl>   <dbl>      <dbl>     <dbl>
## 1     4     26.7    82.6       26.7      82.6
## 2     6     19.7   122.        19.7     122. 
## 3     8     15.1   209.        15.1     209.

7.2.13 重命名多个列

tmp <- aggregate(
  data = mtcars, cbind(mpg, hp) ~ cyl,
  FUN = median
)
tmp <- as.data.table(tmp)
setnames(tmp, old = c("mpg", "hp"), new = c("median_mpg", "median_hp"))
tmp
##    cyl median_mpg median_hp
## 1:   4       26.0      91.0
## 2:   6       19.7     110.0
## 3:   8       15.2     192.5

7.2.14 对多个列依次排序

https://stackoverflow.com/questions/1296646/how-to-sort-a-dataframe-by-multiple-columns

# base
tmp[order(median_mpg, -median_hp), ]
##    cyl median_mpg median_hp
## 1:   8       15.2     192.5
## 2:   6       19.7     110.0
## 3:   4       26.0      91.0
# data.table
setorder(tmp, median_mpg, -median_hp)
# dplyr
dplyr::arrange(tmp, median_mpg, desc(median_hp))
##    cyl median_mpg median_hp
## 1:   8       15.2     192.5
## 2:   6       19.7     110.0
## 3:   4       26.0      91.0

7.2.15 重排多个列的位置

# https://stackoverflow.com/questions/19619666/change-column-position-of-data-table
setcolorder(tmp, c("median_mpg", setdiff(names(tmp), "median_mpg")))
tmp
##    median_mpg cyl median_hp
## 1:       15.2   8     192.5
## 2:       19.7   6     110.0
## 3:       26.0   4      91.0
# dplyr
dplyr::select(tmp, "median_mpg", setdiff(names(tmp), "median_mpg"))
##    median_mpg cyl median_hp
## 1:       15.2   8     192.5
## 2:       19.7   6     110.0
## 3:       26.0   4      91.0

7.2.16 整理回归结果

dat <- split(iris, iris$Species)
mod <- lapply(dat, function(x) lm(Petal.Length ~ Sepal.Length, x))
mod <- lapply(mod, function(x) coef(summary(x)))
mod <- Map(function(x, y) {
  x <- as.data.frame(x)
  x$Species <- y
  x
}, mod, names(dat))
mod <- do.call(rbind, mod)
mod
##                          Estimate Std. Error    t value     Pr(>|t|)    Species
## setosa.(Intercept)      0.8030518 0.34387807  2.3352806 2.375647e-02     setosa
## setosa.Sepal.Length     0.1316317 0.06852690  1.9208760 6.069778e-02     setosa
## versicolor.(Intercept)  0.1851155 0.51421351  0.3599974 7.204283e-01 versicolor
## versicolor.Sepal.Length 0.6864698 0.08630708  7.9538056 2.586190e-10 versicolor
## virginica.(Intercept)   0.6104680 0.41710685  1.4635770 1.498279e-01  virginica
## virginica.Sepal.Length  0.7500808 0.06302606 11.9011203 6.297786e-16  virginica
# 管道操作
split(iris, iris$Species) %>%
  lapply(., function(x) coef(summary(lm(Petal.Length ~ Sepal.Length, x)))) %>%
  Map(function(x, y) {
    x <- as.data.frame(x)
    x$Species <- y
    x
  }, ., levels(iris$Species)) %>%
  do.call(rbind, .)
##                          Estimate Std. Error    t value     Pr(>|t|)    Species
## setosa.(Intercept)      0.8030518 0.34387807  2.3352806 2.375647e-02     setosa
## setosa.Sepal.Length     0.1316317 0.06852690  1.9208760 6.069778e-02     setosa
## versicolor.(Intercept)  0.1851155 0.51421351  0.3599974 7.204283e-01 versicolor
## versicolor.Sepal.Length 0.6864698 0.08630708  7.9538056 2.586190e-10 versicolor
## virginica.(Intercept)   0.6104680 0.41710685  1.4635770 1.498279e-01  virginica
## virginica.Sepal.Length  0.7500808 0.06302606 11.9011203 6.297786e-16  virginica
# dplyr 操作,需要 dplyr >= 1.0.0 
iris %>%
  dplyr::group_by(Species) %>%
  dplyr::summarise(broom::tidy(lm(Petal.Length ~ Sepal.Length)))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 6 × 6
## # Groups:   Species [3]
##   Species    term         estimate std.error statistic  p.value
##   <fct>      <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 setosa     (Intercept)     0.803    0.344      2.34  2.38e- 2
## 2 setosa     Sepal.Length    0.132    0.0685     1.92  6.07e- 2
## 3 versicolor (Intercept)     0.185    0.514      0.360 7.20e- 1
## 4 versicolor Sepal.Length    0.686    0.0863     7.95  2.59e-10
## 5 virginica  (Intercept)     0.610    0.417      1.46  1.50e- 1
## 6 virginica  Sepal.Length    0.750    0.0630    11.9   6.30e-16

7.2.17 :=.()

mtcars[, mpg_rate := round(mpg / sum(mpg) * 100, digits = 2), by = .(cyl, vs, am)]
mtcars[, .(mpg_rate, mpg, cyl, vs, am)]
##     mpg_rate  mpg cyl vs am
##  1:    34.04 21.0   6  0  1
##  2:    34.04 21.0   6  0  1
##  3:    11.48 22.8   4  1  1
##  4:    27.97 21.4   6  1  0
##  5:    10.35 18.7   8  0  0
##  6:    23.66 18.1   6  1  0
##  7:     7.92 14.3   8  0  0
##  8:    35.52 24.4   4  1  0
##  9:    33.19 22.8   4  1  0
## 10:    25.10 19.2   6  1  0
## 11:    23.27 17.8   6  1  0
## 12:     9.08 16.4   8  0  0
## 13:     9.58 17.3   8  0  0
## 14:     8.42 15.2   8  0  0
## 15:     5.76 10.4   8  0  0
## 16:     5.76 10.4   8  0  0
## 17:     8.14 14.7   8  0  0
## 18:    16.31 32.4   4  1  1
## 19:    15.31 30.4   4  1  1
## 20:    17.07 33.9   4  1  1
## 21:    31.30 21.5   4  1  0
## 22:     8.58 15.5   8  0  0
## 23:     8.42 15.2   8  0  0
## 24:     7.36 13.3   8  0  0
## 25:    10.63 19.2   8  0  0
## 26:    13.75 27.3   4  1  1
## 27:   100.00 26.0   4  0  1
## 28:    15.31 30.4   4  1  1
## 29:    51.30 15.8   8  0  1
## 30:    31.93 19.7   6  0  1
## 31:    48.70 15.0   8  0  1
## 32:    10.78 21.4   4  1  1
##     mpg_rate  mpg cyl vs am
mtcars[, .(mpg_rate = round(mpg / sum(mpg) * 100, digits = 2)), by = .(cyl, vs, am)]
##     cyl vs am mpg_rate
##  1:   6  0  1    34.04
##  2:   6  0  1    34.04
##  3:   6  0  1    31.93
##  4:   4  1  1    11.48
##  5:   4  1  1    16.31
##  6:   4  1  1    15.31
##  7:   4  1  1    17.07
##  8:   4  1  1    13.75
##  9:   4  1  1    15.31
## 10:   4  1  1    10.78
## 11:   6  1  0    27.97
## 12:   6  1  0    23.66
## 13:   6  1  0    25.10
## 14:   6  1  0    23.27
## 15:   8  0  0    10.35
## 16:   8  0  0     7.92
## 17:   8  0  0     9.08
## 18:   8  0  0     9.58
## 19:   8  0  0     8.42
## 20:   8  0  0     5.76
## 21:   8  0  0     5.76
## 22:   8  0  0     8.14
## 23:   8  0  0     8.58
## 24:   8  0  0     8.42
## 25:   8  0  0     7.36
## 26:   8  0  0    10.63
## 27:   4  1  0    35.52
## 28:   4  1  0    33.19
## 29:   4  1  0    31.30
## 30:   4  0  1   100.00
## 31:   8  0  1    51.30
## 32:   8  0  1    48.70
##     cyl vs am mpg_rate

7.2.18 去掉含有缺失值的记录

airquality[complete.cases(airquality), ] |>  head()
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 7    23     299  8.6   65     5   7
## 8    19      99 13.8   59     5   8
# 或着
airquality[!apply(airquality, 1, anyNA), ] |>  head()
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 7    23     299  8.6   65     5   7
## 8    19      99 13.8   59     5   8

7.2.19 集合操作

match 和 %in% https://d.cosx.org/d/421314

`%nin%` <- Negate("%in%")
# `%in%` <- function(x, table) match(x, table, nomatch = 0) > 0 # %in% 函数的定义
x <- letters[1:5]
y <- letters[3:8]

x %in% y
## [1] FALSE FALSE  TRUE  TRUE  TRUE
x %nin% y
## [1]  TRUE  TRUE FALSE FALSE FALSE

返回一个逻辑向量,x 中的元素匹配到了就返回 TRUE,否则 FALSE, %nin%%in% 的取反效果

match(x, y)
## [1] NA NA  1  2  3

x 在 y 中的匹配情况,匹配到了,就返回在 y 中匹配的位置,没有匹配到就返回 NA

setdiff(x, y)
## [1] "a" "b"
intersect(x, y)
## [1] "c" "d" "e"
union(x, y)
## [1] "a" "b" "c" "d" "e" "f" "g" "h"

7.2.20 对数值向量按既定分组计数

此数据处理过程陆续使用了 transform()cut()aggregate() 三个函数

# 对数值向量按既定分组计数
dat <- data.frame(y = 1:12)
dat <- transform(dat, x = cut(y, breaks = c(0, 6, 9, 15)))
dat <- aggregate(data = dat, y ~ x, FUN = length)
ggplot(data = dat, aes(x = x, y = y)) +
  geom_col()

data.frame(y = 1:12) %>%
  transform(x = cut(y, breaks = c(0, 6, 9, 15))) %>%
  aggregate(data = ., y ~ x, FUN = length) %>%
  ggplot(data = ., aes(x = x, y = y)) +
  geom_col()

对数值向量按分位数分组计数

dat <- data.frame(y = 1:12)
dat <- transform(dat, x = cut(
  x = y,
  breaks = quantile(y, prob = seq(0, 1, 0.25), na.rm = TRUE)
))

# dat <- transform(dat, x = cut(
#   x = y,
#   breaks = quantile(y, prob = seq(0, 1, 0.25)),
#   include.lowest = T
# ))

dat1 <- aggregate(data = dat, y ~ x, FUN = length)
ggplot(data = dat1, aes(x = x, y = y)) +
  geom_col()

7.2.21 分组排序

按变量 a 分组计算,之后按变量 b 降序排列

dat <- aggregate(data = iris, cbind(Sepal.Width, Sepal.Length) ~ Species, FUN = mean)
# 按 Species 降序排列
dat[order(dat$Species, decreasing = T), ]
##      Species Sepal.Width Sepal.Length
## 3  virginica       2.974        6.588
## 2 versicolor       2.770        5.936
## 1     setosa       3.428        5.006

7.2.22 分组获取 Top 值

分组按既定规律取数,比如按 Species 分组取 Top 6

# 分组取前6个
do.call("rbind.data.frame", lapply(base::split(x = iris, ~Species), head))
# 分组取 Top 6
do.call(rbind, lapply(split(iris, iris$Species),
  FUN = function(x) head(x[order(x$Sepal.Length, decreasing = T), ], 6)
))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
##  1:          5.8         4.0          1.2         0.2     setosa
##  2:          5.7         4.4          1.5         0.4     setosa
##  3:          5.7         3.8          1.7         0.3     setosa
##  4:          5.5         4.2          1.4         0.2     setosa
##  5:          5.5         3.5          1.3         0.2     setosa
##  6:          5.4         3.9          1.7         0.4     setosa
##  7:          7.0         3.2          4.7         1.4 versicolor
##  8:          6.9         3.1          4.9         1.5 versicolor
##  9:          6.8         2.8          4.8         1.4 versicolor
## 10:          6.7         3.1          4.4         1.4 versicolor
## 11:          6.7         3.0          5.0         1.7 versicolor
## 12:          6.7         3.1          4.7         1.5 versicolor
## 13:          7.9         3.8          6.4         2.0  virginica
## 14:          7.7         3.8          6.7         2.2  virginica
## 15:          7.7         2.6          6.9         2.3  virginica
## 16:          7.7         2.8          6.7         2.0  virginica
## 17:          7.7         3.0          6.1         2.3  virginica
## 18:          7.6         3.0          6.6         2.1  virginica

7.2.23 分组抽样

# 分组抽样
do.call(rbind, lapply(split(iris, iris$Species),
  FUN = function(x) x[sample(1:nrow(x), size = 6), ]
))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
##  1:          5.1         3.3          1.7         0.5     setosa
##  2:          5.2         3.4          1.4         0.2     setosa
##  3:          4.6         3.6          1.0         0.2     setosa
##  4:          4.9         3.1          1.5         0.2     setosa
##  5:          5.7         3.8          1.7         0.3     setosa
##  6:          5.4         3.7          1.5         0.2     setosa
##  7:          5.7         2.9          4.2         1.3 versicolor
##  8:          6.3         2.5          4.9         1.5 versicolor
##  9:          6.1         3.0          4.6         1.4 versicolor
## 10:          5.7         2.6          3.5         1.0 versicolor
## 11:          6.6         3.0          4.4         1.4 versicolor
## 12:          5.5         2.4          3.7         1.0 versicolor
## 13:          6.7         3.1          5.6         2.4  virginica
## 14:          7.2         3.2          6.0         1.8  virginica
## 15:          6.1         3.0          4.9         1.8  virginica
## 16:          6.3         2.7          4.9         1.8  virginica
## 17:          5.8         2.7          5.1         1.9  virginica
## 18:          6.0         3.0          4.8         1.8  virginica

7.2.24 分组计算分位数

# 分组计算分位数,如何分组呢
do.call(rbind, lapply(iris[, sapply(iris, class) == "numeric"], quantile))
##              0% 25% 50% 75% 100%
## Sepal.Length  1   1   1   1    1
## Sepal.Width   1   1   1   1    1
## Petal.Length  1   1   1   1    1
## Petal.Width   1   1   1   1    1
## Species       0   0   0   0    0
aggregate(data = iris, cbind(Sepal.Length, Sepal.Width) ~ Species, FUN = quantile)
##      Species Sepal.Length.0% Sepal.Length.25% Sepal.Length.50% Sepal.Length.75%
## 1     setosa           4.300            4.800            5.000            5.200
## 2 versicolor           4.900            5.600            5.900            6.300
## 3  virginica           4.900            6.225            6.500            6.900
##   Sepal.Length.100% Sepal.Width.0% Sepal.Width.25% Sepal.Width.50%
## 1             5.800          2.300           3.200           3.400
## 2             7.000          2.000           2.525           2.800
## 3             7.900          2.200           2.800           3.000
##   Sepal.Width.75% Sepal.Width.100%
## 1           3.675            4.400
## 2           3.000            3.400
## 3           3.175            3.800
# 对 Sepal.Length 按 Species 分组计算分位数
do.call("rbind", tapply(iris$Sepal.Length, iris$Species, quantile))
##             0%   25% 50% 75% 100%
## setosa     4.3 4.800 5.0 5.2  5.8
## versicolor 4.9 5.600 5.9 6.3  7.0
## virginica  4.9 6.225 6.5 6.9  7.9
# 分组取平均 mean /中位数 median
aggregate(data = iris, . ~ Species, FUN = mean)
##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa        5.006       3.428        1.462       0.246
## 2 versicolor        5.936       2.770        4.260       1.326
## 3  virginica        6.588       2.974        5.552       2.026

7.2.25 计算日粒度的 DoD/WoW/MoM/YoY

截止写作时间,data.table 提供的滑动窗口聚合统计函数 frollmean()frollsum()frollapply() 还处于实验阶段。 shift 提供漂移功能,向前前置 lead 或向后延迟 lag。

移动平均、求和和计算

dat <- data.frame(dt = seq(
  from = as.Date("2021-01-01"),
  to = Sys.Date(), by = "1 day"
))

dat <- within(dat, {
  uv = round(1000 * runif(n = nrow(dat)))
  uv_dod_d = ifelse(nrow(dat) <= 1, NA, c(NA, diff(uv, lag = 1)))
  uv_wow_d = ifelse(nrow(dat) <= 7, NA, c(rep(NA, 7), diff(uv, lag = 7)))
  uv_mom_d = ifelse(nrow(dat) <= 30, NA, c(rep(NA, 30), diff(uv, lag = 30)))
  uv_yoy_d = ifelse(nrow(dat) <= 365, NA, c(rep(NA, 365), diff(uv, lag = 365)))
})