7.2 高频操作
以面向问题的方式介绍 Base R 提供的数据操作,然后过渡到 data.table,它是加强版的 Base R。
| base | dplyr |
|---|---|
df[order(x), , drop = FALSE] |
arrange(df, x) |
df[!duplicated(x), , drop = FALSE], unique() |
distinct(df, x) |
df[x & !is.na(x), , drop = FALSE], subset() |
filter(df, x) |
df$z <- df$x + df$y, transform() |
mutate(df, z = x + y) |
df$x |
pull(df, x) |
| N/A | rename(df, y = x) |
df[c("x", "y")], subset() |
select(df, x, y) |
df[grepl(names(df), "^x")] |
select(df, starts_with("x") |
mean(df$x) |
summarise(df, mean(x)) |
df[c(1, 2, 5), , drop = FALSE] |
slice(df, c(1, 2, 5)) |
| base | dplyr |
|---|---|
merge(df1, df2) |
inner_join(df1, df2) |
merge(df1, df2, all.x = TRUE) |
left_join(df1, df2) |
merge(df1, df2, all.y = TRUE) |
right_join(df1, df2) |
merge(df1, df2, all = TRUE) |
full_join(df1, df2) |
df1[df1$x %in% df2$x, , drop = FALSE] |
semi_join(df1, df2) |
df1[!df1$x %in% df2$x, , drop = FALSE] |
anti_join(df1, df2) |
class(mtcars)## [1] "data.frame"
library(data.table)
mtcars <- as.data.table(mtcars)
class(mtcars)## [1] "data.table" "data.frame"
7.2.1 选择多列
# base
mtcars[, c("cyl", "gear")] |> head(3)## cyl gear
## 1: 6 4
## 2: 6 4
## 3: 4 4
# data.table
mtcars[, c("cyl", "gear")] |> head(3)## cyl gear
## 1: 6 4
## 2: 6 4
## 3: 4 4
# dplyr
dplyr::select(mtcars, cyl, gear) |> head(3)## cyl gear
## 1: 6 4
## 2: 6 4
## 3: 4 4
反选多列,选择除了 cyl 和 gear 的列
## 或者 mtcars[, setdiff(names(mtcars), c("cyl", "gear"))]
mtcars[, !(names(mtcars) %in% c("cyl", "gear"))] |> head(3)## [1] TRUE FALSE TRUE
subset(mtcars, select = -c(cyl, gear)) |> head(3)## mpg disp hp drat wt qsec vs am carb
## 1: 21.0 160 110 3.90 2.620 16.46 0 1 4
## 2: 21.0 160 110 3.90 2.875 17.02 0 1 4
## 3: 22.8 108 93 3.85 2.320 18.61 1 1 1
7.2.2 过滤多行
# base
mtcars[mtcars$cyl == 6 & mtcars$gear == 4, ]## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
subset(mtcars, subset = cyl == 6 & gear == 4)## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# data.table
mtcars[cyl == 6 & gear == 4, ]## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# dplyr
dplyr::filter(mtcars, cyl == 6 & gear == 4)## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
7.2.3 去重多行
# base
mtcars[!duplicated(mtcars[, c("cyl", "gear")])]## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 5: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 6: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 7: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# data.table
mtcars[!duplicated(mtcars, by = c("cyl", "gear")), ]## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 5: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 6: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 7: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
unique(mtcars, by = c("cyl", "gear"))## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 5: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 6: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 7: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# dplyr
dplyr::distinct(mtcars, cyl, gear, .keep_all = TRUE)## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 5: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 6: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 7: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
7.2.5 新添多列
mtcars[cyl == 6, `:=`(disp_mean = mean(disp), hp_mean = mean(hp))][cyl == 6, .(cyl, disp, hp, disp_mean, hp_mean)]## cyl disp hp disp_mean hp_mean
## 1: 6 160.0 110 183.3143 122.2857
## 2: 6 160.0 110 183.3143 122.2857
## 3: 6 258.0 110 183.3143 122.2857
## 4: 6 225.0 105 183.3143 122.2857
## 5: 6 167.6 123 183.3143 122.2857
## 6: 6 167.6 123 183.3143 122.2857
## 7: 6 145.0 175 183.3143 122.2857
7.2.6 删除多列
删除列就是将该列的值清空,置为 NULL,下面将新添的两个列删除,根据列名的特点用正则表达式匹配
mtcars[, colnames(mtcars)[grep("_mean$", colnames(mtcars))] := NULL]7.2.7 筛选多列
按照某一规则筛选多列
library(data.table)
iris <- as.data.table(iris)
iris[, head(.SD, 6), .SDcols = function(x) is.numeric(x)]## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1: 5.1 3.5 1.4 0.2
## 2: 4.9 3.0 1.4 0.2
## 3: 4.7 3.2 1.3 0.2
## 4: 4.6 3.1 1.5 0.2
## 5: 5.0 3.6 1.4 0.2
## 6: 5.4 3.9 1.7 0.4
7.2.8 修改多列类型
mtcars[, (c("cyl", "disp")) := lapply(.SD, as.integer), .SDcols = c("cyl", "disp")]
str(mtcars)## Classes 'data.table' and 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : int 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: int 160 160 108 258 360 225 360 146 140 167 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "index")= int(0)
7.2.9 取每组第一行
先将 mtcars 按 cyl 升序,gear 降序排列,然后按 cyl, gear 和 am 分组取第一行
mtcars[order(cyl, -gear)][, head(.SD, 1), by = list(cyl, gear, am)]## cyl gear am mpg disp hp drat wt qsec vs carb
## 1: 4 5 1 26.0 120 91 4.43 2.140 16.70 0 2
## 2: 4 4 1 22.8 108 93 3.85 2.320 18.61 1 1
## 3: 4 4 0 24.4 146 62 3.69 3.190 20.00 1 2
## 4: 4 3 0 21.5 120 97 3.70 2.465 20.01 1 1
## 5: 6 5 1 19.7 145 175 3.62 2.770 15.50 0 6
## 6: 6 4 1 21.0 160 110 3.90 2.620 16.46 0 4
## 7: 6 4 0 19.2 167 123 3.92 3.440 18.30 1 4
## 8: 6 3 0 21.4 258 110 3.08 3.215 19.44 1 1
## 9: 8 5 1 15.8 351 264 4.22 3.170 14.50 0 4
## 10: 8 3 0 18.7 360 175 3.15 3.440 17.02 0 2
# 或者
mtcars[order(cyl, -gear)][, .SD[1], by = list(cyl, gear, am)]## cyl gear am mpg disp hp drat wt qsec vs carb
## 1: 4 5 1 26.0 120 91 4.43 2.140 16.70 0 2
## 2: 4 4 1 22.8 108 93 3.85 2.320 18.61 1 1
## 3: 4 4 0 24.4 146 62 3.69 3.190 20.00 1 2
## 4: 4 3 0 21.5 120 97 3.70 2.465 20.01 1 1
## 5: 6 5 1 19.7 145 175 3.62 2.770 15.50 0 6
## 6: 6 4 1 21.0 160 110 3.90 2.620 16.46 0 4
## 7: 6 4 0 19.2 167 123 3.92 3.440 18.30 1 4
## 8: 6 3 0 21.4 258 110 3.08 3.215 19.44 1 1
## 9: 8 5 1 15.8 351 264 4.22 3.170 14.50 0 4
## 10: 8 3 0 18.7 360 175 3.15 3.440 17.02 0 2
7.2.10 计算环比同比
以数据集 AirPassengers 为例,重新整理后见表 7.3
library(magrittr)
dat <- data.frame(
year = rep(1949:1960, each = 12),
month = month.abb, num = AirPassengers
) %>%
reshape(.,
v.names = "num", idvar = "year", timevar = "month",
direction = "wide", sep = ""
) %>%
setNames(., gsub(pattern = "(num)", replacement = "", x = colnames(.)))
rownames(dat) <- subset(dat, select = year, drop = TRUE)
air_passengers <- subset(dat, select = -year)
knitr::kable(air_passengers,
caption = "1949-1960年国际航班乘客数量变化",
align = "c", row.names = TRUE
)| Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1949 | 112 | 118 | 132 | 129 | 121 | 135 | 148 | 148 | 136 | 119 | 104 | 118 |
| 1950 | 115 | 126 | 141 | 135 | 125 | 149 | 170 | 170 | 158 | 133 | 114 | 140 |
| 1951 | 145 | 150 | 178 | 163 | 172 | 178 | 199 | 199 | 184 | 162 | 146 | 166 |
| 1952 | 171 | 180 | 193 | 181 | 183 | 218 | 230 | 242 | 209 | 191 | 172 | 194 |
| 1953 | 196 | 196 | 236 | 235 | 229 | 243 | 264 | 272 | 237 | 211 | 180 | 201 |
| 1954 | 204 | 188 | 235 | 227 | 234 | 264 | 302 | 293 | 259 | 229 | 203 | 229 |
| 1955 | 242 | 233 | 267 | 269 | 270 | 315 | 364 | 347 | 312 | 274 | 237 | 278 |
| 1956 | 284 | 277 | 317 | 313 | 318 | 374 | 413 | 405 | 355 | 306 | 271 | 306 |
| 1957 | 315 | 301 | 356 | 348 | 355 | 422 | 465 | 467 | 404 | 347 | 305 | 336 |
| 1958 | 340 | 318 | 362 | 348 | 363 | 435 | 491 | 505 | 404 | 359 | 310 | 337 |
| 1959 | 360 | 342 | 406 | 396 | 420 | 472 | 548 | 559 | 463 | 407 | 362 | 405 |
| 1960 | 417 | 391 | 419 | 461 | 472 | 535 | 622 | 606 | 508 | 461 | 390 | 432 |
横向计算环比,如1949年2月相比1月增长多少、3月相比2月增长多少,以此类推,就是计算环比?纵向计算同比,如1950年1月相比1949年1月增长多少、1951年相比1950年1月增长多少?
# 环比横向/同比纵向
mom <- function(x) diff(x, lag = 1) / x[-length(x)] # month to month
# 格式化输出
format_mom <- function(x) formatC(mom(x), format = "f", digits = 4)library(formattable)
# 同比变化
air_passengers %>%
apply(., 2, format_mom) %>%
as.data.frame() %>%
formattable(., list(
Jan = color_tile("white", "pink"),
Feb = color_tile("white", "springgreen4"),
Mar = percent
))
library(DT)
datatable(air_passengers)7.2.11 合并多个数据框
将所有列都保留,以 full_join() 方式合并
df1 <- iris[1:10, c(1, 5)]
df2 <- iris[11:15, c(1, 2, 5)]
df3 <- iris[16:30, c(1, 3, 5)]
all_dfs <- list(df1, df2, df3)
# base
Reduce(function(x, y, ...) merge(x, y, ..., all = TRUE), all_dfs)## Sepal.Length Species Sepal.Width Petal.Length
## 1: 4.3 setosa 3.0 NA
## 2: 4.4 setosa NA NA
## 3: 4.6 setosa NA 1.0
## 4: 4.6 setosa NA 1.0
## 5: 4.7 setosa NA 1.6
## 6: 4.8 setosa 3.4 1.9
## 7: 4.8 setosa 3.0 1.9
## 8: 4.9 setosa NA NA
## 9: 4.9 setosa NA NA
## 10: 5.0 setosa NA 1.6
## 11: 5.0 setosa NA 1.6
## 12: 5.0 setosa NA 1.6
## 13: 5.0 setosa NA 1.6
## 14: 5.1 setosa NA 1.4
## 15: 5.1 setosa NA 1.5
## 16: 5.1 setosa NA 1.5
## 17: 5.1 setosa NA 1.7
## 18: 5.2 setosa NA 1.5
## 19: 5.2 setosa NA 1.4
## 20: 5.4 setosa 3.7 1.3
## 21: 5.4 setosa 3.7 1.7
## 22: 5.7 setosa NA 1.5
## 23: 5.7 setosa NA 1.7
## 24: 5.8 setosa 4.0 NA
## Sepal.Length Species Sepal.Width Petal.Length
# dplyr
Reduce(function(x, y, ...) dplyr::full_join(x, y, ...), all_dfs)## Warning in dplyr::full_join(x, y, ...): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 8 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Sepal.Length Species Sepal.Width Petal.Length
## 1: 5.1 setosa NA 1.4
## 2: 5.1 setosa NA 1.5
## 3: 5.1 setosa NA 1.5
## 4: 5.1 setosa NA 1.7
## 5: 4.9 setosa NA NA
## 6: 4.7 setosa NA 1.6
## 7: 4.6 setosa NA 1.0
## 8: 5.0 setosa NA 1.6
## 9: 5.0 setosa NA 1.6
## 10: 5.4 setosa 3.7 1.3
## 11: 5.4 setosa 3.7 1.7
## 12: 4.6 setosa NA 1.0
## 13: 5.0 setosa NA 1.6
## 14: 5.0 setosa NA 1.6
## 15: 4.4 setosa NA NA
## 16: 4.9 setosa NA NA
## 17: 4.8 setosa 3.4 1.9
## 18: 4.8 setosa 3.0 1.9
## 19: 4.3 setosa 3.0 NA
## 20: 5.8 setosa 4.0 NA
## 21: 5.7 setosa NA 1.5
## 22: 5.7 setosa NA 1.7
## 23: 5.2 setosa NA 1.5
## 24: 5.2 setosa NA 1.4
## Sepal.Length Species Sepal.Width Petal.Length
合并完应该有30行,为啥只有24行?这是因为 merge() 函数对主键 key 相同的记录会合并,要想不合并,需要调用 rbindlist() 函数 https://d.cosx.org/d/421235
rbind() 列数相同的两个 data.frame 按行合并,cbind() 行数相同的两个 data.frame 按列合并,merge() 对行、列数没有要求
rbindlist(all_dfs, fill = TRUE)## Sepal.Length Species Sepal.Width Petal.Length
## 1: 5.1 setosa NA NA
## 2: 4.9 setosa NA NA
## 3: 4.7 setosa NA NA
## 4: 4.6 setosa NA NA
## 5: 5.0 setosa NA NA
## 6: 5.4 setosa NA NA
## 7: 4.6 setosa NA NA
## 8: 5.0 setosa NA NA
## 9: 4.4 setosa NA NA
## 10: 4.9 setosa NA NA
## 11: 5.4 setosa 3.7 NA
## 12: 4.8 setosa 3.4 NA
## 13: 4.8 setosa 3.0 NA
## 14: 4.3 setosa 3.0 NA
## 15: 5.8 setosa 4.0 NA
## 16: 5.7 setosa NA 1.5
## 17: 5.4 setosa NA 1.3
## 18: 5.1 setosa NA 1.4
## 19: 5.7 setosa NA 1.7
## 20: 5.1 setosa NA 1.5
## 21: 5.4 setosa NA 1.7
## 22: 5.1 setosa NA 1.5
## 23: 4.6 setosa NA 1.0
## 24: 5.1 setosa NA 1.7
## 25: 4.8 setosa NA 1.9
## 26: 5.0 setosa NA 1.6
## 27: 5.0 setosa NA 1.6
## 28: 5.2 setosa NA 1.5
## 29: 5.2 setosa NA 1.4
## 30: 4.7 setosa NA 1.6
## Sepal.Length Species Sepal.Width Petal.Length
# dplyr
dplyr::bind_rows(all_dfs)## Sepal.Length Species Sepal.Width Petal.Length
## 1: 5.1 setosa NA NA
## 2: 4.9 setosa NA NA
## 3: 4.7 setosa NA NA
## 4: 4.6 setosa NA NA
## 5: 5.0 setosa NA NA
## 6: 5.4 setosa NA NA
## 7: 4.6 setosa NA NA
## 8: 5.0 setosa NA NA
## 9: 4.4 setosa NA NA
## 10: 4.9 setosa NA NA
## 11: 5.4 setosa 3.7 NA
## 12: 4.8 setosa 3.4 NA
## 13: 4.8 setosa 3.0 NA
## 14: 4.3 setosa 3.0 NA
## 15: 5.8 setosa 4.0 NA
## 16: 5.7 setosa NA 1.5
## 17: 5.4 setosa NA 1.3
## 18: 5.1 setosa NA 1.4
## 19: 5.7 setosa NA 1.7
## 20: 5.1 setosa NA 1.5
## 21: 5.4 setosa NA 1.7
## 22: 5.1 setosa NA 1.5
## 23: 4.6 setosa NA 1.0
## 24: 5.1 setosa NA 1.7
## 25: 4.8 setosa NA 1.9
## 26: 5.0 setosa NA 1.6
## 27: 5.0 setosa NA 1.6
## 28: 5.2 setosa NA 1.5
## 29: 5.2 setosa NA 1.4
## 30: 4.7 setosa NA 1.6
## Sepal.Length Species Sepal.Width Petal.Length
7.2.12 分组聚合多个指标
https://stackoverflow.com/questions/24151602/calculate-multiple-aggregations-with-lapply-sd
# base
aggregate(
data = mtcars, cbind(mpg, hp) ~ cyl,
FUN = function(x) c(mean = mean(x), median = median(x))
)## cyl mpg.mean mpg.median hp.mean hp.median
## 1 4 26.66364 26.00000 82.63636 91.00000
## 2 6 19.74286 19.70000 122.28571 110.00000
## 3 8 15.10000 15.20000 209.21429 192.50000
# 数据一致性 https://d.cosx.org/d/420763-base-r
with(
aggregate(cbind(mpg, hp) ~ cyl, mtcars,
FUN = function(x) c(mean = mean(x), median = median(x))
),
cbind.data.frame(cyl, mpg, hp)
)## cyl mean median mean median
## 1 4 26.66364 26.0 82.63636 91.0
## 2 6 19.74286 19.7 122.28571 110.0
## 3 8 15.10000 15.2 209.21429 192.5
# data.table
mtcars[, as.list(unlist(lapply(.SD, function(x) {
list(
mean = mean(x),
median = median(x)
)
}))),
by = "cyl", .SDcols = c("mpg", "hp")
]## cyl mpg.mean mpg.median hp.mean hp.median
## 1: 6 19.74286 19.7 122.28571 110.0
## 2: 4 26.66364 26.0 82.63636 91.0
## 3: 8 15.10000 15.2 209.21429 192.5
# dplyr
mtcars |>
dplyr::group_by(cyl) |>
dplyr::summarise(
mean_mpg = mean(mpg), mean_hp = mean(hp),
median_mpg = mean(mpg), median_hp = mean(hp)
)## # A tibble: 3 × 5
## cyl mean_mpg mean_hp median_mpg median_hp
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 4 26.7 82.6 26.7 82.6
## 2 6 19.7 122. 19.7 122.
## 3 8 15.1 209. 15.1 209.
7.2.13 重命名多个列
tmp <- aggregate(
data = mtcars, cbind(mpg, hp) ~ cyl,
FUN = median
)
tmp <- as.data.table(tmp)
setnames(tmp, old = c("mpg", "hp"), new = c("median_mpg", "median_hp"))
tmp## cyl median_mpg median_hp
## 1: 4 26.0 91.0
## 2: 6 19.7 110.0
## 3: 8 15.2 192.5
7.2.14 对多个列依次排序
https://stackoverflow.com/questions/1296646/how-to-sort-a-dataframe-by-multiple-columns
# base
tmp[order(median_mpg, -median_hp), ]## cyl median_mpg median_hp
## 1: 8 15.2 192.5
## 2: 6 19.7 110.0
## 3: 4 26.0 91.0
# data.table
setorder(tmp, median_mpg, -median_hp)
# dplyr
dplyr::arrange(tmp, median_mpg, desc(median_hp))## cyl median_mpg median_hp
## 1: 8 15.2 192.5
## 2: 6 19.7 110.0
## 3: 4 26.0 91.0
7.2.15 重排多个列的位置
# https://stackoverflow.com/questions/19619666/change-column-position-of-data-table
setcolorder(tmp, c("median_mpg", setdiff(names(tmp), "median_mpg")))
tmp## median_mpg cyl median_hp
## 1: 15.2 8 192.5
## 2: 19.7 6 110.0
## 3: 26.0 4 91.0
# dplyr
dplyr::select(tmp, "median_mpg", setdiff(names(tmp), "median_mpg"))## median_mpg cyl median_hp
## 1: 15.2 8 192.5
## 2: 19.7 6 110.0
## 3: 26.0 4 91.0
7.2.16 整理回归结果
dat <- split(iris, iris$Species)
mod <- lapply(dat, function(x) lm(Petal.Length ~ Sepal.Length, x))
mod <- lapply(mod, function(x) coef(summary(x)))
mod <- Map(function(x, y) {
x <- as.data.frame(x)
x$Species <- y
x
}, mod, names(dat))
mod <- do.call(rbind, mod)
mod## Estimate Std. Error t value Pr(>|t|) Species
## setosa.(Intercept) 0.8030518 0.34387807 2.3352806 2.375647e-02 setosa
## setosa.Sepal.Length 0.1316317 0.06852690 1.9208760 6.069778e-02 setosa
## versicolor.(Intercept) 0.1851155 0.51421351 0.3599974 7.204283e-01 versicolor
## versicolor.Sepal.Length 0.6864698 0.08630708 7.9538056 2.586190e-10 versicolor
## virginica.(Intercept) 0.6104680 0.41710685 1.4635770 1.498279e-01 virginica
## virginica.Sepal.Length 0.7500808 0.06302606 11.9011203 6.297786e-16 virginica
# 管道操作
split(iris, iris$Species) %>%
lapply(., function(x) coef(summary(lm(Petal.Length ~ Sepal.Length, x)))) %>%
Map(function(x, y) {
x <- as.data.frame(x)
x$Species <- y
x
}, ., levels(iris$Species)) %>%
do.call(rbind, .)## Estimate Std. Error t value Pr(>|t|) Species
## setosa.(Intercept) 0.8030518 0.34387807 2.3352806 2.375647e-02 setosa
## setosa.Sepal.Length 0.1316317 0.06852690 1.9208760 6.069778e-02 setosa
## versicolor.(Intercept) 0.1851155 0.51421351 0.3599974 7.204283e-01 versicolor
## versicolor.Sepal.Length 0.6864698 0.08630708 7.9538056 2.586190e-10 versicolor
## virginica.(Intercept) 0.6104680 0.41710685 1.4635770 1.498279e-01 virginica
## virginica.Sepal.Length 0.7500808 0.06302606 11.9011203 6.297786e-16 virginica
# dplyr 操作,需要 dplyr >= 1.0.0
iris %>%
dplyr::group_by(Species) %>%
dplyr::summarise(broom::tidy(lm(Petal.Length ~ Sepal.Length)))## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 6 × 6
## # Groups: Species [3]
## Species term estimate std.error statistic p.value
## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa (Intercept) 0.803 0.344 2.34 2.38e- 2
## 2 setosa Sepal.Length 0.132 0.0685 1.92 6.07e- 2
## 3 versicolor (Intercept) 0.185 0.514 0.360 7.20e- 1
## 4 versicolor Sepal.Length 0.686 0.0863 7.95 2.59e-10
## 5 virginica (Intercept) 0.610 0.417 1.46 1.50e- 1
## 6 virginica Sepal.Length 0.750 0.0630 11.9 6.30e-16
7.2.17 := 和 .()
mtcars[, mpg_rate := round(mpg / sum(mpg) * 100, digits = 2), by = .(cyl, vs, am)]
mtcars[, .(mpg_rate, mpg, cyl, vs, am)]## mpg_rate mpg cyl vs am
## 1: 34.04 21.0 6 0 1
## 2: 34.04 21.0 6 0 1
## 3: 11.48 22.8 4 1 1
## 4: 27.97 21.4 6 1 0
## 5: 10.35 18.7 8 0 0
## 6: 23.66 18.1 6 1 0
## 7: 7.92 14.3 8 0 0
## 8: 35.52 24.4 4 1 0
## 9: 33.19 22.8 4 1 0
## 10: 25.10 19.2 6 1 0
## 11: 23.27 17.8 6 1 0
## 12: 9.08 16.4 8 0 0
## 13: 9.58 17.3 8 0 0
## 14: 8.42 15.2 8 0 0
## 15: 5.76 10.4 8 0 0
## 16: 5.76 10.4 8 0 0
## 17: 8.14 14.7 8 0 0
## 18: 16.31 32.4 4 1 1
## 19: 15.31 30.4 4 1 1
## 20: 17.07 33.9 4 1 1
## 21: 31.30 21.5 4 1 0
## 22: 8.58 15.5 8 0 0
## 23: 8.42 15.2 8 0 0
## 24: 7.36 13.3 8 0 0
## 25: 10.63 19.2 8 0 0
## 26: 13.75 27.3 4 1 1
## 27: 100.00 26.0 4 0 1
## 28: 15.31 30.4 4 1 1
## 29: 51.30 15.8 8 0 1
## 30: 31.93 19.7 6 0 1
## 31: 48.70 15.0 8 0 1
## 32: 10.78 21.4 4 1 1
## mpg_rate mpg cyl vs am
mtcars[, .(mpg_rate = round(mpg / sum(mpg) * 100, digits = 2)), by = .(cyl, vs, am)]## cyl vs am mpg_rate
## 1: 6 0 1 34.04
## 2: 6 0 1 34.04
## 3: 6 0 1 31.93
## 4: 4 1 1 11.48
## 5: 4 1 1 16.31
## 6: 4 1 1 15.31
## 7: 4 1 1 17.07
## 8: 4 1 1 13.75
## 9: 4 1 1 15.31
## 10: 4 1 1 10.78
## 11: 6 1 0 27.97
## 12: 6 1 0 23.66
## 13: 6 1 0 25.10
## 14: 6 1 0 23.27
## 15: 8 0 0 10.35
## 16: 8 0 0 7.92
## 17: 8 0 0 9.08
## 18: 8 0 0 9.58
## 19: 8 0 0 8.42
## 20: 8 0 0 5.76
## 21: 8 0 0 5.76
## 22: 8 0 0 8.14
## 23: 8 0 0 8.58
## 24: 8 0 0 8.42
## 25: 8 0 0 7.36
## 26: 8 0 0 10.63
## 27: 4 1 0 35.52
## 28: 4 1 0 33.19
## 29: 4 1 0 31.30
## 30: 4 0 1 100.00
## 31: 8 0 1 51.30
## 32: 8 0 1 48.70
## cyl vs am mpg_rate
7.2.18 去掉含有缺失值的记录
airquality[complete.cases(airquality), ] |> head()## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 7 23 299 8.6 65 5 7
## 8 19 99 13.8 59 5 8
# 或着
airquality[!apply(airquality, 1, anyNA), ] |> head()## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 7 23 299 8.6 65 5 7
## 8 19 99 13.8 59 5 8
7.2.19 集合操作
match 和 %in%
https://d.cosx.org/d/421314
`%nin%` <- Negate("%in%")
# `%in%` <- function(x, table) match(x, table, nomatch = 0) > 0 # %in% 函数的定义
x <- letters[1:5]
y <- letters[3:8]
x %in% y## [1] FALSE FALSE TRUE TRUE TRUE
x %nin% y## [1] TRUE TRUE FALSE FALSE FALSE
返回一个逻辑向量,x 中的元素匹配到了就返回 TRUE,否则 FALSE, %nin% 是 %in% 的取反效果
match(x, y)## [1] NA NA 1 2 3
x 在 y 中的匹配情况,匹配到了,就返回在 y 中匹配的位置,没有匹配到就返回 NA
setdiff(x, y)## [1] "a" "b"
intersect(x, y)## [1] "c" "d" "e"
union(x, y)## [1] "a" "b" "c" "d" "e" "f" "g" "h"
7.2.20 对数值向量按既定分组计数
此数据处理过程陆续使用了 transform()、cut() 和 aggregate() 三个函数
# 对数值向量按既定分组计数
dat <- data.frame(y = 1:12)
dat <- transform(dat, x = cut(y, breaks = c(0, 6, 9, 15)))
dat <- aggregate(data = dat, y ~ x, FUN = length)ggplot(data = dat, aes(x = x, y = y)) +
geom_col()
data.frame(y = 1:12) %>%
transform(x = cut(y, breaks = c(0, 6, 9, 15))) %>%
aggregate(data = ., y ~ x, FUN = length) %>%
ggplot(data = ., aes(x = x, y = y)) +
geom_col()对数值向量按分位数分组计数
dat <- data.frame(y = 1:12)
dat <- transform(dat, x = cut(
x = y,
breaks = quantile(y, prob = seq(0, 1, 0.25), na.rm = TRUE)
))
# dat <- transform(dat, x = cut(
# x = y,
# breaks = quantile(y, prob = seq(0, 1, 0.25)),
# include.lowest = T
# ))
dat1 <- aggregate(data = dat, y ~ x, FUN = length)ggplot(data = dat1, aes(x = x, y = y)) +
geom_col()7.2.21 分组排序
按变量 a 分组计算,之后按变量 b 降序排列
dat <- aggregate(data = iris, cbind(Sepal.Width, Sepal.Length) ~ Species, FUN = mean)
# 按 Species 降序排列
dat[order(dat$Species, decreasing = T), ]## Species Sepal.Width Sepal.Length
## 3 virginica 2.974 6.588
## 2 versicolor 2.770 5.936
## 1 setosa 3.428 5.006
7.2.22 分组获取 Top 值
分组按既定规律取数,比如按 Species 分组取 Top 6
# 分组取前6个
do.call("rbind.data.frame", lapply(base::split(x = iris, ~Species), head))# 分组取 Top 6
do.call(rbind, lapply(split(iris, iris$Species),
FUN = function(x) head(x[order(x$Sepal.Length, decreasing = T), ], 6)
))## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 5.8 4.0 1.2 0.2 setosa
## 2: 5.7 4.4 1.5 0.4 setosa
## 3: 5.7 3.8 1.7 0.3 setosa
## 4: 5.5 4.2 1.4 0.2 setosa
## 5: 5.5 3.5 1.3 0.2 setosa
## 6: 5.4 3.9 1.7 0.4 setosa
## 7: 7.0 3.2 4.7 1.4 versicolor
## 8: 6.9 3.1 4.9 1.5 versicolor
## 9: 6.8 2.8 4.8 1.4 versicolor
## 10: 6.7 3.1 4.4 1.4 versicolor
## 11: 6.7 3.0 5.0 1.7 versicolor
## 12: 6.7 3.1 4.7 1.5 versicolor
## 13: 7.9 3.8 6.4 2.0 virginica
## 14: 7.7 3.8 6.7 2.2 virginica
## 15: 7.7 2.6 6.9 2.3 virginica
## 16: 7.7 2.8 6.7 2.0 virginica
## 17: 7.7 3.0 6.1 2.3 virginica
## 18: 7.6 3.0 6.6 2.1 virginica
7.2.23 分组抽样
# 分组抽样
do.call(rbind, lapply(split(iris, iris$Species),
FUN = function(x) x[sample(1:nrow(x), size = 6), ]
))## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 5.1 3.3 1.7 0.5 setosa
## 2: 5.2 3.4 1.4 0.2 setosa
## 3: 4.6 3.6 1.0 0.2 setosa
## 4: 4.9 3.1 1.5 0.2 setosa
## 5: 5.7 3.8 1.7 0.3 setosa
## 6: 5.4 3.7 1.5 0.2 setosa
## 7: 5.7 2.9 4.2 1.3 versicolor
## 8: 6.3 2.5 4.9 1.5 versicolor
## 9: 6.1 3.0 4.6 1.4 versicolor
## 10: 5.7 2.6 3.5 1.0 versicolor
## 11: 6.6 3.0 4.4 1.4 versicolor
## 12: 5.5 2.4 3.7 1.0 versicolor
## 13: 6.7 3.1 5.6 2.4 virginica
## 14: 7.2 3.2 6.0 1.8 virginica
## 15: 6.1 3.0 4.9 1.8 virginica
## 16: 6.3 2.7 4.9 1.8 virginica
## 17: 5.8 2.7 5.1 1.9 virginica
## 18: 6.0 3.0 4.8 1.8 virginica
7.2.24 分组计算分位数
# 分组计算分位数,如何分组呢
do.call(rbind, lapply(iris[, sapply(iris, class) == "numeric"], quantile))## 0% 25% 50% 75% 100%
## Sepal.Length 1 1 1 1 1
## Sepal.Width 1 1 1 1 1
## Petal.Length 1 1 1 1 1
## Petal.Width 1 1 1 1 1
## Species 0 0 0 0 0
aggregate(data = iris, cbind(Sepal.Length, Sepal.Width) ~ Species, FUN = quantile)## Species Sepal.Length.0% Sepal.Length.25% Sepal.Length.50% Sepal.Length.75%
## 1 setosa 4.300 4.800 5.000 5.200
## 2 versicolor 4.900 5.600 5.900 6.300
## 3 virginica 4.900 6.225 6.500 6.900
## Sepal.Length.100% Sepal.Width.0% Sepal.Width.25% Sepal.Width.50%
## 1 5.800 2.300 3.200 3.400
## 2 7.000 2.000 2.525 2.800
## 3 7.900 2.200 2.800 3.000
## Sepal.Width.75% Sepal.Width.100%
## 1 3.675 4.400
## 2 3.000 3.400
## 3 3.175 3.800
# 对 Sepal.Length 按 Species 分组计算分位数
do.call("rbind", tapply(iris$Sepal.Length, iris$Species, quantile))## 0% 25% 50% 75% 100%
## setosa 4.3 4.800 5.0 5.2 5.8
## versicolor 4.9 5.600 5.9 6.3 7.0
## virginica 4.9 6.225 6.5 6.9 7.9
# 分组取平均 mean /中位数 median
aggregate(data = iris, . ~ Species, FUN = mean)## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.006 3.428 1.462 0.246
## 2 versicolor 5.936 2.770 4.260 1.326
## 3 virginica 6.588 2.974 5.552 2.026
7.2.25 计算日粒度的 DoD/WoW/MoM/YoY
截止写作时间,data.table 提供的滑动窗口聚合统计函数 frollmean()、frollsum() 和 frollapply() 还处于实验阶段。 shift 提供漂移功能,向前前置 lead 或向后延迟 lag。
dat <- data.frame(dt = seq(
from = as.Date("2021-01-01"),
to = Sys.Date(), by = "1 day"
))
dat <- within(dat, {
uv = round(1000 * runif(n = nrow(dat)))
uv_dod_d = ifelse(nrow(dat) <= 1, NA, c(NA, diff(uv, lag = 1)))
uv_wow_d = ifelse(nrow(dat) <= 7, NA, c(rep(NA, 7), diff(uv, lag = 7)))
uv_mom_d = ifelse(nrow(dat) <= 30, NA, c(rep(NA, 30), diff(uv, lag = 30)))
uv_yoy_d = ifelse(nrow(dat) <= 365, NA, c(rep(NA, 365), diff(uv, lag = 365)))
})