6.3 数据重塑
重复测量数据的变形 Reshape Grouped Data,将宽格式 wide 的数据框变长格式 long的,反之也行。reshape 还支持正则表达式
str(Indometh)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 66 obs. of 3 variables:
## $ Subject: Ord.factor w/ 6 levels "1"<"4"<"2"<"5"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : num 0.25 0.5 0.75 1 1.25 2 3 4 5 6 ...
## $ conc : num 1.5 0.94 0.78 0.48 0.37 0.19 0.12 0.11 0.08 0.07 ...
## - attr(*, "formula")=Class 'formula' language conc ~ time | Subject
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "labels")=List of 2
## ..$ x: chr "Time since drug administration"
## ..$ y: chr "Indomethacin concentration"
## - attr(*, "units")=List of 2
## ..$ x: chr "(hr)"
## ..$ y: chr "(mcg/ml)"
summary(Indometh)
## Subject time conc
## 1:11 Min. :0.250 Min. :0.0500
## 4:11 1st Qu.:0.750 1st Qu.:0.1100
## 2:11 Median :2.000 Median :0.3400
## 5:11 Mean :2.886 Mean :0.5918
## 6:11 3rd Qu.:5.000 3rd Qu.:0.8325
## 3:11 Max. :8.000 Max. :2.7200
# 长的变宽
<- reshape(Indometh,
wide v.names = "conc", idvar = "Subject",
timevar = "time", direction = "wide"
)1:6] wide[,
## Subject conc.0.25 conc.0.5 conc.0.75 conc.1 conc.1.25
## 1 1 1.50 0.94 0.78 0.48 0.37
## 12 2 2.03 1.63 0.71 0.70 0.64
## 23 3 2.72 1.49 1.16 0.80 0.80
## 34 4 1.85 1.39 1.02 0.89 0.59
## 45 5 2.05 1.04 0.81 0.39 0.30
....
# 宽的变长
reshape(wide, direction = "long")
## Subject time conc
## 1.0.25 1 0.25 1.50
## 2.0.25 2 0.25 2.03
## 3.0.25 3 0.25 2.72
## 4.0.25 4 0.25 1.85
## 5.0.25 5 0.25 2.05
....
宽的格式变成长的格式 https://stackoverflow.com/questions/2185252 或者长的格式变成宽的格式 https://stackoverflow.com/questions/5890584/
set.seed(45)
<- data.frame(
dat name = rep(c("Orange", "Apple"), each=4),
numbers = rep(1:4, 2),
value = rnorm(8))
dat
## name numbers value
## 1 Orange 1 0.3407997
## 2 Orange 2 -0.7033403
## 3 Orange 3 -0.3795377
## 4 Orange 4 -0.7460474
## 5 Apple 1 -0.8981073
## 6 Apple 2 -0.3347941
## 7 Apple 3 -0.5013782
## 8 Apple 4 -0.1745357
reshape(dat, idvar = "name", timevar = "numbers", direction = "wide")
## name value.1 value.2 value.3 value.4
## 1 Orange 0.3407997 -0.7033403 -0.3795377 -0.7460474
## 5 Apple -0.8981073 -0.3347941 -0.5013782 -0.1745357
## times need not be numeric
<- data.frame(id = rep(1:4, rep(2,4)),
df visit = I(rep(c("Before","After"), 4)),
x = rnorm(4), y = runif(4))
df
## id visit x y
## 1 1 Before 1.8090374 0.89106978
## 2 1 After -0.2301050 0.06920426
## 3 2 Before -1.1304182 0.94623103
## 4 2 After 0.2159889 0.74850150
## 5 3 Before 1.8090374 0.89106978
## 6 3 After -0.2301050 0.06920426
## 7 4 Before -1.1304182 0.94623103
## 8 4 After 0.2159889 0.74850150
reshape(df, timevar = "visit", idvar = "id", direction = "wide")
## id x.Before y.Before x.After y.After
## 1 1 1.809037 0.8910698 -0.2301050 0.06920426
## 3 2 -1.130418 0.9462310 0.2159889 0.74850150
## 5 3 1.809037 0.8910698 -0.2301050 0.06920426
## 7 4 -1.130418 0.9462310 0.2159889 0.74850150
## warns that y is really varying
reshape(df, timevar = "visit", idvar = "id", direction = "wide", v.names = "x")
## Warning in reshapeWide(data, idvar = idvar, timevar = timevar, varying =
## varying, : some constant variables (y) are really varying
## id y x.Before x.After
## 1 1 0.8910698 1.809037 -0.2301050
## 3 2 0.9462310 -1.130418 0.2159889
## 5 3 0.8910698 1.809037 -0.2301050
## 7 4 0.9462310 -1.130418 0.2159889
更加复杂的例子, gambia 数据集,重塑的效果是使得个体水平的长格式变为村庄水平的宽格式
# data(gambia, package = "geoR")
# 在线下载数据集
<- read.table(
gambia file =
paste("http://www.leg.ufpr.br/lib/exe/fetch.php",
"pessoais:paulojus:mbgbook:datasets:gambia.txt",
sep = "/"
header = TRUE
),
)head(gambia)
# Building a "village-level" data frame
<- paste("x", gambia[, 1], "y", gambia[, 2], sep = "")
ind <- gambia[!duplicated(ind), c(1:2, 7:8)]
village $prev <- as.vector(tapply(gambia$pos, ind, mean))
villagehead(village)