我想计算data.frame中的所有组合.
数据看起来像这样
9 10 11 12 1 1 1 1 1 2 0 0 0 0 3 0 0 0 0 4 1 1 1 1 5 1 1 1 1 6 0 0 0 0 7 1 0 0 1 8 1 0 0 1 9 1 1 1 1 10 1 1 1 1
我想要的输出很简单
comb n 1 1 1 1 5 0 0 0 0 3 1 0 0 1 2
你知道有什么简单的功能吗?
谢谢
dt = structure(list(`9` = c(1, 0, 0, 1, 1, 0, 1, 1, 1, 1), `10` = c(1, 0, 0, 1, 1, 0, 0, 0, 1, 1), `11` = c(1, 0, 0, 1, 1, 0, 0, 0, 1, 1), `12` = c(1, 0, 0, 1, 1, 0, 1, 1, 1, 1)), .Names = c("9", "10", "11", "12"), class = "data.frame", row.names = c(NA, -10L ))
Cath.. 11
基础R解决方案aggregate
:
aggregate(seq(nrow(dt))~., data=dt, FUN=length) # 9 10 11 12 seq(nrow(dt)) #1 0 0 0 0 3 #2 1 0 0 1 2 #3 1 1 1 1 5
编辑
要使colnames更符合您的输出,您可以执行以下操作:
`colnames<-`(aggregate(seq(nrow(dt))~., data=dt, FUN=length), c("c", "o", "m", "b", "n")) # c o m b n #1 0 0 0 0 3 #2 1 0 0 1 2 #3 1 1 1 1 5
或者,更短:
aggregate(cbind(n = 1:nrow(dt))~., dt, length) # 9 10 11 12 n #1 0 0 0 0 3 #2 1 0 0 1 2 #3 1 1 1 1 5
akrun.. 10
我们可以使用data.table
或dplyr
.这些非常有效.我们将'data.frame'转换为'data.table'(setDT(dt)
),按'dt'(names(dt)
)的所有列分组,我们将nrow(.N
)转换为'Count'
library(data.table) setDT(dt)[,list(Count=.N) ,names(dt)]
或者我们可以使用类似的方法dplyr
.
library(dplyr) names(dt) <- make.names(names(dt)) dt %>% group_by_(.dots=names(dt)) %>% summarise(count= n())
如果有人想要查看某些指标(以及之前备份我的声明(efficient!
)),
set.seed(24) df1 <- as.data.frame(matrix(sample(0:1, 1e6*6, replace=TRUE), ncol=6)) akrunDT <- function() { as.data.table(df1)[,list(Count=.N) ,names(df1)] } akrunDplyr <- function() { df1 %>% group_by_(.dots=names(df1)) %>% summarise(count= n()) } cathG <- function() { aggregate(cbind(n = 1:nrow(df1))~., df1, length) } docendoD <- function() { as.data.frame(table(comb = do.call(paste, df1))) } deena <- function() { table(apply(df1, 1, paste, collapse = ",")) }
下面是microbenchmark
结果
library(microbenchmark) microbenchmark(akrunDT(), akrunDplyr(), cathG(), docendoD(), deena(), unit='relative', times=20L) # Unit: relative # expr min lq mean median uq max neval cld # akrunDT() 1.000000 1.000000 1.000000 1.00000 1.000000 1.0000000 20 a # akrunDplyr() 1.512354 1.523357 1.307724 1.45907 1.365928 0.7539773 20 a # cathG() 43.893946 43.592062 37.008677 42.10787 38.556726 17.9834245 20 c # docendoD() 18.778534 19.843255 16.560827 18.85707 17.296812 8.2688541 20 b # deena() 90.391417 89.449547 74.607662 85.16295 77.316143 34.6962954 20 d
talat.. 7
您可以仅使用基本R尝试以下方法:
as.data.frame(table(comb = do.call(paste, dt))) # comb Freq #1 0 0 0 0 3 #2 1 0 0 1 2 #3 1 1 1 1 5
小智.. 5
也许那样: table(apply(dt, 1, paste, collapse = ","))
基础R解决方案aggregate
:
aggregate(seq(nrow(dt))~., data=dt, FUN=length) # 9 10 11 12 seq(nrow(dt)) #1 0 0 0 0 3 #2 1 0 0 1 2 #3 1 1 1 1 5
编辑
要使colnames更符合您的输出,您可以执行以下操作:
`colnames<-`(aggregate(seq(nrow(dt))~., data=dt, FUN=length), c("c", "o", "m", "b", "n")) # c o m b n #1 0 0 0 0 3 #2 1 0 0 1 2 #3 1 1 1 1 5
或者,更短:
aggregate(cbind(n = 1:nrow(dt))~., dt, length) # 9 10 11 12 n #1 0 0 0 0 3 #2 1 0 0 1 2 #3 1 1 1 1 5
我们可以使用data.table
或dplyr
.这些非常有效.我们将'data.frame'转换为'data.table'(setDT(dt)
),按'dt'(names(dt)
)的所有列分组,我们将nrow(.N
)转换为'Count'
library(data.table) setDT(dt)[,list(Count=.N) ,names(dt)]
或者我们可以使用类似的方法dplyr
.
library(dplyr) names(dt) <- make.names(names(dt)) dt %>% group_by_(.dots=names(dt)) %>% summarise(count= n())
如果有人想要查看某些指标(以及之前备份我的声明(efficient!
)),
set.seed(24) df1 <- as.data.frame(matrix(sample(0:1, 1e6*6, replace=TRUE), ncol=6)) akrunDT <- function() { as.data.table(df1)[,list(Count=.N) ,names(df1)] } akrunDplyr <- function() { df1 %>% group_by_(.dots=names(df1)) %>% summarise(count= n()) } cathG <- function() { aggregate(cbind(n = 1:nrow(df1))~., df1, length) } docendoD <- function() { as.data.frame(table(comb = do.call(paste, df1))) } deena <- function() { table(apply(df1, 1, paste, collapse = ",")) }
下面是microbenchmark
结果
library(microbenchmark) microbenchmark(akrunDT(), akrunDplyr(), cathG(), docendoD(), deena(), unit='relative', times=20L) # Unit: relative # expr min lq mean median uq max neval cld # akrunDT() 1.000000 1.000000 1.000000 1.00000 1.000000 1.0000000 20 a # akrunDplyr() 1.512354 1.523357 1.307724 1.45907 1.365928 0.7539773 20 a # cathG() 43.893946 43.592062 37.008677 42.10787 38.556726 17.9834245 20 c # docendoD() 18.778534 19.843255 16.560827 18.85707 17.296812 8.2688541 20 b # deena() 90.391417 89.449547 74.607662 85.16295 77.316143 34.6962954 20 d
您可以仅使用基本R尝试以下方法:
as.data.frame(table(comb = do.call(paste, dt))) # comb Freq #1 0 0 0 0 3 #2 1 0 0 1 2 #3 1 1 1 1 5
也许那样: table(apply(dt, 1, paste, collapse = ","))