问题
I have a chunk of sentences and I want to build the undirected edge list of word co-occurrence and see the frequency of every edge. I took a look at the tm
package but didn't find similar functions. Is there some package/script I can use? Thanks a lot!
Note: A word doesn't co-occur with itself. A word which appears twice or more co-occurs with other words for only once in the same sentence.
DF:
sentence_id text
1 a b c d e
2 a b b e
3 b c d
4 a e
5 a
6 a a a
OUTPUT
word1 word2 freq
a b 2
a c 1
a d 1
a e 3
b c 2
b d 2
b e 2
c d 2
c e 1
d e 1
回答1:
It's convoluted so there's got to be a better approach:
dat <- read.csv(text="sentence_id, text
1, a b c d e
2, a b b e
3, b c d
4, a e", header=TRUE)
library(qdapTools); library(tidyr)
x <- t(mtabulate(with(dat, by(text, sentence_id, bag_o_words))) > 0)
out <- x %*% t(x)
out[upper.tri(out, diag=TRUE)] <- NA
out2 <- matrix2df(out, "word1") %>%
gather(word2, freq, -word1) %>%
na.omit()
rownames(out2) <- NULL
out2
## word1 word2 freq
## 1 b a 2
## 2 c a 1
## 3 d a 1
## 4 e a 3
## 5 c b 2
## 6 d b 2
## 7 e b 2
## 8 d c 2
## 9 e c 1
## 10 e d 1
Base only solution
out <- lapply(with(dat, split(text, sentence_id)), function(x) {
strsplit(gsub("^\\s+|\\s+$", "", as.character(x)), "\\s+")[[1]]
})
nms <- sort(unique(unlist(out)))
out2 <- lapply(out, function(x) {
as.data.frame(table(x), stringsAsFactors = FALSE)
})
dat2 <- data.frame(x = nms)
for(i in seq_along(out2)) {
m <- merge(dat2, out2[[i]], all.x = TRUE)
names(m)[i + 1] <- dat[["sentence_id"]][i]
dat2 <- m
}
dat2[is.na(dat2)] <- 0
x <- as.matrix(dat2[, -1]) > 0
out3 <- x %*% t(x)
out3[upper.tri(out3, diag=TRUE)] <- NA
dimnames(out3) <- list(dat2[[1]], dat2[[1]])
out4 <- na.omit(data.frame(
word1 = rep(rownames(out3), ncol(out3)),
word2 = rep(colnames(out3), each = nrow(out3)),
freq = c(unlist(out3)),
stringsAsFactors = FALSE)
)
row.names(out4) <- NULL
out4
回答2:
This is very closely related to @TylerRinker's answer, but using different tools.
library(splitstackshape)
library(reshape2)
temp <- crossprod(
as.matrix(
cSplit_e(d, "text", " ", type = "character",
fill = 0, drop = TRUE)[-1]))
temp[upper.tri(temp, diag = TRUE)] <- NA
melt(temp, na.rm = TRUE)
# Var1 Var2 value
# 2 text_b text_a 2
# 3 text_c text_a 1
# 4 text_d text_a 1
# 5 text_e text_a 3
# 8 text_c text_b 2
# 9 text_d text_b 2
# 10 text_e text_b 2
# 14 text_d text_c 2
# 15 text_e text_c 1
# 20 text_e text_d 1
The "text_" parts of "Var1" and "Var2" can be stripped easily with sub
or gsub
.
回答3:
Here's a base R way:
d <- read.table(text='sentence_id text
1 "a b c d e"
2 "a b b e"
3 "b c d"
4 "a e"', header=TRUE, as.is=TRUE)
result.vec <- table(unlist(lapply(d$text, function(text) {
pairs <- combn(unique(scan(text=text, what='', sep=' ')), m=2)
interaction(pairs[1,], pairs[2,])
})))
# a.b b.b c.b d.b a.c b.c c.c d.c a.d b.d c.d d.d a.e b.e c.e d.e
# 2 0 0 0 1 2 0 0 1 2 2 0 3 2 1 1
result <- subset(data.frame(do.call(rbind, strsplit(names(result.vec), '\\.')), freq=as.vector(result.vec)), freq > 0)
with(result, result[order(X1, X2),])
# X1 X2 freq
# 1 a b 2
# 5 a c 1
# 9 a d 1
# 13 a e 3
# 6 b c 2
# 10 b d 2
# 14 b e 2
# 11 c d 2
# 15 c e 1
# 16 d e 1
来源:https://stackoverflow.com/questions/27153320/build-word-co-occurence-edge-list-in-r