I\'m a rookie in R and currently working with collaboration data in the form of an edge list with 32 columns and around 200.000 rows. I want to create a (co-)occurrence matr
Here is a way using dplyr and tidyr packages, the whole idea lies in creating a dataframe with row-wise occurrence of each country then joining it on itself.
library(dplyr)
# Create dataframe sammple
df <- data.frame(ID = c(1,2,3,4),
V1 = c("England", "England", "China", "England"),
V2 = c("Greece", "England", "Greece", "England"),
V32 = c("USA", "China", "Greece", "England"),
stringsAsFactors = FALSE)
# Get the occurance of each country in every row.
row_occurance <-
df %>%
tidyr::gather(key = "identifier", value = "country", -ID) %>%
group_by(ID, country) %>%
count()
row_occurance %>%
# Join row_occurance on itself to simulate the matrix
left_join(row_occurance, by = "ID") %>%
# Get the highest occurance row wise, this to handle when country
# name is repeated within same row
mutate(Occurance = pmax(n.x, n.y)) %>%
# Group by 2 countries
group_by(country.x, country.y) %>%
# Sum the occurance of 2 countries together
summarise(Occurance = sum(Occurance)) %>%
# Spread the data to make it in matrix format
tidyr::spread(key = "country.y", value = "Occurance", fill = 0)
# # A tibble: 4 x 5
# # Groups: country.x [4]
# country.x China England Greece USA
# <chr> <dbl> <dbl> <dbl> <dbl>
# China 2 2 2 0
# England 2 6 1 1
# Greece 2 1 3 1
# USA 0 1 1 1
An option using base::table
:
df <- data.frame(ID = c(1,2,3,4),
V1 = c("England", "England", "China", "England"),
V2 = c("Greece", "England", "Greece", "England"),
V3 = c("USA", "China", "Greece", "England"))
#get paired combi and remove those from same country
pairs <- as.data.frame(do.call(rbind,
by(df, df$ID, function(x) t(combn(as.character(x[-1L]), 2L)))))
pairs <- pairs[pairs$V1!=pairs$V2, ]
#repeat data frame with columns swap so that
#upper and lower tri have same numbers and all countries are shown
pairs <- rbind(pairs, data.frame(V1=pairs$V2, V2=pairs$V1))
#tabulate pairs
tab <- table(pairs)
#set diagonals to be the count of countries
cnt <- c(table(unlist(df[-1L])))
diag(tab) <- cnt[names(diag(tab))]
tab
output:
V2
V1 China England Greece USA
China 2 2 2 0
England 2 6 1 1
Greece 2 1 3 1
USA 0 1 1 1
There may be better ways to do this, but try:
library(tidyverse)
df1 <- df %>%
pivot_longer(-ID, names_to = "Category", values_to = "Country") %>%
xtabs(~ID + Country, data = ., sparse = FALSE) %>%
crossprod(., .)
df_diag <- df %>%
pivot_longer(-ID, names_to = "Category", values_to = "Country") %>%
mutate(Country2 = Country) %>%
xtabs(~Country + Country2, data = ., sparse = FALSE) %>%
diag()
diag(df1) <- df_diag
df1
Country China England Greece USA
China 2 2 2 0
England 2 6 1 1
Greece 2 1 3 1
USA 0 1 1 1