How to calculate a (co-)occurrence matrix from a data frame with several columns using R?

后端 未结 3 1372
时光取名叫无心
时光取名叫无心 2021-01-12 13:57

I\'m a rookie in R and currently working with collaboration data in the form of an edge list with 32 columns and around 200.000 rows. I want to create a (co-)occurrence matr

相关标签:
3条回答
  • 2021-01-12 14:32

    Here is a way using dplyr and tidyr packages, the whole idea lies in creating a dataframe with row-wise occurrence of each country then joining it on itself.

    library(dplyr)
    
    # Create dataframe sammple
    df <- data.frame(ID = c(1,2,3,4), 
                     V1 = c("England", "England", "China", "England"),
                     V2 = c("Greece", "England", "Greece", "England"),
                     V32 = c("USA", "China", "Greece", "England"),
                     stringsAsFactors = FALSE)
    
    # Get the occurance of each country in every row.
    row_occurance <- 
      df %>%
      tidyr::gather(key = "identifier", value = "country", -ID) %>%
      group_by(ID, country) %>%
      count()
    
    row_occurance %>%
      # Join row_occurance on itself to simulate the matrix
      left_join(row_occurance, by = "ID") %>%
      # Get the highest occurance row wise, this to handle when country
      # name is repeated within same row
      mutate(Occurance = pmax(n.x, n.y)) %>%
      # Group by 2 countries
      group_by(country.x, country.y) %>%
      # Sum the occurance of 2 countries together
      summarise(Occurance = sum(Occurance)) %>%
      # Spread the data to make it in matrix format
      tidyr::spread(key = "country.y", value = "Occurance", fill = 0)
    
    # # A tibble: 4 x 5
    # # Groups:   country.x [4]
    # country.x China England Greece   USA
    # <chr>     <dbl>   <dbl>  <dbl> <dbl>
    # China         2       2      2     0
    # England       2       6      1     1
    # Greece        2       1      3     1
    # USA           0       1      1     1
    
    0 讨论(0)
  • 2021-01-12 14:38

    An option using base::table:

    df <- data.frame(ID = c(1,2,3,4), 
        V1 = c("England", "England", "China", "England"),
        V2 = c("Greece", "England", "Greece", "England"),
        V3 = c("USA", "China", "Greece", "England"))
    
    #get paired combi and remove those from same country
    pairs <- as.data.frame(do.call(rbind, 
        by(df, df$ID, function(x) t(combn(as.character(x[-1L]), 2L)))))
    pairs <- pairs[pairs$V1!=pairs$V2, ]
    
    #repeat data frame with columns swap so that 
    #upper and lower tri have same numbers and all countries are shown
    pairs <- rbind(pairs, data.frame(V1=pairs$V2, V2=pairs$V1))
    
    #tabulate pairs
    tab <- table(pairs)
    
    #set diagonals to be the count of countries
    cnt <- c(table(unlist(df[-1L])))
    diag(tab) <- cnt[names(diag(tab))]
    
    tab
    

    output:

             V2
    V1        China England Greece USA
      China       2       2      2   0
      England     2       6      1   1
      Greece      2       1      3   1
      USA         0       1      1   1
    
    0 讨论(0)
  • 2021-01-12 14:39

    There may be better ways to do this, but try:

    library(tidyverse)
    
    df1 <- df %>%
    pivot_longer(-ID, names_to = "Category", values_to = "Country") %>%
    xtabs(~ID + Country, data = ., sparse = FALSE) %>% 
    crossprod(., .) 
    
    df_diag <- df %>% 
    pivot_longer(-ID, names_to = "Category", values_to = "Country") %>%
    mutate(Country2 = Country) %>%
    xtabs(~Country + Country2, data = ., sparse = FALSE) %>% 
    diag()
    
    diag(df1) <- df_diag 
    
    df1
    
    Country   China England Greece USA
      China       2       2      2   0
      England     2       6      1   1
      Greece      2       1      3   1
      USA         0       1      1   1
    
    0 讨论(0)
提交回复
热议问题