convert object of class “dist” into data frame in r

前端 未结 4 1474
傲寒
傲寒 2021-02-04 14:43

if possible to convert a data frame to an object of class \"dist\" is it possible to do just the opposite? convert class \"dist\" to data frame? for example

< dist(ha

相关标签:
4条回答
  • 2021-02-04 15:04

    I would do something like this ..

    library(reshape)
    set.seed(123)
    
    mat <- matrix(rnorm(25),ncol=5,nrow=5,byrow=T) # Creating a 5 X 5 matrix
    d   <- dist(mat)
    
    df  <- melt(as.matrix(d)) #Converting the dist object to matrix while using melt
    
    p    <- t(apply(df[,c(1,2)],1,FUN=sort))
    rmv1 <- which(p[,1] == p[,2])
    
    p    <- paste(p[,1],p[,2],sep="|")
    rmv2 <- which(duplicated(p))
    
    df   <- df[-c(rmv1,rmv2),] #removing self distances and duplicated distances
    
    df
    X1 X2    value
    2  1 3.812287
    3  1 2.311832
    4  1 4.385048
    5  1 2.854179
    3  2 1.916895
    4  2 1.557744
    5  2 2.880357
    4  3 2.509214
    5  3 2.886526
    5  4 3.408147
    
    0 讨论(0)
  • 2021-02-04 15:04

    Here's another approach that avoids melt() and as.matrix() ... it'd be nice to avoid subset() too, but I'll leave that as a homework exercise

    dist.to.df <- function(d){
      size <- attr(d, "Size")
      return(
        data.frame(
          subset(expand.grid(row=2:size, col=1:(size-1)), row > col),
          distance=as.numeric(d),
          row.names = NULL
        )
      )
    }
    

    which gives...

    library(maps)
    data(us.cities)
    d <- dist(head(us.cities[c("lat", "long")]))
    dist.to.df(d)
    ##    row col  distance
    ## 1    2   1 20.160489
    ## 2    3   1 23.139853
    ## 3    4   1 15.584303
    ## 4    5   1 27.880674
    ## 5    6   1 26.331187
    ## 6    3   2 40.874243
    ## 7    4   2  9.865374
    ## 8    5   2  7.882037
    ## 9    6   2 41.720457
    ## 10   4   3 38.579820
    ## 11   5   3 48.707100
    ## 12   6   3  6.900101
    ## 13   5   4 15.189882
    ## 14   6   4 41.036931
    ## 15   6   5 49.328558
    
    0 讨论(0)
  • 2021-02-04 15:05
    library(maps)
    data(us.cities)
    
    d <- dist(head(us.cities[c("lat", "long")]))
    
    ##           1         2         3         4         5
    ## 2 20.160489                                        
    ## 3 23.139853 40.874243                              
    ## 4 15.584303  9.865374 38.579820                    
    ## 5 27.880674  7.882037 48.707100 15.189882          
    ## 6 26.331187 41.720457  6.900101 41.036931 49.328558
    
    library(reshape2)
    
    df <- melt(as.matrix(d), varnames = c("row", "col"))
    
    df[df$row > df$col,]
    ##    row col     value
    ## 2    2   1 20.160489
    ## 3    3   1 23.139853
    ## 4    4   1 15.584303
    ## 5    5   1 27.880674
    ## 6    6   1 26.331187
    ## 9    3   2 40.874243
    ## 10   4   2  9.865374
    ## 11   5   2  7.882037
    ## 12   6   2 41.720457
    ## 16   4   3 38.579820
    ## 17   5   3 48.707100
    ## 18   6   3  6.900101
    ## 23   5   4 15.189882
    ## 24   6   4 41.036931
    ## 30   6   5 49.328558
    
    0 讨论(0)
  • 2021-02-04 15:09

    I would actually write a function something like this:

    myFun <- function(inDist) {
      if (class(inDist) != "dist") stop("wrong input type")
      A <- attr(inDist, "Size")
      B <- if (is.null(attr(inDist, "Labels"))) sequence(A) else attr(inDist, "Labels")
      if (isTRUE(attr(inDist, "Diag"))) attr(inDist, "Diag") <- FALSE
      if (isTRUE(attr(inDist, "Upper"))) attr(inDist, "Upper") <- FALSE
      data.frame(
        row = B[unlist(lapply(sequence(A)[-1], function(x) x:A))],
        col = rep(B[-length(B)], (length(B)-1):1),
        value = as.vector(inDist))
    }
    

    Now, imagine we are starting with (note the non-numeric row and column names):

    dd <- as.dist((1 - cor(USJudgeRatings)[1:5, 1:5])/2)
    #            CONT       INTG       DMNR       DILG
    # INTG 0.56659545                                 
    # DMNR 0.57684427 0.01769236                      
    # DILG 0.49380400 0.06424445 0.08157452           
    # CFMG 0.43154385 0.09295712 0.09332092 0.02060062
    

    We can change it with a simple:

    myFun(dd)
    #     row  col      value
    # 1  INTG CONT 0.56659545
    # 2  DMNR CONT 0.57684427
    # 3  DILG CONT 0.49380400
    # 4  CFMG CONT 0.43154385
    # 5  DMNR INTG 0.01769236
    # 6  DILG INTG 0.06424445
    # 7  CFMG INTG 0.09295712
    # 8  DILG DMNR 0.08157452
    # 9  CFMG DMNR 0.09332092
    # 10 CFMG DILG 0.02060062
    

    A quick performance comparison:

    set.seed(1)
    x <- matrix(rnorm(1000*1000), nrow = 1000)
    dd <- dist(x)
    
    ## Jake's function
    fun2 <- function(inDist) {
      df <- melt(as.matrix(inDist), varnames = c("row", "col"))
      df[as.numeric(df$row) > as.numeric(df$col), ]
    }
    
    all(fun2(dd) == myFun(dd))
    # [1] TRUE
    system.time(fun2(dd))
    #    user  system elapsed 
    #   0.346   0.002   0.349 
    system.time(myFun(dd))
    #    user  system elapsed 
    #   0.012   0.000   0.015
    
    0 讨论(0)
提交回复
热议问题