Dataframe processing

前端 未结 3 1598
情话喂你
情话喂你 2020-12-12 05:44

I have a dataframe, which I read by Match <- read.table(\"Match.txt\", sep=\"\", fill =T, stringsAsFactors = FALSE, quote = \"\", header = F) and looks like

相关标签:
3条回答
  • 2020-12-12 06:20

    Maybe not the best use of stringr or tidyr, but this can be done in the hadleyverse in a somewhat readable manner...

    The logic flow is:

    • Determine the group by using tidyr::fill with ifelse("Inspecting", rowname, NA).
    • Mutate the fields to what you wanted
    • Use reshape (dcast) to get the format that you want.

    library(dplyr)
    library(tidyr)
    library(reshape2)
    library(stringr)
    
    is_in <- function(v1part) {
      return(ifelse(length(v1part) > 0, "B", "U"))
    }
    
    ab1<- ab %>% 
      add_rownames() %>%
      mutate(rowname = ifelse(V1=="Inspecting", rowname, NA),
             V4a = ifelse(V4 == "(-)" | V4 == "(+)", NA, V4),
    
             chr = str_extract_all(ab$V4, "^chr[^:]+", simplify = T)[,1],
             chr = ifelse(chr=="", NA, chr),
    
             start = str_split_fixed(V4a, ":|-", 3)[,2],
             start = ifelse(start=="", NA, start), 
    
             stop = str_split_fixed(V4a, ":|-", 3)[,3],
             stop = ifelse(stop=="", NA, stop),
    
             V1part = str_split_fixed(V1, "\\$|_", 3)[,2]) %>%
      fill(rowname, .direction="down") %>% 
      group_by(rowname) %>%
      fill(chr, .direction="down") %>%
      fill(start, .direction="down") %>%
      fill(stop, .direction="down") %>%
      dcast(chr+start+stop ~ V1part, fun.aggregate=is_in)
    
    > ab1
       chr     start      stop Var.4 ATF3 CEBPB YY1
    1 chr1 173244300 173244500     B    B     B   B
    2 chr1 173244350 173244550     B    B     B   U
    
    0 讨论(0)
  • 2020-12-12 06:30

    Not elegant, but it should work (Your data has a column with "|"... I named it df):

    cond <- which(!df$V2 == "|")
    new_df <- data.frame(chr=character(length(cond)), start=character(length(cond)), stop=character(length(cond)))
    
    for (i in 1:length(cond)) {
      line <- df[cond[i], ]
      var <- unlist(strsplit(line$V4, split = ":"))
      var2 <- unlist(strsplit(var[2], split = "-"))
      new_df$chr[i] <- var[1]
      new_df$start[i] <- var2[1]
      new_df$stop[i] <- var2[2]
      for (k in (i+1):(cond[i+1]-1)) {
        # Your code using name <- df$V1 (Use strsplit again)
        # df[i, name] <- ...
      }
    }
    
    0 讨论(0)
  • 2020-12-12 06:33

    given your input file in this question as /c/tmp.txt

    And this awk script saved as SO-38563400.awk:

    BEGIN {
     OFS="\t" # Set the output separator
     i=0 # Just to init the counter and be sure to start at 1 later
    }
     {
     #print $0
     }
    /Inspecting sequence ID/ { # Changing sequence, initialize new entry with start and end
      split($4,arr,"[:-]") # split the string in fields, split on : and -
      seq[i++,"chr"]=arr[1] # Save the chr part and increase the sequence beforehand
      seq[i,"start"]=arr[2] # save the start date
      seq[i,"end"]=arr[3] # Save the end date
    }
    
    /V[$][^_]+_.*/ { # V line type,
      split($1,arr,"[$_]") # Split on $ and underscore
      seq[i,arr[2]]="B" # This has been seen, setting to B
      seq[i,"print"]=1
      names[arr[2]]++ # Save the name for output
      # (and count occurences, just for fun, well mainly because an int is cheaper to store)
      # Main reason is it allow a quicker access toa rray keys ant END block
    }
    
    END {
      head=sprintf("char%sstart%sstop",OFS,OFS,OFS)
      for (h in names) {
        head=sprintf("%s%s%s",head,OFS,h)
      }
      print(head)
      for (l=1; l<i; l++) { # loop over each line/sequence
        line=sprintf("%s%s%s%s%s",seq[l,"chr"],OFS,seq[l,"start"],OFS,seq[l,"end"])
        for (h in names) {
          if (seq[l,h]=="B") line=sprintf("%s%s%s",line,OFS,"B")
          else line=sprintf("%s%s%s",line,OFS,"U")
        }
        if (seq[l,"print"]) print line
      }
    }
    

    Passing this command:

    awk -f SO-38563400.awk /c/tmp.txt > /c/Rtable.txt
    

    Gives:

    $ cat /c/Rtable.txt
    char    start   stop    STAT3   ATF3    TEAD4   GATA3   JUND    HNF4A   FOXA2   MAX     CEBPB   SPI1    GABPA   CMYC    P300    E2F1    CTCF    ATF2
    chr22   16049850        16050050        B       B       U       B       U       B       B       U       U       U       U       U       B       B       U       B
    chr22   16049900        16050100        B       B       B       B       B       B       B       B       B       B       B       B       B       B       B       B
    

    And then reading in r:

    > x <- read.table("/c/Rtable.txt", sep="\t",  stringsAsFactors = FALSE, header=T)
    > x
    char    start     stop STAT3 ATF3 TEAD4 GATA3 JUND HNF4A FOXA2 MAX CEBPB SPI1 GABPA CMYC P300 E2F1 CTCF ATF2
    1 chr22 16049850 16050050     B    B     U     B    U     B     B   U     U    U     U    U    B    B    U    B
    2 chr22 16049900 16050100     B    B     B     B    B     B     B   B     B    B     B    B    B    B    B    B
    

    Please disregard the setup with /c/ paths, this could work on windows or linux, there's port of awk under windows, I suggest using linux for large files due to the operating system capacities on file streaming.

    We can save far more memory by not reading the whole file before printing results, but this need a fixed set of "names" but you've been too lazy to extract the names by yourself and just sent me a bunch of entries, exercice is left to you to adapt, make the list in BEGIN block, use it as entries for each seq, and on each new seq print the previous result before processing.

    I hope next time you'll take some time to bake a proper question and that you'll understand you have to make some efforts for others to help you, specially after a flow of comments asking you to improve your question.

    0 讨论(0)
提交回复
热议问题