Dataframe processing

前端未结

关注

 3  1598

I have a dataframe, which I read by Match <- read.table(\"Match.txt\", sep=\"\", fill =T, stringsAsFactors = FALSE, quote = \"\", header = F) and looks like

相关标签:

3条回答

不思量自难忘°

2020-12-12 06:20

Maybe not the best use of stringr or tidyr, but this can be done in the hadleyverse in a somewhat readable manner...

The logic flow is:

Determine the group by using tidyr::fill with ifelse("Inspecting", rowname, NA).
Mutate the fields to what you wanted
Use reshape (dcast) to get the format that you want.

library(dplyr)
library(tidyr)
library(reshape2)
library(stringr)

is_in <- function(v1part) {
  return(ifelse(length(v1part) > 0, "B", "U"))
}

ab1<- ab %>% 
  add_rownames() %>%
  mutate(rowname = ifelse(V1=="Inspecting", rowname, NA),
         V4a = ifelse(V4 == "(-)" | V4 == "(+)", NA, V4),

         chr = str_extract_all(ab$V4, "^chr[^:]+", simplify = T)[,1],
         chr = ifelse(chr=="", NA, chr),

         start = str_split_fixed(V4a, ":|-", 3)[,2],
         start = ifelse(start=="", NA, start), 

         stop = str_split_fixed(V4a, ":|-", 3)[,3],
         stop = ifelse(stop=="", NA, stop),

         V1part = str_split_fixed(V1, "\\$|_", 3)[,2]) %>%
  fill(rowname, .direction="down") %>% 
  group_by(rowname) %>%
  fill(chr, .direction="down") %>%
  fill(start, .direction="down") %>%
  fill(stop, .direction="down") %>%
  dcast(chr+start+stop ~ V1part, fun.aggregate=is_in)

> ab1
   chr     start      stop Var.4 ATF3 CEBPB YY1
1 chr1 173244300 173244500     B    B     B   B
2 chr1 173244350 173244550     B    B     B   U

0 讨论(0)

日久生厌

2020-12-12 06:30

Not elegant, but it should work (Your data has a column with "|"... I named it df):

cond <- which(!df$V2 == "|")
new_df <- data.frame(chr=character(length(cond)), start=character(length(cond)), stop=character(length(cond)))

for (i in 1:length(cond)) {
  line <- df[cond[i], ]
  var <- unlist(strsplit(line$V4, split = ":"))
  var2 <- unlist(strsplit(var[2], split = "-"))
  new_df$chr[i] <- var[1]
  new_df$start[i] <- var2[1]
  new_df$stop[i] <- var2[2]
  for (k in (i+1):(cond[i+1]-1)) {
    # Your code using name <- df$V1 (Use strsplit again)
    # df[i, name] <- ...
  }
}

0 讨论(0)

-上瘾入骨i

2020-12-12 06:33

given your input file in this question as /c/tmp.txt

And this awk script saved as SO-38563400.awk:

BEGIN {
 OFS="\t" # Set the output separator
 i=0 # Just to init the counter and be sure to start at 1 later
}
 {
 #print $0
 }
/Inspecting sequence ID/ { # Changing sequence, initialize new entry with start and end
  split($4,arr,"[:-]") # split the string in fields, split on : and -
  seq[i++,"chr"]=arr[1] # Save the chr part and increase the sequence beforehand
  seq[i,"start"]=arr[2] # save the start date
  seq[i,"end"]=arr[3] # Save the end date
}

/V[$][^_]+_.*/ { # V line type,
  split($1,arr,"[$_]") # Split on $ and underscore
  seq[i,arr[2]]="B" # This has been seen, setting to B
  seq[i,"print"]=1
  names[arr[2]]++ # Save the name for output
  # (and count occurences, just for fun, well mainly because an int is cheaper to store)
  # Main reason is it allow a quicker access toa rray keys ant END block
}

END {
  head=sprintf("char%sstart%sstop",OFS,OFS,OFS)
  for (h in names) {
    head=sprintf("%s%s%s",head,OFS,h)
  }
  print(head)
  for (l=1; l<i; l++) { # loop over each line/sequence
    line=sprintf("%s%s%s%s%s",seq[l,"chr"],OFS,seq[l,"start"],OFS,seq[l,"end"])
    for (h in names) {
      if (seq[l,h]=="B") line=sprintf("%s%s%s",line,OFS,"B")
      else line=sprintf("%s%s%s",line,OFS,"U")
    }
    if (seq[l,"print"]) print line
  }
}

Passing this command:

awk -f SO-38563400.awk /c/tmp.txt > /c/Rtable.txt

Gives:

$ cat /c/Rtable.txt
char    start   stop    STAT3   ATF3    TEAD4   GATA3   JUND    HNF4A   FOXA2   MAX     CEBPB   SPI1    GABPA   CMYC    P300    E2F1    CTCF    ATF2
chr22   16049850        16050050        B       B       U       B       U       B       B       U       U       U       U       U       B       B       U       B
chr22   16049900        16050100        B       B       B       B       B       B       B       B       B       B       B       B       B       B       B       B

And then reading in r:

> x <- read.table("/c/Rtable.txt", sep="\t",  stringsAsFactors = FALSE, header=T)
> x
char    start     stop STAT3 ATF3 TEAD4 GATA3 JUND HNF4A FOXA2 MAX CEBPB SPI1 GABPA CMYC P300 E2F1 CTCF ATF2
1 chr22 16049850 16050050     B    B     U     B    U     B     B   U     U    U     U    U    B    B    U    B
2 chr22 16049900 16050100     B    B     B     B    B     B     B   B     B    B     B    B    B    B    B    B

Please disregard the setup with /c/ paths, this could work on windows or linux, there's port of awk under windows, I suggest using linux for large files due to the operating system capacities on file streaming.

We can save far more memory by not reading the whole file before printing results, but this need a fixed set of "names" but you've been too lazy to extract the names by yourself and just sent me a bunch of entries, exercice is left to you to adapt, make the list in BEGIN block, use it as entries for each seq, and on each new seq print the previous result before processing.

I hope next time you'll take some time to bake a proper question and that you'll understand you have to make some efforts for others to help you, specially after a flow of comments asking you to improve your question.

0 讨论(0)