I have a dataframe, which I read by Match <- read.table(\"Match.txt\", sep=\"\", fill =T, stringsAsFactors = FALSE, quote = \"\", header = F)
and looks like
Maybe not the best use of stringr
or tidyr
, but this can be done in the hadleyverse in a somewhat readable manner...
The logic flow is:
tidyr::fill
with ifelse("Inspecting", rowname, NA)
. dcast
) to get the format that you want.library(dplyr)
library(tidyr)
library(reshape2)
library(stringr)
is_in <- function(v1part) {
return(ifelse(length(v1part) > 0, "B", "U"))
}
ab1<- ab %>%
add_rownames() %>%
mutate(rowname = ifelse(V1=="Inspecting", rowname, NA),
V4a = ifelse(V4 == "(-)" | V4 == "(+)", NA, V4),
chr = str_extract_all(ab$V4, "^chr[^:]+", simplify = T)[,1],
chr = ifelse(chr=="", NA, chr),
start = str_split_fixed(V4a, ":|-", 3)[,2],
start = ifelse(start=="", NA, start),
stop = str_split_fixed(V4a, ":|-", 3)[,3],
stop = ifelse(stop=="", NA, stop),
V1part = str_split_fixed(V1, "\\$|_", 3)[,2]) %>%
fill(rowname, .direction="down") %>%
group_by(rowname) %>%
fill(chr, .direction="down") %>%
fill(start, .direction="down") %>%
fill(stop, .direction="down") %>%
dcast(chr+start+stop ~ V1part, fun.aggregate=is_in)
> ab1
chr start stop Var.4 ATF3 CEBPB YY1
1 chr1 173244300 173244500 B B B B
2 chr1 173244350 173244550 B B B U
Not elegant, but it should work (Your data has a column with "|"... I named it df):
cond <- which(!df$V2 == "|")
new_df <- data.frame(chr=character(length(cond)), start=character(length(cond)), stop=character(length(cond)))
for (i in 1:length(cond)) {
line <- df[cond[i], ]
var <- unlist(strsplit(line$V4, split = ":"))
var2 <- unlist(strsplit(var[2], split = "-"))
new_df$chr[i] <- var[1]
new_df$start[i] <- var2[1]
new_df$stop[i] <- var2[2]
for (k in (i+1):(cond[i+1]-1)) {
# Your code using name <- df$V1 (Use strsplit again)
# df[i, name] <- ...
}
}
given your input file in this question as /c/tmp.txt
And this awk script saved as SO-38563400.awk
:
BEGIN {
OFS="\t" # Set the output separator
i=0 # Just to init the counter and be sure to start at 1 later
}
{
#print $0
}
/Inspecting sequence ID/ { # Changing sequence, initialize new entry with start and end
split($4,arr,"[:-]") # split the string in fields, split on : and -
seq[i++,"chr"]=arr[1] # Save the chr part and increase the sequence beforehand
seq[i,"start"]=arr[2] # save the start date
seq[i,"end"]=arr[3] # Save the end date
}
/V[$][^_]+_.*/ { # V line type,
split($1,arr,"[$_]") # Split on $ and underscore
seq[i,arr[2]]="B" # This has been seen, setting to B
seq[i,"print"]=1
names[arr[2]]++ # Save the name for output
# (and count occurences, just for fun, well mainly because an int is cheaper to store)
# Main reason is it allow a quicker access toa rray keys ant END block
}
END {
head=sprintf("char%sstart%sstop",OFS,OFS,OFS)
for (h in names) {
head=sprintf("%s%s%s",head,OFS,h)
}
print(head)
for (l=1; l<i; l++) { # loop over each line/sequence
line=sprintf("%s%s%s%s%s",seq[l,"chr"],OFS,seq[l,"start"],OFS,seq[l,"end"])
for (h in names) {
if (seq[l,h]=="B") line=sprintf("%s%s%s",line,OFS,"B")
else line=sprintf("%s%s%s",line,OFS,"U")
}
if (seq[l,"print"]) print line
}
}
Passing this command:
awk -f SO-38563400.awk /c/tmp.txt > /c/Rtable.txt
Gives:
$ cat /c/Rtable.txt
char start stop STAT3 ATF3 TEAD4 GATA3 JUND HNF4A FOXA2 MAX CEBPB SPI1 GABPA CMYC P300 E2F1 CTCF ATF2
chr22 16049850 16050050 B B U B U B B U U U U U B B U B
chr22 16049900 16050100 B B B B B B B B B B B B B B B B
And then reading in r:
> x <- read.table("/c/Rtable.txt", sep="\t", stringsAsFactors = FALSE, header=T)
> x
char start stop STAT3 ATF3 TEAD4 GATA3 JUND HNF4A FOXA2 MAX CEBPB SPI1 GABPA CMYC P300 E2F1 CTCF ATF2
1 chr22 16049850 16050050 B B U B U B B U U U U U B B U B
2 chr22 16049900 16050100 B B B B B B B B B B B B B B B B
Please disregard the setup with /c/
paths, this could work on windows or linux, there's port of awk
under windows, I suggest using linux for large files due to the operating system capacities on file streaming.
We can save far more memory by not reading the whole file before printing results, but this need a fixed set of "names" but you've been too lazy to extract the names by yourself and just sent me a bunch of entries, exercice is left to you to adapt, make the list in BEGIN block, use it as entries for each seq, and on each new seq print the previous result before processing.
I hope next time you'll take some time to bake a proper question and that you'll understand you have to make some efforts for others to help you, specially after a flow of comments asking you to improve your question.