I have a huge .csv
file like this :
Transcript Id Gene Id(name) Mirna Name miTG score
ENST00000286800 ENSG00000156273 (BACH1) hsa-let-7a-5p 1
You could try to structure the CSV using regular expressions:
textfile <- "ENST00000286800 ENSG00000156273 (BACH1) hsa-let-7a-5p 1
UTR3 21:30717114-30717142 0.05994568
UTR3 21:30717414-30717442 0.13591267
ENST00000345080 ENSG00000187772 (LIN28B) hsa-let-7a-5p 1
UTR3 6:105526681-105526709 0.133514751"
txt <- readLines(textConnection(textfile))
sepr <- grepl("^ENST.*", txt)
r <- rle(sepr)
r <- r$lengths[!r$values]
regex <- "(\\S+)\\s+(\\S+)\\s(\\([^)]+\\)\\s+\\S+)\\s+(\\d+)"
m <- regexec(regex, txt[sepr])
m1 <- as.data.frame(t(sapply(regmatches(txt[sepr], m), "[", 2:5)))
m1 <- m1[rep(1:nrow(m1), r),]
regex <- "(\\S+)\\s+(\\S+)\\s+(\\S+)"
m <- regexec(regex, txt[!sepr])
m2 <- as.data.frame(t(sapply(regmatches(txt[!sepr], m), "[", 2:4)))
df <- cbind(m1, m2[,-1])
names(df) <- c("Transcript Id", "Gene Id(name)", "Mirna Name", "miTG score", "UTR3", "MRE_score" )
rownames(df) <- NULL
df
# Transcript Id Gene Id(name) Mirna Name miTG score UTR3 MRE_score
# 1 ENST00000286800 ENSG00000156273 (BACH1) hsa-let-7a-5p 1 21:30717114-30717142 0.05994568
# 2 ENST00000286800 ENSG00000156273 (BACH1) hsa-let-7a-5p 1 21:30717414-30717442 0.13591267
# 3 ENST00000345080 ENSG00000187772 (LIN28B) hsa-let-7a-5p 1 6:105526681-105526709 0.133514751