问题
I need to split a column that contains information into several columns.
I'd use tstrsplit
but the same kind of information is not in the same order among the rows and I need to extract the name of the new column within the variable. Important to know: there can be many pieces of information (fields to become new variables) and I don't know all of them, so I don't want a "field by field" solution.
Below is an example of what I have:
library(data.table)
myDT <- structure(list(chr = c("chr1", "chr2", "chr4"), pos = c(123L,
435L, 120L), info = c("type=3;end=4", "end=6", "end=5;pos=TRUE;type=2"
)), class = c("data.table", "data.frame"), row.names = c(NA,-3L))
# chr pos info
#1: chr1 123 type=3;end=4
#2: chr2 435 end=6
#3: chr4 120 end=5;pos=TRUE;type=2
And I'd like to get:
# chr pos end pos type
#1: chr1 123 4 <NA> 3
#2: chr2 435 6 <NA> <NA>
#3: chr4 120 5 TRUE 2
A most straightforward way to get that would be much appreciated! (Note: I'm not willing to go with a dplyr/tidyr way)
回答1:
Using regex
and the stringi
packages:
setDT(myDT) # After creating data.table from structure()
library(stringi)
fields <- unique(unlist(stri_extract_all(regex = "[a-z]+(?==)", myDT$info)))
patterns <- sprintf("(?<=%s=)[^;]+", fields)
myDT[, (fields) := lapply(patterns, function(x) stri_extract(regex = x, info))]
myDT[, !"info"]
chr pos type end
1: chr1 <NA> 3 4
2: chr2 <NA> <NA> 6
3: chr4 TRUE 2 5
Edit: To get the correct type it seems (?) type.convert()
can be used:
myDT[, (fields) := lapply(patterns, function(x) type.convert(stri_extract(regex = x, info), as.is = TRUE))]
回答2:
I am guessing your data is coming from a VCF file, if so there is a dedicated tool for such problems - bcftools.
Let's create example VCF file for testing:
# subset some data from 1000genomes data
tabix -h ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz 17:1471000-1472000 > myFile.vcf
# zip it and index:
bgzip -c myFile.vcf > myFile.vcf.gz
tabix -p vcf myFile.vcf.gz
Now we can use bcftools. Here as an example we are subsetting AF and DP from INFO column:
bcftools query -f '%CHROM %POS %INFO/AF %INFO/DP \n' myFile.vcf.gz
17 1471199 1916 0.088
17 1471538 2445 0.016
17 1471611 2733 0.239
17 1471623 2815 0.003
17 1471946 1608 0.007
17 1471959 1612 0.014
17 1471975 1610 0.179
See the manual for more query options.
回答3:
We could split on ";"
then reshape wide-to-long, then split again on "="
, then reshape back to long-to-wide:
dcast(
melt(dt[, paste0("col", 1:3) := tstrsplit(info, split = ";") ],
id.vars = c("chr", "pos", "info"))[, -c("info", "variable")][
,c("x1", "x2") := tstrsplit(value, split = "=")][
,value := NULL][ !is.na(x1), ],
chr + pos ~ x1, value.var = "x2")
# chr pos end pos type
# 1: chr1 123 4 <NA> 3
# 2: chr2 435 6 <NA> <NA>
# 3: chr4 120 5 TRUE 2
An improved / more readible version:
dt[, paste0("col", 1:3) := tstrsplit(info, split = ";")
][, melt(.SD, id.vars = c("chr", "pos", "info"), na.rm = TRUE)
][, -c("info", "variable")
][, c("x1", "x2") := tstrsplit(value, split = "=")
][, dcast(.SD, chr + pos ~ x1, value.var = "x2")]
回答4:
For now, I managed to get what I want with the following code:
newDT <- reshape(splitstackshape::cSplit(myDT, "info", sep=";", "long")[,
c(.SD, tstrsplit(info, "="))],
idvar=c("chr", "pos"), direction="wide", timevar="V4", drop="info")
setnames(newDT, sub("V5\\.", "", names(newDT)))
newDT
# chr pos type end pos
#1: chr1 123 3 4 <NA>
#2: chr2 435 <NA> 6 <NA>
#3: chr4 120 2 5 TRUE
Two options to improve the lines above, thanks to @A5C1D2H2I1M1N2O1R2T1 (who gave them in comments) :
. with a double cSplit
prior to dcast
:
cSplit(cSplit(myDT, "info", ";", "long"), "info", "=")[, dcast(.SD, chr + pos ~ info_1, value.var = "info_2")]
. with cSplit
/trstrplit
and dcast
instead of reshape
:
cSplit(myDT, "info", ";", "long")[, c("t1", "t2") := tstrsplit(info, "=", fixed = TRUE)][, dcast(.SD, chr + pos ~ t1, value.var = "t2")]
回答5:
Here's how I'd do it :
library(data.table)
myDT <- structure(list(chr = c("chr1", "chr2", "chr4"), pos = c(123L,
435L, 120L), info = c("type=3;end=4", "end=6", "end=5;pos=TRUE;type=2"
)), class = c("data.table", "data.frame"), row.names = c(NA,-3L))
R_strings <- paste0("list(", chartr(";", ",", myDT$info),")")
lists <- lapply(parse(text=R_strings),eval)
myDT[,info:=NULL]
myDT <- cbind(myDT,rbindlist(lists, fill = TRUE))
myDT
#> chr pos type end pos
#> 1: chr1 123 3 4 NA
#> 2: chr2 435 NA 6 NA
#> 3: chr4 120 2 5 TRUE
Created on 2019-11-29 by the reprex package (v0.3.0)
回答6:
You could use separate calls to sub
for each desired extracted field, e.g. for type
:
myDT$type <- sub("^.*\\btype=([^;]+)\\b.*$", "\\1", myDT$info)
来源:https://stackoverflow.com/questions/59103539/split-character-columns-and-get-names-of-field-in-string