问题
names(score)
[1] "(Intercept)" "aado2_calc(20,180]" "aado2_calc(360,460]"
[4] "aado2_calc(460,629]" "albumin[1,1.8]" "albumin(1.8,2.2]"
[7] "albumin(2.2,2.8]" "aniongap(15,18]" "aniongap(18,20]"
[10] "aniongap(20,22]" "aniongap(22,25]" "aniongap(25,49]"
I want to extract the two numbers within parenthesis (numbers outside the parenthesis are not needed) and there are "(" or "[". the first number will be assigned to an object "low" and the second to "high".
回答1:
scorenames <- c(
"(Intercept)" ,"aado2_calc(20,180]" ,"aado2_calc(360,460]"
,"aado2_calc(460,629]" ,"albumin[1,1.8]" ,"albumin(1.8,2.2]"
,"albumin(2.2,2.8]" ,"aniongap(15,18]" ,"aniongap(18,20]"
,"aniongap(20,22]" ,"aniongap(22,25]" ,"aniongap(25,49]"
)
The first step might be to extract everything within the "parens"-delimiters (to include ()
, []
, and the comma ,
).
mat <- regmatches(scorenames,
gregexpr("(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", scorenames, perl = TRUE))
str(mat)
# List of 12
# $ : chr(0)
# $ : chr [1:2] "20" "180"
# $ : chr [1:2] "360" "460"
# $ : chr [1:2] "460" "629"
# $ : chr [1:2] "1" "1.8"
# $ : chr [1:2] "1.8" "2.2"
# $ : chr [1:2] "2.2" "2.8"
# $ : chr [1:2] "15" "18"
# $ : chr [1:2] "18" "20"
# $ : chr [1:2] "20" "22"
# $ : chr [1:2] "22" "25"
# $ : chr [1:2] "25" "49"
From here, we can see that (1) the first one is problematic (no surprise, you need to figure out what you want here), and (2) the rest look about right.
Here's one rough way to process this list. This is very trusting and naïve ... you should probably add checks to ensure the list is of length 2, that everything converts correctly (perhaps in a tryCatch
), etc.
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
str(newnames)
# List of 12
# $ :List of 2
# ..$ low : logi NA
# ..$ high: logi NA
# $ :List of 2
# ..$ low : num 20
# ..$ high: num 180
# $ :List of 2
# ..$ low : num 360
# ..$ high: num 460
# ...snip...
You can turn this into a data.frame with:
head(do.call(rbind.data.frame, newnames))
# low high
# 1 NA NA
# 2 20.0 180.0
# 3 360.0 460.0
# 4 460.0 629.0
# 5 1.0 1.8
# 6 1.8 2.2
回答2:
You can use the readr
package and the function parse_number
for ease of use. For more power you'd want to use something like the base regular expression functions in r, or a package like stringi
回答3:
Just like @jake-kaupp said - use stringi
:) As you can see, stringi solution is shorter, easier to understand and much faster - up to 30 times!
Short answer:
arr <- stri_extract_all_regex(x, "(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
Long answer:
require(stringi)
require(microbenchmark)
grepFun <- function(x){
mat <- regmatches(x,
gregexpr("(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", x, perl = TRUE))
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
do.call(rbind.data.frame, newnames)
}
striFun <- function(x){
arr <- stri_extract_all_regex(x, "(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
}
# both functions work the same grepFun(scorenames) low high 1 NA NA 2 20.0 180.0 3 360.0 460.0 4 460.0 629.0 ... 12 25.0 49.0 striFun(scorenames) low high 1 NA NA 2 20.0 180.0 3 360.0 460.0 4 460.0 629.0 ... 12 25.0 49.0
# generating more complicated vector
n <- 10000
x <- stri_paste(stri_rand_strings(n, length = 1:10), sample(c("(","["),n,TRUE),
sample(1000,n,TRUE), ",", sample(1000,n,TRUE), sample(c(")","]"), n, TRUE))
head(x) # check first elements
[1] "O[68,434]" "Ql[783,151)" "Zk0(773,60)" "ETfV(446,518]" "Xixbr(576,855)" "G6QnHu(92,955)"
#short test using new data grepFun(x[1:6]) low high 1 68 434 2 783 151 3 773 60 4 446 518 5 576 855 6 92 955 striFun(x[1:6]) low high 1 68 434 2 783 151 3 773 60 4 446 518 5 576 855 6 92 955 #and some benchmark to prove performance microbenchmark(grepFun(x), striFun(x)) Unit: milliseconds expr min lq mean median uq max neval grepFun(x) 330.27733 366.09306 416.56330 406.08914 465.29829 568.15250 100 striFun(x) 11.57449 11.97825 13.38157 12.46927 13.67699 25.97455 100
来源:https://stackoverflow.com/questions/42078763/how-can-i-extract-numbers-from-a-string-in-r