how can I extract numbers from a string in R?

问题

names(score)
 [1] "(Intercept)"              "aado2_calc(20,180]"       "aado2_calc(360,460]"     
 [4] "aado2_calc(460,629]"      "albumin[1,1.8]"           "albumin(1.8,2.2]"        
 [7] "albumin(2.2,2.8]"         "aniongap(15,18]"          "aniongap(18,20]"         
[10] "aniongap(20,22]"          "aniongap(22,25]"          "aniongap(25,49]"

I want to extract the two numbers within parenthesis (numbers outside the parenthesis are not needed) and there are "(" or "[". the first number will be assigned to an object "low" and the second to "high".

回答1:

scorenames <- c(
  "(Intercept)"              ,"aado2_calc(20,180]"       ,"aado2_calc(360,460]"     
 ,"aado2_calc(460,629]"      ,"albumin[1,1.8]"           ,"albumin(1.8,2.2]"        
 ,"albumin(2.2,2.8]"         ,"aniongap(15,18]"          ,"aniongap(18,20]"         
 ,"aniongap(20,22]"          ,"aniongap(22,25]"          ,"aniongap(25,49]"
)

The first step might be to extract everything within the "parens"-delimiters (to include (), [], and the comma ,).

mat <- regmatches(scorenames,
                  gregexpr("(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", scorenames, perl = TRUE))
str(mat)
# List of 12
#  $ : chr(0) 
#  $ : chr [1:2] "20" "180"
#  $ : chr [1:2] "360" "460"
#  $ : chr [1:2] "460" "629"
#  $ : chr [1:2] "1" "1.8"
#  $ : chr [1:2] "1.8" "2.2"
#  $ : chr [1:2] "2.2" "2.8"
#  $ : chr [1:2] "15" "18"
#  $ : chr [1:2] "18" "20"
#  $ : chr [1:2] "20" "22"
#  $ : chr [1:2] "22" "25"
#  $ : chr [1:2] "25" "49"

From here, we can see that (1) the first one is problematic (no surprise, you need to figure out what you want here), and (2) the rest look about right.

Here's one rough way to process this list. This is very trusting and naïve ... you should probably add checks to ensure the list is of length 2, that everything converts correctly (perhaps in a tryCatch), etc.

newnames <- lapply(mat, function(m) {
  if (! length(m)) return(list(low = NA, high = NA))
  setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
str(newnames)
# List of 12
#  $ :List of 2
#   ..$ low : logi NA
#   ..$ high: logi NA
#  $ :List of 2
#   ..$ low : num 20
#   ..$ high: num 180
#  $ :List of 2
#   ..$ low : num 360
#   ..$ high: num 460
# ...snip...

You can turn this into a data.frame with:

head(do.call(rbind.data.frame, newnames))
#     low  high
# 1    NA    NA
# 2  20.0 180.0
# 3 360.0 460.0
# 4 460.0 629.0
# 5   1.0   1.8
# 6   1.8   2.2

回答2:

You can use the readr package and the function parse_number for ease of use. For more power you'd want to use something like the base regular expression functions in r, or a package like stringi

回答3:

Just like @jake-kaupp said - use stringi :) As you can see, stringi solution is shorter, easier to understand and much faster - up to 30 times!

Short answer:

arr <- stri_extract_all_regex(x, "(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))

Long answer:

require(stringi)
require(microbenchmark)

grepFun <- function(x){
  mat <- regmatches(x,
                gregexpr("(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", x, perl = TRUE))
  newnames <- lapply(mat, function(m) {
    if (! length(m)) return(list(low = NA, high = NA))
      setNames(as.list(as.numeric(m)), nm = c("low", "high"))
  })
  do.call(rbind.data.frame, newnames)
}

striFun <- function(x){
  arr <- stri_extract_all_regex(x, "(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", simplify = NA)
  data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
}

# both functions work the same
grepFun(scorenames)
     low  high
1     NA    NA
2   20.0 180.0
3  360.0 460.0
4  460.0 629.0
...
12  25.0  49.0
striFun(scorenames)
     low  high
1     NA    NA
2   20.0 180.0
3  360.0 460.0
4  460.0 629.0
...
12  25.0  49.0

# generating more complicated vector 
n <- 10000
x <- stri_paste(stri_rand_strings(n, length = 1:10), sample(c("(","["),n,TRUE),  
  sample(1000,n,TRUE), ",", sample(1000,n,TRUE),    sample(c(")","]"), n, TRUE))
head(x) # check first elements
[1] "O[68,434]"      "Ql[783,151)"    "Zk0(773,60)"    "ETfV(446,518]"  "Xixbr(576,855)" "G6QnHu(92,955)"

#short test using new data
grepFun(x[1:6])
  low high
1  68  434
2 783  151
3 773   60
4 446  518
5 576  855
6  92  955
striFun(x[1:6])
  low high
1  68  434
2 783  151
3 773   60
4 446  518
5 576  855
6  92  955

#and some benchmark to prove performance
microbenchmark(grepFun(x), striFun(x))
Unit: milliseconds
       expr       min        lq      mean    median        uq       max neval
 grepFun(x) 330.27733 366.09306 416.56330 406.08914 465.29829 568.15250   100
 striFun(x)  11.57449  11.97825  13.38157  12.46927  13.67699  25.97455   100

来源：https://stackoverflow.com/questions/42078763/how-can-i-extract-numbers-from-a-string-in-r

标签

string

numbers

stringr