Extract a sample of words around a particular word using stringr in R

*爱你&永不变心* 提交于 2019-12-04 06:22:12

Try this:

stringr::str_extract(text, "([^\\s]+\\s){3}Verulam(\\s[^\\s]+){3}")
# alternately, if you like " " more than \\s:
# stringr::str_extract(text, "(?:[^ ]+ ){3}Verulam(?: [^ ]+){3}")

#[1] "and created Baron Verulam in 1618[4] and"

Change the number inside the {} to suit your needs.

You can use non-capture (?:) groups, too, though I'm not sure yet whether that will improve speed.

stringr::str_extract(text, "(?:[^\\s]+\\s){3}Verulam(?:\\s[^\\s]+){3}")

I'd use unlist(strsplit) and then index the resulting vector. You could make it a function so that the number of words to fetch pre and post is a flexible parameter:

getContext <- function(text, look_for, pre = 3, post=pre) {
  # create vector of words (anything separated by a space)
  t_vec <- unlist(strsplit(text, '\\s'))

  # find position of matches
  matches <- which(t_vec==look_for)

  # return words before & after if any matches
  if(length(matches) > 0) {
    out <- 
      list(before = ifelse(m-pre < 1, NA, 
                           sapply(matches, function(m) t_vec[(m - pre):(m - 1)])), 
           after = sapply(matches, function(m) t_vec[(m + 1):(m + post)]))

    return(out)
  } else {
    warning('No matches')
  }
}

Works for a single match

getContext(text, 'Verulam')

# $before
#      [,1]     
# [1,] "and"    
# [2,] "created"
# [3,] "Baron"  
# 
# $after
#      [,1]     
# [1,] "in"     
# [2,] "1618[4]"
# [3,] "and"   

Also works if there's more than one match

getContext(text, 'he')

# $before
#      [,1]     [,2]           [,3]          [,4]     
# [1,] "After"  "nature."      "in"          "John"   
# [2,] "his"    "Most"         "1621;[3][b]" "Aubrey" 
# [3,] "death," "importantly," "as"          "stating"
# 
# $after
#      [,1]          [,2]     [,3]      [,4]        
# [1,] "remained"    "argued" "died"    "contracted"
# [2,] "extremely"   "this"   "without" "the"       
# [3,] "influential" "could"  "heirs,"  "condition" 

getContext(text, 'fruitloops')
# Warning message:
#   In getContext(text, "fruitloops") : No matches

If you do not mind to triplicate the data, you can make a data.frame, which is normally the best option to work with in R.

context <- function(text){
  splittedText <- strsplit(text, ' ', T)[[1]]
  print(splittedText)

  data.frame(
    words  = splittedText,
    before = head(c('', splittedText), -1), 
    after  = tail(c(splittedText, ''), -1)
  )
}

Much cleaner IMO:

info <- context(text)

print(subset(info, words == 'Verulam'))

print(subset(info, before == 'Lord'))

print(subset(info, grepl('[[:digit:]]', words)))

#       words before #after
# 161 Verulam  Baron    in
#        words before after
# 9 Chancellor   Lord    of
#             words before after
# 43  empiricism.[6]     of   His
# 157           1603     in   and
# 163        1618[4]     in   and
# 169    1621;[3][b]     in    as
# 187          1626,     in  with
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!