Simultaneously escape double and single quotes in Xpath

前端 未结 4 952
梦谈多话
梦谈多话 2021-01-12 16:57

Similar to How to deal with single quote in xpath, I want to escape single quotes. The difference is that I can\'t exclude the possibility that a double quote might also app

相关标签:
4条回答
  • 2021-01-12 17:19

    I added the cat function to the target inside the html_nodes() function call. Seems to handle both the cases. cat() also has the side-effect of printing the escaped text.

    library(rvest)
    library(magrittr)
    
    html <- "<div>1</div><div>Father's son</div>"
    target <- "Father's son"
    html %>% xml2::read_html() %>% html_nodes(xpath = paste0("//*[contains(text(), \"",cat(target),"\")]"))
    #> Father's son
    #> {xml_nodeset (4)}
    #> [1] <html><body>\n<div>1</div>\n<div>Father's son</div>\n</body></html>
    #> [2] <body>\n<div>1</div>\n<div>Father's son</div>\n</body>
    #> [3] <div>1</div>\n
    #> [4] <div>Father's son</div>
    
    html <- "<div>1</div><div>Father said \"Hello!\"</div>"
    target <- 'Father said "Hello!"'
    html %>% xml2::read_html() %>% html_nodes(xpath = paste0("//*[contains(text(), \"",cat(target),"\")]"))
    #> Father said "Hello!"
    #> {xml_nodeset (4)}
    #> [1] <html><body>\n<div>1</div>\n<div>Father said "Hello!"</div>\n</body> ...
    #> [2] <body>\n<div>1</div>\n<div>Father said "Hello!"</div>\n</body>
    #> [3] <div>1</div>\n
    #> [4] <div>Father said "Hello!"</div>
    
    0 讨论(0)
  • 2021-01-12 17:24

    Because you are using string manipulation to build your XPath expression, it's your responsibility that the expression is valid XPath. This expression:

    //*[contains(.,concat('Fat"',"her's son"))]
    

    Selects:

    <div>Fat"her's son</div>
    

    Test in here

    It would be a better approach to use an XPath string variable, but it looks like R doesn't have an API for that, even using libxml.

    0 讨论(0)
  • 2021-01-12 17:39

    use quote() for xpath query

    library(XML)
    

    only single quote inside string

    target1 <- "Father's son"
    doc1 <- XML::newHTMLDoc()
    newXMLNode("div", 1, parent = getNodeSet(doc1, "//body"), doc = doc1)
    newXMLNode("div", target1, parent = getNodeSet(doc1, "//body"), doc = doc1)
    xpath_query1 <- paste0('//*[ contains(text(), ', '"', target1, '"', ')]')
    getNodeSet(doc1, xpath_query1)
    

    both single and double quote inside string

    target2 <- "Fat\"her's son"
    doc2 <- XML::newHTMLDoc()
    newXMLNode("div", 1, parent = getNodeSet(doc2, "//body"), doc = doc2)
    newXMLNode("div", target2, parent = getNodeSet(doc2, "//body"), doc = doc2)
    xpath_query2 <- quote('//body/*[contains(.,concat(\'Fat"\',"her\'s son"))]')
    getNodeSet(doc2, xpath_query2)
    

    Output:

    getNodeSet(doc1, xpath_query1)
    # [[1]]
    # <div>Father's son</div> 
    # 
    # attr(,"class")
    # [1] "XMLNodeSet"
    
    getNodeSet(doc2, xpath_query2)
    # [[1]]
    # <div>Fat"her's son</div> 
    # 
    # attr(,"class")
    # [1] "XMLNodeSet"
    
    0 讨论(0)
  • 2021-01-12 17:41

    The key here is realising that with xml2 you can write back into the parsed html with html-escaped characters. This function will do the trick. It's longer than it needs to be because I've included comments and some type checking / converting logic.

    contains_text <- function(node_set, find_this)
    {
      # Ensure we have a nodeset
      if(all(class(node_set) == c("xml_document", "xml_node")))
        node_set %<>% xml_children()
    
      if(class(node_set) != "xml_nodeset")
        stop("contains_text requires an xml_nodeset or xml_document.")
    
      # Get all leaf nodes
      node_set %<>% xml_nodes(xpath = "//*[not(*)]")
    
      # HTML escape the target string
      find_this %<>% {gsub("\"", "&quot;", .)}
    
      # Extract, HTML escape and replace the nodes
      lapply(node_set, function(node) xml_text(node) %<>% {gsub("\"", "&quot;", .)})
    
      # Now we can define the xpath and extract our target nodes
      xpath <- paste0("//*[contains(text(), \"", find_this, "\")]")
      new_nodes <- html_nodes(node_set, xpath = xpath)
    
      # Since the underlying xml_document is passed by pointer internally,
      # we should unescape any text to leave it unaltered
      xml_text(node_set) %<>% {gsub("&quot;", "\"", .)}
      return(new_nodes)
    }
    

    Now:

    library(rvest)
    library(xml2)
    
    html %>% xml2::read_html() %>% contains_text(target)
    #> {xml_nodeset (1)}
    #> [1] <div>Fat"her's son</div>
    html %>% xml2::read_html() %>% contains_text(target) %>% xml_text()
    #> [1] "Fat\"her's son"
    

    ADDENDUM

    This is an alternative method, which is an implementation of the method suggested by @Alejandro but allows arbitrary targets. It has the merit of leaving the xml document untouched, and is a little faster than the above method, but involves the kind of string parsing that an xml library is supposed to prevent. It works by taking the target, splitting it after each " and ', then enclosing each fragment in the opposite type of quote to the one it contains before pasting them all back together with commas and inserting them into an XPath concatenate function.

    library(stringr)
    
    safe_xpath <- function(target)
    {
      target                                 %<>%
      str_replace_all("\"", "&quot;&break;") %>%
      str_replace_all("'", "&apo;&break;")   %>%
      str_split("&break;")                   %>%
      unlist()
    
      safe_pieces    <- grep("(&quot;)|(&apo;)", target, invert = TRUE)
      contain_quotes <- grep("&quot;", target)
      contain_apo    <- grep("&apo;", target)
    
      if(length(safe_pieces) > 0) 
          target[safe_pieces] <- paste0("\"", target[safe_pieces], "\"")
    
      if(length(contain_quotes) > 0)
      {
        target[contain_quotes] <- paste0("'", target[contain_quotes], "'")
        target[contain_quotes] <- gsub("&quot;", "\"", target[contain_quotes])
      }
    
      if(length(contain_apo) > 0)
      {
        target[contain_apo] <- paste0("\"", target[contain_apo], "\"")
        target[contain_apo] <- gsub("&apo;", "'", target[contain_apo])
      }
    
      fragment <- paste0(target, collapse = ",")
      return(paste0("//*[contains(text(),concat(", fragment, "))]"))
    }
    

    Now we can generate a valid xpath like this:

    safe_xpath(target)
    #> [1] "//*[contains(text(),concat('Fat\"',\"her'\",\"s son\"))]"
    

    so that

    html %>% xml2::read_html() %>% html_nodes(xpath = safe_xpath(target))
    #> {xml_nodeset (1)}
    #> [1] <div>Fat"her's son</div>
    
    0 讨论(0)
提交回复
热议问题