How to replace words with span tag using jsoup?

前端 未结 3 1699
耶瑟儿~
耶瑟儿~ 2020-12-10 07:07

Assume I have the following html:





    
I am
相关标签:
3条回答
  • 2020-12-10 07:25

    I think you need to traverse the tree. The result of text() on an Element will be all of the Element's text including text within child elements. Hopefully something like the following code will be helpful to you:

    import java.io.File;
    import java.io.IOException;
    import java.util.StringTokenizer;
    import org.apache.commons.io.FileUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.nodes.Node;
    import org.jsoup.nodes.TextNode;
    
    public class ScreenScrape {
    
        public static void main(String[] args) throws IOException {
            String content = FileUtils.readFileToString(new File("test.html"));
            Document doc = Jsoup.parse(content);
            Element body = doc.body();
            //System.out.println(body.toString());
    
            StringBuilder sb = new StringBuilder();
            traverse(body, sb);
    
            System.out.println(sb.toString());
        }
    
        private static void traverse(Node n, StringBuilder sb) {
            if (n instanceof Element) {
                sb.append('<');
                sb.append(n.nodeName());            
                if (n.attributes().size() > 0) {
                    sb.append(n.attributes().toString());
                }
                sb.append('>');
            }
            if (n instanceof TextNode) {
                TextNode tn = (TextNode) n;
                if (!tn.isBlank()) {
                    sb.append(spanifyText(tn.text()));
                }
            }
            for (Node c : n.childNodes()) {
                traverse(c, sb);
            }
            if (n instanceof Element) {
                sb.append("</");
                sb.append(n.nodeName());
                sb.append('>');
            }        
        }
    
        private static String spanifyText(String text){
            StringBuilder sb = new StringBuilder();
            StringTokenizer st = new StringTokenizer(text);
            String token;
            while (st.hasMoreTokens()) {
                 token = st.nextToken();
                 if(token.length() > 3){
                     sb.append("<span>");
                     sb.append(token);
                     sb.append("</span>");
                 } else {
                     sb.append(token);
                 }             
                 sb.append(' ');
            }
            return sb.substring(0, sb.length() - 1).toString();
        }
    
    }
    

    UPDATE

    Using Jonathan's new Jsoup List element.textNode() method and combining it with MarcoS's suggested NodeTraversor/NodeVisitor technique I came up with (although I am modifying the tree whilst traversing it - probably a bad idea):

    Document doc = Jsoup.parse(content);
    Element body = doc.body();
    NodeTraversor nd = new NodeTraversor(new NodeVisitor() {
    
        @Override
        public void tail(Node node, int depth) {
            if (node instanceof Element) {
                boolean foundLongWord;
                Element elem = (Element) node;
                Element span;
                String token;
                StringTokenizer st;
                ArrayList<Node> changedNodes;
                Node currentNode;
                for (TextNode tn : elem.textNodes()) {
                    foundLongWord = Boolean.FALSE;
                    changedNodes = new ArrayList<Node>();
                    st = new StringTokenizer(tn.text());
                    while (st.hasMoreTokens()) {
                        token = st.nextToken();
                        if (token.length() > 3) {
                            foundLongWord = Boolean.TRUE;
                            span = new Element(Tag.valueOf("span"), elem.baseUri());
                            span.appendText(token);
                            changedNodes.add(span);
                        } else {
                            changedNodes.add(new TextNode(token + " ", elem.baseUri()));
                        }
                    }
                    if (foundLongWord) {
                        currentNode = changedNodes.remove(0);
                        tn.replaceWith(currentNode);
                        for (Node n : changedNodes) {
                            currentNode.after(n);
                            currentNode = n;
                        }
                    }
                }
            }
        }
    
        @Override
        public void head(Node node, int depth) {
        }
    });    
    nd.traverse(body);
    System.out.println(body.toString());
    
    0 讨论(0)
  • 2020-12-10 07:26

    I am replacing word hello with hello(span tag)

    Document doc = Jsoup.parse(content);
        Element test =  doc.body();
        Elements elemenets = test.getAllElements();
        for(int i =0 ;i <elemenets .size();i++){
            String elementText = elemenets .get(i).text();
            if(elementText.contains("hello"))
                elemenets .get(i).html(l.get(i).text().replaceAll("hello","<span style=\"color:blue\">hello</span>"));
        }
    
    0 讨论(0)
  • 2020-12-10 07:31

    In this case you must traverse your document as suggested by this answer. Here's a way of doing it using Jsoup APIs:

    • NodeTraversor and NodeVisitor allow you to traverse the DOM
    • Node.replaceWith(...) allows for replacing a node in the DOM

    Here's the code:

    public class JsoupReplacer {
    
      public static void main(String[] args) {
        so6527876();
      }
    
      public static void so6527876() {
        String html = 
        "<html>" +
        "<head>" +
        "</head>" +
        "<body>" +
        "    <div id=\"wrapper\" >" +
        "         <div class=\"s2\">I am going <a title=\"some title\" href=\"\">by flying</a>" +
        "           <p>mr tt</p>" +
        "         </div> " +
        "    </div>" +
        "</body>    " +
        "</html>";
        Document doc = Jsoup.parse(html);
    
        final List<TextNode> nodesToChange = new ArrayList<TextNode>();
    
        NodeTraversor nd  = new NodeTraversor(new NodeVisitor() {
    
          @Override
          public void tail(Node node, int depth) {
            if (node instanceof TextNode) {
              TextNode textNode = (TextNode) node;
              String text = textNode.getWholeText();
              String[] words = text.trim().split(" ");
              for (String word : words) {
                if (word.length() > 4) {
                  nodesToChange.add(textNode);
                  break;
                }
              }
            }
          }
    
          @Override
          public void head(Node node, int depth) {        
          }
        });
    
        nd.traverse(doc.body());
    
        for (TextNode textNode : nodesToChange) {
          Node newNode = buildElementForText(textNode);
          textNode.replaceWith(newNode);
        }
    
        System.out.println("result: ");
        System.out.println();
        System.out.println(doc);
      }
    
      private static Node buildElementForText(TextNode textNode) {
        String text = textNode.getWholeText();
        String[] words = text.trim().split(" ");
        Set<String> longWords = new HashSet<String>();
        for (String word : words) {
          if (word.length() > 4) {
            longWords.add(word);
          } 
        }
        String newText = text;
        for (String longWord : longWords) {
          newText = newText.replaceAll(longWord, 
              "<span>" + longWord + "</span>");
        }
        return new DataNode(newText, textNode.baseUri());
      }
    
    }
    
    0 讨论(0)
提交回复
热议问题