Assume I have the following html:
I am
-
In this case you must traverse your document as suggested by this answer. Here's a way of doing it using Jsoup APIs:
NodeTraversor
and NodeVisitor
allow you to traverse the DOM
Node.replaceWith(...)
allows for replacing a node in the DOM
Here's the code:
public class JsoupReplacer {
public static void main(String[] args) {
so6527876();
}
public static void so6527876() {
String html =
"" +
"" +
"" +
"" +
" " +
" I am going by flying" +
" mr tt
" +
" " +
" " +
" " +
"";
Document doc = Jsoup.parse(html);
final List nodesToChange = new ArrayList();
NodeTraversor nd = new NodeTraversor(new NodeVisitor() {
@Override
public void tail(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String text = textNode.getWholeText();
String[] words = text.trim().split(" ");
for (String word : words) {
if (word.length() > 4) {
nodesToChange.add(textNode);
break;
}
}
}
}
@Override
public void head(Node node, int depth) {
}
});
nd.traverse(doc.body());
for (TextNode textNode : nodesToChange) {
Node newNode = buildElementForText(textNode);
textNode.replaceWith(newNode);
}
System.out.println("result: ");
System.out.println();
System.out.println(doc);
}
private static Node buildElementForText(TextNode textNode) {
String text = textNode.getWholeText();
String[] words = text.trim().split(" ");
Set longWords = new HashSet();
for (String word : words) {
if (word.length() > 4) {
longWords.add(word);
}
}
String newText = text;
for (String longWord : longWords) {
newText = newText.replaceAll(longWord,
"" + longWord + "");
}
return new DataNode(newText, textNode.baseUri());
}
}