Remove HTML tags from a String

后端 未结 30 3240
误落风尘
误落风尘 2020-11-21 07:35

Is there a good way to remove HTML from a Java string? A simple regex like

replaceAll("\\\\<.*?>", &quo         


        
30条回答
  •  野趣味
    野趣味 (楼主)
    2020-11-21 07:52

    Here's a lightly more fleshed out update to try to handle some formatting for breaks and lists. I used Amaya's output as a guide.

    import java.io.IOException;
    import java.io.Reader;
    import java.io.StringReader;
    import java.util.Stack;
    import java.util.logging.Logger;
    
    import javax.swing.text.MutableAttributeSet;
    import javax.swing.text.html.HTML;
    import javax.swing.text.html.HTMLEditorKit;
    import javax.swing.text.html.parser.ParserDelegator;
    
    public class HTML2Text extends HTMLEditorKit.ParserCallback {
        private static final Logger log = Logger
                .getLogger(Logger.GLOBAL_LOGGER_NAME);
    
        private StringBuffer stringBuffer;
    
        private Stack indentStack;
    
        public static class IndexType {
            public String type;
            public int counter; // used for ordered lists
    
            public IndexType(String type) {
                this.type = type;
                counter = 0;
            }
        }
    
        public HTML2Text() {
            stringBuffer = new StringBuffer();
            indentStack = new Stack();
        }
    
        public static String convert(String html) {
            HTML2Text parser = new HTML2Text();
            Reader in = new StringReader(html);
            try {
                // the HTML to convert
                parser.parse(in);
            } catch (Exception e) {
                log.severe(e.getMessage());
            } finally {
                try {
                    in.close();
                } catch (IOException ioe) {
                    // this should never happen
                }
            }
            return parser.getText();
        }
    
        public void parse(Reader in) throws IOException {
            ParserDelegator delegator = new ParserDelegator();
            // the third parameter is TRUE to ignore charset directive
            delegator.parse(in, this, Boolean.TRUE);
        }
    
        public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
            log.info("StartTag:" + t.toString());
            if (t.toString().equals("p")) {
                if (stringBuffer.length() > 0
                        && !stringBuffer.substring(stringBuffer.length() - 1)
                                .equals("\n")) {
                    newLine();
                }
                newLine();
            } else if (t.toString().equals("ol")) {
                indentStack.push(new IndexType("ol"));
                newLine();
            } else if (t.toString().equals("ul")) {
                indentStack.push(new IndexType("ul"));
                newLine();
            } else if (t.toString().equals("li")) {
                IndexType parent = indentStack.peek();
                if (parent.type.equals("ol")) {
                    String numberString = "" + (++parent.counter) + ".";
                    stringBuffer.append(numberString);
                    for (int i = 0; i < (4 - numberString.length()); i++) {
                        stringBuffer.append(" ");
                    }
                } else {
                    stringBuffer.append("*   ");
                }
                indentStack.push(new IndexType("li"));
            } else if (t.toString().equals("dl")) {
                newLine();
            } else if (t.toString().equals("dt")) {
                newLine();
            } else if (t.toString().equals("dd")) {
                indentStack.push(new IndexType("dd"));
                newLine();
            }
        }
    
        private void newLine() {
            stringBuffer.append("\n");
            for (int i = 0; i < indentStack.size(); i++) {
                stringBuffer.append("    ");
            }
        }
    
        public void handleEndTag(HTML.Tag t, int pos) {
            log.info("EndTag:" + t.toString());
            if (t.toString().equals("p")) {
                newLine();
            } else if (t.toString().equals("ol")) {
                indentStack.pop();
                ;
                newLine();
            } else if (t.toString().equals("ul")) {
                indentStack.pop();
                ;
                newLine();
            } else if (t.toString().equals("li")) {
                indentStack.pop();
                ;
                newLine();
            } else if (t.toString().equals("dd")) {
                indentStack.pop();
                ;
            }
        }
    
        public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
            log.info("SimpleTag:" + t.toString());
            if (t.toString().equals("br")) {
                newLine();
            }
        }
    
        public void handleText(char[] text, int pos) {
            log.info("Text:" + new String(text));
            stringBuffer.append(text);
        }
    
        public String getText() {
            return stringBuffer.toString();
        }
    
        public static void main(String args[]) {
            String html = "

    paragraph at start

    hello
    What is happening?

    this is a
    mutiline paragraph

    1. This
    2. is
    3. an
    4. ordered
    5. list

      with

      • another
      • list
        This
        is
        sdasd
        sdasda
        asda

        aasdas

        sdada
        fsdfsdfsd
        vbcvcvbcvb
        cvbcvbc
        vbcbcvbcvb
        cvbcv
      • cool

      stuff

    6. cool

    "; System.out.println(convert(html)); } }

提交回复
热议问题