Remove HTML tags from a String

后端 未结 30 3197
误落风尘
误落风尘 2020-11-21 07:35

Is there a good way to remove HTML from a Java string? A simple regex like

replaceAll("\\\\<.*?>", &quo         


        
相关标签:
30条回答
  • 2020-11-21 07:48

    This should work -

    use this

      text.replaceAll('<.*?>' , " ") -> This will replace all the html tags with a space.
    

    and this

      text.replaceAll('&.*?;' , "")-> this will replace all the tags which starts with "&" and ends with ";" like &nbsp;, &amp;, &gt; etc.
    
    0 讨论(0)
  • 2020-11-21 07:48

    One way to retain new-line info with JSoup is to precede all new line tags with some dummy string, execute JSoup and replace dummy string with "\n".

    String html = "<p>Line one</p><p>Line two</p>Line three<br/>etc.";
    String NEW_LINE_MARK = "NEWLINESTART1234567890NEWLINEEND";
    for (String tag: new String[]{"</p>","<br/>","</h1>","</h2>","</h3>","</h4>","</h5>","</h6>","</li>"}) {
        html = html.replace(tag, NEW_LINE_MARK+tag);
    }
    
    String text = Jsoup.parse(html).text();
    
    text = text.replace(NEW_LINE_MARK + " ", "\n\n");
    text = text.replace(NEW_LINE_MARK, "\n\n");
    
    0 讨论(0)
  • 2020-11-21 07:49

    On Android, try this:

    String result = Html.fromHtml(html).toString();
    
    0 讨论(0)
  • 2020-11-21 07:52

    Here's a lightly more fleshed out update to try to handle some formatting for breaks and lists. I used Amaya's output as a guide.

    import java.io.IOException;
    import java.io.Reader;
    import java.io.StringReader;
    import java.util.Stack;
    import java.util.logging.Logger;
    
    import javax.swing.text.MutableAttributeSet;
    import javax.swing.text.html.HTML;
    import javax.swing.text.html.HTMLEditorKit;
    import javax.swing.text.html.parser.ParserDelegator;
    
    public class HTML2Text extends HTMLEditorKit.ParserCallback {
        private static final Logger log = Logger
                .getLogger(Logger.GLOBAL_LOGGER_NAME);
    
        private StringBuffer stringBuffer;
    
        private Stack<IndexType> indentStack;
    
        public static class IndexType {
            public String type;
            public int counter; // used for ordered lists
    
            public IndexType(String type) {
                this.type = type;
                counter = 0;
            }
        }
    
        public HTML2Text() {
            stringBuffer = new StringBuffer();
            indentStack = new Stack<IndexType>();
        }
    
        public static String convert(String html) {
            HTML2Text parser = new HTML2Text();
            Reader in = new StringReader(html);
            try {
                // the HTML to convert
                parser.parse(in);
            } catch (Exception e) {
                log.severe(e.getMessage());
            } finally {
                try {
                    in.close();
                } catch (IOException ioe) {
                    // this should never happen
                }
            }
            return parser.getText();
        }
    
        public void parse(Reader in) throws IOException {
            ParserDelegator delegator = new ParserDelegator();
            // the third parameter is TRUE to ignore charset directive
            delegator.parse(in, this, Boolean.TRUE);
        }
    
        public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
            log.info("StartTag:" + t.toString());
            if (t.toString().equals("p")) {
                if (stringBuffer.length() > 0
                        && !stringBuffer.substring(stringBuffer.length() - 1)
                                .equals("\n")) {
                    newLine();
                }
                newLine();
            } else if (t.toString().equals("ol")) {
                indentStack.push(new IndexType("ol"));
                newLine();
            } else if (t.toString().equals("ul")) {
                indentStack.push(new IndexType("ul"));
                newLine();
            } else if (t.toString().equals("li")) {
                IndexType parent = indentStack.peek();
                if (parent.type.equals("ol")) {
                    String numberString = "" + (++parent.counter) + ".";
                    stringBuffer.append(numberString);
                    for (int i = 0; i < (4 - numberString.length()); i++) {
                        stringBuffer.append(" ");
                    }
                } else {
                    stringBuffer.append("*   ");
                }
                indentStack.push(new IndexType("li"));
            } else if (t.toString().equals("dl")) {
                newLine();
            } else if (t.toString().equals("dt")) {
                newLine();
            } else if (t.toString().equals("dd")) {
                indentStack.push(new IndexType("dd"));
                newLine();
            }
        }
    
        private void newLine() {
            stringBuffer.append("\n");
            for (int i = 0; i < indentStack.size(); i++) {
                stringBuffer.append("    ");
            }
        }
    
        public void handleEndTag(HTML.Tag t, int pos) {
            log.info("EndTag:" + t.toString());
            if (t.toString().equals("p")) {
                newLine();
            } else if (t.toString().equals("ol")) {
                indentStack.pop();
                ;
                newLine();
            } else if (t.toString().equals("ul")) {
                indentStack.pop();
                ;
                newLine();
            } else if (t.toString().equals("li")) {
                indentStack.pop();
                ;
                newLine();
            } else if (t.toString().equals("dd")) {
                indentStack.pop();
                ;
            }
        }
    
        public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
            log.info("SimpleTag:" + t.toString());
            if (t.toString().equals("br")) {
                newLine();
            }
        }
    
        public void handleText(char[] text, int pos) {
            log.info("Text:" + new String(text));
            stringBuffer.append(text);
        }
    
        public String getText() {
            return stringBuffer.toString();
        }
    
        public static void main(String args[]) {
            String html = "<html><body><p>paragraph at start</p>hello<br />What is happening?<p>this is a<br />mutiline paragraph</p><ol>  <li>This</li>  <li>is</li>  <li>an</li>  <li>ordered</li>  <li>list    <p>with</p>    <ul>      <li>another</li>      <li>list        <dl>          <dt>This</dt>          <dt>is</dt>            <dd>sdasd</dd>            <dd>sdasda</dd>            <dd>asda              <p>aasdas</p>            </dd>            <dd>sdada</dd>          <dt>fsdfsdfsd</dt>        </dl>        <dl>          <dt>vbcvcvbcvb</dt>          <dt>cvbcvbc</dt>            <dd>vbcbcvbcvb</dd>          <dt>cvbcv</dt>          <dt></dt>        </dl>        <dl>          <dt></dt>        </dl></li>      <li>cool</li>    </ul>    <p>stuff</p>  </li>  <li>cool</li></ol><p></p></body></html>";
            System.out.println(convert(html));
        }
    }
    
    0 讨论(0)
  • 2020-11-21 07:52

    Worth noting that if you're trying to accomplish this in a Service Stack project, it's already a built-in string extension

    using ServiceStack.Text;
    // ...
    "The <b>quick</b> brown <p> fox </p> jumps over the lazy dog".StripHtml();
    
    0 讨论(0)
  • 2020-11-21 07:53

    The accepted answer did not work for me for the test case I indicated: the result of "a < b or b > c" is "a b or b > c".

    So, I used TagSoup instead. Here's a shot that worked for my test case (and a couple of others):

    import java.io.IOException;
    import java.io.StringReader;
    import java.util.logging.Logger;
    
    import org.ccil.cowan.tagsoup.Parser;
    import org.xml.sax.Attributes;
    import org.xml.sax.ContentHandler;
    import org.xml.sax.InputSource;
    import org.xml.sax.Locator;
    import org.xml.sax.SAXException;
    import org.xml.sax.XMLReader;
    
    /**
     * Take HTML and give back the text part while dropping the HTML tags.
     *
     * There is some risk that using TagSoup means we'll permute non-HTML text.
     * However, it seems to work the best so far in test cases.
     *
     * @author dan
     * @see <a href="http://home.ccil.org/~cowan/XML/tagsoup/">TagSoup</a> 
     */
    public class Html2Text2 implements ContentHandler {
    private StringBuffer sb;
    
    public Html2Text2() {
    }
    
    public void parse(String str) throws IOException, SAXException {
        XMLReader reader = new Parser();
        reader.setContentHandler(this);
        sb = new StringBuffer();
        reader.parse(new InputSource(new StringReader(str)));
    }
    
    public String getText() {
        return sb.toString();
    }
    
    @Override
    public void characters(char[] ch, int start, int length)
        throws SAXException {
        for (int idx = 0; idx < length; idx++) {
        sb.append(ch[idx+start]);
        }
    }
    
    @Override
    public void ignorableWhitespace(char[] ch, int start, int length)
        throws SAXException {
        sb.append(ch);
    }
    
    // The methods below do not contribute to the text
    @Override
    public void endDocument() throws SAXException {
    }
    
    @Override
    public void endElement(String uri, String localName, String qName)
        throws SAXException {
    }
    
    @Override
    public void endPrefixMapping(String prefix) throws SAXException {
    }
    
    
    @Override
    public void processingInstruction(String target, String data)
        throws SAXException {
    }
    
    @Override
    public void setDocumentLocator(Locator locator) {
    }
    
    @Override
    public void skippedEntity(String name) throws SAXException {
    }
    
    @Override
    public void startDocument() throws SAXException {
    }
    
    @Override
    public void startElement(String uri, String localName, String qName,
        Attributes atts) throws SAXException {
    }
    
    @Override
    public void startPrefixMapping(String prefix, String uri)
        throws SAXException {
    }
    }
    
    0 讨论(0)
提交回复
热议问题