问题
I have a html string like this:
<html><body><p>foo <a href='http://www.example.com'>bar</a> baz</p></body></html>
I wish to strip all html tags so that the resulting string becomes:
foo bar baz
From another post here at SO I've come up with this function (which uses the Html Agility Pack):
Public Shared Function stripTags(ByVal html As String) As String
Dim plain As String = String.Empty
Dim htmldoc As New HtmlAgilityPack.HtmlDocument
htmldoc.LoadHtml(html)
Dim invalidNodes As HtmlAgilityPack.HtmlNodeCollection = htmldoc.DocumentNode.SelectNodes("//html|//body|//p|//a")
If Not htmldoc Is Nothing Then
For Each node In invalidNodes
node.ParentNode.RemoveChild(node, True)
Next
End If
Return htmldoc.DocumentNode.WriteContentTo
End Function
Unfortunately this does not return what I expect, instead it gives:
bazbarfoo
Please, where do I go wrong - and is this the best approach?
Regards and happy coding!
UPDATE: by the answer below I came up with this function, might be usefull to others:
Public Shared Function stripTags(ByVal html As String) As String
Dim htmldoc As New HtmlAgilityPack.HtmlDocument
htmldoc.LoadHtml(html.Replace("</p>", "</p>" & New String(Environment.NewLine, 2)).Replace("<br/>", Environment.NewLine))
Return htmldoc.DocumentNode.InnerText
End Function
回答1:
Why not just return htmldoc.DocumentNode.InnerText
instead of removing all the non-text nodes? It should give you what you want.
回答2:
It removes the tags and properties not found in the whitelist.
Public NotInheritable Class HtmlSanitizer
Private Sub New()
End Sub
Private Shared ReadOnly Whitelist As IDictionary(Of String, String())
Private Shared DeletableNodesXpath As New List(Of String)()
Shared Sub New()
Whitelist = New Dictionary(Of String, String())() From { _
{"a", New () {"href"}}, _
{"strong", Nothing}, _
{"em", Nothing}, _
{"blockquote", Nothing}, _
{"b", Nothing}, _
{"p", Nothing}, _
{"ul", Nothing}, _
{"ol", Nothing}, _
{"li", Nothing}, _
{"div", New () {"align"}}, _
{"strike", Nothing}, _
{"u", Nothing}, _
{"sub", Nothing}, _
{"sup", Nothing}, _
{"table", Nothing}, _
{"tr", Nothing}, _
{"td", Nothing}, _
{"th", Nothing} _
}
End Sub
Public Shared Function Sanitize(input As String) As String
If input.Trim().Length < 1 Then
Return String.Empty
End If
Dim htmlDocument = New HtmlDocument()
htmlDocument.LoadHtml(input)
SanitizeNode(htmlDocument.DocumentNode)
Dim xPath As String = HtmlSanitizer.CreateXPath()
Return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath)
End Function
Private Shared Sub SanitizeChildren(parentNode As HtmlNode)
For i As Integer = parentNode.ChildNodes.Count - 1 To 0 Step -1
SanitizeNode(parentNode.ChildNodes(i))
Next
End Sub
Private Shared Sub SanitizeNode(node As HtmlNode)
If node.NodeType = HtmlNodeType.Element Then
If Not Whitelist.ContainsKey(node.Name) Then
If Not DeletableNodesXpath.Contains(node.Name) Then
'DeletableNodesXpath.Add(node.Name.Replace("?",""));
node.Name = "removeableNode"
DeletableNodesXpath.Add(node.Name)
End If
If node.HasChildNodes Then
SanitizeChildren(node)
End If
Return
End If
If node.HasAttributes Then
For i As Integer = node.Attributes.Count - 1 To 0 Step -1
Dim currentAttribute As HtmlAttribute = node.Attributes(i)
Dim allowedAttributes As String() = Whitelist(node.Name)
If allowedAttributes IsNot Nothing Then
If Not allowedAttributes.Contains(currentAttribute.Name) Then
node.Attributes.Remove(currentAttribute)
End If
Else
node.Attributes.Remove(currentAttribute)
End If
Next
End If
End If
If node.HasChildNodes Then
SanitizeChildren(node)
End If
End Sub
Private Shared Function StripHtml(html As String, xPath As String) As String
Dim htmlDoc As New HtmlDocument()
htmlDoc.LoadHtml(html)
If xPath.Length > 0 Then
Dim invalidNodes As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes(xPath)
For Each node As HtmlNode In invalidNodes
node.ParentNode.RemoveChild(node, True)
Next
End If
Return htmlDoc.DocumentNode.WriteContentTo()
End Function
Private Shared Function CreateXPath() As String
Dim _xPath As String = String.Empty
For i As Integer = 0 To DeletableNodesXpath.Count - 1
If i IsNot DeletableNodesXpath.Count - 1 Then
_xPath += String.Format("//{0}|", DeletableNodesXpath(i).ToString())
Else
_xPath += String.Format("//{0}", DeletableNodesXpath(i).ToString())
End If
Next
Return _xPath
End Function
End Class
回答3:
You seem to assume that ForEach traverses the document from start to finish.. if you want to make sure you do that, use a regular for loop. You can't even be sure the nodes are being picked up in the order you expect with the xpath selector, but you might be right on this occasion..
regards, Brunis
回答4:
edit below few lines, then you get that you want..
Private Shared Function StripHtml(html As String, xPath As String) As String
Dim htmlDoc As New HtmlAgilityPack.HtmlDocument()
htmlDoc.LoadHtml(html)
If xPath.Length > 0 Then
Dim invalidNodes As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes(xPath)
'------- edit this line -------------------
'For Each node As HtmlNode In invalidNodes
'node.ParentNode.RemoveChild(node, True)
'Next
'
' result-> bazbarfoo
'
'------- modify line ----------------------
For i = invalidNodes.Count - 1 To 0 Step -1
Dim Node As HtmlNode = invalidNodes.Item(i)
Node.ParentNode.RemoveChild(Node, True)
Next
'
' result-> foo bar baz
'
End If
Return htmlDoc.DocumentNode.WriteContentTo()
End Function
回答5:
You can use the following code.
public string RemoveHTMLTags(string source)
{
string expn = "<.*?>";
return Regex.Replace(source, expn, string.Empty);
}
来源:https://stackoverflow.com/questions/3140919/stripping-all-html-tags-with-html-agility-pack