问题
I can easily remove the element just by note.Remove() lik this:
HtmlDocument html = new HtmlDocument();
html.Load(Server.MapPath(@"~\Site\themes\default\index.cshtml"));
foreach (var item in html.DocumentNode.SelectNodes("//removeMe"))
{
item.Remove();
}
But that removes the innerHtml as well. What if i only want to remove the tag, and keep the innerHtml?
Example:
<ul>
<removeMe>
<li>
<a href="#">Keep me</a>
</li>
</removeMe>
</ul>
Any help would be appreciated :)
回答1:
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
var node = doc.DocumentNode.SelectSingleNode("//removeme");
node.ParentNode.RemoveChild(node, true);
回答2:
This should work:
foreach (var item in doc.DocumentNode.SelectNodes("//removeMe"))
{
if (item.PreviousSibling == null)
{
//First element -> so add it at beginning of the parent's innerhtml
item.ParentNode.InnerHtml = item.InnerHtml + item.ParentNode.InnerHtml;
}
else
{
//There is an element before itemToRemove -> add the innerhtml after the previous item
foreach(HtmlNode node in item.ChildNodes){
item.PreviousSibling.ParentNode.InsertAfter(node, item.PreviousSibling);
}
}
item.Remove();
}
回答3:
There is a problem with the bool KeepGrandChildren implementation for people that might have text withing the element they are trying to remove. If the removeme tag had text in it, the text will be removed also. For example <removeme>text<p>more text</p></removeme>
will become <p>more text</p>
Try this:
private static void RemoveElementKeepText(HtmlNode node)
{
//node.ParentNode.RemoveChild(node, true);
HtmlNode parent = node.ParentNode;
HtmlNode prev = node.PreviousSibling;
HtmlNode next = node.NextSibling;
foreach (HtmlNode child in node.ChildNodes)
{
if (prev != null)
parent.InsertAfter(child, prev);
else if (next != null)
parent.InsertBefore(child, next);
else
parent.AppendChild(child);
}
node.Remove();
}
回答4:
There is a simple way:
element.InnerHtml = element.InnerHtml.Replace("<br>", "{1}");
var innerTextWithBR = element.InnerText.Replace("{1}", "<br>");
回答5:
Adding my two cents because none of these approaches handled what I wanted (to remove a set of given tags like p
and div
and handle nesting properly while preserving inner tags).
Here's what I came up with and passes all my unit tests with what I would consider most of the cases I need to deal with:
var htmlDoc = new HtmlDocument();
// load html
htmlDoc.LoadHtml(html);
var tags = (from tag in htmlDoc.DocumentNode.Descendants()
where tagNames.Contains(tag.Name)
select tag).Reverse();
// find formatting tags
foreach (var item in tags)
{
if (item.PreviousSibling == null)
{
// Prepend children to parent node in reverse order
foreach (HtmlNode node in item.ChildNodes.Reverse())
{
item.ParentNode.PrependChild(node);
}
}
else
{
// Insert children after previous sibling
foreach (HtmlNode node in item.ChildNodes)
{
item.ParentNode.InsertAfter(node, item.PreviousSibling);
}
}
// remove from tree
item.Remove();
}
// return transformed doc
html = htmlDoc.DocumentNode.WriteContentTo().Trim();
Here are the cases I used to test:
[TestMethod]
public void StripTags_CanStripSingleTag()
{
var input = "<p>tag</p>";
var expected = "tag";
var actual = HtmlUtilities.StripTags(input, "p");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripNestedTag()
{
var input = "<p>tag <p>inner</p></p>";
var expected = "tag inner";
var actual = HtmlUtilities.StripTags(input, "p");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripTwoTopLevelTags()
{
var input = "<p>tag</p> <div>block</div>";
var expected = "tag block";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripMultipleNestedTags_2LevelsDeep()
{
var input = "<p>tag <div>inner</div></p>";
var expected = "tag inner";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripMultipleNestedTags_3LevelsDeep()
{
var input = "<p>tag <div>inner <p>superinner</p></div></p>";
var expected = "tag inner superinner";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripTwoTopLevelMultipleNestedTags_3LevelsDeep()
{
var input = "<p>tag <div>inner <p>superinner</p></div></p> <div><p>inner</p> toplevel</div>";
var expected = "tag inner superinner inner toplevel";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_IgnoresTagsThatArentSpecified()
{
var input = "<p>tag <div>inner <a>superinner</a></div></p>";
var expected = "tag inner <a>superinner</a>";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
input = "<wrapper><p>tag <div>inner</div></p></wrapper>";
expected = "<wrapper>tag inner</wrapper>";
actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripSelfClosingAndUnclosedTagsLikeBr()
{
var input = "<p>tag</p><br><br/>";
var expected = "tag";
var actual = HtmlUtilities.StripTags(input, "p", "br");
Assert.AreEqual(expected, actual);
}
It may not handle everything probably but it works for my needs.
回答6:
Perhaps this might be what you're looking for?
foreach (HtmlNode node in html.DocumentNode.SelectNodes("//removeme"))
{
HtmlNodeCollection children = node.ChildNodes; //get <removeme>'s children
HtmlNode parent = node.ParentNode; //get <removeme>'s parent
node.Remove(); //remove <removeme>
parent.AppendChildren(children); //append the children to the parent
}
Edit: L.B's answer is much cleaner. Go with his!
回答7:
How about this?
var removedNodes = document.SelectNodes("//removeme");
if(removedNodes != null)
foreach(var rn in removedNodes){
HtmlTextNode innernodes =document.CreateTextNode(rn.InnerHtml);
rn.ParnetNode.ReplaceChild(innernodes, rn);
}
回答8:
Normally the correct expression would be node.ParentNode.RemoveChildren(node, true)
.
Due to a ordering bug in HtmlNode.RemoveChildren()
(http://htmlagilitypack.codeplex.com/discussions/79587), I have created a method that is similar. Sorry it's in VB. If anyone wants a translation I'll write one.
'The HTML Agility Pack (1.4.9) includes the HtmlNode.RemoveChild() method but it has an ordering bug with preserving child nodes.
'The below implementation orders children correctly.
Private Shared Sub RemoveNode(node As HtmlAgilityPack.HtmlNode, keepChildren As Boolean)
Dim parent = node.ParentNode
If keepChildren Then
For i = node.ChildNodes.Count - 1 To 0 Step -1
parent.InsertAfter(node.ChildNodes(i), node)
Next
End If
node.Remove()
End Sub
I have tested this code with the following test markup:
<removeme>
outertextbegin
<p>innertext1</p>
<p>innertext2</p>
outertextend
</removeme>
The output is:
outertextbegin
<p>innertext1</p>
<p>innertext2</p>
outertextend
回答9:
with regex you can do or you need to do with htmlagilitypack?
string html = "<ul><removeMe><li><a href="#">Keep me</a></li></removeMe></ul>";
html = Regex.Replace(html, "<removeMe.*?>", "", RegexOptions.Compiled);
html = Regex.Replace(html, "</removeMe>", "", RegexOptions.Compiled);
来源:https://stackoverflow.com/questions/12092575/html-agility-pack-remove-element-but-not-innerhtml