I am attempting to replace this god awful collection of regular expressions that is currently used to clean up blocks of poorly formed HTML and stumbled upon the HTML Agility Pa
On HtmlNode, the method RemoveChild has this overload:
public HtmlNode RemoveChild(HtmlNode oldChild, bool keepGrandChildren);
So this is how you would do it:
HtmlDocument doc = new HtmlDocument();
doc.Load("yourfile.htm");
foreach (HtmlNode font in doc.DocumentNode.SelectNodes("//font"))
{
font.ParentNode.RemoveChild(font, true);
}
EDIT: It looks like the Replace w/ keepGrandChildren option is not working as expected, so here is an alternate implementation:
public static HtmlNode RemoveChild(HtmlNode parent, HtmlNode oldChild, bool keepGrandChildren)
{
if (oldChild == null)
throw new ArgumentNullException("oldChild");
if (oldChild.HasChildNodes && keepGrandChildren)
{
HtmlNode prev = oldChild.PreviousSibling;
List nodes = new List(oldChild.ChildNodes.Cast());
nodes.Sort(new StreamPositionComparer());
foreach (HtmlNode grandchild in nodes)
{
parent.InsertAfter(grandchild, prev);
}
}
parent.RemoveChild(oldChild);
return oldChild;
}
// this helper class allows to sort nodes using their position in the file.
private class StreamPositionComparer : IComparer
{
int IComparer.Compare(HtmlNode x, HtmlNode y)
{
return y.StreamPosition.CompareTo(x.StreamPosition);
}
}