I am attempting to replace this god awful collection of regular expressions that is currently used to clean up blocks of poorly formed HTML and stumbled upon the HTML Agility Pa
Once you find the
element use the InnerText method to get the text, Then do the remove and then insert the text.
You could try using AngleSharp instead.
var parser = new HtmlParser();
var document = parser.Parse(html);
using (var writer = new StringWriter())
{
document.ToHtml(writer, new PrettyMarkupFormatter());
return writer.ToString();
}
On HtmlNode, the method RemoveChild has this overload:
public HtmlNode RemoveChild(HtmlNode oldChild, bool keepGrandChildren);
So this is how you would do it:
HtmlDocument doc = new HtmlDocument();
doc.Load("yourfile.htm");
foreach (HtmlNode font in doc.DocumentNode.SelectNodes("//font"))
{
font.ParentNode.RemoveChild(font, true);
}
EDIT: It looks like the Replace w/ keepGrandChildren option is not working as expected, so here is an alternate implementation:
public static HtmlNode RemoveChild(HtmlNode parent, HtmlNode oldChild, bool keepGrandChildren)
{
if (oldChild == null)
throw new ArgumentNullException("oldChild");
if (oldChild.HasChildNodes && keepGrandChildren)
{
HtmlNode prev = oldChild.PreviousSibling;
List<HtmlNode> nodes = new List<HtmlNode>(oldChild.ChildNodes.Cast<HtmlNode>());
nodes.Sort(new StreamPositionComparer());
foreach (HtmlNode grandchild in nodes)
{
parent.InsertAfter(grandchild, prev);
}
}
parent.RemoveChild(oldChild);
return oldChild;
}
// this helper class allows to sort nodes using their position in the file.
private class StreamPositionComparer : IComparer<HtmlNode>
{
int IComparer<HtmlNode>.Compare(HtmlNode x, HtmlNode y)
{
return y.StreamPosition.CompareTo(x.StreamPosition);
}
}