I need to parse Html code. More specifically, parse each cell of every rows in all tables. Each row represent a single object and each cell represent different properties.
What I had meant in my comment was that you're doing in code (the nested loops) what having the right XPath can do for you. Using LINQ-to-XML can make this even more simpler to write. But now that we see how you want your XML file formatted, we can offer our own answers. I'd write the ParseHtml()
method like so:
public void ParseHtml()
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(htmlCode);
var cells = htmlDoc.DocumentNode
// use the right XPath rather than looping manually
.SelectNodes(@"//tr/tr/td[@class='statBox']")
.Select(node => node.InnerText.Trim())
.ToList();
var elementNames = new[] { "Name", "Team", "Pos", "GP", "G", "A", "PlusMinus", "PIM", "PP", "SH", "GW", "OT", "Shots", "ShotPctg", "TOIPerGame", "ShiftsPerGame", "FOWinPctg", "UnknownField" };
var xmlDoc =
new XElement("Stats", new XAttribute("Date", DateTime.Now.ToShortDateString()),
new XElement("Player", new XAttribute("Rank", cells.First()),
// generate the elements based on the parsed cells
cells.Skip(1)
.Zip(elementNames, (Value, Name) => new XElement(Name, Value))
.Where(element => !String.IsNullOrEmpty(element.Value))
)
);
// save to your file
xmlDoc.Save(filepath);
}
Produces the output:
<?xml version="1.0" encoding="utf-8"?>
<Stats Date="1/3/2011">
<Player Rank="1">
<Name>Sidney Crosby</Name>
<Team>PIT</Team>
<Pos>C</Pos>
<GP>39</GP>
<G>32</G>
<A>33</A>
<PlusMinus>20</PlusMinus>
<PIM>29</PIM>
<PP>10</PP>
<SH>1</SH>
<GW>3</GW>
<Shots>0</Shots>
<ShotPctg>154</ShotPctg>
<TOIPerGame>20.8</TOIPerGame>
<ShiftsPerGame>21:54</ShiftsPerGame>
<FOWinPctg>22.6</FOWinPctg>
<UnknownField>55.7</UnknownField>
</Player>
</Stats>
After looking around MSDN, I finally found an implementation solution to my problem:
using System;
using HtmlAgilityPack;
using System.Xml;
namespace HockeyStats
{
class StatsParser
{
private string htmlCode;
private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml";
public StatsParser(string htmlCode)
{
this.htmlCode = htmlCode;
this.ParseHtml();
}
public void ParseHtml()
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlCode);
XmlWriter writer = null;
try
{
// Create an XmlWriterSettings object with the correct options.
XmlWriterSettings settings = new XmlWriterSettings();
settings.Indent = true;
settings.IndentChars = (" ");
settings.OmitXmlDeclaration = false;
// Create the XmlWriter object and write some content.
writer = XmlWriter.Create(@"..\..\"+fileName, settings);
writer.WriteStartElement("Stats");
writer.WriteAttributeString("Date", DateTime.Now.ToShortDateString());
// Iterate all rows within another row
HtmlNodeCollection rows = doc.DocumentNode.SelectNodes(".//tr/tr");
for (int i = 0; i < rows.Count; ++i)
{
// Iterate all columns in this row
HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']");
for (int j = 0; j < 20; ++j)
{
switch (j)
{
case 0:
{
writer.WriteStartElement("Player");
writer.WriteAttributeString("Rank", cols[j].InnerText.Trim()); break;
}
case 1: writer.WriteElementString("Name", cols[j].InnerText.Trim()); break;
case 2: writer.WriteElementString("Team", cols[j].InnerText.Trim()); break;
case 3: writer.WriteElementString("Pos", cols[j].InnerText.Trim()); break;
case 4: writer.WriteElementString("GP", cols[j].InnerText.Trim()); break;
case 5: writer.WriteElementString("G", cols[j].InnerText.Trim()); break;
case 6: writer.WriteElementString("A", cols[j].InnerText.Trim()); break;
case 7: writer.WriteElementString("PlusMinus", cols[j].InnerText.Trim()); break;
case 8: writer.WriteElementString("PIM", cols[j].InnerText); break;
case 9: writer.WriteElementString("PP", cols[j].InnerText); break;
case 10: writer.WriteElementString("SH", cols[j].InnerText); break;
case 11: writer.WriteElementString("GW", cols[j].InnerText); break;
case 12: writer.WriteElementString("OT", cols[j].InnerText); break;
case 13: writer.WriteElementString("Shots", cols[j].InnerText); break;
case 14: writer.WriteElementString("ShotPctg", cols[j].InnerText); break;
case 15: writer.WriteElementString("TOIPerGame", cols[j].InnerText); break;
case 16: writer.WriteElementString("ShiftsPerGame", cols[j].InnerText); break;
case 17: writer.WriteElementString("FOWinPctg", cols[j].InnerText); break;
}
}
}
writer.WriteEndElement();
}
writer.WriteEndElement();
writer.Flush();
}
finally
{
if (writer != null)
writer.Close();
}
}
}
}
which gives the following XML file as an output:
<?xml version="1.0" encoding="utf-8" ?>
<Stats Date="2011-01-01">
<Player Rank="1">
<Name>Sidney Crosby</Name>
<Team>PIT</Team>
<Pos>C</Pos>
<GP>39</GP>
<G>32</G>
<A>33</A>
<PlusMinus>20</PlusMinus>
<PIM>29</PIM>
<PP>10</PP>
<SH>1</SH>
<GW>3</GW>
<Shots>0</Shots>
<ShotPctg>154</ShotPctg>
<TOIPerGame>20.8</TOIPerGame>
<ShiftsPerGame>21:54</ShiftsPerGame>
<FOWinPctg>22.6</FOWinPctg>
</Player>
</Stats>