You could try to strip the HTML but this might not work properly if there's more content outside tags (would require more specific filtering, e.g. checking the surrounding tag name).
Untested but should work:
char *html = ...; // html being a pointer to the document's contents
int ip = 0; // the input position
int op = 0; // the ouput position
int in_tag = 0; // are we inside a html tag?
char c; // current character
while(c = html[ip++])
{
if(c == '<')
in_tag = 1;
else if(c == '>')
in_tag = 0;
else if(c == '\n' || c == '\r') // strip line breaks
;
else if(!in_tag)
html[op++] = c;
}
html[op] = '\0';