问题
How can i strip the HTML from document between and including the <...> tags in a HTML document using C? My current program uses curl to get the contents of the webpage and puts it into a text file, it then reads from the text file and removes the <>, but i am unsure of how to remove everything between those tags.
#include <curl/curl.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#define WEBPAGE_URL "http://homepages.paradise.net.nz/adrianfu/index.html"
#define DESTINATION_FILE "/home/user/data.txt"
size_t write_data( void *ptr, size_t size, size_t nmeb, void *stream)
{
return fwrite(ptr,size,nmeb,stream);
}
int main()
{
int in_tag = 0;
char * buffer;
char c;
long lSize;
size_t result;
FILE * file = fopen(DESTINATION_FILE,"w+");
if (file==NULL) {
fputs ("File error",stderr);
exit (1);
}
CURL *handle = curl_easy_init();
curl_easy_setopt(handle,CURLOPT_URL,WEBPAGE_URL); /*Using the http protocol*/
curl_easy_setopt(handle,CURLOPT_WRITEFUNCTION, write_data);
curl_easy_setopt(handle,CURLOPT_WRITEDATA, file);
curl_easy_perform(handle);
curl_easy_cleanup(handle);
int i, nRead, fd;
int source;
char buf[1024];
if((fd = open("data.txt", O_RDONLY)) == -1)
{
printf("Cannot open the file");
}
else
{
nRead = read(fd, buf, 1024);
printf("Original String ");
for(i=0; i<nRead; i++)
{
printf("%c", buf[i]);
}
printf("\nReplaced String ");
for(i=0; i<nRead; i++)
{
if(buf[i]=='<' || buf[i]=='>'){
buf[i]=' ';
}
printf("%c", buf[i]);
}
}
close(source);
return 0;
}
回答1:
Placing just the code that removes the contents between the '<' and '>' tags (assuming that you deal with proper html, meaning that you don't have one tag nested in the declaration of the other like <html < body> >
). I am just changing a small portion of your code. I will also remove the tags from the buf
variable, instead of replacing the undesired characters with intervals, because I think this will be more useful to you (correct me if I am wrong).
int idx = 0;
int opened = 0; // false
for(i=0; i<nRead; i++)
{
if(buf[i]=='<') {
opened = 1; // true
} else if (buf[i] == '>') {
opened = 0; // false
} else if (!opened) {
buf[idx++] = buf[i];
}
}
buf[idx] = '\0';
printf("%s\n", buf);
回答2:
This would also handle scripts and style tags
int stripHTMLTags(char *sToClean,size_t size)
{
int i=0,j=0,k=0;
int flag = 0; // 0: searching for < or & (& as in &bspn; etc), 1: searching for >, 2: searching for ; after &, 3: searching for </script>,</style>, -->
char tempbuf[1024*1024] = "";
char searchbuf[1024] = "";
while(i<size)
{
if(flag == 0)
{
if(sToClean[i] == '<')
{
flag = 1;
tempbuf[0] = '\0';
k=0; // track for <script>,<style>, <!-- --> etc
}
else if(sToClean[i] == '&')
{
flag = 2;
}
else
{
sToClean[j] = sToClean[i];
j++;
}
}
else if(flag == 1)
{
tempbuf[k] = sToClean[i];
k++;
tempbuf[k] = '\0';
//printf("DEBUG: %s\n",tempbuf);
if((0 == strcmp(tempbuf,"script")))
{
flag = 3;
strcpy(searchbuf,"</script>");
//printf("DEBUG: Detected %s\n",tempbuf);
tempbuf[0] = '\0';
k = 0;
}
else if((0 == strcmp(tempbuf,"style")))
{
flag = 3;
strcpy(searchbuf,"</style>");
//printf("DEBUG: Detected %s\n",tempbuf);
tempbuf[0] = '\0';
k = 0;
}
else if((0 == strcmp(tempbuf,"!--")))
{
flag = 3;
strcpy(searchbuf,"-->");
//printf("DEBUG: Detected %s\n",tempbuf);
tempbuf[0] = '\0';
k = 0;
}
if(sToClean[i] == '>')
{
sToClean[j] = ' ';
j++;
flag = 0;
}
}
else if(flag == 2)
{
if(sToClean[i] == ';')
{
sToClean[j] = ' ';
j++;
flag = 0;
}
}
else if(flag == 3)
{
tempbuf[k] = sToClean[i];
k++;
tempbuf[k] = '\0';
//printf("DEBUG: %s\n",tempbuf);
//printf("DEBUG: Searching for %s\n",searchbuf);
if(0 == strcmp(&tempbuf[0] + k - strlen(searchbuf),searchbuf))
{
flag = 0;
//printf("DEBUG: Detected END OF %s\n",searchbuf);
searchbuf[0] = '\0';
tempbuf[0] = '\0';
k = 0;
}
}
i++;
}
sToClean[j] = '\0';
return j;
}
来源:https://stackoverflow.com/questions/9444200/c-strip-html-between