网页格式化排版代码,专用信息采集后的内容整理

天涯浪子 提交于 2020-02-01 09:00:28
        public static string ClearHtml(string content) {           
            Regex regex = new Regex("");
            //首先把p标签的属性去掉,只留<p>
            regex = new Regex(@"<p.*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            content = regex.Replace(content, "<p>");

            //找到网页中的各种标签,留待后续处理
            regex = new Regex(@"<[/]*(?<txt>.*?)[\s>]", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            List<string> labels = new List<string>();
            MatchCollection mclabels = regex.Matches(content);
            foreach (Match m in mclabels) {
                if(labels.Contains(m.Groups["txt"].Value) == false)
                    labels.Add(m.Groups["txt"].Value.ToLower());
            }
            //对各种标签进行替换,p、img、strong除外。br后面会进行单独处理
            foreach (string lable in labels) {
                if (lable=="p" || lable == "img" || lable=="strong" || lable=="br")
                    continue;
                regex = new Regex(@"<[\/]*" + lable + ".*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                content = regex.Replace(content, "");
            }
            MatchCollection mc = null;

            regex = new Regex(@"<img.*?src\s*?=\s*?['""](?<txt>.*?)['""].*?>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            mc = regex.Matches(content);
            foreach (Match m in mc) {
                content = content.Replace(m.Value,"<img src='"+ m.Groups["txt"].Value+"' />");
            }

            Regex r = new Regex(@"<br.*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            content = r.Replace(content, "\r\n");
            r = new Regex(@"[\r\n\t]", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            content = r.Replace(content, "</p><p>");
            content = content.Trim();
            if (content.StartsWith("</p>") == true)
                content = content.Substring(4);
            if (content.EndsWith("<p>") == true)
                content = content.Remove(content.Length - 3);

            //替换段前空格开始
            regex = new Regex(@"<p>\s*&nbsp;", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            while (regex.IsMatch(content))
            {
                content = regex.Replace(content, @"<p>");
            }
            regex = new Regex(@"<p>\s+", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            while (regex.IsMatch(content))
            {
                content = regex.Replace(content, @"<p>");
            }
            regex = new Regex(@"<p> +", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            while (regex.IsMatch(content))
            {
                content = regex.Replace(content, @"<p>");
            }
            //替换段前空格结束

            //替换p标签空嵌套的情况
            regex = new Regex(@"<p>\s*?<p>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            while (regex.IsMatch(content))
                content = regex.Replace(content, "<p>");
            regex = new Regex(@"<\/p>\s*?<\/p>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            while (regex.IsMatch(content))
                content = regex.Replace(content, @"</p>");

            //替换p标签内容为空的情况
            regex = new Regex(@"<p>(?<txt>.*?)</p>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            mc = regex.Matches(content);
            foreach (Match m in mc) {
                string value = m.Groups["txt"].Value;
                value = value.Replace("&nbsp;", "").Trim();
                if (string.IsNullOrEmpty(value) == true)
                    content = content.Replace(m.Value,"");
            }

            //段首加空格
            content = content.Replace("<p>", "<p>  ");

            return content;
        }

 

 

剔除了除p、img、strong之外的其他标签,对p、img的各种属性也进行了清除,专门用于生成干净的网页正文,可用于信息采集后的内容整理和格式化排版。自用代码,算法效率可能不高,但是足以满足目前需求了。

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!