C#除去HTML标记

2013-09-30

C#去掉HTML标记using System.Text.RegularExpressions/// summary/// 去除HTML标记/// /summary///

C#去掉HTML标记

using System.Text.RegularExpressions;    /// <summary>    /// 去除HTML标记    /// </summary>    /// <param name="strHtml">包括HTML的源码 </param>    /// <returns>已经去除后的文字</returns>   public static string StripHTML(string strHtml)    {        string[] aryReg ={          @"<script[^>]*?>.*?</script>",          @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",          @"([ ])[\s]+",          @"&(quot|#34);",          @"&(amp|#38);",          @"&(lt|#60);",          @"&(gt|#62);",           @"&(nbsp|#160);",           @"&(iexcl|#161);",          @"&(cent|#162);",          @"&(pound|#163);",          @"&(copy|#169);",          @"&#(\d+);",          @"-->",          @"<!--.* "                  };        string[] aryRep = {           "",           "",           "",           """,           "&",           "<",           ">",           " ",           "\xa1",//chr(161),           "\xa2",//chr(162),           "\xa3",//chr(163),           "\xa9",//chr(169),           "",           " ",           ""          };        string newReg = aryReg[0];        string strOutput = strHtml;        for (int i = 0; i < aryReg.Length; i++)        {            Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);            strOutput = regex.Replace(strOutput, aryRep[i]);        }        strOutput.Replace("<", "");        strOutput.Replace(">", "");        strOutput.Replace(" ", "");        return strOutput;    }

热点排行

CSS

C#除去HTML标记