如何只读取文字,不读取样式
现在读取出的效果是
<meta name="description" content="诚聘多名毛绒玩具缝纫机工,&lt;p&gt;
&lt;span style=&quot;font-family: 黑体&quot;&gt;&lt;span style=&quot;font-size: 18px&quot;&gt; 本&lt;strong&gt;&lt…玩具批发,毛绒玩具批发,玩具生产,玩具订做厂家,www.yztoy.com">
我想变成读取出来纯文字,而不是包含类似&lt;p&gt;这些代码的东西,请问怎么办的
[解决办法]
/// <summary> /// 过滤掉字符串中的HTML,CSS代码 /// </summary> /// <param name="inStr">字符串</param> /// <returns></returns> public static string RemoveHTML(string inStr) { string strOutput = inStr; var scriptRegExp = new Regex("<scr" + "ipt[^>.]*>[\\s\\S]*?</sc" + "ript>", RegexOptions.IgnoreCase & RegexOptions.Compiled & RegexOptions.Multiline & RegexOptions.ExplicitCapture); strOutput = scriptRegExp.Replace(strOutput, ""); var styleRegex = new Regex("<style[^>.]*>[\\s\\S]*?</style>", RegexOptions.IgnoreCase & RegexOptions.Compiled & RegexOptions.Multiline & RegexOptions.ExplicitCapture); strOutput = styleRegex.Replace(strOutput, ""); var objRegExp = new Regex("<(.|\\n)+?>", RegexOptions.IgnoreCase & RegexOptions.Compiled & RegexOptions.Multiline); strOutput = objRegExp.Replace(strOutput, ""); objRegExp = new Regex("<[^>]+>", RegexOptions.IgnoreCase & RegexOptions.Compiled & RegexOptions.Multiline); strOutput = objRegExp.Replace(strOutput, ""); strOutput = strOutput.Replace("<", "<"); strOutput = strOutput.Replace(">", ">"); // strOutput = strOutput.Replace(" ", " "); return strOutput.Trim(); }
[解决办法]
public static string StripHTML(string strHtml) { string[] aryReg ={ @"<script[^>]*?>.*?</script>", @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", @"([\r\n])[\s]+", @"&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @"&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);", @"&#(\d+);", @"-->", @"<!--.*\n" }; string[] aryRep = { "", "", "", "\"", "&", "<", ">", " ", "\xa1",//chr(161), "\xa2",//chr(162), "\xa3",//chr(163), "\xa9",//chr(169), "", "\r\n", "" }; string newReg = aryReg[0]; string strOutput = strHtml; for (int i = 0; i < aryReg.Length; i++) { System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(aryReg[i], System.Text.RegularExpressions.RegexOptions.IgnoreCase); strOutput = regex.Replace(strOutput, aryRep[i]); } //strOutput.Replace("<", ""); //strOutput.Replace(">", ""); strOutput = strOutput.Replace("\r\n", ""); return strOutput; }