Click here to Skip to main content
65,938 articles
CodeProject is changing. Read more.
Articles / web / HTML

Strip HTML Tags from Text

3.67/5 (2 votes)
25 Mar 2010CPOL 1  
How to strip HTML tags from text

Have you ever wondered how you would show on a web form text that is stripped of HTML tags but you still want to accept HTML tags when saving to your form? Sounds confusing?

To make it clear, I will give a good example.  Let's say you have a form that has a Rich Text Box (FTB or FCK) which allows users to cut and paste items that have HTML tags so that you can display it properly like a Blog Article, but there are some instances that you want this to be stripped of the HTML tags like displaying a summary on a Grid.  Now my solution is to strip the HTML codes before displaying it when needed.

So when you copy this...

Hello, World! 

...it won't show on your grid as this:

HTML
<html>
 <head>
 <title>
 Hello World
 </title>
 </head>
 <body>
 <font size ="4" color="blue">
 Hello, World!
 </font>
 </body>
</html>

but as this:

"Hello World!"

Now with a mix or Replace and Regular Expressions, I created a class to handle that and here it is:

C#
public string StripHTML(string sInputString)
 {
 try
 {
 string sOutputString;
 sOutputString = sInputString;
 //Initial Cleaning Step
 //Replace new line and carriage return with Spaces
 sOutputString = sOutputString.Replace("\r", " ");
 sOutputString = sOutputString.Replace("\n", " ");
 // Remove sTabs
 sOutputString = sOutputString.Replace("\t", string.Empty);

 //Tag Removal
 DataTable myDataTable = GetTableDefinition();
 myDataTable.DefaultView.Sort = "iID ASC";
 foreach (DataRow drCleaningItem in myDataTable.Rows)
 {
 string sOriginalString = (drCleaningItem["sOriginalString"]).ToString();
 string sReplacementString = (drCleaningItem["sReplacementString"]).ToString();
 sOutputString = Regex.Replace
	(sOutputString, sOriginalString, sReplacementString, RegexOptions.IgnoreCase);
 }

 //Initial replacement target string for linebreaks
 string sBreaks = "\r\r\r";

 // Initial replacement target string for sTabs
 string sTabs = "\t\t\t\t\t";
 for (int x = 0; x < sOutputString.Length; x++)
 {
 sOutputString = sOutputString.Replace(sBreaks, "\r\r");
 sOutputString = sOutputString.Replace(sTabs, "\t\t\t\t");
 sBreaks = sBreaks + "\r";
 sTabs = sTabs + "\t";
 }

 return sOutputString;
 }
 catch
 {
 return sInputString;
 }
 }

 private DataTable GetTableDefinition()
 {
 DataTable dtCleaningCollection = new DataTable();
 dtCleaningCollection.Columns.Add("iID", typeof(int));
 dtCleaningCollection.Columns.Add("sOriginalString", typeof(string));
 dtCleaningCollection.Columns.Add("sReplacementString", typeof(string));

 // Replace repeating spaces with single space
 dtCleaningCollection.Rows.Add(1, @"( )+", " ");

 // Prepare and clean Header Tag
 dtCleaningCollection.Rows.Add(2, @"<( )*head([^>])*>", "<head>");
 dtCleaningCollection.Rows.Add(3, @"(<( )*(/)( )*head( )*>)", "</head>");
 dtCleaningCollection.Rows.Add(4, "(<head>).*(</head>)", string.Empty);

 // Prepare and clean Script Tag
 dtCleaningCollection.Rows.Add(5, @"<( )*script([^>])*>", "<script>");
 dtCleaningCollection.Rows.Add(6, @"(<( )*(/)( )*script( )*>)", "</script>");
 dtCleaningCollection.Rows.Add(7, @"(<script>).*(</script>)", string.Empty);

 // Prepare and clean Style Tag
 dtCleaningCollection.Rows.Add(8, @"<( )*style([^>])*>", "<style>");
 dtCleaningCollection.Rows.Add(9, @"(<( )*(/)( )*style( )*>)", "</style>");
 dtCleaningCollection.Rows.Add(10, "(<style>).*(</style>)", string.Empty);

 // Replace <td> with sTabs
 dtCleaningCollection.Rows.Add(11, @"<( )*td([^>])*>", "\t");

 // Replace <BR> and <LI> with Line sBreaks
 dtCleaningCollection.Rows.Add(12, @"<( )*br( )*>", "\r");
 dtCleaningCollection.Rows.Add(13, @"<( )*li( )*>", "\r");

 // Replace <P>, <DIV> and <TR> with Double Line sBreaks
 dtCleaningCollection.Rows.Add(14, @"<( )*div([^>])*>", "\r\r");
 dtCleaningCollection.Rows.Add(15, @"<( )*tr([^>])*>", "\r\r");
 dtCleaningCollection.Rows.Add(16, @"<( )*p([^>])*>", "\r\r");

 // Remove Remaining tags enclosed in < >
 dtCleaningCollection.Rows.Add(17, @"<[^>]*>", string.Empty);

 // Replace special characters:
 dtCleaningCollection.Rows.Add(18, @" ", " ");
 dtCleaningCollection.Rows.Add(19, @"&bull;", " * ");
 dtCleaningCollection.Rows.Add(20, @"&lsaquo;", "<");
 dtCleaningCollection.Rows.Add(21, @"&rsaquo;", ">");
 dtCleaningCollection.Rows.Add(22, @"&trade;", "(tm)");
 dtCleaningCollection.Rows.Add(23, @"&frasl;", "/");
 dtCleaningCollection.Rows.Add(24, @"&lt;", "<");
 dtCleaningCollection.Rows.Add(25, @"&gt;", ">");
 dtCleaningCollection.Rows.Add(26, @"&copy;", "(c)");
 dtCleaningCollection.Rows.Add(27, @"&reg;", "(r)");
 dtCleaningCollection.Rows.Add(28, @"&frac14;", "1/4");
 dtCleaningCollection.Rows.Add(29, @"&frac12;", "1/2");
 dtCleaningCollection.Rows.Add(30, @"&frac34;", "3/4");
 dtCleaningCollection.Rows.Add(31, @"&lsquo;", "'");
 dtCleaningCollection.Rows.Add(32, @"&rsquo;", "'");
 dtCleaningCollection.Rows.Add(33, @"&ldquo;", "\"");
 dtCleaningCollection.Rows.Add(34, @"&rdquo;", "\"");

 // Remove all others remianing special characters
 // you dont want to replace with another string
 dtCleaningCollection.Rows.Add(35, @"&(.{2,6});", string.Empty);

 // Remove extra line sBreaks and sTabs
 dtCleaningCollection.Rows.Add(36, "(\r)( )+(\r)", "\r\r");
 dtCleaningCollection.Rows.Add(37, "(\t)( )+(\t)", "\t\t");
 dtCleaningCollection.Rows.Add(38, "(\t)( )+(\r)", "\t\r");
 dtCleaningCollection.Rows.Add(39, "(\r)( )+(\t)", "\r\t");
 dtCleaningCollection.Rows.Add(40, "(\r)(\t)+(\r)", "\r\r");
 dtCleaningCollection.Rows.Add(41, "(\r)(\t)+", "\r\t");

 return dtCleaningCollection;
 }

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)