Geeks With Blogs

News My Blog has been MOVED to https://mfreidge.wordpress.com
Michael Freidgeim's OLD Blog My Blog has been MOVED to https://mfreidge.wordpress.com

I found an article “C# Validate XHTML” with source code and decided to use it as a start point for my Html Fragment validation. Unfortunately there are quite a few things in the original code, that didn’t work as I expected/wanted, so I had to spend much more time to change it that I originally thought. Thanks to Sam Allen for very responsive answers.

 

#region Summary
///////////////////////////////////////////////////////////////////////////////
/*/ $History:  * Sam Allen  http://www.dotnetperls.com/xhtml						Created  * Michael Freidgeim http://geekswithblogs.net/mnf/  31-May-2011	Enhanced /*/
///////////////////////////////////////////////////////////////////////////////
#endregion
#region Namespace Imports
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
 
#endregion

	/// <summary>
	/// Copied and modified from http://www.dotnetperls.com/xhtml
	/// </summary>
	public class HtmlValidator
	{
		#region Enums
 
		[Flags]
		protected enum FeaturesToValidate
		{
//        "Quoted" attributes: The parser understands double-quoted attributes, and will flag errors if an attribute isn't quoted. Its support isn't elaborate or precise, but these errors will be noted.
		   QuotedAttributes=1,
//Invalid characters: Valid HTML does not allow the ampersand (&) in the source code unless it is part of an entity. This is a very common problem with lists of people's names or with programming sites.
		   AmpersandInCode = 2,
	   	  Tab=64,
//Lowercase markup: The parser demands that all markup be lowercased. (Lowercase markup compresses better, and good style usually requires lowercase tags.)
		   LowercaseMarkup=4,
//Solo tags: In XHTML, some tags (such as img and br) must be closed in the same tag as they are declared. The parser I developed enforces this. It will flag errors where these tags are not closed.
		   SoloTags=8,
//Nested tags: HTML markup must be nested correctly. Child elements must be closed before their parent tags. (My parser doesn't deal with complicated inline elements, such as when a overlaps with i.)	
		   NestedTags=16,
		   CloseTags=32,
		}
		[Flags]
		private enum TagType
		{
			Unknown=0,
			CloseTag=1,
			SoloTag=2
		}
 
		#endregion //Enums
		#region Properties
 
		private Dictionary<string, bool> _soloTags = null;
		#endregion //Properties
		#region Constructors
		public HtmlValidator()
		{
			_soloTags=new Dictionary<string, bool>(s_defaultSoloTags);
		}
		#endregion //Constructors
		#region Static helpers
 
		/// <summary>
		/// http://www.w3.org/People/Raggett/tidy/
		/// </summary>
		/// <param name="path"></param>
		/// <param name="pattern"></param>
		/// <param name="tidyOptions">possible options are listed at http://manpages.ubuntu.com/manpages/dapper/man1/tidy.1.html 
		/// Default is 
		///  -modify or -m  modify the original input files
		///   -indentor -i    indent element content
		///       -bare  or -b    strip out smart quotes and em dashes, etc.
		///      -quiet or -q    suppress nonessential output
		///  -wrap <column> wrap text at the specified <column> (default is 68)
 
 
		///</param>
		/// <returns></returns>
		public static string GenerateTidyBatch(string path, string pattern = "*.htm?", string tidyPath = "", string tidyOptions = "-m -i -b -q -wrap 150", string fileToSave = "")
		{
			string batchContent="";
          string[] dirs = Directory.GetFiles(path,pattern);
		  Debug.WriteLine("The number of files in path {0} {1} is {2}.", path, pattern, dirs.Length);
            foreach (string fileName in dirs) 
            {
            	string sHtml = StreamHelper.FileToString(fileName);
				if (!String.IsNullOrEmpty(tidyPath))
				{
					tidyPath = StringHelper.EnsureEndsWith(tidyPath,@"\");
				}
				batchContent += "\"" + tidyPath + "tidy\"  " + tidyOptions+ "  \"" + fileName +"\" 2>>errs.txt"+Environment.NewLine;
            }
			if (!String.IsNullOrEmpty(fileToSave))
			{
				string fileContent = "REM the file has been generated by " + MethodBase.GetCurrentMethod().DeclaringType.FullName + Environment.NewLine + batchContent;//DeclaringType.
				StreamHelper.SaveStringToFile(fileContent, fileToSave);
			}
			return batchContent;
		}
		#endregion //Static helpers
		/// <summary>
		/// 
		/// </summary>
		/// <param name="path"></param>
		/// <param name="pattern"></param>
		/// <returns></returns>
 
		public string ValidateHtmlFiles(string path,string pattern="*.htm?")
		{
			string sMessage="";
          string[] dirs = Directory.GetFiles(path,pattern);
		  Debug.WriteLine("The number of files in path {0} {1} is {2}.", path, pattern, dirs.Length);
            foreach (string fileName in dirs) 
            {
            	string sHtml = StreamHelper.FileToString(fileName);
				string sErr = CheckHtmlStructure(sHtml);
				if(!string.IsNullOrEmpty(sErr))
				{
					sMessage += Environment.NewLine + "file " + fileName + " has errors " + Environment.NewLine + sErr;
				}
            }
			return sMessage;
		}
		public string CheckHtmlStructure(string sHtml)
		{
			return CheckHtml(sHtml, FeaturesToValidate.CloseTags | FeaturesToValidate.NestedTags);
		}
		/// <summary>
/// Whether the HTML is likely valid. Error parameter will be empty
/// if no errors were found.
/// </summary>
		private string CheckHtml(string html,FeaturesToValidate flagsToValidate)
		{
			//
			// Store our tags in a stack
			//
			Stack<string> tags = new Stack<string>();
 
			//
			// Initialize return to empty
			//
			string error = string.Empty;
 
			//
			// Count of parenthesis
			//
			int parenthesisR = 0;
			int parenthesisL = 0;
 
			//
			// Traverse entire HTML
			//
			for (int i = 0; i < html.Length; i++)
			{
				char c = html[i];
				if (c == '<')
				{
					TagType tagType;
 
					//
					// Look ahead at this tag
					//
					string tag = LookAheadForTagName(html, i, out tagType);
					bool isClose=tagType.HasFlag(TagType.CloseTag);
					bool isSolo = tagType.HasFlag(TagType.SoloTag); 
 
					//
					// Make sure tag is lowercase
					//
					if (tag.ToLower() != tag)
					{
						if (flagsToValidate.HasFlag(FeaturesToValidate.LowercaseMarkup))
						{
							error += "upper: " + tag;
							//try to continue? return error;
						}
					}
 
					//
					// Make sure solo tags are parsed as solo tags
					//
				//some solos not required closing slash, e.g. !DOCTYPE
					if (_soloTags.ContainsKey(tag) && _soloTags[tag] == true)
					{
						if (!isSolo)
						{
							 if (flagsToValidate.HasFlag(FeaturesToValidate.SoloTags))
							{
								error += "!solo: " + tag;
								//try to continue return;
							}
						}
					}
					else
					{
						//
						// We are on a regular end or start tag
						//
						if (isClose)
						{
							//
							// We can't close a tag that isn't on the stack
							//
 
							if (tags.Count == 0)
							{
								if (flagsToValidate.HasFlag(FeaturesToValidate.CloseTags))
								{
									error += "!closing: " + tag;
									//try to continue return;
								}
							}
 
							//
							// Tag on stack must be equal to this closing tag
							//
							if (tags.Peek() == tag)
							{
								//
								// Remove the start tag from the stack
								//
								tags.Pop();
							}
							else
							{
								//
								// Mismatched closing tag
								//
								if (flagsToValidate.HasFlag(FeaturesToValidate.CloseTags))
								{
									error += "!match: " + tag;// +Environment.NewLine;
									//try to continue return;
								}
							}
						}
						else
						{
							if (_soloTags.ContainsKey(tag) && _soloTags[tag] == false)
							{ //ignore
							}
							else
							{//
								// Add tag to stack
								//
								tags.Push(tag);
							}
						}
					}
					i += tag.Length;
				}
				else if (c == '&')
				{
					//
					// & must never be followed by space or other &
					//
					if ((i + 1) < html.Length)
					{
						char next = html[i + 1];
 
						if (char.IsWhiteSpace(next) || next == '&')
						{
							if (flagsToValidate.HasFlag(FeaturesToValidate.AmpersandInCode))
							{
								error += "ampersand ";
								//try to continue return;
							} //error = "ampersand";
						}
					}
				}
				else if (c == '\t')
				{
					if (flagsToValidate.HasFlag(FeaturesToValidate.Tab))
					{
						error += "tab";
						//try to continue return;
					}
				}
				else if (c == '(')
				{
					parenthesisL++;
				}
				else if (c == ')')
				{
					parenthesisR++;
				}
			}
 
			if (flagsToValidate.HasFlag(FeaturesToValidate.CloseTags))
			{
 
				//
				// If we have tags in the stack, write them to error
				//
				foreach (string tagName in tags)
				{
					if (tagName!="!DOCTYPE")//allowed to be not closed
					error += "extra:" + tagName + " ";
				}
 
				//
				// Require even number of parenthesis
				//
				if (parenthesisL != parenthesisR)
				{
					error += "!even parenthesisList";
				}
			}
			return error;
		}
 
//Look ahead method. The method shown above will call the following method, which deals with the tags themselves. This method deals with 'solo' or self-closing tags. It does this by recording the position of the slash. It also handles quotes. After the method there is a static Dictionary that is used to tell if a tag is a solo tag.
 
/// <summary>
/// Called at the start of an html tag. We look forward and record information
/// about our tag. Handles start tags, close tags, and solo tags. 'Collects'
/// an entire tag.
/// </summary>
/// <returns>Tag name.</returns>
		static private string LookAheadForTagName(string html, int start, out TagType tagType) 
		//	bool isClose,out bool isSolo)
{
	 tagType=TagType.Unknown;
    StringBuilder tagName = new StringBuilder();
 
    //
    // Stores the position of the final slash
    //
    int slashPos = -1;
 
    //
    // Whether we have encountered a space
    //
    bool space = false;
 
    //
    // Whether we are in a quote
    //
    bool quote = false;
 
    //
    // Begin scanning the tag
    //
    int i;
    for (i = 0; ; i++)
    {
		 //
		// Get the position in main html
		//
		int pos = start + i;
 
		//
		// Don't go outside the html
		//
		if (pos >= html.Length)
		{
			return "x";//Why ?
		}
 
		//
		// The character we are looking at
		//
		char c = html[pos];
 
		//
		// See if a space has been encountered
		//
		if (char.IsWhiteSpace(c))
		{
			space = true;
		}
 
		//
		// Add to our tag name if none of these are present
		//
		if (space == false &&
			c != '<' &&
			c != '>' &&
			c != '/')
		{
			tagName.Append(c);
		}
 
		//
		// Record position of slash if not inside a quoted area
		//
		if (c == '/' &&
			quote == false)
		{
			slashPos = i;
		}
 
		//
		// End at the > bracket
		//
		if (c == '>')
		{
			break;
		}
 
		//
		// Record whether we are in a quoted area
		//
		if (c == '\"')
		{
			quote = !quote;
		}
    }
 
    //
    // Determine if this is a solo or closing tag
    //
    if (slashPos != -1)
    {
		//
		// If slash is at the end so this is solo
		//
		if (slashPos + 1 == i)
		{
			tagType|= TagType.SoloTag;// isSolo = true;
		}
		else
		{
			if (slashPos == 1)// it is the beginning
			{ tagType |= TagType.CloseTag; }
			else
			{
				Debug.Assert(false, "unexpected in the middle of element");
			}
		}
    }
 
    //
    // Return the name of the tag collected
    //
    string name = tagName.ToString();
	//if (name.Length == 0)
	//{
	//return "empty";
	//}
	//else
	//{
	return name;
    //}
}
 
/// <summary>
/// Tags that must be closed in the start
/// boolean -is closing slash required
/// </summary>
static Dictionary<string, bool> s_defaultSoloTags = new Dictionary<string, bool>()
{
    {"img", true},
    {"br", true},
	{"meta",true},
	{"!DOCTYPE",false},//todo: use as special TagType the DOCTYPE. Like at the start at every document just check it exists
//and then start scanning after it. I don't think it's worth it trying to treat it as a solo tag.
{"!--",false}//todo:use as special TagType };   }
Posted on Wednesday, June 1, 2011 6:28 AM | Back to top


Comments on this post: HtmlValidator class

No comments posted yet.
Your comment:
 (will show your gravatar)


Copyright © Michael Freidgeim | Powered by: GeeksWithBlogs.net