C# HTML Parser
CodeKeep C# Feed Gennaio 28th, 2008
Description: Basic HTML Parsing. Handles HTML not in XML format. No explicit handling for empty tags, non-standards-compliant text, etc.Link: http://www.codekeep.net/snippets/5e05b0e4-572c-4c52-87a8-6f3bf4bbc0b3.aspx
//================================================================================
//ContentsRetrievedEventArgs.cs
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using net.wattenbarger.Utilities.Regex;
namespace net.wattenbarger.Utilities {
public class ContentsRetrievedEventArgs : EventArgs {
private string _contents;
protected internal ContentsRetrievedEventArgs ( string contents ) {
_contents = contents;
}
public string Contents {
get { return _contents; }
}
}
}
//================================================================================
//HTMLParser.cs
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using net.wattenbarger.Utilities.Regex;
namespace net.wattenbarger.Utilities {
public class HtmlParser {
private static HtmlParser _instance;
public event EventHandler<TagProcessedEventArgs> TagProcessed;
public event EventHandler<ContentsRetrievedEventArgs>
ContentsRetrieved;
private HtmlParser ( ) { }
public static HtmlParser Instance {
get {
if ( _instance == null ) {
_instance = new HtmlParser ( );
}
return _instance;
}
}
public string ConvertAttributes ( string fragment ) {
AttributeRegex re = new AttributeRegex ( );
return re.Replace ( fragment, "${1}\"${2}\"" );
}
public void ParseTaggedText ( string fragment ) {
TagRegex reTag = new TagRegex ( );
foreach ( Match m in reTag.Matches ( fragment ) ) {
this.OnTagProcessed ( m );
this.OnContentsRetrieved ( this.GetContents ( fragment, m ) );
}
}
private string GetContents ( string fragment, Match m ) {
Match nextMatch = m.NextMatch ( );
if ( nextMatch != null && nextMatch.Index > 0 ) {
int startContents = m.Index + m.Length;
int contentsLength = nextMatch.Index - startContents;
return fragment.Substring ( startContents,
contentsLength ).Trim ( );
} else {
return "";
}
}
protected virtual void OnContentsRetrieved (
ContentsRetrievedEventArgs e ) {
if ( this.ContentsRetrieved != null ) {
this.ContentsRetrieved ( this, e );
}
}
private void OnContentsRetrieved ( string contents ) {
if ( contents.Length > 0 ) {
this.OnContentsRetrieved (
new ContentsRetrievedEventArgs ( contents ) );
}
}
protected virtual void OnTagProcessed (
TagProcessedEventArgs e ) {
if ( this.TagProcessed != null ) {
this.TagProcessed ( this, e );
}
}
private void OnTagProcessed ( Match m ) {
this.OnTagProcessed ( new TagProcessedEventArgs ( m ) );
}
}
}
//================================================================================
//TagProcessedEventArgs.cs
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using net.wattenbarger.Utilities.Regex;
namespace net.wattenbarger.Utilities {
public class TagProcessedEventArgs : EventArgs {
private string _tagName;
private string _tagText;
private bool _isEndTag;
internal protected TagProcessedEventArgs ( Match m ) {
if ( m.Success ) {
_tagText = m.Result ( "$1" );
_tagName = m.Result ( "$2" );
_isEndTag = ( _tagText [ 0 ] == '/' );
} else {
throw new ApplicationException (
"Invalid tag." );
}
}
public string TagName {
get { return _tagName; }
}
public string TagText {
get { return _tagText; }
}
public bool IsEndTag {
get { return _isEndTag; }
}
}
}
//================================================================================
//RegularExpressionsDefinition.cs
//compile by itself to a library and then add references to a build containing the
//other three classes above
#define DEBUG
#define TRACE
using System;
using System.Collections.Generic;
using System.Reflection;
using System.Reflection.Emit;
using System.Text;
using System.Text.RegularExpressions;
[assembly:AssemblyVersion ( "1.0.0.0" ) ]
[assembly:AssemblyProduct ( "Regular Expressions for HTML Parser" ) ]
namespace net.wattenbarger.Utilities.RegularExpressions {
public class RegularExpressionsDefinition {
private Dictionary<string, string> _patternDictionary;
private Dictionary<string, RegexCompilationInfo> _rciDictionary;
public RegularExpressionsDefinition ( ) {
_patternDictionary
= new Dictionary<string, string> ( );
_rciDictionary
= new Dictionary<string, RegexCompilationInfo> ( );
}
public void LoadPatterns ( ) {
_patternDictionary.Add ( "AttributeRegex",
@"(\w+\=)((\w|/)+)" );
_patternDictionary.Add ( "TagRegex",
"(?s:\\<((/?\\w+).*?)\\>)" );
_patternDictionary.Add ( "NewLineRegex",
@"(\r\n)|\r|\n" );
}
public void LoadRegularExpressions ( ) {
this.LoadPatterns ( );
foreach ( KeyValuePair<string, string> kvp
in _patternDictionary ) {
RegexCompilationInfo rci
= new RegexCompilationInfo ( kvp.Value,
RegexOptions.Compiled,
kvp.Key, "net.wattenbarger.Utilities.Regex", true );
_rciDictionary.Add ( kvp.Key, rci );
}
}
private CustomAttributeBuilder [ ] CreateAttributes ( ) {
List<CustomAttributeBuilder> attributeBuilders
= new List<CustomAttributeBuilder> ( );
CustomAttributeBuilder cab;
cab = new CustomAttributeBuilder (
typeof ( AssemblyVersionAttribute ).GetConstructor (
new Type [ ] { typeof ( string ) } ),
new object [ ] { "1.0.0.0" } );
attributeBuilders.Add ( cab );
cab = new CustomAttributeBuilder (
typeof (AssemblyProductAttribute ).GetConstructor (
new Type [ ] { typeof ( string ) } ),
new object [ ] { "HtmlParser Regular Expressions" } );
attributeBuilders.Add ( cab );
return attributeBuilders.ToArray ( );
}
public void CreateRegexAssembly ( ) {
this.LoadRegularExpressions ( );
List<RegexCompilationInfo> rciList
= new List<RegexCompilationInfo> ( _rciDictionary.Values );
RegexCompilationInfo [ ] compilationInfos
= rciList.ToArray ( );
AssemblyName an
= new AssemblyName ( "HtmlParser.RegularExpressions" );
Regex.CompileToAssembly ( compilationInfos, an,
this.CreateAttributes());
}
public static void Main ( ) {
RegularExpressionsDefinition red
= new RegularExpressionsDefinition ( );
red.CreateRegexAssembly ( );
}
}
}