WPF Sample Application using Regular Expressions to Parse Anchor Tags from HTML files.

Here's a sample application that I wrote that combines three technologies:

  • Windows Presentation Framework
  • Regular Expressions to parse HTML
  • Reading HTML with HttpWebRequest and HttpWebResponse

To run the sample, just press the Go button.  The text box fills with anchor tags from the web site given in the url box.

To build the sample, start with a new .NET Framework 3.0 Windows Application (WPF) project.  You'll have to rename the main window to GlideWindow and do a few other obvious tweaks to get the code to compile.

Put this in the GlideWindow XAML file:

<Window x:Class="Glide.GlideWindow"

    xmlns="https://schemas.microsoft.com/winfx/2006/xaml/presentation"

    xmlns:x="https://schemas.microsoft.com/winfx/2006/xaml"

    Title="Glide" Height="300" Width="700"

    >

  <Grid x:Name="_Grid">

    <Grid.RowDefinitions>

      <RowDefinition Height="36" />

      <RowDefinition />

    </Grid.RowDefinitions>

    <Canvas Grid.Row="0" Name="_ControlCanvas">

      <TextBlock Canvas.Left="15" Canvas.Top="8" Width="25" Height="20"

HorizontalAlignment="Left" Text="url:" />

      <TextBox   Canvas.Left="40" Canvas.Top="8" Width="600" Height="20"

         x:Name="_UrlTextBox" >https://msn.com</TextBox>

      <Button    Canvas.Left="650" Canvas.Top="8" Width="28" Height="20"

         HorizontalAlignment="Right" x:Name="_GoButton" Content="Go!">

      </Button>

    </Canvas>

    <TextBox Grid.Row="1" Margin="10,4,0,0" Name="textBox1"

         VerticalAlignment="Top" HorizontalAlignment="Left" VerticalScrollBarVisibility="Auto">

    </TextBox>

  </Grid>

</Window>

And here's the GlideWindow code-behind file. 

// First step in creating a tree-driven personal search engine.

// by Martin J. Tracy

using System;

using System.Collections.Generic;

using System.Text;

using System.Windows;

using System.IO;

using System.Net;

using System.Diagnostics;

using System.Text.RegularExpressions;

namespace Glide

{

    // Main window.

    public partial class GlideWindow : System.Windows.Window

    {

        #region Constructor

        public GlideWindow()

        {

            InitializeComponent();

            this._GoButton.Click += new RoutedEventHandler(_GoButton_Click);

        }

        #endregion

        #region Go button event handler

        // Read and parse the HTML and display anchor tags in the text box.

        void _GoButton_Click(object sender, RoutedEventArgs e)

        {

            string url = _UrlTextBox.Text;

            string line = GetHtmlAsLine(url);

            List<string> anchors = GetAnchorsFromLine(line);

            // Format anchor tags and display them in a text box.

            StringBuilder sbuild = new StringBuilder();

            foreach (string anchor in anchors)

            {

                sbuild.Append(anchor + "\r\n");

            }

            textBox1.Text = sbuild.ToString();

        }

        #endregion

        #region Read HTML from web page as one long line

        // Read web page at url and return contents as one long line.

        string GetHtmlAsLine(string url)

        {

            string line = "";

            HttpWebRequest httpWReq = null;

            HttpWebResponse httpWResp = null;

            StreamReader webstream = null;

            try

            {

                httpWReq = (HttpWebRequest)WebRequest.Create(url);

                httpWResp = (HttpWebResponse)httpWReq.GetResponse();

                webstream = new StreamReader(httpWResp.GetResponseStream(), Encoding.ASCII);

                line = webstream.ReadToEnd();

            }

            catch (Exception ex)

            {

                Debug.WriteLine(ex.Message);

            }

            finally

            {

                if (webstream != null) webstream.Close();

                if (httpWResp != null) httpWResp.Close();

            }

            return line;

        }

        #endregion

        #region Parse anchors from HTML using regular expressions

        // Return a list of anchor tags.

        List<string> GetAnchorsFromLine(string line)

        {

            List<string> anchors = new List<string>();

           string anchorPattern =

                @"(?<anchor><a(\s+\w+\s*[=]\s*([""].*?[""]|['].*?[']))*\s*>)(?<inner>.*?)</a>";

           

            MatchCollection matches =

                Regex.Matches(line, anchorPattern,

                    RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);

            foreach (Match m in matches)

            {

                string anchor = m.Groups["anchor"].ToString();

                anchors.Add(anchor);

            }

            return anchors;

        }

        #endregion

    }

}