Regex based Lexer with F#

Article
01/19/2010

This lexer allows you to define your regular expression based rules in a very declarative way using F# computation expressions.

 open Lexer

let definitions =

    lexerDefinitions {

        do! addNextlineDefinition "NEWLINE" @"(\n\r)|\n|\r"

        do! addIgnoreDefinition "WS"        @"\s"

        do! addDefinition "LET"             "let"

        do! addDefinition "ID"              "(?i)[a-z][a-z0-9]*"

        do! addDefinition "FLOAT"           @"[0-9]+\.[0-9]+"

        do! addDefinition "INT"             "[0-9]+"

        do! addDefinition "OPERATOR"      @"[+*=!/&|<>\^\-]+"

With those defined you can execute the lexer with:

 open Lexer

let lex input =

try

        let y = Lexer.tokenize definitions input

        printfn "%A" y

    with e -> printf "%s" e.Message

lex "let a = 5"

Which will result in:

seq [{name = "LET";
      text = "let";
      pos = 0;
      column = 0;
      line = 0;}; {name = "ID";
                   text = "a";
                   pos = 4;
                   column = 4;
                   line = 0;}; {name = "OPERATOR";
                                text = "=";
                                pos = 6;
                                column = 6;
                                line = 0;}; {name = "INT";
                                             text = "5";
                                             pos = 8;
                                             column = 8;
                                             line = 0;}]

The lexer’s code is structured in three parts. The first part is a state monad using the F# computation expressions. This enables the declarative approach (seen above) to setup your lexer rules.

 module StateMonad

type State<'s,'a> = State of ('s -> ('a *'s))

let runState (State f) = f

type StateBuilder() =

    member b.Return(x) = State (fun s -> (x,s))

    member b.Delay(f) = f() : State<'s,'a>

    member b.Zero() = State (fun s -> ((),s))

    member b.Bind(State p,rest) = State (fun s -> let v,s2 = p s in  (runState (rest v)) s2)

    member b.Get () = State (fun s -> (s,s))

    member b.Put s = State (fun _ -> ((),s))

The second part are the combinators that are used to define your lexer rules. There are three main combinators: AddDefinition which lets you define a name / regex pair, AddIgnoreDefinition which lets you define characters which the lexer should ignore and AddNextlineDefinition which lets you define what characters determine a new line.

 type LexDefinitions =

  {regexes : string list;

   names : string list;

   nextlines : bool list;

   ignores : bool list; }

let buildDefinition name pattern nextLine ignore =

    state {

        let! x = state.Get()

        do! state.Put { regexes = x.regexes @  [sprintf @"(?<%s>%s)" name pattern];

                        names = x.names @ [name];

                        nextlines  = x.nextlines @ [nextLine];

                        ignores = x.ignores @ [ignore]}

let addDefinition name pattern = buildDefinition name pattern false false

let addIgnoreDefinition name pattern = buildDefinition name pattern false true

let addNextlineDefinition name pattern = buildDefinition name pattern true true

And the final part is the code that performs the tokenizing. It uses the Seq.unfold method to create the list of tokens. Unfold is a function which takes a single item and generates a list of new items from it. It is the opposite of Seq.fold which takes a list of items and turns it into a single item. The tokenize function used Seq.unfold to generate each token while keeping track of the current line number, position in that line and position in the input string.

 type Token =

    { name : string;

      text: string;

      pos :int;

      column: int;

      line: int }

let createLexDefs pb =  (runState pb) {regexes = []; names = []; nextlines = []; ignores = []} |> snd

let tokenize lexerBuilder (str:string) =

    let patterns = createLexDefs lexerBuilder

    let combinedRegex =  Regex(List.fold (fun acc reg -> acc + "|" + reg) (List.head patterns.regexes) (List.tail patterns.regexes))

    let nextlineMap = List.zip patterns.names patterns.nextlines |> Map.ofList

    let ignoreMap = List.zip patterns.names patterns.ignores |> Map.ofList

    let tokenizeStep (pos,line,lineStart) =

        if pos >= str.Length then

            None

        else

            let getMatchedGroupName (grps:GroupCollection) names = List.find (fun (name:string) -> grps.[name].Length > 0) names

            match combinedRegex.Match(str, pos) with

                | mt when mt.Success && pos = mt.Index  ->

                    let groupName = getMatchedGroupName mt.Groups patterns.names

                    let column = mt.Index - lineStart

                    let nextPos = pos + mt.Length

                    let (nextLine, nextLineStart) = if nextlineMap.Item groupName then (line + 1, nextPos) else (line,lineStart)

                    let token = if ignoreMap.Item groupName

                                then None

                                else Some {

                                        name = groupName;

                                        text = mt.Value;

                                        pos = pos;

                                        line = line;

                                        column = column; }

                    Some(token, (nextPos, nextLine, nextLineStart))

                | otherwise ->

                    let textAroundError = str.Substring(pos, min (pos + 5) str.Length)

                    raise (ArgumentException (sprintf "Lexing error occured at line:%d and column:%d near the text:%s" line (pos - lineStart) textAroundError))

    Seq.unfold tokenizeStep (0, 0, 0) |> Seq.filter (fun x -> x.IsSome) |> Seq.map (fun x -> x.Value)

Lastly, here are the unit tests written using XUnit.Net:

 module LexerFacts

open Xunit

open Lexer

open System.Linq

let simpleDefs =

    state {

        do! addNextlineDefinition "NextLine"           "/"

        do! addIgnoreDefinition "IgnoredSymbol"        "=+"

        do! addDefinition "String"                     "[a-zA-Z]+"

        do! addDefinition "Number"                     "\d+"

        do! addDefinition "Name"                       "Matt"

[<Fact>]

let Will_return_no_tokens_for_empty_string() =

    let tokens = Lexer.tokenize simpleDefs ""

    Assert.Equal(0, tokens.Count())

[<Fact>]

let Will_throw_exception_for_invalid_token() =

    let tokens = Lexer.tokenize simpleDefs "-"

    let ex = Assert.ThrowsDelegateWithReturn(fun () -> upcast tokens.Count()) |> Record.Exception

    Assert.NotNull(ex)

    Assert.True(ex :? System.ArgumentException)

[<Fact>]

let Will_ignore_symbols_defined_as_ignore_symbols() =

    let tokens = Lexer.tokenize simpleDefs "========="

    Assert.Equal(0, tokens.Count())

[<Fact>]

let Will_get_token_with_correct_position_and_type() =

    let tokens = Lexer.tokenize simpleDefs "1one=2=two"

    Assert.Equal("Number",tokens.ElementAt(2).name)

    Assert.Equal("2",tokens.ElementAt(2).text)

    Assert.Equal(5,tokens.ElementAt(2).pos)

    Assert.Equal(5,tokens.ElementAt(2).column)

    Assert.Equal(0,tokens.ElementAt(2).line)

[<Fact>]

let Will_tokenize_string_with_alernating_numbers_and_strings() =

    let tokens = Lexer.tokenize simpleDefs "1one2two"

    Assert.Equal("1",tokens.ElementAt(0).text)

    Assert.Equal("one",tokens.ElementAt(1).text)

    Assert.Equal("2",tokens.ElementAt(2).text)

    Assert.Equal("two",tokens.ElementAt(3).text)

[<Fact>]

let Will_increment_line_with_newline_symbol() =

    let tokens = Lexer.tokenize simpleDefs "1one/2two"

    Assert.Equal("Number",tokens.ElementAt(2).name)

    Assert.Equal("2",tokens.ElementAt(2).text)

    Assert.Equal(5,tokens.ElementAt(2).pos)

    Assert.Equal(0,tokens.ElementAt(2).column)

    Assert.Equal(1,tokens.ElementAt(2).line)

[<Fact>]

let Will_give_priority_to_lexer_definitions_defined_earlier() =

    let tokens = Lexer.tokenize simpleDefs "Matt"

    Assert.Equal("String",tokens.ElementAt(0).name)

I attached a zip (Lexer.zip) containing all the code mentioned above.

Lexer.zip

Regex based Lexer with F#

Additional resources