Regular expression matching in <100 lines of code

6 12 2010

The recent discussion about the Yacc is dead paper on Lambda the Ultimate sparked my interest in regular expression derivatives. The original idea goes back to the paper Derivatives of Regular Expressions published in 1964 (!) by Janusz A. Brzozowski. For a more modern treatment of the topic see Regular-expression derivatives reexamined.

The derivative of a set of strings with respect to a character, is the set of strings which results from removing the first character from all the strings in the set which start with that character. Let for example S = \{foo, bar, baz\} then \partial_b S = \{ar, az\} . It turns out that regular languages are closed under derivatives. That is, any derivative of a regular language is again a regular language. Furthermore, it is possible to extend the notion of derivatives to regular expression such that given a regular expression r which generates the language \mathcal{L}(r) and a character c , one can derive a regular expression \partial_c r such that \mathcal{L}(\partial_c r) = \partial_c(\mathcal{L}(r)).

This is a key ingredient for a very elegant regular expression matching algorithm: to match a string against a regular expression repeatedly calculate the derivative of the regular expression for each characters in the string. When no character is left, check whether the last derivative accepts the empty string. If so we have a match and otherwise not.

The exact algorithm for finding whether a regular expression is nullable (i.e. accepts the empty string) is given in Regular-expression derivatives reexamined as is the algorithm for calculating derivatives of regular expressions. Below is a direct implementation of that algorithm in Scala (with a slight modification to allow for strings instead of individual characters).

trait RegExp {
  def nullable: Boolean
  def derive(c: Char): RegExp
}

case object Empty extends RegExp {
  def nullable = false
  def derive(c: Char) = Empty
}

case object Eps extends RegExp {
  def nullable = true
  def derive(c: Char) = Empty
}

case class Str(s: String) extends RegExp {
  def nullable = s.isEmpty
  def derive(c: Char) =
    if (s.isEmpty || s.head != c) Empty
    else Str(s.tail)
}

case class Cat(r: RegExp, s: RegExp) extends RegExp {
  def nullable = r.nullable && s.nullable
  def derive(c: Char) =
    if (r.nullable) Or(Cat(r.derive(c), s), s.derive(c)) 
    else Cat(r.derive(c), s)
}

case class Star(r: RegExp) extends RegExp {
  def nullable = true
  def derive(c: Char) = Cat(r.derive(c), this)
}

case class Or(r: RegExp, s: RegExp) extends RegExp {
  def nullable = r.nullable || s.nullable
  def derive(c: Char) = Or(r.derive(c), s.derive(c))
}

case class And(r: RegExp, s: RegExp) extends RegExp {
  def nullable = r.nullable && s.nullable
  def derive(c: Char) = And(r.derive(c), s.derive(c))
}

case class Not(r: RegExp) extends RegExp {
  def nullable = !r.nullable
  def derive(c: Char) = Not(r.derive(c))
}

Having these constructors we need a way to match strings against regular expressions.

object Matcher {
  def matches(r: RegExp, s: String): Boolean = {
    if (s.isEmpty) r.nullable
    else matches(r.derive(s.head), s.tail)
  }
}

Here are some pimps to make usage of the regular expression constructors more convenient.

object Pimps {
  implicit def string2RegExp(s: String) = Str(s)

  implicit def regExpOps(r: RegExp) = new {
    def | (s: RegExp) = Or(r, s)
    def & (s: RegExp) = And(r, s)
    def % = Star(r)
    def %(n: Int) = rep(r, n)
    def ? = Or(Eps, r)
    def ! = Not(r)
    def ++ (s: RegExp) = Cat(r, s)
    def ~ (s: String) = Matcher.matches(r, s)
  }

  implicit def stringOps(s: String) = new {
    def | (r: RegExp) = Or(s, r)
    def | (r: String) = Or(s, r)
    def & (r: RegExp) = And(s, r)
    def & (r: String) = And(s, r)
    def % = Star(s)
    def % (n: Int) = rep(Str(s), n)
    def ? = Or(Eps, s)
    def ! = Not(s)
    def ++ (r: RegExp) = Cat(s, r)
    def ++ (r: String) = Cat(s, r)
    def ~ (t: String) = Matcher.matches(s, t)
  }

  def rep(r: RegExp, n: Int): RegExp =
    if (n <= 0) Star(r)
    else Cat(r, rep(r, n - 1))
}

And finally here is how to use it:

object Test {
  import Pimps._

  val digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
  val int = ("+" | "-").? ++ digit.%(1)
  val real = ("+" | "-").? ++ digit.%(1) ++ ("." ++ digit.%(1)).? ++ (("e" | "E") ++ ("+" | "-").? ++ digit.%(1)).?

  def main(args: Array[String]) {
    val ints = List("0", "-4534", "+049", "99")
    val reals = List("0.9", "-12.8", "+91.0", "9e12", "+9.21E-12", "-512E+01")
    val errs = List("", "-", "+", "+-1", "-+2", "2-")

    ints.foreach(s => assert(int ~ s))
    reals.foreach(s => assert(!(int ~ s)))
    errs.foreach(s => assert(!(int ~ s)))

    ints.foreach(s => assert(real ~ s))
    reals.foreach(s => assert(real ~ s))
    errs.foreach(s => assert(!(real ~ s)))
}

Now that’s 48 + 6 + 32 = 86 lines of code for a regular expression matching library!