regex.scala // Jump To …

From Interpreter to Compiler

Outline:

A staged interpreter is a compiler. This is useful, because an interpreter is usually much easier to implement than a compiler. In this section, we illustrate how to turn a vanilla interpreter into a compiler, using lightweight modular staging (LMS). The gist is to let LMS generate code for the interpreter specialized to a particular program – the program is fixed at staging time, while the input(s) to the program may vary in the generated code. Hence, staging an interpreter should be as simple as wrapping the types of expressions that vary in Rep[_] while leaving the types of expressions we want specialized as is.

As a case study, we stage a simple regular expression matcher. Our vanilla regular expression matcher is invoked on a regular expression string and an input string. The staged regular expression matcher is invoked on a static regular expression constant string and a dynamic input string of type Rep[String], and generates code specialized to match any input string against the constant regular expression pattern.

We could further optimize the generated code by additional staged transformations, but here, we only illustrate the basic process of staging an interpreter. This process is widely applicable. For example, we used the same process to stage a bytecode interpreter into a bytecode compiler.

package scala.lms.tutorial

import org.scalatest.FunSuite


Regular Expression Matcher

We start with a small regular expression matcher, ported to Scala from a C version, written by Rob Pike and Brian Kernighan.

trait RegexpMatcher {

  /* search for regexp anywhere in text */
  def matchsearch(regexp: String, text: String): Boolean = {
    if (regexp(0) == '^')
      matchhere(regexp, 1, text, 0)
    else {
      var start = -1
      var found = false
      while (!found && start < text.length) {
        start += 1
        found = matchhere(regexp, 0, text, start)
      }
      found
    }
  }


  /* search for restart of regexp at start of text */
  def matchhere(regexp: String, restart: Int, text: String, start: Int): Boolean = {
    if (restart==regexp.length)
      true
    else if (regexp(restart)=='$' && restart+1==regexp.length)
      start==text.length
    else if (restart+1 < regexp.length && regexp(restart+1)=='*')
      matchstar(regexp(restart), regexp, restart+2, text, start)
    else if (start < text.length && matchchar(regexp(restart), text(start)))
      matchhere(regexp, restart+1, text, start+1)
    else false
  }

  /* search for c* followed by restart of regexp at start of text */
  def matchstar(c: Char, regexp: String, restart: Int, text: String, start: Int): Boolean = {
    var sstart = start
    var found = matchhere(regexp, restart, text, sstart)
    var failed = false
    while (!failed && !found && sstart < text.length) {
      failed = !matchchar(c, text(sstart))
      sstart += 1
      found = matchhere(regexp, restart, text, sstart)
    }
    !failed && found
  }

  def matchchar(c: Char, t: Char): Boolean = {
    c == '.' || c == t
  }
}

trait RegexpMatcherTestCases {
  def testmatch(regexp: String, text: String, expected: Boolean): Unit

  testmatch("^hello$", "hello", true)
  testmatch("^hello$", "hell", false)
  testmatch("hell", "hello", true);
  testmatch("hell", "hell", true);
  testmatch("hel*", "he", true);
  testmatch("hel*$", "hello", false);
  testmatch("hel*", "yo hello", true);
  testmatch("ab", "hello ab hello", true);
  testmatch("^ab", "hello ab hello", false);
  testmatch("a*b", "hello aab hello", true);
  testmatch("^ab*", "abcd", true);
  testmatch("^ab*", "a", true);
  testmatch("^ab*", "ac", true);
  testmatch("^ab*", "bac", false);
  testmatch("^ab*$", "ac", false);
}

class RegexpMatcherTest extends FunSuite with RegexpMatcher with RegexpMatcherTestCases {
  override def testmatch(regexp: String, text: String, expected: Boolean) {
    test(s"""matchsearch("$regexp", "$text") == $expected""") {
      assertResult(expected){matchsearch(regexp, text)}
    }
  }
}

Staged Interpreter

The staged interpreter simply consist in wrapping the variable parameters in Rep[_] types. Otherwise, the code is the same.

trait StagedRegexpMatcher extends Dsl {

  /* search for regexp anywhere in text */
  def matchsearch(regexp: String, text: Rep[String]): Rep[Boolean] = {
    if (regexp(0) == '^')
      matchhere(regexp, 1, text, 0)
    else {
      var start = -1
      var found = false
      while (!found && start < text.length) {
        start += 1
        found = matchhere(regexp, 0, text, start)
      }
      found
    }
  }

  /* search for restart of regexp at start of text */
  def matchhere(regexp: String, restart: Int, text: Rep[String], start: Rep[Int]): Rep[Boolean] = {
    if (restart==regexp.length)
      true
    else if (regexp(restart)=='$' && restart+1==regexp.length)
      start==text.length
    else if (restart+1 < regexp.length && regexp(restart+1)=='*')
      matchstar(regexp(restart), regexp, restart+2, text, start)
    else if (start < text.length && matchchar(regexp(restart), text(start)))
      matchhere(regexp, restart+1, text, start+1)
    else false
  }

  /* search for c* followed by restart of regexp at start of text */
  def matchstar(c: Char, regexp: String, restart: Int, text: Rep[String], start: Rep[Int]): Rep[Boolean] = {
    var sstart = start
    var found = matchhere(regexp, restart, text, sstart)
    var failed = false
    while (!failed && !found && sstart < text.length) {
      failed = !matchchar(c, text(sstart))
      sstart += 1
      found = matchhere(regexp, restart, text, sstart)
    }
    !failed && found
  }

  def matchchar(c: Char, t: Rep[Char]): Rep[Boolean] = {
    c == '.' || c == t
  }
}

class StagedRegexpMatcherTest extends TutorialFunSuite with RegexpMatcherTestCases {
  val under = "sre"
  var m = Map.empty[String, DslDriver[String,Boolean]]
  def cache(k: String, b: => DslDriver[String,Boolean]): DslDriver[String,Boolean] = {
    m.get(k) match {
      case Some(v) => v
      case None =>
        m = m.updated(k, b)
        m(k)
    }
  }
  override def testmatch(regexp: String, text: String, expected: Boolean) {
    test(s"""matchsearch("$regexp", "$text") == $expected""") {
      val snippet = cache(regexp,
        new DslDriver[String,Boolean] with StagedRegexpMatcher {
          def snippet(x: Rep[String]) = matchsearch(regexp, x)
        })
      check("_"+regexp.replace("^", "_b").replace("*", "_s").replace("$", "_e"),
            snippet.code)
      assertResult(expected){snippet.eval(text)}
    }
  }
}

Generated Code

As an example, here is the code generated for ^ab.

/*****************************************
Emitting Generated Code
*******************************************/
class Snippet extends ((java.lang.String)=>(Boolean)) {
  def apply(x0:java.lang.String): Boolean = {
    val x1 = x0.length
    val x2 = 0 < x1
    val x5 = if (x2) {
      val x3 = x0(0)
      val x4 = 'a' == x3
      x4
    } else {
      false
    }
    val x11 = if (x5) {
      val x6 = 1 < x1
      val x9 = if (x6) {
        val x7 = x0(1)
        val x8 = 'b' == x7
        x8
      } else {
        false
      }
      val x10 = x9
      x10
    } else {
      false
    }
    x11
  }
}
/*****************************************
End of Generated Code
*******************************************/

What's next?

Go back to the tutorial index or continue with the Ackermann's Function.

Comments? Suggestions for improvement? View this file on GitHub.