YAML lexer: proof of concept in Go

| updated

This is a scanner/lexer for YAML, implemented in Go, based on the Scanning chapter in Crafting interpreters by Robert Nystrom.

Related:

Scanner

The scanner and its primitive operations:

type scanner struct {
    source []byte
    tokens []token
    start,
    current,
    line int
}

func newScanner(source string) scanner {
    return scanner{
        source:  []byte(source),
        tokens:  make([]token, 0),
        start:   0,
        current: 0,
        line:    1}
}

func (s *scanner) isAtEnd() bool {
    return s.current >= len(s.source)
}

func (s *scanner) peek() byte {
    if s.isAtEnd() { return 0 }
    return s.source[s.current]
}

func (s *scanner) advance() byte {
    c := s.source[s.current]
    s.current++
    return c
}

func (s *scanner) match(c byte) bool {
    if s.isAtEnd() || s.source[s.current] != c {
        return false
    }
    s.current++
    return true
}

We’ll define a scanTokens() method below.

Token

First, the definition of the token struct:

type token struct {
    type_   tokenType
    lexeme  []byte
    literal literal
    line    int
}
type tokenType int
type literal []byte

We use a slice of bytes for the literal field to avoid allocating a string and copying into it.

The recognised tokens are as follows:

const (
    COLON tokenType = iota
    COMMA
    DASH
    LEFT_BRACE
    RIGHT_BRACE
    LEFT_BRACKET
    RIGHT_BRACKET
    STRING
    INDENT
    EOL
    EOF
)

For testing and debugging, let’s add a String() method that converts the tokenType int into a string:

var typeNames = []string{
    "colon",
    "comma",
    "dash",
    "left_brace",
    "right_brace",
    "left_bracket",
    "right_bracket",
    "string",
    "indent",
    "eol",
    "eof",
}

func (t *token) String() string {
    if t.type_ == STRING {
        return fmt.Sprintf("string(%s)", t.literal)
    }
    if t.type_ == INDENT {
        return fmt.Sprintf("indent(%d)", len(t.literal))
    }
    return typeNames[t.type_]
}

Scanning implementation

Now we can scan the tokens in a while loop until the end of the source string:

func (s *scanner) scanTokens() []token {
    for !s.isAtEnd() {
        s.start = s.current
        s.scanToken()
    }
    s.tokens = append(s.tokens, token{type_: EOF, line: s.line})
    return s.tokens
}

func (s *scanner) addToken(type_ tokenType, literal literal) {
    lexeme := s.source[s.start:s.current]
    tok := token{type_, lexeme, literal, s.line}
    s.tokens = append(s.tokens, tok)
}

func (s *scanner) scanToken() {
    c := s.advance()
    switch c {
    case ':':
        s.addToken(COLON, nil)
    case ',':
        s.addToken(COMMA, nil)
    case '{':
        s.addToken(LEFT_BRACE, nil)
    case '}':
        s.addToken(RIGHT_BRACE, nil)
    case '[':
        s.addToken(LEFT_BRACKET, nil)
    case ']':
        s.addToken(RIGHT_BRACKET, nil)
    case '-':
        if s.match(' ') {
            s.addToken(DASH, nil)
        } else {
            s.string()
        }
    case '\n':
        s.addToken(EOL, nil)
        s.line++
        if s.peek() == ' ' {
            s.indent()
        }
    case ' ', '\t', '\r':
        return
    case '"':
        s.dQuotedString()
    default:
        s.string()
    }
}

Single-character tokens are generally stored as is. Special cases:

The following methods are defined for scanning more complicated spans of the source string:

func (s *scanner) indent() {
    // NB: Only call this function when s.start is at EOL.
    indentStart := s.start + 1
    for s.peek() == ' ' {
        s.advance()
    }
    s.addToken(INDENT, s.source[indentStart:s.current])
}

func (s *scanner) dQuotedString() {
    // NB: Only call this function when s.start is at opening ".
    for s.peek() != '"' && !s.isAtEnd() {
        if s.peek() == '\n' {
            s.line++
        }
        s.advance()
    }
    if s.isAtEnd() {
        log.Printf("%d | unterminated string\n", s.line)
        return
    }
    s.advance()
    s.addToken(STRING, s.source[s.start+1:s.current-1])
}

func (s *scanner) string() {
    for !bytes.ContainsAny([]byte{s.peek()}, ":,}]\n") && !s.isAtEnd() {
        s.advance()
    }
    s.addToken(STRING, bytes.TrimSpace(s.source[s.start:s.current]))
}

Note that we don’t return any errors from scanTokens at all; we only log the syntax errors (in dQuotedString), and leave it to the subsequent parser to error out if any sequence of tokens is invalid.

The string() scanner reads until it encounters one of the closing characters :,}] or EOL or EOF. On the other hand, dQuotedString() allows multiline strings.

Testing

A fairly complete happy-path test case:

var text = `key0: -0.2
key1: "multiline
value: x"
"key 2": val: 2
key3:
    key31: x
    key32: y
key4: [ val4.1, val4.2 ]
key5: {key51: [val5.1, val5.2]}
key6:
  - val61
  - val62
`
var want = []string{
    "string(key0)", "colon", "string(-0.2)", "eol",
    "string(key1)", "colon", "string(multiline\nvalue: x)", "eol",
    "string(key 2)", "colon", "string(val)", "colon", "string(2)", "eol",
    "string(key3)", "colon", "eol",
    "indent(4)", "string(key31)", "colon", "string(x)", "eol",
    "indent(4)", "string(key32)", "colon", "string(y)", "eol",
    "string(key4)", "colon", "left_bracket", "string(val4.1)", "comma", "string(val4.2)", "right_bracket", "eol",
    "string(key5)", "colon", "left_brace", "string(key51)", "colon", "left_bracket", "string(val5.1)", "comma", "string(val5.2)", "right_bracket", "right_brace", "eol",
    "string(key6)", "colon", "eol",
    "indent(2)", "dash", "string(val61)", "eol",
    "indent(2)", "dash", "string(val62)", "eol",
    "eof",
}

func TestScanner(t *testing.T) {
    sc := newScanner(text)
    tokens := sc.scanTokens()
    if len(tokens) != len(want) {
        t.Errorf("want %d tokens; got %d", len(want), len(tokens))
    }
    for i, tok := range tokens[:len(want)] {
        if want[i] != tok.String() {
            t.Errorf("%d | want: %q; got: %q", i, want[i], tok.String())
        }
    }
}

Update

I’ve published an experimental Go library incorporating the above lexer, with additional testing and a parser to process the tokens into a tree: javiljoen/miniyaml