YAML lexer: proof of concept in Go
| updated
This is a scanner/lexer for YAML, implemented in Go, based on the Scanning chapter in Crafting interpreters by Robert Nystrom.
Related:
Scanner
The scanner and its primitive operations:
type scanner struct {
source []byte
tokens []token
start,
current,
line int
}
func newScanner(source string) scanner {
return scanner{
source: []byte(source),
tokens: make([]token, 0),
start: 0,
current: 0,
line: 1}
}
func (s *scanner) isAtEnd() bool {
return s.current >= len(s.source)
}
func (s *scanner) peek() byte {
if s.isAtEnd() { return 0 }
return s.source[s.current]
}
func (s *scanner) advance() byte {
c := s.source[s.current]
s.current++
return c
}
func (s *scanner) match(c byte) bool {
if s.isAtEnd() || s.source[s.current] != c {
return false
}
s.current++
return true
}
We’ll define a scanTokens() method below.
Token
First, the definition of the token struct:
type token struct {
type_ tokenType
lexeme []byte
literal literal
line int
}
type tokenType int
type literal []byte
We use a slice of bytes for the literal field
to avoid allocating a string and copying into it.
The recognised tokens are as follows:
const (
COLON tokenType = iota
COMMA
DASH
LEFT_BRACE
RIGHT_BRACE
LEFT_BRACKET
RIGHT_BRACKET
STRING
INDENT
EOL
EOF
)
For testing and debugging, let’s add a String() method
that converts the tokenType int into a string:
var typeNames = []string{
"colon",
"comma",
"dash",
"left_brace",
"right_brace",
"left_bracket",
"right_bracket",
"string",
"indent",
"eol",
"eof",
}
func (t *token) String() string {
if t.type_ == STRING {
return fmt.Sprintf("string(%s)", t.literal)
}
if t.type_ == INDENT {
return fmt.Sprintf("indent(%d)", len(t.literal))
}
return typeNames[t.type_]
}
Scanning implementation
Now we can scan the tokens in a while loop until the end of the source string:
func (s *scanner) scanTokens() []token {
for !s.isAtEnd() {
s.start = s.current
s.scanToken()
}
s.tokens = append(s.tokens, token{type_: EOF, line: s.line})
return s.tokens
}
func (s *scanner) addToken(type_ tokenType, literal literal) {
lexeme := s.source[s.start:s.current]
tok := token{type_, lexeme, literal, s.line}
s.tokens = append(s.tokens, tok)
}
func (s *scanner) scanToken() {
c := s.advance()
switch c {
case ':':
s.addToken(COLON, nil)
case ',':
s.addToken(COMMA, nil)
case '{':
s.addToken(LEFT_BRACE, nil)
case '}':
s.addToken(RIGHT_BRACE, nil)
case '[':
s.addToken(LEFT_BRACKET, nil)
case ']':
s.addToken(RIGHT_BRACKET, nil)
case '-':
if s.match(' ') {
s.addToken(DASH, nil)
} else {
s.string()
}
case '\n':
s.addToken(EOL, nil)
s.line++
if s.peek() == ' ' {
s.indent()
}
case ' ', '\t', '\r':
return
case '"':
s.dQuotedString()
default:
s.string()
}
}
Single-character tokens are generally stored as is. Special cases:
- Whitespace outside a string is ignored.
- Indents only occur after an EOL, so we scan them in that case block.
- Dashes/hyphens can occur at the start of a string/number;
we only emit a
DASHtoken if it is followed by a space. - Single-character tokens that occur within a string (and don’t terminate it) are included in the string.
The following methods are defined for scanning more complicated spans of the source string:
func (s *scanner) indent() {
// NB: Only call this function when s.start is at EOL.
indentStart := s.start + 1
for s.peek() == ' ' {
s.advance()
}
s.addToken(INDENT, s.source[indentStart:s.current])
}
func (s *scanner) dQuotedString() {
// NB: Only call this function when s.start is at opening ".
for s.peek() != '"' && !s.isAtEnd() {
if s.peek() == '\n' {
s.line++
}
s.advance()
}
if s.isAtEnd() {
log.Printf("%d | unterminated string\n", s.line)
return
}
s.advance()
s.addToken(STRING, s.source[s.start+1:s.current-1])
}
func (s *scanner) string() {
for !bytes.ContainsAny([]byte{s.peek()}, ":,}]\n") && !s.isAtEnd() {
s.advance()
}
s.addToken(STRING, bytes.TrimSpace(s.source[s.start:s.current]))
}
Note that we don’t return any errors from scanTokens at all;
we only log the syntax errors (in dQuotedString),
and leave it to the subsequent parser to error out
if any sequence of tokens is invalid.
The string() scanner reads
until it encounters one of the closing characters :,}] or EOL or EOF.
On the other hand, dQuotedString() allows multiline strings.
Testing
A fairly complete happy-path test case:
var text = `key0: -0.2
key1: "multiline
value: x"
"key 2": val: 2
key3:
key31: x
key32: y
key4: [ val4.1, val4.2 ]
key5: {key51: [val5.1, val5.2]}
key6:
- val61
- val62
`
var want = []string{
"string(key0)", "colon", "string(-0.2)", "eol",
"string(key1)", "colon", "string(multiline\nvalue: x)", "eol",
"string(key 2)", "colon", "string(val)", "colon", "string(2)", "eol",
"string(key3)", "colon", "eol",
"indent(4)", "string(key31)", "colon", "string(x)", "eol",
"indent(4)", "string(key32)", "colon", "string(y)", "eol",
"string(key4)", "colon", "left_bracket", "string(val4.1)", "comma", "string(val4.2)", "right_bracket", "eol",
"string(key5)", "colon", "left_brace", "string(key51)", "colon", "left_bracket", "string(val5.1)", "comma", "string(val5.2)", "right_bracket", "right_brace", "eol",
"string(key6)", "colon", "eol",
"indent(2)", "dash", "string(val61)", "eol",
"indent(2)", "dash", "string(val62)", "eol",
"eof",
}
func TestScanner(t *testing.T) {
sc := newScanner(text)
tokens := sc.scanTokens()
if len(tokens) != len(want) {
t.Errorf("want %d tokens; got %d", len(want), len(tokens))
}
for i, tok := range tokens[:len(want)] {
if want[i] != tok.String() {
t.Errorf("%d | want: %q; got: %q", i, want[i], tok.String())
}
}
}
Update
I’ve published an experimental Go library incorporating the above lexer, with additional testing and a parser to process the tokens into a tree: javiljoen/miniyaml