categraf/inputs/mtail/internal/runtime/compiler/parser/lexer.go

628 lines
11 KiB
Go

// Copyright 2011 Google Inc. All Rights Reserved.
// This file is available under the Apache license.
package parser
import (
"bufio"
"errors"
"fmt"
"io"
"log"
"sort"
"strings"
"unicode"
// "github.com/golang/glog"
"flashcat.cloud/categraf/inputs/mtail/internal/runtime/compiler/position"
)
// List of keywords. Keep this list sorted!
var keywords = map[string]Kind{
"after": AFTER,
"as": AS,
"buckets": BUCKETS,
"by": BY,
"const": CONST,
"counter": COUNTER,
"def": DEF,
"del": DEL,
"else": ELSE,
"gauge": GAUGE,
"hidden": HIDDEN,
"histogram": HISTOGRAM,
"limit": LIMIT,
"next": NEXT,
"otherwise": OTHERWISE,
"stop": STOP,
"text": TEXT,
"timer": TIMER,
}
// List of builtin functions. Keep this list sorted!
var builtins = []string{
"bool",
"float",
"getfilename",
"int",
"len",
"settime",
"string",
"strptime",
"strtol",
"subst",
"timestamp",
"tolower",
}
// Dictionary returns a list of all keywords and builtins of the language.
func Dictionary() (r []string) {
for k := range keywords {
r = append(r, k)
}
r = append(r, builtins...)
return
}
// A stateFn represents each state the scanner can be in.
type stateFn func(*Lexer) stateFn
// A lexer holds the state of the scanner.
type Lexer struct {
name string // Name of program.
input *bufio.Reader // Source program
state stateFn // Current state function of the lexer.
// The "read cursor" in the input.
rune rune // The current rune.
width int // Width in bytes.
line int // The line position of the current rune.
col int // The column position of the current rune.
InRegex bool // Context aware flag from parser to say we're in a regex
// The currently being lexed token.
startcol int // Starting column of the current token.
text strings.Builder // the text of the current token
tokens chan Token // Output channel for tokens emitted.
}
// NewLexer creates a new scanner type that reads the input provided.
func NewLexer(name string, input io.Reader) *Lexer {
l := &Lexer{
name: name,
input: bufio.NewReader(input),
state: lexProg,
tokens: make(chan Token, 2),
}
return l
}
// NextToken returns the next token in the input. When no token is available
// to be returned it executes the next action in the state machine.
func (l *Lexer) NextToken() Token {
for {
select {
case tok := <-l.tokens:
return tok
default:
l.state = l.state(l)
}
}
}
// emit passes a token to the client.
func (l *Lexer) emit(kind Kind) {
pos := position.Position{l.name, l.line, l.startcol, l.col - 1}
// glog.V(2).Infof("Emitting %v spelled %q at %v", kind, l.text.String(), pos)
l.tokens <- Token{kind, l.text.String(), pos}
// Reset the current token
l.text.Reset()
l.startcol = l.col
}
// Internal end of file value.
const eof rune = -1
// next returns the next rune in the input.
func (l *Lexer) next() rune {
var err error
l.rune, l.width, err = l.input.ReadRune()
if errors.Is(err, io.EOF) {
l.width = 1
l.rune = eof
}
if l.rune == '␤' {
l.rune = eof
}
return l.rune
}
// backup indicates that we haven't yet dealt with the next rune. Use when
// terminating tokens on unknown runes.
func (l *Lexer) backup() {
l.width = 0
if l.rune == eof {
return
}
if err := l.input.UnreadRune(); err != nil {
log.Println(err)
}
}
// stepCursor moves the read cursor.
func (l *Lexer) stepCursor() {
if l.rune == '\n' {
l.line++
l.col = 0
} else {
l.col += l.width
}
}
// accept accepts the current rune and its position into the current token.
func (l *Lexer) accept() {
l.text.WriteRune(l.rune)
l.stepCursor()
}
// skip does not accept the current rune into the current token's text, but
// does accept its position into the token. Use only at the start or end of a
// token.
func (l *Lexer) skip() {
l.stepCursor()
}
// ignore skips over the current rune, removing it from the text of the token,
// and resetting the start position of the current token. Use only between
// tokens.
func (l *Lexer) ignore() {
l.stepCursor()
l.startcol = l.col
}
// errorf returns an error token and resets the scanner.
func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
pos := position.Position{
Filename: l.name,
Line: l.line,
Startcol: l.startcol,
Endcol: l.col - 1,
}
l.tokens <- Token{
Kind: INVALID,
Spelling: fmt.Sprintf(format, args...),
Pos: pos,
}
// Reset the current token
l.text.Reset()
l.startcol = l.col
return lexProg
}
// State functions.
// lexProg starts lexing a program.
func lexProg(l *Lexer) stateFn {
if l.InRegex {
return lexRegex
}
switch r := l.next(); {
case r == '\n':
l.accept()
l.emit(NL)
case r == '#':
return lexComment
case isSpace(r):
l.ignore()
case r == '{':
l.accept()
l.emit(LCURLY)
case r == '}':
l.accept()
l.emit(RCURLY)
case r == '(':
l.accept()
l.emit(LPAREN)
case r == ')':
l.accept()
l.emit(RPAREN)
case r == '[':
l.accept()
l.emit(LSQUARE)
case r == ']':
l.accept()
l.emit(RSQUARE)
case r == ',':
l.accept()
l.emit(COMMA)
case r == '-':
l.accept()
switch r = l.next(); {
case r == '-':
l.accept()
l.emit(DEC)
case isDigit(r):
l.backup()
return lexNumeric
default:
l.backup()
l.emit(MINUS)
}
case r == '+':
l.accept()
switch l.next() {
case '+':
l.accept()
l.emit(INC)
case '=':
l.accept()
l.emit(ADD_ASSIGN)
default:
l.backup()
l.emit(PLUS)
}
case r == '*':
l.accept()
switch l.next() {
case '*':
l.accept()
l.emit(POW)
default:
l.backup()
l.emit(MUL)
}
case r == '=':
l.accept()
switch l.next() {
case '=':
l.accept()
l.emit(EQ)
case '~':
l.accept()
l.emit(MATCH)
default:
l.backup()
l.emit(ASSIGN)
}
case r == '<':
l.accept()
switch l.next() {
case '=':
l.accept()
l.emit(LE)
case '<':
l.accept()
l.emit(SHL)
default:
l.backup()
l.emit(LT)
}
case r == '>':
l.accept()
switch l.next() {
case '=':
l.accept()
l.emit(GE)
case '>':
l.accept()
l.emit(SHR)
default:
l.backup()
l.emit(GT)
}
case r == '!':
l.accept()
switch l.next() {
case '=':
l.accept()
l.emit(NE)
case '~':
l.accept()
l.emit(NOT_MATCH)
default:
l.backup()
return l.errorf("Unexpected input: %q", r)
}
case r == '/':
l.accept()
l.emit(DIV)
case r == '%':
l.accept()
l.emit(MOD)
case r == '&':
l.accept()
switch l.next() {
case '&':
l.accept()
l.emit(AND)
default:
l.backup()
l.emit(BITAND)
}
case r == '|':
l.accept()
switch l.next() {
case '|':
l.accept()
l.emit(OR)
default:
l.backup()
l.emit(BITOR)
}
case r == '^':
l.accept()
l.emit(XOR)
case r == '~':
l.accept()
l.emit(NOT)
case r == '"':
return lexQuotedString
case r == '$':
return lexCapref
case r == '@':
return lexDecorator
case isDigit(r):
l.backup()
return lexNumeric
case isAlpha(r):
return lexIdentifier
case r == eof:
l.skip()
l.emit(EOF)
// Stop the machine, we're done.
return nil
case r == '.':
l.backup()
return lexNumeric
default:
l.accept()
return l.errorf("Unexpected input: %q", r)
}
return lexProg
}
// Lex a comment.
func lexComment(l *Lexer) stateFn {
l.ignore()
Loop:
for {
switch l.next() {
case '\n':
l.skip()
fallthrough
case eof:
break Loop
default:
l.ignore()
}
}
return lexProg
}
// Lex a numerical constant.
func lexNumeric(l *Lexer) stateFn {
r := l.next()
for isDigit(r) {
l.accept()
r = l.next()
}
if r != '.' && r != 'E' && r != 'e' && !isDurationSuffix(r) {
l.backup()
l.emit(Kind(INTLITERAL))
return lexProg
}
if r == '.' {
l.accept()
r = l.next()
for isDigit(r) {
l.accept()
r = l.next()
}
}
if r == 'e' || r == 'E' {
l.accept()
r = l.next()
if r == '+' || r == '-' {
l.accept()
r = l.next()
}
for isDigit(r) {
l.accept()
r = l.next()
}
}
if isDurationSuffix(r) {
l.accept()
return lexDuration
}
l.backup()
l.emit(Kind(FLOATLITERAL))
return lexProg
}
func isDurationSuffix(r rune) bool {
switch r {
case 's', 'm', 'h', 'd':
return true
}
return false
}
func lexDuration(l *Lexer) stateFn {
Loop:
for {
switch r := l.next(); {
case isDigit(r):
l.accept()
case r == '.':
l.accept()
case r == '-':
l.accept()
case r == '+':
l.accept()
case isDurationSuffix(r):
l.accept()
default:
l.backup()
break Loop
}
}
l.emit(DURATIONLITERAL)
return lexProg
}
// Lex a quoted string. The text of a quoted string does not include the '"' quotes.
func lexQuotedString(l *Lexer) stateFn {
l.skip() // Skip leading quote
Loop:
for {
switch l.next() {
case '\\':
l.skip()
if r := l.next(); r != eof && r != '\n' {
if r != '"' {
l.text.WriteRune('\\')
}
l.accept()
break
}
fallthrough
case eof, '\n':
return l.errorf("Unterminated quoted string: \"\\\"%s\"", l.text.String())
case '"':
l.skip() // Skip trailing quote.
break Loop
default:
l.accept()
}
}
l.emit(STRING)
return lexProg
}
// Lex a capture group reference. These are local variable references to
// capture groups in the preceding regular expression.
func lexCapref(l *Lexer) stateFn {
l.skip() // Skip the leading $
named := false
Loop:
for {
switch r := l.next(); {
case isAlnum(r) || r == '_':
l.accept()
if !isDigit(r) {
named = true
}
default:
l.backup()
break Loop
}
}
if named {
l.emit(CAPREF_NAMED)
} else {
l.emit(CAPREF)
}
return lexProg
}
// Lex an identifier, or builtin keyword.
func lexIdentifier(l *Lexer) stateFn {
l.accept()
Loop:
for {
switch r := l.next(); {
case isAlnum(r) || r == '_':
l.accept()
default:
l.backup()
break Loop
}
}
if r, ok := keywords[l.text.String()]; ok {
l.emit(r)
} else if r := sort.SearchStrings(builtins, l.text.String()); r >= 0 && r < len(builtins) && builtins[r] == l.text.String() {
l.emit(BUILTIN)
} else {
l.emit(ID)
}
return lexProg
}
// Lex a regular expression pattern. The text of the regular expression does
// not include the '/' quotes.
func lexRegex(l *Lexer) stateFn {
// Exit regex mode when leaving this function.
defer func() {
// glog.V(2).Info("Exiting regex")
// glog.V(2).Infof("Regex at line %d, startcol %d, col %d", l.line, l.startcol, l.col)
l.InRegex = false
}()
Loop:
for {
switch l.next() {
case '\\':
l.skip()
if r := l.next(); r != eof && r != '\n' {
if r != '/' {
l.text.WriteRune('\\')
}
l.accept()
break
}
fallthrough
case eof, '\n':
return l.errorf("Unterminated regular expression: \"/%s\"", l.text.String())
case '/':
l.backup() // Backup trailing slash on successful parse
break Loop
default:
l.accept()
}
}
l.emit(REGEX)
return lexProg
}
// Lex a decorator name. These are functiony templatey wrappers around blocks
// of rules.
func lexDecorator(l *Lexer) stateFn {
l.skip() // Skip the leading @
Loop:
for {
switch r := l.next(); {
case isAlnum(r) || r == '_':
l.accept()
default:
l.backup()
break Loop
}
}
l.emit(DECO)
return lexProg
}
// Helper predicates.
// isAlpha reports whether r is an alphabetical rune.
func isAlpha(r rune) bool {
return unicode.IsLetter(r)
}
// isAlnum reports whether r is an alphanumeric rune.
func isAlnum(r rune) bool {
return isAlpha(r) || isDigit(r)
}
// isDigit reports whether r is a numerical rune.
func isDigit(r rune) bool {
return unicode.IsDigit(r)
}
// isSpace reports whether r is whitespace.
func isSpace(r rune) bool {
return unicode.IsSpace(r)
}