forked from flashcat/categraf
628 lines
11 KiB
Go
628 lines
11 KiB
Go
// Copyright 2011 Google Inc. All Rights Reserved.
|
|
// This file is available under the Apache license.
|
|
|
|
package parser
|
|
|
|
import (
|
|
"bufio"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"sort"
|
|
"strings"
|
|
"unicode"
|
|
|
|
// "github.com/golang/glog"
|
|
"flashcat.cloud/categraf/inputs/mtail/internal/runtime/compiler/position"
|
|
)
|
|
|
|
// List of keywords. Keep this list sorted!
|
|
var keywords = map[string]Kind{
|
|
"after": AFTER,
|
|
"as": AS,
|
|
"buckets": BUCKETS,
|
|
"by": BY,
|
|
"const": CONST,
|
|
"counter": COUNTER,
|
|
"def": DEF,
|
|
"del": DEL,
|
|
"else": ELSE,
|
|
"gauge": GAUGE,
|
|
"hidden": HIDDEN,
|
|
"histogram": HISTOGRAM,
|
|
"limit": LIMIT,
|
|
"next": NEXT,
|
|
"otherwise": OTHERWISE,
|
|
"stop": STOP,
|
|
"text": TEXT,
|
|
"timer": TIMER,
|
|
}
|
|
|
|
// List of builtin functions. Keep this list sorted!
|
|
var builtins = []string{
|
|
"bool",
|
|
"float",
|
|
"getfilename",
|
|
"int",
|
|
"len",
|
|
"settime",
|
|
"string",
|
|
"strptime",
|
|
"strtol",
|
|
"subst",
|
|
"timestamp",
|
|
"tolower",
|
|
}
|
|
|
|
// Dictionary returns a list of all keywords and builtins of the language.
|
|
func Dictionary() (r []string) {
|
|
for k := range keywords {
|
|
r = append(r, k)
|
|
}
|
|
r = append(r, builtins...)
|
|
return
|
|
}
|
|
|
|
// A stateFn represents each state the scanner can be in.
|
|
type stateFn func(*Lexer) stateFn
|
|
|
|
// A lexer holds the state of the scanner.
|
|
type Lexer struct {
|
|
name string // Name of program.
|
|
input *bufio.Reader // Source program
|
|
state stateFn // Current state function of the lexer.
|
|
|
|
// The "read cursor" in the input.
|
|
rune rune // The current rune.
|
|
width int // Width in bytes.
|
|
line int // The line position of the current rune.
|
|
col int // The column position of the current rune.
|
|
|
|
InRegex bool // Context aware flag from parser to say we're in a regex
|
|
|
|
// The currently being lexed token.
|
|
startcol int // Starting column of the current token.
|
|
text strings.Builder // the text of the current token
|
|
|
|
tokens chan Token // Output channel for tokens emitted.
|
|
}
|
|
|
|
// NewLexer creates a new scanner type that reads the input provided.
|
|
func NewLexer(name string, input io.Reader) *Lexer {
|
|
l := &Lexer{
|
|
name: name,
|
|
input: bufio.NewReader(input),
|
|
state: lexProg,
|
|
tokens: make(chan Token, 2),
|
|
}
|
|
return l
|
|
}
|
|
|
|
// NextToken returns the next token in the input. When no token is available
|
|
// to be returned it executes the next action in the state machine.
|
|
func (l *Lexer) NextToken() Token {
|
|
for {
|
|
select {
|
|
case tok := <-l.tokens:
|
|
return tok
|
|
default:
|
|
l.state = l.state(l)
|
|
}
|
|
}
|
|
}
|
|
|
|
// emit passes a token to the client.
|
|
func (l *Lexer) emit(kind Kind) {
|
|
pos := position.Position{l.name, l.line, l.startcol, l.col - 1}
|
|
// glog.V(2).Infof("Emitting %v spelled %q at %v", kind, l.text.String(), pos)
|
|
l.tokens <- Token{kind, l.text.String(), pos}
|
|
// Reset the current token
|
|
l.text.Reset()
|
|
l.startcol = l.col
|
|
}
|
|
|
|
// Internal end of file value.
|
|
const eof rune = -1
|
|
|
|
// next returns the next rune in the input.
|
|
func (l *Lexer) next() rune {
|
|
var err error
|
|
l.rune, l.width, err = l.input.ReadRune()
|
|
if errors.Is(err, io.EOF) {
|
|
l.width = 1
|
|
l.rune = eof
|
|
}
|
|
if l.rune == '' {
|
|
l.rune = eof
|
|
}
|
|
return l.rune
|
|
}
|
|
|
|
// backup indicates that we haven't yet dealt with the next rune. Use when
|
|
// terminating tokens on unknown runes.
|
|
func (l *Lexer) backup() {
|
|
l.width = 0
|
|
if l.rune == eof {
|
|
return
|
|
}
|
|
if err := l.input.UnreadRune(); err != nil {
|
|
log.Println(err)
|
|
}
|
|
}
|
|
|
|
// stepCursor moves the read cursor.
|
|
func (l *Lexer) stepCursor() {
|
|
if l.rune == '\n' {
|
|
l.line++
|
|
l.col = 0
|
|
} else {
|
|
l.col += l.width
|
|
}
|
|
}
|
|
|
|
// accept accepts the current rune and its position into the current token.
|
|
func (l *Lexer) accept() {
|
|
l.text.WriteRune(l.rune)
|
|
l.stepCursor()
|
|
}
|
|
|
|
// skip does not accept the current rune into the current token's text, but
|
|
// does accept its position into the token. Use only at the start or end of a
|
|
// token.
|
|
func (l *Lexer) skip() {
|
|
l.stepCursor()
|
|
}
|
|
|
|
// ignore skips over the current rune, removing it from the text of the token,
|
|
// and resetting the start position of the current token. Use only between
|
|
// tokens.
|
|
func (l *Lexer) ignore() {
|
|
l.stepCursor()
|
|
l.startcol = l.col
|
|
}
|
|
|
|
// errorf returns an error token and resets the scanner.
|
|
func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
|
|
pos := position.Position{
|
|
Filename: l.name,
|
|
Line: l.line,
|
|
Startcol: l.startcol,
|
|
Endcol: l.col - 1,
|
|
}
|
|
l.tokens <- Token{
|
|
Kind: INVALID,
|
|
Spelling: fmt.Sprintf(format, args...),
|
|
Pos: pos,
|
|
}
|
|
// Reset the current token
|
|
l.text.Reset()
|
|
l.startcol = l.col
|
|
return lexProg
|
|
}
|
|
|
|
// State functions.
|
|
|
|
// lexProg starts lexing a program.
|
|
func lexProg(l *Lexer) stateFn {
|
|
if l.InRegex {
|
|
return lexRegex
|
|
}
|
|
switch r := l.next(); {
|
|
case r == '\n':
|
|
l.accept()
|
|
l.emit(NL)
|
|
case r == '#':
|
|
return lexComment
|
|
case isSpace(r):
|
|
l.ignore()
|
|
case r == '{':
|
|
l.accept()
|
|
l.emit(LCURLY)
|
|
case r == '}':
|
|
l.accept()
|
|
l.emit(RCURLY)
|
|
case r == '(':
|
|
l.accept()
|
|
l.emit(LPAREN)
|
|
case r == ')':
|
|
l.accept()
|
|
l.emit(RPAREN)
|
|
case r == '[':
|
|
l.accept()
|
|
l.emit(LSQUARE)
|
|
case r == ']':
|
|
l.accept()
|
|
l.emit(RSQUARE)
|
|
case r == ',':
|
|
l.accept()
|
|
l.emit(COMMA)
|
|
case r == '-':
|
|
l.accept()
|
|
switch r = l.next(); {
|
|
case r == '-':
|
|
l.accept()
|
|
l.emit(DEC)
|
|
case isDigit(r):
|
|
l.backup()
|
|
return lexNumeric
|
|
default:
|
|
l.backup()
|
|
l.emit(MINUS)
|
|
}
|
|
case r == '+':
|
|
l.accept()
|
|
switch l.next() {
|
|
case '+':
|
|
l.accept()
|
|
l.emit(INC)
|
|
case '=':
|
|
l.accept()
|
|
l.emit(ADD_ASSIGN)
|
|
default:
|
|
l.backup()
|
|
l.emit(PLUS)
|
|
}
|
|
case r == '*':
|
|
l.accept()
|
|
switch l.next() {
|
|
case '*':
|
|
l.accept()
|
|
l.emit(POW)
|
|
default:
|
|
l.backup()
|
|
l.emit(MUL)
|
|
}
|
|
case r == '=':
|
|
l.accept()
|
|
switch l.next() {
|
|
case '=':
|
|
l.accept()
|
|
l.emit(EQ)
|
|
case '~':
|
|
l.accept()
|
|
l.emit(MATCH)
|
|
default:
|
|
l.backup()
|
|
l.emit(ASSIGN)
|
|
}
|
|
case r == '<':
|
|
l.accept()
|
|
switch l.next() {
|
|
case '=':
|
|
l.accept()
|
|
l.emit(LE)
|
|
case '<':
|
|
l.accept()
|
|
l.emit(SHL)
|
|
default:
|
|
l.backup()
|
|
l.emit(LT)
|
|
}
|
|
case r == '>':
|
|
l.accept()
|
|
switch l.next() {
|
|
case '=':
|
|
l.accept()
|
|
l.emit(GE)
|
|
case '>':
|
|
l.accept()
|
|
l.emit(SHR)
|
|
default:
|
|
l.backup()
|
|
l.emit(GT)
|
|
}
|
|
case r == '!':
|
|
l.accept()
|
|
switch l.next() {
|
|
case '=':
|
|
l.accept()
|
|
l.emit(NE)
|
|
case '~':
|
|
l.accept()
|
|
l.emit(NOT_MATCH)
|
|
default:
|
|
l.backup()
|
|
return l.errorf("Unexpected input: %q", r)
|
|
}
|
|
case r == '/':
|
|
l.accept()
|
|
l.emit(DIV)
|
|
case r == '%':
|
|
l.accept()
|
|
l.emit(MOD)
|
|
case r == '&':
|
|
l.accept()
|
|
switch l.next() {
|
|
case '&':
|
|
l.accept()
|
|
l.emit(AND)
|
|
default:
|
|
l.backup()
|
|
l.emit(BITAND)
|
|
}
|
|
case r == '|':
|
|
l.accept()
|
|
switch l.next() {
|
|
case '|':
|
|
l.accept()
|
|
l.emit(OR)
|
|
default:
|
|
l.backup()
|
|
l.emit(BITOR)
|
|
}
|
|
case r == '^':
|
|
l.accept()
|
|
l.emit(XOR)
|
|
case r == '~':
|
|
l.accept()
|
|
l.emit(NOT)
|
|
case r == '"':
|
|
return lexQuotedString
|
|
case r == '$':
|
|
return lexCapref
|
|
case r == '@':
|
|
return lexDecorator
|
|
case isDigit(r):
|
|
l.backup()
|
|
return lexNumeric
|
|
case isAlpha(r):
|
|
return lexIdentifier
|
|
case r == eof:
|
|
l.skip()
|
|
l.emit(EOF)
|
|
// Stop the machine, we're done.
|
|
return nil
|
|
case r == '.':
|
|
l.backup()
|
|
return lexNumeric
|
|
default:
|
|
l.accept()
|
|
return l.errorf("Unexpected input: %q", r)
|
|
}
|
|
return lexProg
|
|
}
|
|
|
|
// Lex a comment.
|
|
func lexComment(l *Lexer) stateFn {
|
|
l.ignore()
|
|
Loop:
|
|
for {
|
|
switch l.next() {
|
|
case '\n':
|
|
l.skip()
|
|
fallthrough
|
|
case eof:
|
|
break Loop
|
|
default:
|
|
l.ignore()
|
|
}
|
|
}
|
|
return lexProg
|
|
}
|
|
|
|
// Lex a numerical constant.
|
|
func lexNumeric(l *Lexer) stateFn {
|
|
r := l.next()
|
|
for isDigit(r) {
|
|
l.accept()
|
|
r = l.next()
|
|
}
|
|
if r != '.' && r != 'E' && r != 'e' && !isDurationSuffix(r) {
|
|
l.backup()
|
|
l.emit(Kind(INTLITERAL))
|
|
return lexProg
|
|
}
|
|
if r == '.' {
|
|
l.accept()
|
|
r = l.next()
|
|
for isDigit(r) {
|
|
l.accept()
|
|
r = l.next()
|
|
}
|
|
}
|
|
if r == 'e' || r == 'E' {
|
|
l.accept()
|
|
r = l.next()
|
|
if r == '+' || r == '-' {
|
|
l.accept()
|
|
r = l.next()
|
|
}
|
|
for isDigit(r) {
|
|
l.accept()
|
|
r = l.next()
|
|
}
|
|
}
|
|
if isDurationSuffix(r) {
|
|
l.accept()
|
|
return lexDuration
|
|
}
|
|
l.backup()
|
|
l.emit(Kind(FLOATLITERAL))
|
|
return lexProg
|
|
}
|
|
|
|
func isDurationSuffix(r rune) bool {
|
|
switch r {
|
|
case 's', 'm', 'h', 'd':
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func lexDuration(l *Lexer) stateFn {
|
|
Loop:
|
|
for {
|
|
switch r := l.next(); {
|
|
case isDigit(r):
|
|
l.accept()
|
|
case r == '.':
|
|
l.accept()
|
|
case r == '-':
|
|
l.accept()
|
|
case r == '+':
|
|
l.accept()
|
|
case isDurationSuffix(r):
|
|
l.accept()
|
|
default:
|
|
l.backup()
|
|
break Loop
|
|
}
|
|
}
|
|
l.emit(DURATIONLITERAL)
|
|
return lexProg
|
|
}
|
|
|
|
// Lex a quoted string. The text of a quoted string does not include the '"' quotes.
|
|
func lexQuotedString(l *Lexer) stateFn {
|
|
l.skip() // Skip leading quote
|
|
Loop:
|
|
for {
|
|
switch l.next() {
|
|
case '\\':
|
|
l.skip()
|
|
if r := l.next(); r != eof && r != '\n' {
|
|
if r != '"' {
|
|
l.text.WriteRune('\\')
|
|
}
|
|
l.accept()
|
|
break
|
|
}
|
|
fallthrough
|
|
case eof, '\n':
|
|
return l.errorf("Unterminated quoted string: \"\\\"%s\"", l.text.String())
|
|
case '"':
|
|
l.skip() // Skip trailing quote.
|
|
break Loop
|
|
default:
|
|
l.accept()
|
|
}
|
|
}
|
|
l.emit(STRING)
|
|
return lexProg
|
|
}
|
|
|
|
// Lex a capture group reference. These are local variable references to
|
|
// capture groups in the preceding regular expression.
|
|
func lexCapref(l *Lexer) stateFn {
|
|
l.skip() // Skip the leading $
|
|
named := false
|
|
Loop:
|
|
for {
|
|
switch r := l.next(); {
|
|
case isAlnum(r) || r == '_':
|
|
l.accept()
|
|
if !isDigit(r) {
|
|
named = true
|
|
}
|
|
default:
|
|
l.backup()
|
|
break Loop
|
|
}
|
|
}
|
|
if named {
|
|
l.emit(CAPREF_NAMED)
|
|
} else {
|
|
l.emit(CAPREF)
|
|
}
|
|
return lexProg
|
|
}
|
|
|
|
// Lex an identifier, or builtin keyword.
|
|
func lexIdentifier(l *Lexer) stateFn {
|
|
l.accept()
|
|
Loop:
|
|
for {
|
|
switch r := l.next(); {
|
|
case isAlnum(r) || r == '_':
|
|
l.accept()
|
|
default:
|
|
l.backup()
|
|
break Loop
|
|
}
|
|
}
|
|
if r, ok := keywords[l.text.String()]; ok {
|
|
l.emit(r)
|
|
} else if r := sort.SearchStrings(builtins, l.text.String()); r >= 0 && r < len(builtins) && builtins[r] == l.text.String() {
|
|
l.emit(BUILTIN)
|
|
} else {
|
|
l.emit(ID)
|
|
}
|
|
return lexProg
|
|
}
|
|
|
|
// Lex a regular expression pattern. The text of the regular expression does
|
|
// not include the '/' quotes.
|
|
func lexRegex(l *Lexer) stateFn {
|
|
// Exit regex mode when leaving this function.
|
|
defer func() {
|
|
// glog.V(2).Info("Exiting regex")
|
|
// glog.V(2).Infof("Regex at line %d, startcol %d, col %d", l.line, l.startcol, l.col)
|
|
l.InRegex = false
|
|
}()
|
|
Loop:
|
|
for {
|
|
switch l.next() {
|
|
case '\\':
|
|
l.skip()
|
|
if r := l.next(); r != eof && r != '\n' {
|
|
if r != '/' {
|
|
l.text.WriteRune('\\')
|
|
}
|
|
l.accept()
|
|
break
|
|
}
|
|
fallthrough
|
|
case eof, '\n':
|
|
return l.errorf("Unterminated regular expression: \"/%s\"", l.text.String())
|
|
case '/':
|
|
l.backup() // Backup trailing slash on successful parse
|
|
break Loop
|
|
default:
|
|
l.accept()
|
|
}
|
|
}
|
|
l.emit(REGEX)
|
|
return lexProg
|
|
}
|
|
|
|
// Lex a decorator name. These are functiony templatey wrappers around blocks
|
|
// of rules.
|
|
func lexDecorator(l *Lexer) stateFn {
|
|
l.skip() // Skip the leading @
|
|
Loop:
|
|
for {
|
|
switch r := l.next(); {
|
|
case isAlnum(r) || r == '_':
|
|
l.accept()
|
|
default:
|
|
l.backup()
|
|
break Loop
|
|
}
|
|
}
|
|
l.emit(DECO)
|
|
return lexProg
|
|
}
|
|
|
|
// Helper predicates.
|
|
|
|
// isAlpha reports whether r is an alphabetical rune.
|
|
func isAlpha(r rune) bool {
|
|
return unicode.IsLetter(r)
|
|
}
|
|
|
|
// isAlnum reports whether r is an alphanumeric rune.
|
|
func isAlnum(r rune) bool {
|
|
return isAlpha(r) || isDigit(r)
|
|
}
|
|
|
|
// isDigit reports whether r is a numerical rune.
|
|
func isDigit(r rune) bool {
|
|
return unicode.IsDigit(r)
|
|
}
|
|
|
|
// isSpace reports whether r is whitespace.
|
|
func isSpace(r rune) bool {
|
|
return unicode.IsSpace(r)
|
|
}
|