421 lines
10 KiB
Go
421 lines
10 KiB
Go
package jmespath
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
type token struct {
|
|
tokenType tokType
|
|
value string
|
|
position int
|
|
length int
|
|
}
|
|
|
|
type tokType int
|
|
|
|
const eof = -1
|
|
|
|
// Lexer contains information about the expression being tokenized.
|
|
type Lexer struct {
|
|
expression string // The expression provided by the user.
|
|
currentPos int // The current position in the string.
|
|
lastWidth int // The width of the current rune. This
|
|
buf bytes.Buffer // Internal buffer used for building up values.
|
|
}
|
|
|
|
// SyntaxError is the main error used whenever a lexing or parsing error occurs.
|
|
type SyntaxError struct {
|
|
msg string // Error message displayed to user
|
|
Expression string // Expression that generated a SyntaxError
|
|
Offset int // The location in the string where the error occurred
|
|
}
|
|
|
|
func (e SyntaxError) Error() string {
|
|
// In the future, it would be good to underline the specific
|
|
// location where the error occurred.
|
|
return "SyntaxError: " + e.msg
|
|
}
|
|
|
|
// HighlightLocation will show where the syntax error occurred.
|
|
// It will place a "^" character on a line below the expression
|
|
// at the point where the syntax error occurred.
|
|
func (e SyntaxError) HighlightLocation() string {
|
|
return e.Expression + "\n" + strings.Repeat(" ", e.Offset) + "^"
|
|
}
|
|
|
|
//go:generate stringer -type=tokType
|
|
const (
|
|
tUnknown tokType = iota
|
|
tStar
|
|
tDot
|
|
tFilter
|
|
tFlatten
|
|
tLparen
|
|
tRparen
|
|
tLbracket
|
|
tRbracket
|
|
tLbrace
|
|
tRbrace
|
|
tOr
|
|
tPipe
|
|
tNumber
|
|
tUnquotedIdentifier
|
|
tQuotedIdentifier
|
|
tComma
|
|
tColon
|
|
tLT
|
|
tLTE
|
|
tGT
|
|
tGTE
|
|
tEQ
|
|
tNE
|
|
tJSONLiteral
|
|
tStringLiteral
|
|
tCurrent
|
|
tExpref
|
|
tAnd
|
|
tNot
|
|
tEOF
|
|
)
|
|
|
|
var basicTokens = map[rune]tokType{
|
|
'.': tDot,
|
|
'*': tStar,
|
|
',': tComma,
|
|
':': tColon,
|
|
'{': tLbrace,
|
|
'}': tRbrace,
|
|
']': tRbracket, // tLbracket not included because it could be "[]"
|
|
'(': tLparen,
|
|
')': tRparen,
|
|
'@': tCurrent,
|
|
}
|
|
|
|
// Bit mask for [a-zA-Z_] shifted down 64 bits to fit in a single uint64.
|
|
// When using this bitmask just be sure to shift the rune down 64 bits
|
|
// before checking against identifierStartBits.
|
|
const identifierStartBits uint64 = 576460745995190270
|
|
|
|
// Bit mask for [a-zA-Z0-9], 128 bits -> 2 uint64s.
|
|
var identifierTrailingBits = [2]uint64{287948901175001088, 576460745995190270}
|
|
|
|
var whiteSpace = map[rune]bool{
|
|
' ': true, '\t': true, '\n': true, '\r': true,
|
|
}
|
|
|
|
func (t token) String() string {
|
|
return fmt.Sprintf("Token{%+v, %s, %d, %d}",
|
|
t.tokenType, t.value, t.position, t.length)
|
|
}
|
|
|
|
// NewLexer creates a new JMESPath lexer.
|
|
func NewLexer() *Lexer {
|
|
lexer := Lexer{}
|
|
return &lexer
|
|
}
|
|
|
|
func (lexer *Lexer) next() rune {
|
|
if lexer.currentPos >= len(lexer.expression) {
|
|
lexer.lastWidth = 0
|
|
return eof
|
|
}
|
|
r, w := utf8.DecodeRuneInString(lexer.expression[lexer.currentPos:])
|
|
lexer.lastWidth = w
|
|
lexer.currentPos += w
|
|
return r
|
|
}
|
|
|
|
func (lexer *Lexer) back() {
|
|
lexer.currentPos -= lexer.lastWidth
|
|
}
|
|
|
|
func (lexer *Lexer) peek() rune {
|
|
t := lexer.next()
|
|
lexer.back()
|
|
return t
|
|
}
|
|
|
|
// tokenize takes an expression and returns corresponding tokens.
|
|
func (lexer *Lexer) tokenize(expression string) ([]token, error) {
|
|
var tokens []token
|
|
lexer.expression = expression
|
|
lexer.currentPos = 0
|
|
lexer.lastWidth = 0
|
|
loop:
|
|
for {
|
|
r := lexer.next()
|
|
if identifierStartBits&(1<<(uint64(r)-64)) > 0 {
|
|
t := lexer.consumeUnquotedIdentifier()
|
|
tokens = append(tokens, t)
|
|
} else if val, ok := basicTokens[r]; ok {
|
|
// Basic single char token.
|
|
t := token{
|
|
tokenType: val,
|
|
value: string(r),
|
|
position: lexer.currentPos - lexer.lastWidth,
|
|
length: 1,
|
|
}
|
|
tokens = append(tokens, t)
|
|
} else if r == '-' || (r >= '0' && r <= '9') {
|
|
t := lexer.consumeNumber()
|
|
tokens = append(tokens, t)
|
|
} else if r == '[' {
|
|
t := lexer.consumeLBracket()
|
|
tokens = append(tokens, t)
|
|
} else if r == '"' {
|
|
t, err := lexer.consumeQuotedIdentifier()
|
|
if err != nil {
|
|
return tokens, err
|
|
}
|
|
tokens = append(tokens, t)
|
|
} else if r == '\'' {
|
|
t, err := lexer.consumeRawStringLiteral()
|
|
if err != nil {
|
|
return tokens, err
|
|
}
|
|
tokens = append(tokens, t)
|
|
} else if r == '`' {
|
|
t, err := lexer.consumeLiteral()
|
|
if err != nil {
|
|
return tokens, err
|
|
}
|
|
tokens = append(tokens, t)
|
|
} else if r == '|' {
|
|
t := lexer.matchOrElse(r, '|', tOr, tPipe)
|
|
tokens = append(tokens, t)
|
|
} else if r == '<' {
|
|
t := lexer.matchOrElse(r, '=', tLTE, tLT)
|
|
tokens = append(tokens, t)
|
|
} else if r == '>' {
|
|
t := lexer.matchOrElse(r, '=', tGTE, tGT)
|
|
tokens = append(tokens, t)
|
|
} else if r == '!' {
|
|
t := lexer.matchOrElse(r, '=', tNE, tNot)
|
|
tokens = append(tokens, t)
|
|
} else if r == '=' {
|
|
t := lexer.matchOrElse(r, '=', tEQ, tUnknown)
|
|
tokens = append(tokens, t)
|
|
} else if r == '&' {
|
|
t := lexer.matchOrElse(r, '&', tAnd, tExpref)
|
|
tokens = append(tokens, t)
|
|
} else if r == eof {
|
|
break loop
|
|
} else if _, ok := whiteSpace[r]; ok {
|
|
// Ignore whitespace
|
|
} else {
|
|
return tokens, lexer.syntaxError(fmt.Sprintf("Unknown char: %s", strconv.QuoteRuneToASCII(r)))
|
|
}
|
|
}
|
|
tokens = append(tokens, token{tEOF, "", len(lexer.expression), 0})
|
|
return tokens, nil
|
|
}
|
|
|
|
// Consume characters until the ending rune "r" is reached.
|
|
// If the end of the expression is reached before seeing the
|
|
// terminating rune "r", then an error is returned.
|
|
// If no error occurs then the matching substring is returned.
|
|
// The returned string will not include the ending rune.
|
|
func (lexer *Lexer) consumeUntil(end rune) (string, error) {
|
|
start := lexer.currentPos
|
|
current := lexer.next()
|
|
for current != end && current != eof {
|
|
if current == '\\' && lexer.peek() != eof {
|
|
lexer.next()
|
|
}
|
|
current = lexer.next()
|
|
}
|
|
if lexer.lastWidth == 0 {
|
|
// Then we hit an EOF so we never reached the closing
|
|
// delimiter.
|
|
return "", SyntaxError{
|
|
msg: "Unclosed delimiter: " + string(end),
|
|
Expression: lexer.expression,
|
|
Offset: len(lexer.expression),
|
|
}
|
|
}
|
|
return lexer.expression[start : lexer.currentPos-lexer.lastWidth], nil
|
|
}
|
|
|
|
func (lexer *Lexer) consumeLiteral() (token, error) {
|
|
start := lexer.currentPos
|
|
value, err := lexer.consumeUntil('`')
|
|
if err != nil {
|
|
return token{}, err
|
|
}
|
|
value = strings.Replace(value, "\\`", "`", -1)
|
|
return token{
|
|
tokenType: tJSONLiteral,
|
|
value: value,
|
|
position: start,
|
|
length: len(value),
|
|
}, nil
|
|
}
|
|
|
|
func (lexer *Lexer) consumeRawStringLiteral() (token, error) {
|
|
start := lexer.currentPos
|
|
currentIndex := start
|
|
current := lexer.next()
|
|
for current != '\'' && lexer.peek() != eof {
|
|
if current == '\\' && lexer.peek() == '\'' {
|
|
chunk := lexer.expression[currentIndex : lexer.currentPos-1]
|
|
lexer.buf.WriteString(chunk)
|
|
lexer.buf.WriteString("'")
|
|
lexer.next()
|
|
currentIndex = lexer.currentPos
|
|
}
|
|
current = lexer.next()
|
|
}
|
|
if lexer.lastWidth == 0 {
|
|
// Then we hit an EOF so we never reached the closing
|
|
// delimiter.
|
|
return token{}, SyntaxError{
|
|
msg: "Unclosed delimiter: '",
|
|
Expression: lexer.expression,
|
|
Offset: len(lexer.expression),
|
|
}
|
|
}
|
|
if currentIndex < lexer.currentPos {
|
|
lexer.buf.WriteString(lexer.expression[currentIndex : lexer.currentPos-1])
|
|
}
|
|
value := lexer.buf.String()
|
|
// Reset the buffer so it can reused again.
|
|
lexer.buf.Reset()
|
|
return token{
|
|
tokenType: tStringLiteral,
|
|
value: value,
|
|
position: start,
|
|
length: len(value),
|
|
}, nil
|
|
}
|
|
|
|
func (lexer *Lexer) syntaxError(msg string) SyntaxError {
|
|
return SyntaxError{
|
|
msg: msg,
|
|
Expression: lexer.expression,
|
|
Offset: lexer.currentPos - 1,
|
|
}
|
|
}
|
|
|
|
// Checks for a two char token, otherwise matches a single character
|
|
// token. This is used whenever a two char token overlaps a single
|
|
// char token, e.g. "||" -> tPipe, "|" -> tOr.
|
|
func (lexer *Lexer) matchOrElse(first rune, second rune, matchedType tokType, singleCharType tokType) token {
|
|
start := lexer.currentPos - lexer.lastWidth
|
|
nextRune := lexer.next()
|
|
var t token
|
|
if nextRune == second {
|
|
t = token{
|
|
tokenType: matchedType,
|
|
value: string(first) + string(second),
|
|
position: start,
|
|
length: 2,
|
|
}
|
|
} else {
|
|
lexer.back()
|
|
t = token{
|
|
tokenType: singleCharType,
|
|
value: string(first),
|
|
position: start,
|
|
length: 1,
|
|
}
|
|
}
|
|
return t
|
|
}
|
|
|
|
func (lexer *Lexer) consumeLBracket() token {
|
|
// There's three options here:
|
|
// 1. A filter expression "[?"
|
|
// 2. A flatten operator "[]"
|
|
// 3. A bare rbracket "["
|
|
start := lexer.currentPos - lexer.lastWidth
|
|
nextRune := lexer.next()
|
|
var t token
|
|
if nextRune == '?' {
|
|
t = token{
|
|
tokenType: tFilter,
|
|
value: "[?",
|
|
position: start,
|
|
length: 2,
|
|
}
|
|
} else if nextRune == ']' {
|
|
t = token{
|
|
tokenType: tFlatten,
|
|
value: "[]",
|
|
position: start,
|
|
length: 2,
|
|
}
|
|
} else {
|
|
t = token{
|
|
tokenType: tLbracket,
|
|
value: "[",
|
|
position: start,
|
|
length: 1,
|
|
}
|
|
lexer.back()
|
|
}
|
|
return t
|
|
}
|
|
|
|
func (lexer *Lexer) consumeQuotedIdentifier() (token, error) {
|
|
start := lexer.currentPos
|
|
value, err := lexer.consumeUntil('"')
|
|
if err != nil {
|
|
return token{}, err
|
|
}
|
|
var decoded string
|
|
asJSON := []byte("\"" + value + "\"")
|
|
if err := json.Unmarshal([]byte(asJSON), &decoded); err != nil {
|
|
return token{}, err
|
|
}
|
|
return token{
|
|
tokenType: tQuotedIdentifier,
|
|
value: decoded,
|
|
position: start - 1,
|
|
length: len(decoded),
|
|
}, nil
|
|
}
|
|
|
|
func (lexer *Lexer) consumeUnquotedIdentifier() token {
|
|
// Consume runes until we reach the end of an unquoted
|
|
// identifier.
|
|
start := lexer.currentPos - lexer.lastWidth
|
|
for {
|
|
r := lexer.next()
|
|
if r < 0 || r > 128 || identifierTrailingBits[uint64(r)/64]&(1<<(uint64(r)%64)) == 0 {
|
|
lexer.back()
|
|
break
|
|
}
|
|
}
|
|
value := lexer.expression[start:lexer.currentPos]
|
|
return token{
|
|
tokenType: tUnquotedIdentifier,
|
|
value: value,
|
|
position: start,
|
|
length: lexer.currentPos - start,
|
|
}
|
|
}
|
|
|
|
func (lexer *Lexer) consumeNumber() token {
|
|
// Consume runes until we reach something that's not a number.
|
|
start := lexer.currentPos - lexer.lastWidth
|
|
for {
|
|
r := lexer.next()
|
|
if r < '0' || r > '9' {
|
|
lexer.back()
|
|
break
|
|
}
|
|
}
|
|
value := lexer.expression[start:lexer.currentPos]
|
|
return token{
|
|
tokenType: tNumber,
|
|
value: value,
|
|
position: start,
|
|
length: lexer.currentPos - start,
|
|
}
|
|
}
|