458 lines
10 KiB
Go
458 lines
10 KiB
Go
/*
|
|
Copyright 2012 Google Inc. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package shlex
|
|
|
|
/*
|
|
Package shlex implements a simple lexer which splits input in to tokens using
|
|
shell-style rules for quoting and commenting.
|
|
*/
|
|
import (
|
|
"bufio"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
)
|
|
|
|
/*
|
|
A TokenType is a top-level token; a word, space, comment, unknown.
|
|
*/
|
|
type TokenType int
|
|
|
|
/*
|
|
A RuneTokenType is the type of a UTF-8 character; a character, quote, space, escape.
|
|
*/
|
|
type RuneTokenType int
|
|
|
|
type lexerState int
|
|
|
|
type Token struct {
|
|
tokenType TokenType
|
|
value string
|
|
}
|
|
|
|
/*
|
|
Two tokens are equal if both their types and values are equal. A nil token can
|
|
never equal another token.
|
|
*/
|
|
func (a *Token) Equal(b *Token) bool {
|
|
if a == nil || b == nil {
|
|
return false
|
|
}
|
|
if a.tokenType != b.tokenType {
|
|
return false
|
|
}
|
|
return a.value == b.value
|
|
}
|
|
|
|
const (
|
|
RUNE_CHAR string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-,/@$*()+=><:;&^%~|!?[]{}"
|
|
RUNE_SPACE string = " \t\r\n"
|
|
RUNE_ESCAPING_QUOTE string = "\""
|
|
RUNE_NONESCAPING_QUOTE string = "'"
|
|
RUNE_ESCAPE = "\\"
|
|
RUNE_COMMENT = "#"
|
|
|
|
RUNETOKEN_UNKNOWN RuneTokenType = 0
|
|
RUNETOKEN_CHAR RuneTokenType = 1
|
|
RUNETOKEN_SPACE RuneTokenType = 2
|
|
RUNETOKEN_ESCAPING_QUOTE RuneTokenType = 3
|
|
RUNETOKEN_NONESCAPING_QUOTE RuneTokenType = 4
|
|
RUNETOKEN_ESCAPE RuneTokenType = 5
|
|
RUNETOKEN_COMMENT RuneTokenType = 6
|
|
RUNETOKEN_EOF RuneTokenType = 7
|
|
|
|
TOKEN_UNKNOWN TokenType = 0
|
|
TOKEN_WORD TokenType = 1
|
|
TOKEN_SPACE TokenType = 2
|
|
TOKEN_COMMENT TokenType = 3
|
|
|
|
STATE_START lexerState = 0
|
|
STATE_INWORD lexerState = 1
|
|
STATE_ESCAPING lexerState = 2
|
|
STATE_ESCAPING_QUOTED lexerState = 3
|
|
STATE_QUOTED_ESCAPING lexerState = 4
|
|
STATE_QUOTED lexerState = 5
|
|
STATE_COMMENT lexerState = 6
|
|
|
|
INITIAL_TOKEN_CAPACITY int = 100
|
|
)
|
|
|
|
/*
|
|
A type for classifying characters. This allows for different sorts of
|
|
classifiers - those accepting extended non-ascii chars, or strict posix
|
|
compatibility, for example.
|
|
*/
|
|
type TokenClassifier struct {
|
|
typeMap map[int32]RuneTokenType
|
|
}
|
|
|
|
func addRuneClass(typeMap *map[int32]RuneTokenType, runes string, tokenType RuneTokenType) {
|
|
for _, rune := range runes {
|
|
(*typeMap)[int32(rune)] = tokenType
|
|
}
|
|
}
|
|
|
|
/*
|
|
Create a new classifier for basic ASCII characters.
|
|
*/
|
|
func NewDefaultClassifier() *TokenClassifier {
|
|
typeMap := map[int32]RuneTokenType{}
|
|
addRuneClass(&typeMap, RUNE_CHAR, RUNETOKEN_CHAR)
|
|
addRuneClass(&typeMap, RUNE_SPACE, RUNETOKEN_SPACE)
|
|
addRuneClass(&typeMap, RUNE_ESCAPING_QUOTE, RUNETOKEN_ESCAPING_QUOTE)
|
|
addRuneClass(&typeMap, RUNE_NONESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE)
|
|
addRuneClass(&typeMap, RUNE_ESCAPE, RUNETOKEN_ESCAPE)
|
|
addRuneClass(&typeMap, RUNE_COMMENT, RUNETOKEN_COMMENT)
|
|
return &TokenClassifier{
|
|
typeMap: typeMap}
|
|
}
|
|
|
|
func (classifier *TokenClassifier) ClassifyRune(rune int32) RuneTokenType {
|
|
return classifier.typeMap[rune]
|
|
}
|
|
|
|
/*
|
|
A type for turning an input stream in to a sequence of strings. Whitespace and
|
|
comments are skipped.
|
|
*/
|
|
type Lexer struct {
|
|
tokenizer *Tokenizer
|
|
}
|
|
|
|
/*
|
|
Create a new lexer.
|
|
*/
|
|
func NewLexer(r io.Reader) (*Lexer, error) {
|
|
|
|
tokenizer, err := NewTokenizer(r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
lexer := &Lexer{tokenizer: tokenizer}
|
|
return lexer, nil
|
|
}
|
|
|
|
/*
|
|
Return the next word, and an error value. If there are no more words, the error
|
|
will be io.EOF.
|
|
*/
|
|
func (l *Lexer) NextWord() (string, error) {
|
|
var token *Token
|
|
var err error
|
|
for {
|
|
token, err = l.tokenizer.NextToken()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
switch token.tokenType {
|
|
case TOKEN_WORD:
|
|
{
|
|
return token.value, nil
|
|
}
|
|
case TOKEN_COMMENT:
|
|
{
|
|
// skip comments
|
|
}
|
|
default:
|
|
{
|
|
panic(fmt.Sprintf("Unknown token type: %v", token.tokenType))
|
|
}
|
|
}
|
|
}
|
|
return "", io.EOF
|
|
}
|
|
|
|
/*
|
|
A type for turning an input stream in to a sequence of typed tokens.
|
|
*/
|
|
type Tokenizer struct {
|
|
input *bufio.Reader
|
|
classifier *TokenClassifier
|
|
}
|
|
|
|
/*
|
|
Create a new tokenizer.
|
|
*/
|
|
func NewTokenizer(r io.Reader) (*Tokenizer, error) {
|
|
input := bufio.NewReader(r)
|
|
classifier := NewDefaultClassifier()
|
|
tokenizer := &Tokenizer{
|
|
input: input,
|
|
classifier: classifier}
|
|
return tokenizer, nil
|
|
}
|
|
|
|
/*
|
|
Scan the stream for the next token.
|
|
|
|
This uses an internal state machine. It will panic if it encounters a character
|
|
which it does not know how to handle.
|
|
*/
|
|
func (t *Tokenizer) scanStream() (*Token, error) {
|
|
state := STATE_START
|
|
var tokenType TokenType
|
|
value := make([]int32, 0, INITIAL_TOKEN_CAPACITY)
|
|
var (
|
|
nextRune int32
|
|
nextRuneType RuneTokenType
|
|
err error
|
|
)
|
|
SCAN:
|
|
for {
|
|
nextRune, _, err = t.input.ReadRune()
|
|
nextRuneType = t.classifier.ClassifyRune(nextRune)
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
nextRuneType = RUNETOKEN_EOF
|
|
err = nil
|
|
} else {
|
|
return nil, err
|
|
}
|
|
}
|
|
switch state {
|
|
case STATE_START: // no runes read yet
|
|
{
|
|
switch nextRuneType {
|
|
case RUNETOKEN_EOF:
|
|
{
|
|
return nil, io.EOF
|
|
}
|
|
case RUNETOKEN_CHAR:
|
|
{
|
|
tokenType = TOKEN_WORD
|
|
value = append(value, nextRune)
|
|
state = STATE_INWORD
|
|
}
|
|
case RUNETOKEN_SPACE:
|
|
{
|
|
}
|
|
case RUNETOKEN_ESCAPING_QUOTE:
|
|
{
|
|
tokenType = TOKEN_WORD
|
|
state = STATE_QUOTED_ESCAPING
|
|
}
|
|
case RUNETOKEN_NONESCAPING_QUOTE:
|
|
{
|
|
tokenType = TOKEN_WORD
|
|
state = STATE_QUOTED
|
|
}
|
|
case RUNETOKEN_ESCAPE:
|
|
{
|
|
tokenType = TOKEN_WORD
|
|
state = STATE_ESCAPING
|
|
}
|
|
case RUNETOKEN_COMMENT:
|
|
{
|
|
tokenType = TOKEN_COMMENT
|
|
state = STATE_COMMENT
|
|
}
|
|
default:
|
|
{
|
|
return nil, errors.New(fmt.Sprintf("Unknown rune: %v", nextRune))
|
|
}
|
|
}
|
|
}
|
|
case STATE_INWORD: // in a regular word
|
|
{
|
|
switch nextRuneType {
|
|
case RUNETOKEN_EOF:
|
|
{
|
|
break SCAN
|
|
}
|
|
case RUNETOKEN_CHAR, RUNETOKEN_COMMENT:
|
|
{
|
|
value = append(value, nextRune)
|
|
}
|
|
case RUNETOKEN_SPACE:
|
|
{
|
|
t.input.UnreadRune()
|
|
break SCAN
|
|
}
|
|
case RUNETOKEN_ESCAPING_QUOTE:
|
|
{
|
|
state = STATE_QUOTED_ESCAPING
|
|
}
|
|
case RUNETOKEN_NONESCAPING_QUOTE:
|
|
{
|
|
state = STATE_QUOTED
|
|
}
|
|
case RUNETOKEN_ESCAPE:
|
|
{
|
|
state = STATE_ESCAPING
|
|
}
|
|
default:
|
|
{
|
|
return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
|
|
}
|
|
}
|
|
}
|
|
case STATE_ESCAPING: // the next rune after an escape character
|
|
{
|
|
switch nextRuneType {
|
|
case RUNETOKEN_EOF:
|
|
{
|
|
err = errors.New("EOF found after escape character")
|
|
break SCAN
|
|
}
|
|
case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
|
|
{
|
|
state = STATE_INWORD
|
|
value = append(value, nextRune)
|
|
}
|
|
default:
|
|
{
|
|
return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
|
|
}
|
|
}
|
|
}
|
|
case STATE_ESCAPING_QUOTED: // the next rune after an escape character, in double quotes
|
|
{
|
|
switch nextRuneType {
|
|
case RUNETOKEN_EOF:
|
|
{
|
|
err = errors.New("EOF found after escape character")
|
|
break SCAN
|
|
}
|
|
case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
|
|
{
|
|
state = STATE_QUOTED_ESCAPING
|
|
value = append(value, nextRune)
|
|
}
|
|
default:
|
|
{
|
|
return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
|
|
}
|
|
}
|
|
}
|
|
case STATE_QUOTED_ESCAPING: // in escaping double quotes
|
|
{
|
|
switch nextRuneType {
|
|
case RUNETOKEN_EOF:
|
|
{
|
|
err = errors.New("EOF found when expecting closing quote.")
|
|
break SCAN
|
|
}
|
|
case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_SPACE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_COMMENT:
|
|
{
|
|
value = append(value, nextRune)
|
|
}
|
|
case RUNETOKEN_ESCAPING_QUOTE:
|
|
{
|
|
state = STATE_INWORD
|
|
}
|
|
case RUNETOKEN_ESCAPE:
|
|
{
|
|
state = STATE_ESCAPING_QUOTED
|
|
}
|
|
default:
|
|
{
|
|
return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
|
|
}
|
|
}
|
|
}
|
|
case STATE_QUOTED: // in non-escaping single quotes
|
|
{
|
|
switch nextRuneType {
|
|
case RUNETOKEN_EOF:
|
|
{
|
|
err = errors.New("EOF found when expecting closing quote.")
|
|
break SCAN
|
|
}
|
|
case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
|
|
{
|
|
value = append(value, nextRune)
|
|
}
|
|
case RUNETOKEN_NONESCAPING_QUOTE:
|
|
{
|
|
state = STATE_INWORD
|
|
}
|
|
default:
|
|
{
|
|
return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
|
|
}
|
|
}
|
|
}
|
|
case STATE_COMMENT:
|
|
{
|
|
switch nextRuneType {
|
|
case RUNETOKEN_EOF:
|
|
{
|
|
break SCAN
|
|
}
|
|
case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT, RUNETOKEN_NONESCAPING_QUOTE:
|
|
{
|
|
value = append(value, nextRune)
|
|
}
|
|
case RUNETOKEN_SPACE:
|
|
{
|
|
if nextRune == '\n' {
|
|
state = STATE_START
|
|
break SCAN
|
|
} else {
|
|
value = append(value, nextRune)
|
|
}
|
|
}
|
|
default:
|
|
{
|
|
return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
|
|
}
|
|
}
|
|
}
|
|
default:
|
|
{
|
|
panic(fmt.Sprintf("Unexpected state: %v", state))
|
|
}
|
|
}
|
|
}
|
|
token := &Token{
|
|
tokenType: tokenType,
|
|
value: string(value)}
|
|
return token, err
|
|
}
|
|
|
|
/*
|
|
Return the next token in the stream, and an error value. If there are no more
|
|
tokens available, the error value will be io.EOF.
|
|
*/
|
|
func (t *Tokenizer) NextToken() (*Token, error) {
|
|
return t.scanStream()
|
|
}
|
|
|
|
/*
|
|
Split a string in to a slice of strings, based upon shell-style rules for
|
|
quoting, escaping, and spaces.
|
|
*/
|
|
func Split(s string) ([]string, error) {
|
|
l, err := NewLexer(strings.NewReader(s))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
subStrings := []string{}
|
|
for {
|
|
word, err := l.NextWord()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
return subStrings, nil
|
|
}
|
|
return subStrings, err
|
|
}
|
|
subStrings = append(subStrings, word)
|
|
}
|
|
return subStrings, nil
|
|
}
|