lexer.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. package jmespath
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "strconv"
  7. "strings"
  8. "unicode/utf8"
  9. )
  10. type token struct {
  11. tokenType tokType
  12. value string
  13. position int
  14. length int
  15. }
  16. type tokType int
  17. const eof = -1
  18. // Lexer contains information about the expression being tokenized.
  19. type Lexer struct {
  20. expression string // The expression provided by the user.
  21. currentPos int // The current position in the string.
  22. lastWidth int // The width of the current rune. This
  23. buf bytes.Buffer // Internal buffer used for building up values.
  24. }
  25. // SyntaxError is the main error used whenever a lexing or parsing error occurs.
  26. type SyntaxError struct {
  27. msg string // Error message displayed to user
  28. Expression string // Expression that generated a SyntaxError
  29. Offset int // The location in the string where the error occurred
  30. }
  31. func (e SyntaxError) Error() string {
  32. // In the future, it would be good to underline the specific
  33. // location where the error occurred.
  34. return "SyntaxError: " + e.msg
  35. }
  36. // HighlightLocation will show where the syntax error occurred.
  37. // It will place a "^" character on a line below the expression
  38. // at the point where the syntax error occurred.
  39. func (e SyntaxError) HighlightLocation() string {
  40. return e.Expression + "\n" + strings.Repeat(" ", e.Offset) + "^"
  41. }
  42. //go:generate stringer -type=tokType
  43. const (
  44. tUnknown tokType = iota
  45. tStar
  46. tDot
  47. tFilter
  48. tFlatten
  49. tLparen
  50. tRparen
  51. tLbracket
  52. tRbracket
  53. tLbrace
  54. tRbrace
  55. tOr
  56. tPipe
  57. tNumber
  58. tUnquotedIdentifier
  59. tQuotedIdentifier
  60. tComma
  61. tColon
  62. tLT
  63. tLTE
  64. tGT
  65. tGTE
  66. tEQ
  67. tNE
  68. tJSONLiteral
  69. tStringLiteral
  70. tCurrent
  71. tExpref
  72. tAnd
  73. tNot
  74. tEOF
  75. )
  76. var basicTokens = map[rune]tokType{
  77. '.': tDot,
  78. '*': tStar,
  79. ',': tComma,
  80. ':': tColon,
  81. '{': tLbrace,
  82. '}': tRbrace,
  83. ']': tRbracket, // tLbracket not included because it could be "[]"
  84. '(': tLparen,
  85. ')': tRparen,
  86. '@': tCurrent,
  87. }
  88. // Bit mask for [a-zA-Z_] shifted down 64 bits to fit in a single uint64.
  89. // When using this bitmask just be sure to shift the rune down 64 bits
  90. // before checking against identifierStartBits.
  91. const identifierStartBits uint64 = 576460745995190270
  92. // Bit mask for [a-zA-Z0-9], 128 bits -> 2 uint64s.
  93. var identifierTrailingBits = [2]uint64{287948901175001088, 576460745995190270}
  94. var whiteSpace = map[rune]bool{
  95. ' ': true, '\t': true, '\n': true, '\r': true,
  96. }
  97. func (t token) String() string {
  98. return fmt.Sprintf("Token{%+v, %s, %d, %d}",
  99. t.tokenType, t.value, t.position, t.length)
  100. }
  101. // NewLexer creates a new JMESPath lexer.
  102. func NewLexer() *Lexer {
  103. lexer := Lexer{}
  104. return &lexer
  105. }
  106. func (lexer *Lexer) next() rune {
  107. if lexer.currentPos >= len(lexer.expression) {
  108. lexer.lastWidth = 0
  109. return eof
  110. }
  111. r, w := utf8.DecodeRuneInString(lexer.expression[lexer.currentPos:])
  112. lexer.lastWidth = w
  113. lexer.currentPos += w
  114. return r
  115. }
  116. func (lexer *Lexer) back() {
  117. lexer.currentPos -= lexer.lastWidth
  118. }
  119. func (lexer *Lexer) peek() rune {
  120. t := lexer.next()
  121. lexer.back()
  122. return t
  123. }
  124. // tokenize takes an expression and returns corresponding tokens.
  125. func (lexer *Lexer) tokenize(expression string) ([]token, error) {
  126. var tokens []token
  127. lexer.expression = expression
  128. lexer.currentPos = 0
  129. lexer.lastWidth = 0
  130. loop:
  131. for {
  132. r := lexer.next()
  133. if identifierStartBits&(1<<(uint64(r)-64)) > 0 {
  134. t := lexer.consumeUnquotedIdentifier()
  135. tokens = append(tokens, t)
  136. } else if val, ok := basicTokens[r]; ok {
  137. // Basic single char token.
  138. t := token{
  139. tokenType: val,
  140. value: string(r),
  141. position: lexer.currentPos - lexer.lastWidth,
  142. length: 1,
  143. }
  144. tokens = append(tokens, t)
  145. } else if r == '-' || (r >= '0' && r <= '9') {
  146. t := lexer.consumeNumber()
  147. tokens = append(tokens, t)
  148. } else if r == '[' {
  149. t := lexer.consumeLBracket()
  150. tokens = append(tokens, t)
  151. } else if r == '"' {
  152. t, err := lexer.consumeQuotedIdentifier()
  153. if err != nil {
  154. return tokens, err
  155. }
  156. tokens = append(tokens, t)
  157. } else if r == '\'' {
  158. t, err := lexer.consumeRawStringLiteral()
  159. if err != nil {
  160. return tokens, err
  161. }
  162. tokens = append(tokens, t)
  163. } else if r == '`' {
  164. t, err := lexer.consumeLiteral()
  165. if err != nil {
  166. return tokens, err
  167. }
  168. tokens = append(tokens, t)
  169. } else if r == '|' {
  170. t := lexer.matchOrElse(r, '|', tOr, tPipe)
  171. tokens = append(tokens, t)
  172. } else if r == '<' {
  173. t := lexer.matchOrElse(r, '=', tLTE, tLT)
  174. tokens = append(tokens, t)
  175. } else if r == '>' {
  176. t := lexer.matchOrElse(r, '=', tGTE, tGT)
  177. tokens = append(tokens, t)
  178. } else if r == '!' {
  179. t := lexer.matchOrElse(r, '=', tNE, tNot)
  180. tokens = append(tokens, t)
  181. } else if r == '=' {
  182. t := lexer.matchOrElse(r, '=', tEQ, tUnknown)
  183. tokens = append(tokens, t)
  184. } else if r == '&' {
  185. t := lexer.matchOrElse(r, '&', tAnd, tExpref)
  186. tokens = append(tokens, t)
  187. } else if r == eof {
  188. break loop
  189. } else if _, ok := whiteSpace[r]; ok {
  190. // Ignore whitespace
  191. } else {
  192. return tokens, lexer.syntaxError(fmt.Sprintf("Unknown char: %s", strconv.QuoteRuneToASCII(r)))
  193. }
  194. }
  195. tokens = append(tokens, token{tEOF, "", len(lexer.expression), 0})
  196. return tokens, nil
  197. }
  198. // Consume characters until the ending rune "r" is reached.
  199. // If the end of the expression is reached before seeing the
  200. // terminating rune "r", then an error is returned.
  201. // If no error occurs then the matching substring is returned.
  202. // The returned string will not include the ending rune.
  203. func (lexer *Lexer) consumeUntil(end rune) (string, error) {
  204. start := lexer.currentPos
  205. current := lexer.next()
  206. for current != end && current != eof {
  207. if current == '\\' && lexer.peek() != eof {
  208. lexer.next()
  209. }
  210. current = lexer.next()
  211. }
  212. if lexer.lastWidth == 0 {
  213. // Then we hit an EOF so we never reached the closing
  214. // delimiter.
  215. return "", SyntaxError{
  216. msg: "Unclosed delimiter: " + string(end),
  217. Expression: lexer.expression,
  218. Offset: len(lexer.expression),
  219. }
  220. }
  221. return lexer.expression[start : lexer.currentPos-lexer.lastWidth], nil
  222. }
  223. func (lexer *Lexer) consumeLiteral() (token, error) {
  224. start := lexer.currentPos
  225. value, err := lexer.consumeUntil('`')
  226. if err != nil {
  227. return token{}, err
  228. }
  229. value = strings.Replace(value, "\\`", "`", -1)
  230. return token{
  231. tokenType: tJSONLiteral,
  232. value: value,
  233. position: start,
  234. length: len(value),
  235. }, nil
  236. }
  237. func (lexer *Lexer) consumeRawStringLiteral() (token, error) {
  238. start := lexer.currentPos
  239. currentIndex := start
  240. current := lexer.next()
  241. for current != '\'' && lexer.peek() != eof {
  242. if current == '\\' && lexer.peek() == '\'' {
  243. chunk := lexer.expression[currentIndex : lexer.currentPos-1]
  244. lexer.buf.WriteString(chunk)
  245. lexer.buf.WriteString("'")
  246. lexer.next()
  247. currentIndex = lexer.currentPos
  248. }
  249. current = lexer.next()
  250. }
  251. if lexer.lastWidth == 0 {
  252. // Then we hit an EOF so we never reached the closing
  253. // delimiter.
  254. return token{}, SyntaxError{
  255. msg: "Unclosed delimiter: '",
  256. Expression: lexer.expression,
  257. Offset: len(lexer.expression),
  258. }
  259. }
  260. if currentIndex < lexer.currentPos {
  261. lexer.buf.WriteString(lexer.expression[currentIndex : lexer.currentPos-1])
  262. }
  263. value := lexer.buf.String()
  264. // Reset the buffer so it can reused again.
  265. lexer.buf.Reset()
  266. return token{
  267. tokenType: tStringLiteral,
  268. value: value,
  269. position: start,
  270. length: len(value),
  271. }, nil
  272. }
  273. func (lexer *Lexer) syntaxError(msg string) SyntaxError {
  274. return SyntaxError{
  275. msg: msg,
  276. Expression: lexer.expression,
  277. Offset: lexer.currentPos - 1,
  278. }
  279. }
  280. // Checks for a two char token, otherwise matches a single character
  281. // token. This is used whenever a two char token overlaps a single
  282. // char token, e.g. "||" -> tPipe, "|" -> tOr.
  283. func (lexer *Lexer) matchOrElse(first rune, second rune, matchedType tokType, singleCharType tokType) token {
  284. start := lexer.currentPos - lexer.lastWidth
  285. nextRune := lexer.next()
  286. var t token
  287. if nextRune == second {
  288. t = token{
  289. tokenType: matchedType,
  290. value: string(first) + string(second),
  291. position: start,
  292. length: 2,
  293. }
  294. } else {
  295. lexer.back()
  296. t = token{
  297. tokenType: singleCharType,
  298. value: string(first),
  299. position: start,
  300. length: 1,
  301. }
  302. }
  303. return t
  304. }
  305. func (lexer *Lexer) consumeLBracket() token {
  306. // There's three options here:
  307. // 1. A filter expression "[?"
  308. // 2. A flatten operator "[]"
  309. // 3. A bare rbracket "["
  310. start := lexer.currentPos - lexer.lastWidth
  311. nextRune := lexer.next()
  312. var t token
  313. if nextRune == '?' {
  314. t = token{
  315. tokenType: tFilter,
  316. value: "[?",
  317. position: start,
  318. length: 2,
  319. }
  320. } else if nextRune == ']' {
  321. t = token{
  322. tokenType: tFlatten,
  323. value: "[]",
  324. position: start,
  325. length: 2,
  326. }
  327. } else {
  328. t = token{
  329. tokenType: tLbracket,
  330. value: "[",
  331. position: start,
  332. length: 1,
  333. }
  334. lexer.back()
  335. }
  336. return t
  337. }
  338. func (lexer *Lexer) consumeQuotedIdentifier() (token, error) {
  339. start := lexer.currentPos
  340. value, err := lexer.consumeUntil('"')
  341. if err != nil {
  342. return token{}, err
  343. }
  344. var decoded string
  345. asJSON := []byte("\"" + value + "\"")
  346. if err := json.Unmarshal([]byte(asJSON), &decoded); err != nil {
  347. return token{}, err
  348. }
  349. return token{
  350. tokenType: tQuotedIdentifier,
  351. value: decoded,
  352. position: start - 1,
  353. length: len(decoded),
  354. }, nil
  355. }
  356. func (lexer *Lexer) consumeUnquotedIdentifier() token {
  357. // Consume runes until we reach the end of an unquoted
  358. // identifier.
  359. start := lexer.currentPos - lexer.lastWidth
  360. for {
  361. r := lexer.next()
  362. if r < 0 || r > 128 || identifierTrailingBits[uint64(r)/64]&(1<<(uint64(r)%64)) == 0 {
  363. lexer.back()
  364. break
  365. }
  366. }
  367. value := lexer.expression[start:lexer.currentPos]
  368. return token{
  369. tokenType: tUnquotedIdentifier,
  370. value: value,
  371. position: start,
  372. length: lexer.currentPos - start,
  373. }
  374. }
  375. func (lexer *Lexer) consumeNumber() token {
  376. // Consume runes until we reach something that's not a number.
  377. start := lexer.currentPos - lexer.lastWidth
  378. for {
  379. r := lexer.next()
  380. if r < '0' || r > '9' {
  381. lexer.back()
  382. break
  383. }
  384. }
  385. value := lexer.expression[start:lexer.currentPos]
  386. return token{
  387. tokenType: tNumber,
  388. value: value,
  389. position: start,
  390. length: lexer.currentPos - start,
  391. }
  392. }