decode.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package text
  5. import (
  6. "bytes"
  7. "fmt"
  8. "io"
  9. "regexp"
  10. "strconv"
  11. "unicode/utf8"
  12. "google.golang.org/protobuf/internal/errors"
  13. )
  14. // Decoder is a token-based textproto decoder.
  15. type Decoder struct {
  16. // lastCall is last method called, either readCall or peekCall.
  17. // Initial value is readCall.
  18. lastCall call
  19. // lastToken contains the last read token.
  20. lastToken Token
  21. // lastErr contains the last read error.
  22. lastErr error
  23. // openStack is a stack containing the byte characters for MessageOpen and
  24. // ListOpen kinds. The top of stack represents the message or the list that
  25. // the current token is nested in. An empty stack means the current token is
  26. // at the top level message. The characters '{' and '<' both represent the
  27. // MessageOpen kind.
  28. openStack []byte
  29. // orig is used in reporting line and column.
  30. orig []byte
  31. // in contains the unconsumed input.
  32. in []byte
  33. }
  34. // NewDecoder returns a Decoder to read the given []byte.
  35. func NewDecoder(b []byte) *Decoder {
  36. return &Decoder{orig: b, in: b}
  37. }
  38. // ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
  39. var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
  40. // call specifies which Decoder method was invoked.
  41. type call uint8
  42. const (
  43. readCall call = iota
  44. peekCall
  45. )
  46. // Peek looks ahead and returns the next token and error without advancing a read.
  47. func (d *Decoder) Peek() (Token, error) {
  48. defer func() { d.lastCall = peekCall }()
  49. if d.lastCall == readCall {
  50. d.lastToken, d.lastErr = d.Read()
  51. }
  52. return d.lastToken, d.lastErr
  53. }
  54. // Read returns the next token.
  55. // It will return an error if there is no valid token.
  56. func (d *Decoder) Read() (Token, error) {
  57. defer func() { d.lastCall = readCall }()
  58. if d.lastCall == peekCall {
  59. return d.lastToken, d.lastErr
  60. }
  61. tok, err := d.parseNext(d.lastToken.kind)
  62. if err != nil {
  63. return Token{}, err
  64. }
  65. switch tok.kind {
  66. case comma, semicolon:
  67. tok, err = d.parseNext(tok.kind)
  68. if err != nil {
  69. return Token{}, err
  70. }
  71. }
  72. d.lastToken = tok
  73. return tok, nil
  74. }
  75. const (
  76. mismatchedFmt = "mismatched close character %q"
  77. unexpectedFmt = "unexpected character %q"
  78. )
  79. // parseNext parses the next Token based on given last kind.
  80. func (d *Decoder) parseNext(lastKind Kind) (Token, error) {
  81. // Trim leading spaces.
  82. d.consume(0)
  83. isEOF := false
  84. if len(d.in) == 0 {
  85. isEOF = true
  86. }
  87. switch lastKind {
  88. case EOF:
  89. return d.consumeToken(EOF, 0, 0), nil
  90. case bof:
  91. // Start of top level message. Next token can be EOF or Name.
  92. if isEOF {
  93. return d.consumeToken(EOF, 0, 0), nil
  94. }
  95. return d.parseFieldName()
  96. case Name:
  97. // Next token can be MessageOpen, ListOpen or Scalar.
  98. if isEOF {
  99. return Token{}, ErrUnexpectedEOF
  100. }
  101. switch ch := d.in[0]; ch {
  102. case '{', '<':
  103. d.pushOpenStack(ch)
  104. return d.consumeToken(MessageOpen, 1, 0), nil
  105. case '[':
  106. d.pushOpenStack(ch)
  107. return d.consumeToken(ListOpen, 1, 0), nil
  108. default:
  109. return d.parseScalar()
  110. }
  111. case Scalar:
  112. openKind, closeCh := d.currentOpenKind()
  113. switch openKind {
  114. case bof:
  115. // Top level message.
  116. // Next token can be EOF, comma, semicolon or Name.
  117. if isEOF {
  118. return d.consumeToken(EOF, 0, 0), nil
  119. }
  120. switch d.in[0] {
  121. case ',':
  122. return d.consumeToken(comma, 1, 0), nil
  123. case ';':
  124. return d.consumeToken(semicolon, 1, 0), nil
  125. default:
  126. return d.parseFieldName()
  127. }
  128. case MessageOpen:
  129. // Next token can be MessageClose, comma, semicolon or Name.
  130. if isEOF {
  131. return Token{}, ErrUnexpectedEOF
  132. }
  133. switch ch := d.in[0]; ch {
  134. case closeCh:
  135. d.popOpenStack()
  136. return d.consumeToken(MessageClose, 1, 0), nil
  137. case otherCloseChar[closeCh]:
  138. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  139. case ',':
  140. return d.consumeToken(comma, 1, 0), nil
  141. case ';':
  142. return d.consumeToken(semicolon, 1, 0), nil
  143. default:
  144. return d.parseFieldName()
  145. }
  146. case ListOpen:
  147. // Next token can be ListClose or comma.
  148. if isEOF {
  149. return Token{}, ErrUnexpectedEOF
  150. }
  151. switch ch := d.in[0]; ch {
  152. case ']':
  153. d.popOpenStack()
  154. return d.consumeToken(ListClose, 1, 0), nil
  155. case ',':
  156. return d.consumeToken(comma, 1, 0), nil
  157. default:
  158. return Token{}, d.newSyntaxError(unexpectedFmt, ch)
  159. }
  160. }
  161. case MessageOpen:
  162. // Next token can be MessageClose or Name.
  163. if isEOF {
  164. return Token{}, ErrUnexpectedEOF
  165. }
  166. _, closeCh := d.currentOpenKind()
  167. switch ch := d.in[0]; ch {
  168. case closeCh:
  169. d.popOpenStack()
  170. return d.consumeToken(MessageClose, 1, 0), nil
  171. case otherCloseChar[closeCh]:
  172. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  173. default:
  174. return d.parseFieldName()
  175. }
  176. case MessageClose:
  177. openKind, closeCh := d.currentOpenKind()
  178. switch openKind {
  179. case bof:
  180. // Top level message.
  181. // Next token can be EOF, comma, semicolon or Name.
  182. if isEOF {
  183. return d.consumeToken(EOF, 0, 0), nil
  184. }
  185. switch ch := d.in[0]; ch {
  186. case ',':
  187. return d.consumeToken(comma, 1, 0), nil
  188. case ';':
  189. return d.consumeToken(semicolon, 1, 0), nil
  190. default:
  191. return d.parseFieldName()
  192. }
  193. case MessageOpen:
  194. // Next token can be MessageClose, comma, semicolon or Name.
  195. if isEOF {
  196. return Token{}, ErrUnexpectedEOF
  197. }
  198. switch ch := d.in[0]; ch {
  199. case closeCh:
  200. d.popOpenStack()
  201. return d.consumeToken(MessageClose, 1, 0), nil
  202. case otherCloseChar[closeCh]:
  203. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  204. case ',':
  205. return d.consumeToken(comma, 1, 0), nil
  206. case ';':
  207. return d.consumeToken(semicolon, 1, 0), nil
  208. default:
  209. return d.parseFieldName()
  210. }
  211. case ListOpen:
  212. // Next token can be ListClose or comma
  213. if isEOF {
  214. return Token{}, ErrUnexpectedEOF
  215. }
  216. switch ch := d.in[0]; ch {
  217. case closeCh:
  218. d.popOpenStack()
  219. return d.consumeToken(ListClose, 1, 0), nil
  220. case ',':
  221. return d.consumeToken(comma, 1, 0), nil
  222. default:
  223. return Token{}, d.newSyntaxError(unexpectedFmt, ch)
  224. }
  225. }
  226. case ListOpen:
  227. // Next token can be ListClose, MessageStart or Scalar.
  228. if isEOF {
  229. return Token{}, ErrUnexpectedEOF
  230. }
  231. switch ch := d.in[0]; ch {
  232. case ']':
  233. d.popOpenStack()
  234. return d.consumeToken(ListClose, 1, 0), nil
  235. case '{', '<':
  236. d.pushOpenStack(ch)
  237. return d.consumeToken(MessageOpen, 1, 0), nil
  238. default:
  239. return d.parseScalar()
  240. }
  241. case ListClose:
  242. openKind, closeCh := d.currentOpenKind()
  243. switch openKind {
  244. case bof:
  245. // Top level message.
  246. // Next token can be EOF, comma, semicolon or Name.
  247. if isEOF {
  248. return d.consumeToken(EOF, 0, 0), nil
  249. }
  250. switch ch := d.in[0]; ch {
  251. case ',':
  252. return d.consumeToken(comma, 1, 0), nil
  253. case ';':
  254. return d.consumeToken(semicolon, 1, 0), nil
  255. default:
  256. return d.parseFieldName()
  257. }
  258. case MessageOpen:
  259. // Next token can be MessageClose, comma, semicolon or Name.
  260. if isEOF {
  261. return Token{}, ErrUnexpectedEOF
  262. }
  263. switch ch := d.in[0]; ch {
  264. case closeCh:
  265. d.popOpenStack()
  266. return d.consumeToken(MessageClose, 1, 0), nil
  267. case otherCloseChar[closeCh]:
  268. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  269. case ',':
  270. return d.consumeToken(comma, 1, 0), nil
  271. case ';':
  272. return d.consumeToken(semicolon, 1, 0), nil
  273. default:
  274. return d.parseFieldName()
  275. }
  276. default:
  277. // It is not possible to have this case. Let it panic below.
  278. }
  279. case comma, semicolon:
  280. openKind, closeCh := d.currentOpenKind()
  281. switch openKind {
  282. case bof:
  283. // Top level message. Next token can be EOF or Name.
  284. if isEOF {
  285. return d.consumeToken(EOF, 0, 0), nil
  286. }
  287. return d.parseFieldName()
  288. case MessageOpen:
  289. // Next token can be MessageClose or Name.
  290. if isEOF {
  291. return Token{}, ErrUnexpectedEOF
  292. }
  293. switch ch := d.in[0]; ch {
  294. case closeCh:
  295. d.popOpenStack()
  296. return d.consumeToken(MessageClose, 1, 0), nil
  297. case otherCloseChar[closeCh]:
  298. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  299. default:
  300. return d.parseFieldName()
  301. }
  302. case ListOpen:
  303. if lastKind == semicolon {
  304. // It is not be possible to have this case as logic here
  305. // should not have produced a semicolon Token when inside a
  306. // list. Let it panic below.
  307. break
  308. }
  309. // Next token can be MessageOpen or Scalar.
  310. if isEOF {
  311. return Token{}, ErrUnexpectedEOF
  312. }
  313. switch ch := d.in[0]; ch {
  314. case '{', '<':
  315. d.pushOpenStack(ch)
  316. return d.consumeToken(MessageOpen, 1, 0), nil
  317. default:
  318. return d.parseScalar()
  319. }
  320. }
  321. }
  322. line, column := d.Position(len(d.orig) - len(d.in))
  323. panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
  324. }
  325. var otherCloseChar = map[byte]byte{
  326. '}': '>',
  327. '>': '}',
  328. }
  329. // currentOpenKind indicates whether current position is inside a message, list
  330. // or top-level message by returning MessageOpen, ListOpen or bof respectively.
  331. // If the returned kind is either a MessageOpen or ListOpen, it also returns the
  332. // corresponding closing character.
  333. func (d *Decoder) currentOpenKind() (Kind, byte) {
  334. if len(d.openStack) == 0 {
  335. return bof, 0
  336. }
  337. openCh := d.openStack[len(d.openStack)-1]
  338. switch openCh {
  339. case '{':
  340. return MessageOpen, '}'
  341. case '<':
  342. return MessageOpen, '>'
  343. case '[':
  344. return ListOpen, ']'
  345. }
  346. panic(fmt.Sprintf("Decoder: openStack contains invalid byte %s", string(openCh)))
  347. }
  348. func (d *Decoder) pushOpenStack(ch byte) {
  349. d.openStack = append(d.openStack, ch)
  350. }
  351. func (d *Decoder) popOpenStack() {
  352. d.openStack = d.openStack[:len(d.openStack)-1]
  353. }
  354. // parseFieldName parses field name and separator.
  355. func (d *Decoder) parseFieldName() (tok Token, err error) {
  356. defer func() {
  357. if err == nil && d.tryConsumeChar(':') {
  358. tok.attrs |= hasSeparator
  359. }
  360. }()
  361. // Extension or Any type URL.
  362. if d.in[0] == '[' {
  363. return d.parseTypeName()
  364. }
  365. // Identifier.
  366. if size := parseIdent(d.in, false); size > 0 {
  367. return d.consumeToken(Name, size, uint8(IdentName)), nil
  368. }
  369. // Field number. Identify if input is a valid number that is not negative
  370. // and is decimal integer within 32-bit range.
  371. if num := parseNumber(d.in); num.size > 0 {
  372. if !num.neg && num.kind == numDec {
  373. if _, err := strconv.ParseInt(string(d.in[:num.size]), 10, 32); err == nil {
  374. return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
  375. }
  376. }
  377. return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size])
  378. }
  379. return Token{}, d.newSyntaxError("invalid field name: %s", errRegexp.Find(d.in))
  380. }
  381. // parseTypeName parses Any type URL or extension field name. The name is
  382. // enclosed in [ and ] characters. The C++ parser does not handle many legal URL
  383. // strings. This implementation is more liberal and allows for the pattern
  384. // ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
  385. // in between [ ], '.', '/' and the sub names.
  386. func (d *Decoder) parseTypeName() (Token, error) {
  387. startPos := len(d.orig) - len(d.in)
  388. // Use alias s to advance first in order to use d.in for error handling.
  389. // Caller already checks for [ as first character.
  390. s := consume(d.in[1:], 0)
  391. if len(s) == 0 {
  392. return Token{}, ErrUnexpectedEOF
  393. }
  394. var name []byte
  395. for len(s) > 0 && isTypeNameChar(s[0]) {
  396. name = append(name, s[0])
  397. s = s[1:]
  398. }
  399. s = consume(s, 0)
  400. var closed bool
  401. for len(s) > 0 && !closed {
  402. switch {
  403. case s[0] == ']':
  404. s = s[1:]
  405. closed = true
  406. case s[0] == '/', s[0] == '.':
  407. if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
  408. return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
  409. d.orig[startPos:len(d.orig)-len(s)+1])
  410. }
  411. name = append(name, s[0])
  412. s = s[1:]
  413. s = consume(s, 0)
  414. for len(s) > 0 && isTypeNameChar(s[0]) {
  415. name = append(name, s[0])
  416. s = s[1:]
  417. }
  418. s = consume(s, 0)
  419. default:
  420. return Token{}, d.newSyntaxError(
  421. "invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
  422. }
  423. }
  424. if !closed {
  425. return Token{}, ErrUnexpectedEOF
  426. }
  427. // First character cannot be '.'. Last character cannot be '.' or '/'.
  428. size := len(name)
  429. if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
  430. return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
  431. d.orig[startPos:len(d.orig)-len(s)])
  432. }
  433. d.in = s
  434. endPos := len(d.orig) - len(d.in)
  435. d.consume(0)
  436. return Token{
  437. kind: Name,
  438. attrs: uint8(TypeName),
  439. pos: startPos,
  440. raw: d.orig[startPos:endPos],
  441. str: string(name),
  442. }, nil
  443. }
  444. func isTypeNameChar(b byte) bool {
  445. return (b == '-' || b == '_' ||
  446. ('0' <= b && b <= '9') ||
  447. ('a' <= b && b <= 'z') ||
  448. ('A' <= b && b <= 'Z'))
  449. }
  450. func isWhiteSpace(b byte) bool {
  451. switch b {
  452. case ' ', '\n', '\r', '\t':
  453. return true
  454. default:
  455. return false
  456. }
  457. }
  458. // parseIdent parses an unquoted proto identifier and returns size.
  459. // If allowNeg is true, it allows '-' to be the first character in the
  460. // identifier. This is used when parsing literal values like -infinity, etc.
  461. // Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
  462. func parseIdent(input []byte, allowNeg bool) int {
  463. var size int
  464. s := input
  465. if len(s) == 0 {
  466. return 0
  467. }
  468. if allowNeg && s[0] == '-' {
  469. s = s[1:]
  470. size++
  471. if len(s) == 0 {
  472. return 0
  473. }
  474. }
  475. switch {
  476. case s[0] == '_',
  477. 'a' <= s[0] && s[0] <= 'z',
  478. 'A' <= s[0] && s[0] <= 'Z':
  479. s = s[1:]
  480. size++
  481. default:
  482. return 0
  483. }
  484. for len(s) > 0 && (s[0] == '_' ||
  485. 'a' <= s[0] && s[0] <= 'z' ||
  486. 'A' <= s[0] && s[0] <= 'Z' ||
  487. '0' <= s[0] && s[0] <= '9') {
  488. s = s[1:]
  489. size++
  490. }
  491. if len(s) > 0 && !isDelim(s[0]) {
  492. return 0
  493. }
  494. return size
  495. }
  496. // parseScalar parses for a string, literal or number value.
  497. func (d *Decoder) parseScalar() (Token, error) {
  498. if d.in[0] == '"' || d.in[0] == '\'' {
  499. return d.parseStringValue()
  500. }
  501. if tok, ok := d.parseLiteralValue(); ok {
  502. return tok, nil
  503. }
  504. if tok, ok := d.parseNumberValue(); ok {
  505. return tok, nil
  506. }
  507. return Token{}, d.newSyntaxError("invalid scalar value: %s", errRegexp.Find(d.in))
  508. }
  509. // parseLiteralValue parses a literal value. A literal value is used for
  510. // bools, special floats and enums. This function simply identifies that the
  511. // field value is a literal.
  512. func (d *Decoder) parseLiteralValue() (Token, bool) {
  513. size := parseIdent(d.in, true)
  514. if size == 0 {
  515. return Token{}, false
  516. }
  517. return d.consumeToken(Scalar, size, literalValue), true
  518. }
  519. // consumeToken constructs a Token for given Kind from d.in and consumes given
  520. // size-length from it.
  521. func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
  522. // Important to compute raw and pos before consuming.
  523. tok := Token{
  524. kind: kind,
  525. attrs: attrs,
  526. pos: len(d.orig) - len(d.in),
  527. raw: d.in[:size],
  528. }
  529. d.consume(size)
  530. return tok
  531. }
  532. // newSyntaxError returns a syntax error with line and column information for
  533. // current position.
  534. func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
  535. e := errors.New(f, x...)
  536. line, column := d.Position(len(d.orig) - len(d.in))
  537. return errors.New("syntax error (line %d:%d): %v", line, column, e)
  538. }
  539. // Position returns line and column number of given index of the original input.
  540. // It will panic if index is out of range.
  541. func (d *Decoder) Position(idx int) (line int, column int) {
  542. b := d.orig[:idx]
  543. line = bytes.Count(b, []byte("\n")) + 1
  544. if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
  545. b = b[i+1:]
  546. }
  547. column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
  548. return line, column
  549. }
  550. func (d *Decoder) tryConsumeChar(c byte) bool {
  551. if len(d.in) > 0 && d.in[0] == c {
  552. d.consume(1)
  553. return true
  554. }
  555. return false
  556. }
  557. // consume consumes n bytes of input and any subsequent whitespace or comments.
  558. func (d *Decoder) consume(n int) {
  559. d.in = consume(d.in, n)
  560. return
  561. }
  562. // consume consumes n bytes of input and any subsequent whitespace or comments.
  563. func consume(b []byte, n int) []byte {
  564. b = b[n:]
  565. for len(b) > 0 {
  566. switch b[0] {
  567. case ' ', '\n', '\r', '\t':
  568. b = b[1:]
  569. case '#':
  570. if i := bytes.IndexByte(b, '\n'); i >= 0 {
  571. b = b[i+len("\n"):]
  572. } else {
  573. b = nil
  574. }
  575. default:
  576. return b
  577. }
  578. }
  579. return b
  580. }
  581. // Any sequence that looks like a non-delimiter (for error reporting).
  582. var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
  583. // isDelim returns true if given byte is a delimiter character.
  584. func isDelim(c byte) bool {
  585. return !(c == '-' || c == '+' || c == '.' || c == '_' ||
  586. ('a' <= c && c <= 'z') ||
  587. ('A' <= c && c <= 'Z') ||
  588. ('0' <= c && c <= '9'))
  589. }