decode_string.go 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package text
  5. import (
  6. "bytes"
  7. "strconv"
  8. "strings"
  9. "unicode"
  10. "unicode/utf16"
  11. "unicode/utf8"
  12. "google.golang.org/protobuf/internal/strs"
  13. )
  14. // parseStringValue parses string field token.
  15. // This differs from parseString since the text format allows
  16. // multiple back-to-back string literals where they are semantically treated
  17. // as a single large string with all values concatenated.
  18. //
  19. // E.g., `"foo" "bar" "baz"` => "foobarbaz"
  20. func (d *Decoder) parseStringValue() (Token, error) {
  21. // Note that the ending quote is sufficient to unambiguously mark the end
  22. // of a string. Thus, the text grammar does not require intervening
  23. // whitespace or control characters in-between strings.
  24. // Thus, the following is valid:
  25. // `"foo"'bar'"baz"` => "foobarbaz"
  26. in0 := d.in
  27. var ss []string
  28. for len(d.in) > 0 && (d.in[0] == '"' || d.in[0] == '\'') {
  29. s, err := d.parseString()
  30. if err != nil {
  31. return Token{}, err
  32. }
  33. ss = append(ss, s)
  34. }
  35. // d.in already points to the end of the value at this point.
  36. return Token{
  37. kind: Scalar,
  38. attrs: stringValue,
  39. pos: len(d.orig) - len(in0),
  40. raw: in0[:len(in0)-len(d.in)],
  41. str: strings.Join(ss, ""),
  42. }, nil
  43. }
  44. // parseString parses a string value enclosed in " or '.
  45. func (d *Decoder) parseString() (string, error) {
  46. in := d.in
  47. if len(in) == 0 {
  48. return "", ErrUnexpectedEOF
  49. }
  50. quote := in[0]
  51. in = in[1:]
  52. i := indexNeedEscapeInBytes(in)
  53. in, out := in[i:], in[:i:i] // set cap to prevent mutations
  54. for len(in) > 0 {
  55. switch r, n := utf8.DecodeRune(in); {
  56. case r == utf8.RuneError && n == 1:
  57. return "", d.newSyntaxError("invalid UTF-8 detected")
  58. case r == 0 || r == '\n':
  59. return "", d.newSyntaxError("invalid character %q in string", r)
  60. case r == rune(quote):
  61. in = in[1:]
  62. d.consume(len(d.in) - len(in))
  63. return string(out), nil
  64. case r == '\\':
  65. if len(in) < 2 {
  66. return "", ErrUnexpectedEOF
  67. }
  68. switch r := in[1]; r {
  69. case '"', '\'', '\\', '?':
  70. in, out = in[2:], append(out, r)
  71. case 'a':
  72. in, out = in[2:], append(out, '\a')
  73. case 'b':
  74. in, out = in[2:], append(out, '\b')
  75. case 'n':
  76. in, out = in[2:], append(out, '\n')
  77. case 'r':
  78. in, out = in[2:], append(out, '\r')
  79. case 't':
  80. in, out = in[2:], append(out, '\t')
  81. case 'v':
  82. in, out = in[2:], append(out, '\v')
  83. case 'f':
  84. in, out = in[2:], append(out, '\f')
  85. case '0', '1', '2', '3', '4', '5', '6', '7':
  86. // One, two, or three octal characters.
  87. n := len(in[1:]) - len(bytes.TrimLeft(in[1:], "01234567"))
  88. if n > 3 {
  89. n = 3
  90. }
  91. v, err := strconv.ParseUint(string(in[1:1+n]), 8, 8)
  92. if err != nil {
  93. return "", d.newSyntaxError("invalid octal escape code %q in string", in[:1+n])
  94. }
  95. in, out = in[1+n:], append(out, byte(v))
  96. case 'x':
  97. // One or two hexadecimal characters.
  98. n := len(in[2:]) - len(bytes.TrimLeft(in[2:], "0123456789abcdefABCDEF"))
  99. if n > 2 {
  100. n = 2
  101. }
  102. v, err := strconv.ParseUint(string(in[2:2+n]), 16, 8)
  103. if err != nil {
  104. return "", d.newSyntaxError("invalid hex escape code %q in string", in[:2+n])
  105. }
  106. in, out = in[2+n:], append(out, byte(v))
  107. case 'u', 'U':
  108. // Four or eight hexadecimal characters
  109. n := 6
  110. if r == 'U' {
  111. n = 10
  112. }
  113. if len(in) < n {
  114. return "", ErrUnexpectedEOF
  115. }
  116. v, err := strconv.ParseUint(string(in[2:n]), 16, 32)
  117. if utf8.MaxRune < v || err != nil {
  118. return "", d.newSyntaxError("invalid Unicode escape code %q in string", in[:n])
  119. }
  120. in = in[n:]
  121. r := rune(v)
  122. if utf16.IsSurrogate(r) {
  123. if len(in) < 6 {
  124. return "", ErrUnexpectedEOF
  125. }
  126. v, err := strconv.ParseUint(string(in[2:6]), 16, 16)
  127. r = utf16.DecodeRune(r, rune(v))
  128. if in[0] != '\\' || in[1] != 'u' || r == unicode.ReplacementChar || err != nil {
  129. return "", d.newSyntaxError("invalid Unicode escape code %q in string", in[:6])
  130. }
  131. in = in[6:]
  132. }
  133. out = append(out, string(r)...)
  134. default:
  135. return "", d.newSyntaxError("invalid escape code %q in string", in[:2])
  136. }
  137. default:
  138. i := indexNeedEscapeInBytes(in[n:])
  139. in, out = in[n+i:], append(out, in[:n+i]...)
  140. }
  141. }
  142. return "", ErrUnexpectedEOF
  143. }
  144. // indexNeedEscapeInString returns the index of the character that needs
  145. // escaping. If no characters need escaping, this returns the input length.
  146. func indexNeedEscapeInBytes(b []byte) int { return indexNeedEscapeInString(strs.UnsafeString(b)) }
  147. // UnmarshalString returns an unescaped string given a textproto string value.
  148. // String value needs to contain single or double quotes. This is only used by
  149. // internal/encoding/defval package for unmarshaling bytes.
  150. func UnmarshalString(s string) (string, error) {
  151. d := NewDecoder([]byte(s))
  152. return d.parseString()
  153. }