escape.go 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. package jsonparser
  2. import (
  3. "bytes"
  4. "unicode/utf8"
  5. )
  6. // JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7
  7. const supplementalPlanesOffset = 0x10000
  8. const highSurrogateOffset = 0xD800
  9. const lowSurrogateOffset = 0xDC00
  10. const basicMultilingualPlaneReservedOffset = 0xDFFF
  11. const basicMultilingualPlaneOffset = 0xFFFF
  12. func combineUTF16Surrogates(high, low rune) rune {
  13. return supplementalPlanesOffset + (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
  14. }
  15. const badHex = -1
  16. func h2I(c byte) int {
  17. switch {
  18. case c >= '0' && c <= '9':
  19. return int(c - '0')
  20. case c >= 'A' && c <= 'F':
  21. return int(c - 'A' + 10)
  22. case c >= 'a' && c <= 'f':
  23. return int(c - 'a' + 10)
  24. }
  25. return badHex
  26. }
  27. // decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. The prefix \u is assumed to be present and
  28. // is not checked.
  29. // In JSON, these escapes can either come alone or as part of "UTF16 surrogate pairs" that must be handled together.
  30. // This function only handles one; decodeUnicodeEscape handles this more complex case.
  31. func decodeSingleUnicodeEscape(in []byte) (rune, bool) {
  32. // We need at least 6 characters total
  33. if len(in) < 6 {
  34. return utf8.RuneError, false
  35. }
  36. // Convert hex to decimal
  37. h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5])
  38. if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
  39. return utf8.RuneError, false
  40. }
  41. // Compose the hex digits
  42. return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
  43. }
  44. // isUTF16EncodedRune checks if a rune is in the range for non-BMP characters,
  45. // which is used to describe UTF16 chars.
  46. // Source: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
  47. func isUTF16EncodedRune(r rune) bool {
  48. return highSurrogateOffset <= r && r <= basicMultilingualPlaneReservedOffset
  49. }
  50. func decodeUnicodeEscape(in []byte) (rune, int) {
  51. if r, ok := decodeSingleUnicodeEscape(in); !ok {
  52. // Invalid Unicode escape
  53. return utf8.RuneError, -1
  54. } else if r <= basicMultilingualPlaneOffset && !isUTF16EncodedRune(r) {
  55. // Valid Unicode escape in Basic Multilingual Plane
  56. return r, 6
  57. } else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain
  58. // UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate"
  59. return utf8.RuneError, -1
  60. } else if r2 < lowSurrogateOffset {
  61. // Invalid UTF16 "low surrogate"
  62. return utf8.RuneError, -1
  63. } else {
  64. // Valid UTF16 surrogate pair
  65. return combineUTF16Surrogates(r, r2), 12
  66. }
  67. }
  68. // backslashCharEscapeTable: when '\X' is found for some byte X, it is to be replaced with backslashCharEscapeTable[X]
  69. var backslashCharEscapeTable = [...]byte{
  70. '"': '"',
  71. '\\': '\\',
  72. '/': '/',
  73. 'b': '\b',
  74. 'f': '\f',
  75. 'n': '\n',
  76. 'r': '\r',
  77. 't': '\t',
  78. }
  79. // unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns
  80. // how many characters were consumed from 'in' and emitted into 'out'.
  81. // If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error.
  82. func unescapeToUTF8(in, out []byte) (inLen int, outLen int) {
  83. if len(in) < 2 || in[0] != '\\' {
  84. // Invalid escape due to insufficient characters for any escape or no initial backslash
  85. return -1, -1
  86. }
  87. // https://tools.ietf.org/html/rfc7159#section-7
  88. switch e := in[1]; e {
  89. case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
  90. // Valid basic 2-character escapes (use lookup table)
  91. out[0] = backslashCharEscapeTable[e]
  92. return 2, 1
  93. case 'u':
  94. // Unicode escape
  95. if r, inLen := decodeUnicodeEscape(in); inLen == -1 {
  96. // Invalid Unicode escape
  97. return -1, -1
  98. } else {
  99. // Valid Unicode escape; re-encode as UTF8
  100. outLen := utf8.EncodeRune(out, r)
  101. return inLen, outLen
  102. }
  103. }
  104. return -1, -1
  105. }
  106. // unescape unescapes the string contained in 'in' and returns it as a slice.
  107. // If 'in' contains no escaped characters:
  108. // Returns 'in'.
  109. // Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)):
  110. // 'out' is used to build the unescaped string and is returned with no extra allocation
  111. // Else:
  112. // A new slice is allocated and returned.
  113. func Unescape(in, out []byte) ([]byte, error) {
  114. firstBackslash := bytes.IndexByte(in, '\\')
  115. if firstBackslash == -1 {
  116. return in, nil
  117. }
  118. // Get a buffer of sufficient size (allocate if needed)
  119. if cap(out) < len(in) {
  120. out = make([]byte, len(in))
  121. } else {
  122. out = out[0:len(in)]
  123. }
  124. // Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice)
  125. copy(out, in[:firstBackslash])
  126. in = in[firstBackslash:]
  127. buf := out[firstBackslash:]
  128. for len(in) > 0 {
  129. // Unescape the next escaped character
  130. inLen, bufLen := unescapeToUTF8(in, buf)
  131. if inLen == -1 {
  132. return nil, MalformedStringEscapeError
  133. }
  134. in = in[inLen:]
  135. buf = buf[bufLen:]
  136. // Copy everything up until the next backslash
  137. nextBackslash := bytes.IndexByte(in, '\\')
  138. if nextBackslash == -1 {
  139. copy(buf, in)
  140. buf = buf[len(in):]
  141. break
  142. } else {
  143. copy(buf, in[:nextBackslash])
  144. buf = buf[nextBackslash:]
  145. in = in[nextBackslash:]
  146. }
  147. }
  148. // Trim the out buffer to the amount that was actually emitted
  149. return out[:len(out)-len(buf)], nil
  150. }