123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- package jsonparser
- import (
- "bytes"
- "unicode/utf8"
- )
- // JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7
- const supplementalPlanesOffset = 0x10000
- const highSurrogateOffset = 0xD800
- const lowSurrogateOffset = 0xDC00
- const basicMultilingualPlaneReservedOffset = 0xDFFF
- const basicMultilingualPlaneOffset = 0xFFFF
- func combineUTF16Surrogates(high, low rune) rune {
- return supplementalPlanesOffset + (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
- }
- const badHex = -1
- func h2I(c byte) int {
- switch {
- case c >= '0' && c <= '9':
- return int(c - '0')
- case c >= 'A' && c <= 'F':
- return int(c - 'A' + 10)
- case c >= 'a' && c <= 'f':
- return int(c - 'a' + 10)
- }
- return badHex
- }
- // decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. The prefix \u is assumed to be present and
- // is not checked.
- // In JSON, these escapes can either come alone or as part of "UTF16 surrogate pairs" that must be handled together.
- // This function only handles one; decodeUnicodeEscape handles this more complex case.
- func decodeSingleUnicodeEscape(in []byte) (rune, bool) {
- // We need at least 6 characters total
- if len(in) < 6 {
- return utf8.RuneError, false
- }
- // Convert hex to decimal
- h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5])
- if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
- return utf8.RuneError, false
- }
- // Compose the hex digits
- return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
- }
- // isUTF16EncodedRune checks if a rune is in the range for non-BMP characters,
- // which is used to describe UTF16 chars.
- // Source: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
- func isUTF16EncodedRune(r rune) bool {
- return highSurrogateOffset <= r && r <= basicMultilingualPlaneReservedOffset
- }
- func decodeUnicodeEscape(in []byte) (rune, int) {
- if r, ok := decodeSingleUnicodeEscape(in); !ok {
- // Invalid Unicode escape
- return utf8.RuneError, -1
- } else if r <= basicMultilingualPlaneOffset && !isUTF16EncodedRune(r) {
- // Valid Unicode escape in Basic Multilingual Plane
- return r, 6
- } else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain
- // UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate"
- return utf8.RuneError, -1
- } else if r2 < lowSurrogateOffset {
- // Invalid UTF16 "low surrogate"
- return utf8.RuneError, -1
- } else {
- // Valid UTF16 surrogate pair
- return combineUTF16Surrogates(r, r2), 12
- }
- }
- // backslashCharEscapeTable: when '\X' is found for some byte X, it is to be replaced with backslashCharEscapeTable[X]
- var backslashCharEscapeTable = [...]byte{
- '"': '"',
- '\\': '\\',
- '/': '/',
- 'b': '\b',
- 'f': '\f',
- 'n': '\n',
- 'r': '\r',
- 't': '\t',
- }
- // unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns
- // how many characters were consumed from 'in' and emitted into 'out'.
- // If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error.
- func unescapeToUTF8(in, out []byte) (inLen int, outLen int) {
- if len(in) < 2 || in[0] != '\\' {
- // Invalid escape due to insufficient characters for any escape or no initial backslash
- return -1, -1
- }
- // https://tools.ietf.org/html/rfc7159#section-7
- switch e := in[1]; e {
- case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
- // Valid basic 2-character escapes (use lookup table)
- out[0] = backslashCharEscapeTable[e]
- return 2, 1
- case 'u':
- // Unicode escape
- if r, inLen := decodeUnicodeEscape(in); inLen == -1 {
- // Invalid Unicode escape
- return -1, -1
- } else {
- // Valid Unicode escape; re-encode as UTF8
- outLen := utf8.EncodeRune(out, r)
- return inLen, outLen
- }
- }
- return -1, -1
- }
- // unescape unescapes the string contained in 'in' and returns it as a slice.
- // If 'in' contains no escaped characters:
- // Returns 'in'.
- // Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)):
- // 'out' is used to build the unescaped string and is returned with no extra allocation
- // Else:
- // A new slice is allocated and returned.
- func Unescape(in, out []byte) ([]byte, error) {
- firstBackslash := bytes.IndexByte(in, '\\')
- if firstBackslash == -1 {
- return in, nil
- }
- // Get a buffer of sufficient size (allocate if needed)
- if cap(out) < len(in) {
- out = make([]byte, len(in))
- } else {
- out = out[0:len(in)]
- }
- // Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice)
- copy(out, in[:firstBackslash])
- in = in[firstBackslash:]
- buf := out[firstBackslash:]
- for len(in) > 0 {
- // Unescape the next escaped character
- inLen, bufLen := unescapeToUTF8(in, buf)
- if inLen == -1 {
- return nil, MalformedStringEscapeError
- }
- in = in[inLen:]
- buf = buf[bufLen:]
- // Copy everything up until the next backslash
- nextBackslash := bytes.IndexByte(in, '\\')
- if nextBackslash == -1 {
- copy(buf, in)
- buf = buf[len(in):]
- break
- } else {
- copy(buf, in[:nextBackslash])
- buf = buf[nextBackslash:]
- in = in[nextBackslash:]
- }
- }
- // Trim the out buffer to the amount that was actually emitted
- return out[:len(out)-len(buf)], nil
- }
|