unmarshal_text.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. package decoder
  2. import (
  3. "bytes"
  4. "encoding"
  5. "unicode"
  6. "unicode/utf16"
  7. "unicode/utf8"
  8. "unsafe"
  9. "github.com/goccy/go-json/internal/errors"
  10. "github.com/goccy/go-json/internal/runtime"
  11. )
  12. type unmarshalTextDecoder struct {
  13. typ *runtime.Type
  14. structName string
  15. fieldName string
  16. }
  17. func newUnmarshalTextDecoder(typ *runtime.Type, structName, fieldName string) *unmarshalTextDecoder {
  18. return &unmarshalTextDecoder{
  19. typ: typ,
  20. structName: structName,
  21. fieldName: fieldName,
  22. }
  23. }
  24. func (d *unmarshalTextDecoder) annotateError(cursor int64, err error) {
  25. switch e := err.(type) {
  26. case *errors.UnmarshalTypeError:
  27. e.Struct = d.structName
  28. e.Field = d.fieldName
  29. case *errors.SyntaxError:
  30. e.Offset = cursor
  31. }
  32. }
  33. var (
  34. nullbytes = []byte(`null`)
  35. )
  36. func (d *unmarshalTextDecoder) DecodeStream(s *Stream, depth int64, p unsafe.Pointer) error {
  37. s.skipWhiteSpace()
  38. start := s.cursor
  39. if err := s.skipValue(depth); err != nil {
  40. return err
  41. }
  42. src := s.buf[start:s.cursor]
  43. if len(src) > 0 {
  44. switch src[0] {
  45. case '[':
  46. return &errors.UnmarshalTypeError{
  47. Value: "array",
  48. Type: runtime.RType2Type(d.typ),
  49. Offset: s.totalOffset(),
  50. }
  51. case '{':
  52. return &errors.UnmarshalTypeError{
  53. Value: "object",
  54. Type: runtime.RType2Type(d.typ),
  55. Offset: s.totalOffset(),
  56. }
  57. case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  58. return &errors.UnmarshalTypeError{
  59. Value: "number",
  60. Type: runtime.RType2Type(d.typ),
  61. Offset: s.totalOffset(),
  62. }
  63. case 'n':
  64. if bytes.Equal(src, nullbytes) {
  65. *(*unsafe.Pointer)(p) = nil
  66. return nil
  67. }
  68. }
  69. }
  70. dst := make([]byte, len(src))
  71. copy(dst, src)
  72. if b, ok := unquoteBytes(dst); ok {
  73. dst = b
  74. }
  75. v := *(*interface{})(unsafe.Pointer(&emptyInterface{
  76. typ: d.typ,
  77. ptr: p,
  78. }))
  79. if err := v.(encoding.TextUnmarshaler).UnmarshalText(dst); err != nil {
  80. d.annotateError(s.cursor, err)
  81. return err
  82. }
  83. return nil
  84. }
  85. func (d *unmarshalTextDecoder) Decode(ctx *RuntimeContext, cursor, depth int64, p unsafe.Pointer) (int64, error) {
  86. buf := ctx.Buf
  87. cursor = skipWhiteSpace(buf, cursor)
  88. start := cursor
  89. end, err := skipValue(buf, cursor, depth)
  90. if err != nil {
  91. return 0, err
  92. }
  93. src := buf[start:end]
  94. if len(src) > 0 {
  95. switch src[0] {
  96. case '[':
  97. return 0, &errors.UnmarshalTypeError{
  98. Value: "array",
  99. Type: runtime.RType2Type(d.typ),
  100. Offset: start,
  101. }
  102. case '{':
  103. return 0, &errors.UnmarshalTypeError{
  104. Value: "object",
  105. Type: runtime.RType2Type(d.typ),
  106. Offset: start,
  107. }
  108. case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  109. return 0, &errors.UnmarshalTypeError{
  110. Value: "number",
  111. Type: runtime.RType2Type(d.typ),
  112. Offset: start,
  113. }
  114. case 'n':
  115. if bytes.Equal(src, nullbytes) {
  116. *(*unsafe.Pointer)(p) = nil
  117. return end, nil
  118. }
  119. }
  120. }
  121. if s, ok := unquoteBytes(src); ok {
  122. src = s
  123. }
  124. v := *(*interface{})(unsafe.Pointer(&emptyInterface{
  125. typ: d.typ,
  126. ptr: *(*unsafe.Pointer)(unsafe.Pointer(&p)),
  127. }))
  128. if err := v.(encoding.TextUnmarshaler).UnmarshalText(src); err != nil {
  129. d.annotateError(cursor, err)
  130. return 0, err
  131. }
  132. return end, nil
  133. }
  134. func unquoteBytes(s []byte) (t []byte, ok bool) {
  135. length := len(s)
  136. if length < 2 || s[0] != '"' || s[length-1] != '"' {
  137. return
  138. }
  139. s = s[1 : length-1]
  140. length -= 2
  141. // Check for unusual characters. If there are none,
  142. // then no unquoting is needed, so return a slice of the
  143. // original bytes.
  144. r := 0
  145. for r < length {
  146. c := s[r]
  147. if c == '\\' || c == '"' || c < ' ' {
  148. break
  149. }
  150. if c < utf8.RuneSelf {
  151. r++
  152. continue
  153. }
  154. rr, size := utf8.DecodeRune(s[r:])
  155. if rr == utf8.RuneError && size == 1 {
  156. break
  157. }
  158. r += size
  159. }
  160. if r == length {
  161. return s, true
  162. }
  163. b := make([]byte, length+2*utf8.UTFMax)
  164. w := copy(b, s[0:r])
  165. for r < length {
  166. // Out of room? Can only happen if s is full of
  167. // malformed UTF-8 and we're replacing each
  168. // byte with RuneError.
  169. if w >= len(b)-2*utf8.UTFMax {
  170. nb := make([]byte, (len(b)+utf8.UTFMax)*2)
  171. copy(nb, b[0:w])
  172. b = nb
  173. }
  174. switch c := s[r]; {
  175. case c == '\\':
  176. r++
  177. if r >= length {
  178. return
  179. }
  180. switch s[r] {
  181. default:
  182. return
  183. case '"', '\\', '/', '\'':
  184. b[w] = s[r]
  185. r++
  186. w++
  187. case 'b':
  188. b[w] = '\b'
  189. r++
  190. w++
  191. case 'f':
  192. b[w] = '\f'
  193. r++
  194. w++
  195. case 'n':
  196. b[w] = '\n'
  197. r++
  198. w++
  199. case 'r':
  200. b[w] = '\r'
  201. r++
  202. w++
  203. case 't':
  204. b[w] = '\t'
  205. r++
  206. w++
  207. case 'u':
  208. r--
  209. rr := getu4(s[r:])
  210. if rr < 0 {
  211. return
  212. }
  213. r += 6
  214. if utf16.IsSurrogate(rr) {
  215. rr1 := getu4(s[r:])
  216. if dec := utf16.DecodeRune(rr, rr1); dec != unicode.ReplacementChar {
  217. // A valid pair; consume.
  218. r += 6
  219. w += utf8.EncodeRune(b[w:], dec)
  220. break
  221. }
  222. // Invalid surrogate; fall back to replacement rune.
  223. rr = unicode.ReplacementChar
  224. }
  225. w += utf8.EncodeRune(b[w:], rr)
  226. }
  227. // Quote, control characters are invalid.
  228. case c == '"', c < ' ':
  229. return
  230. // ASCII
  231. case c < utf8.RuneSelf:
  232. b[w] = c
  233. r++
  234. w++
  235. // Coerce to well-formed UTF-8.
  236. default:
  237. rr, size := utf8.DecodeRune(s[r:])
  238. r += size
  239. w += utf8.EncodeRune(b[w:], rr)
  240. }
  241. }
  242. return b[0:w], true
  243. }
  244. func getu4(s []byte) rune {
  245. if len(s) < 6 || s[0] != '\\' || s[1] != 'u' {
  246. return -1
  247. }
  248. var r rune
  249. for _, c := range s[2:6] {
  250. switch {
  251. case '0' <= c && c <= '9':
  252. c = c - '0'
  253. case 'a' <= c && c <= 'f':
  254. c = c - 'a' + 10
  255. case 'A' <= c && c <= 'F':
  256. c = c - 'A' + 10
  257. default:
  258. return -1
  259. }
  260. r = r*16 + rune(c)
  261. }
  262. return r
  263. }