string.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. package encoder
  2. import (
  3. "math/bits"
  4. "reflect"
  5. "unsafe"
  6. )
  7. const (
  8. lsb = 0x0101010101010101
  9. msb = 0x8080808080808080
  10. )
  11. var hex = "0123456789abcdef"
  12. //nolint:govet
  13. func stringToUint64Slice(s string) []uint64 {
  14. return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{
  15. Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data,
  16. Len: len(s) / 8,
  17. Cap: len(s) / 8,
  18. }))
  19. }
  20. func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
  21. if ctx.Option.Flag&HTMLEscapeOption != 0 {
  22. if ctx.Option.Flag&NormalizeUTF8Option != 0 {
  23. return appendNormalizedHTMLString(buf, s)
  24. }
  25. return appendHTMLString(buf, s)
  26. }
  27. if ctx.Option.Flag&NormalizeUTF8Option != 0 {
  28. return appendNormalizedString(buf, s)
  29. }
  30. return appendString(buf, s)
  31. }
  32. func appendNormalizedHTMLString(buf []byte, s string) []byte {
  33. valLen := len(s)
  34. if valLen == 0 {
  35. return append(buf, `""`...)
  36. }
  37. buf = append(buf, '"')
  38. var (
  39. i, j int
  40. )
  41. if valLen >= 8 {
  42. chunks := stringToUint64Slice(s)
  43. for _, n := range chunks {
  44. // combine masks before checking for the MSB of each byte. We include
  45. // `n` in the mask to check whether any of the *input* byte MSBs were
  46. // set (i.e. the byte was outside the ASCII range).
  47. mask := n | (n - (lsb * 0x20)) |
  48. ((n ^ (lsb * '"')) - lsb) |
  49. ((n ^ (lsb * '\\')) - lsb) |
  50. ((n ^ (lsb * '<')) - lsb) |
  51. ((n ^ (lsb * '>')) - lsb) |
  52. ((n ^ (lsb * '&')) - lsb)
  53. if (mask & msb) != 0 {
  54. j = bits.TrailingZeros64(mask&msb) / 8
  55. goto ESCAPE_END
  56. }
  57. }
  58. for i := len(chunks) * 8; i < valLen; i++ {
  59. if needEscapeHTMLNormalizeUTF8[s[i]] {
  60. j = i
  61. goto ESCAPE_END
  62. }
  63. }
  64. // no found any escape characters.
  65. return append(append(buf, s...), '"')
  66. }
  67. ESCAPE_END:
  68. for j < valLen {
  69. c := s[j]
  70. if !needEscapeHTMLNormalizeUTF8[c] {
  71. // fast path: most of the time, printable ascii characters are used
  72. j++
  73. continue
  74. }
  75. switch c {
  76. case '\\', '"':
  77. buf = append(buf, s[i:j]...)
  78. buf = append(buf, '\\', c)
  79. i = j + 1
  80. j = j + 1
  81. continue
  82. case '\n':
  83. buf = append(buf, s[i:j]...)
  84. buf = append(buf, '\\', 'n')
  85. i = j + 1
  86. j = j + 1
  87. continue
  88. case '\r':
  89. buf = append(buf, s[i:j]...)
  90. buf = append(buf, '\\', 'r')
  91. i = j + 1
  92. j = j + 1
  93. continue
  94. case '\t':
  95. buf = append(buf, s[i:j]...)
  96. buf = append(buf, '\\', 't')
  97. i = j + 1
  98. j = j + 1
  99. continue
  100. case '<', '>', '&':
  101. buf = append(buf, s[i:j]...)
  102. buf = append(buf, `\u00`...)
  103. buf = append(buf, hex[c>>4], hex[c&0xF])
  104. i = j + 1
  105. j = j + 1
  106. continue
  107. case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
  108. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
  109. buf = append(buf, s[i:j]...)
  110. buf = append(buf, `\u00`...)
  111. buf = append(buf, hex[c>>4], hex[c&0xF])
  112. i = j + 1
  113. j = j + 1
  114. continue
  115. }
  116. state, size := decodeRuneInString(s[j:])
  117. switch state {
  118. case runeErrorState:
  119. buf = append(buf, s[i:j]...)
  120. buf = append(buf, `\ufffd`...)
  121. i = j + 1
  122. j = j + 1
  123. continue
  124. // U+2028 is LINE SEPARATOR.
  125. // U+2029 is PARAGRAPH SEPARATOR.
  126. // They are both technically valid characters in JSON strings,
  127. // but don't work in JSONP, which has to be evaluated as JavaScript,
  128. // and can lead to security holes there. It is valid JSON to
  129. // escape them, so we do so unconditionally.
  130. // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
  131. case lineSepState:
  132. buf = append(buf, s[i:j]...)
  133. buf = append(buf, `\u2028`...)
  134. i = j + 3
  135. j = j + 3
  136. continue
  137. case paragraphSepState:
  138. buf = append(buf, s[i:j]...)
  139. buf = append(buf, `\u2029`...)
  140. i = j + 3
  141. j = j + 3
  142. continue
  143. }
  144. j += size
  145. }
  146. return append(append(buf, s[i:]...), '"')
  147. }
  148. func appendHTMLString(buf []byte, s string) []byte {
  149. valLen := len(s)
  150. if valLen == 0 {
  151. return append(buf, `""`...)
  152. }
  153. buf = append(buf, '"')
  154. var (
  155. i, j int
  156. )
  157. if valLen >= 8 {
  158. chunks := stringToUint64Slice(s)
  159. for _, n := range chunks {
  160. // combine masks before checking for the MSB of each byte. We include
  161. // `n` in the mask to check whether any of the *input* byte MSBs were
  162. // set (i.e. the byte was outside the ASCII range).
  163. mask := n | (n - (lsb * 0x20)) |
  164. ((n ^ (lsb * '"')) - lsb) |
  165. ((n ^ (lsb * '\\')) - lsb) |
  166. ((n ^ (lsb * '<')) - lsb) |
  167. ((n ^ (lsb * '>')) - lsb) |
  168. ((n ^ (lsb * '&')) - lsb)
  169. if (mask & msb) != 0 {
  170. j = bits.TrailingZeros64(mask&msb) / 8
  171. goto ESCAPE_END
  172. }
  173. }
  174. for i := len(chunks) * 8; i < valLen; i++ {
  175. if needEscapeHTML[s[i]] {
  176. j = i
  177. goto ESCAPE_END
  178. }
  179. }
  180. // no found any escape characters.
  181. return append(append(buf, s...), '"')
  182. }
  183. ESCAPE_END:
  184. for j < valLen {
  185. c := s[j]
  186. if !needEscapeHTML[c] {
  187. // fast path: most of the time, printable ascii characters are used
  188. j++
  189. continue
  190. }
  191. switch c {
  192. case '\\', '"':
  193. buf = append(buf, s[i:j]...)
  194. buf = append(buf, '\\', c)
  195. i = j + 1
  196. j = j + 1
  197. continue
  198. case '\n':
  199. buf = append(buf, s[i:j]...)
  200. buf = append(buf, '\\', 'n')
  201. i = j + 1
  202. j = j + 1
  203. continue
  204. case '\r':
  205. buf = append(buf, s[i:j]...)
  206. buf = append(buf, '\\', 'r')
  207. i = j + 1
  208. j = j + 1
  209. continue
  210. case '\t':
  211. buf = append(buf, s[i:j]...)
  212. buf = append(buf, '\\', 't')
  213. i = j + 1
  214. j = j + 1
  215. continue
  216. case '<', '>', '&':
  217. buf = append(buf, s[i:j]...)
  218. buf = append(buf, `\u00`...)
  219. buf = append(buf, hex[c>>4], hex[c&0xF])
  220. i = j + 1
  221. j = j + 1
  222. continue
  223. case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
  224. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
  225. buf = append(buf, s[i:j]...)
  226. buf = append(buf, `\u00`...)
  227. buf = append(buf, hex[c>>4], hex[c&0xF])
  228. i = j + 1
  229. j = j + 1
  230. continue
  231. }
  232. j++
  233. }
  234. return append(append(buf, s[i:]...), '"')
  235. }
  236. func appendNormalizedString(buf []byte, s string) []byte {
  237. valLen := len(s)
  238. if valLen == 0 {
  239. return append(buf, `""`...)
  240. }
  241. buf = append(buf, '"')
  242. var (
  243. i, j int
  244. )
  245. if valLen >= 8 {
  246. chunks := stringToUint64Slice(s)
  247. for _, n := range chunks {
  248. // combine masks before checking for the MSB of each byte. We include
  249. // `n` in the mask to check whether any of the *input* byte MSBs were
  250. // set (i.e. the byte was outside the ASCII range).
  251. mask := n | (n - (lsb * 0x20)) |
  252. ((n ^ (lsb * '"')) - lsb) |
  253. ((n ^ (lsb * '\\')) - lsb)
  254. if (mask & msb) != 0 {
  255. j = bits.TrailingZeros64(mask&msb) / 8
  256. goto ESCAPE_END
  257. }
  258. }
  259. valLen := len(s)
  260. for i := len(chunks) * 8; i < valLen; i++ {
  261. if needEscapeNormalizeUTF8[s[i]] {
  262. j = i
  263. goto ESCAPE_END
  264. }
  265. }
  266. return append(append(buf, s...), '"')
  267. }
  268. ESCAPE_END:
  269. for j < valLen {
  270. c := s[j]
  271. if !needEscapeNormalizeUTF8[c] {
  272. // fast path: most of the time, printable ascii characters are used
  273. j++
  274. continue
  275. }
  276. switch c {
  277. case '\\', '"':
  278. buf = append(buf, s[i:j]...)
  279. buf = append(buf, '\\', c)
  280. i = j + 1
  281. j = j + 1
  282. continue
  283. case '\n':
  284. buf = append(buf, s[i:j]...)
  285. buf = append(buf, '\\', 'n')
  286. i = j + 1
  287. j = j + 1
  288. continue
  289. case '\r':
  290. buf = append(buf, s[i:j]...)
  291. buf = append(buf, '\\', 'r')
  292. i = j + 1
  293. j = j + 1
  294. continue
  295. case '\t':
  296. buf = append(buf, s[i:j]...)
  297. buf = append(buf, '\\', 't')
  298. i = j + 1
  299. j = j + 1
  300. continue
  301. case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
  302. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
  303. buf = append(buf, s[i:j]...)
  304. buf = append(buf, `\u00`...)
  305. buf = append(buf, hex[c>>4], hex[c&0xF])
  306. i = j + 1
  307. j = j + 1
  308. continue
  309. }
  310. state, size := decodeRuneInString(s[j:])
  311. switch state {
  312. case runeErrorState:
  313. buf = append(buf, s[i:j]...)
  314. buf = append(buf, `\ufffd`...)
  315. i = j + 1
  316. j = j + 1
  317. continue
  318. // U+2028 is LINE SEPARATOR.
  319. // U+2029 is PARAGRAPH SEPARATOR.
  320. // They are both technically valid characters in JSON strings,
  321. // but don't work in JSONP, which has to be evaluated as JavaScript,
  322. // and can lead to security holes there. It is valid JSON to
  323. // escape them, so we do so unconditionally.
  324. // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
  325. case lineSepState:
  326. buf = append(buf, s[i:j]...)
  327. buf = append(buf, `\u2028`...)
  328. i = j + 3
  329. j = j + 3
  330. continue
  331. case paragraphSepState:
  332. buf = append(buf, s[i:j]...)
  333. buf = append(buf, `\u2029`...)
  334. i = j + 3
  335. j = j + 3
  336. continue
  337. }
  338. j += size
  339. }
  340. return append(append(buf, s[i:]...), '"')
  341. }
  342. func appendString(buf []byte, s string) []byte {
  343. valLen := len(s)
  344. if valLen == 0 {
  345. return append(buf, `""`...)
  346. }
  347. buf = append(buf, '"')
  348. var (
  349. i, j int
  350. )
  351. if valLen >= 8 {
  352. chunks := stringToUint64Slice(s)
  353. for _, n := range chunks {
  354. // combine masks before checking for the MSB of each byte. We include
  355. // `n` in the mask to check whether any of the *input* byte MSBs were
  356. // set (i.e. the byte was outside the ASCII range).
  357. mask := n | (n - (lsb * 0x20)) |
  358. ((n ^ (lsb * '"')) - lsb) |
  359. ((n ^ (lsb * '\\')) - lsb)
  360. if (mask & msb) != 0 {
  361. j = bits.TrailingZeros64(mask&msb) / 8
  362. goto ESCAPE_END
  363. }
  364. }
  365. valLen := len(s)
  366. for i := len(chunks) * 8; i < valLen; i++ {
  367. if needEscape[s[i]] {
  368. j = i
  369. goto ESCAPE_END
  370. }
  371. }
  372. return append(append(buf, s...), '"')
  373. }
  374. ESCAPE_END:
  375. for j < valLen {
  376. c := s[j]
  377. if !needEscape[c] {
  378. // fast path: most of the time, printable ascii characters are used
  379. j++
  380. continue
  381. }
  382. switch c {
  383. case '\\', '"':
  384. buf = append(buf, s[i:j]...)
  385. buf = append(buf, '\\', c)
  386. i = j + 1
  387. j = j + 1
  388. continue
  389. case '\n':
  390. buf = append(buf, s[i:j]...)
  391. buf = append(buf, '\\', 'n')
  392. i = j + 1
  393. j = j + 1
  394. continue
  395. case '\r':
  396. buf = append(buf, s[i:j]...)
  397. buf = append(buf, '\\', 'r')
  398. i = j + 1
  399. j = j + 1
  400. continue
  401. case '\t':
  402. buf = append(buf, s[i:j]...)
  403. buf = append(buf, '\\', 't')
  404. i = j + 1
  405. j = j + 1
  406. continue
  407. case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
  408. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
  409. buf = append(buf, s[i:j]...)
  410. buf = append(buf, `\u00`...)
  411. buf = append(buf, hex[c>>4], hex[c&0xF])
  412. i = j + 1
  413. j = j + 1
  414. continue
  415. }
  416. j++
  417. }
  418. return append(append(buf, s[i:]...), '"')
  419. }