utf8.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. package toml
  2. import (
  3. "unicode/utf8"
  4. )
  5. type utf8Err struct {
  6. Index int
  7. Size int
  8. }
  9. func (u utf8Err) Zero() bool {
  10. return u.Size == 0
  11. }
  12. // Verified that a given string is only made of valid UTF-8 characters allowed
  13. // by the TOML spec:
  14. //
  15. // Any Unicode character may be used except those that must be escaped:
  16. // quotation mark, backslash, and the control characters other than tab (U+0000
  17. // to U+0008, U+000A to U+001F, U+007F).
  18. //
  19. // It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
  20. // when a character is not allowed.
  21. //
  22. // The returned utf8Err is Zero() if the string is valid, or contains the byte
  23. // index and size of the invalid character.
  24. //
  25. // quotation mark => already checked
  26. // backslash => already checked
  27. // 0-0x8 => invalid
  28. // 0x9 => tab, ok
  29. // 0xA - 0x1F => invalid
  30. // 0x7F => invalid
  31. func utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
  32. // Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
  33. offset := 0
  34. for len(p) >= 8 {
  35. // Combining two 32 bit loads allows the same code to be used
  36. // for 32 and 64 bit platforms.
  37. // The compiler can generate a 32bit load for first32 and second32
  38. // on many platforms. See test/codegen/memcombine.go.
  39. first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
  40. second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
  41. if (first32|second32)&0x80808080 != 0 {
  42. // Found a non ASCII byte (>= RuneSelf).
  43. break
  44. }
  45. for i, b := range p[:8] {
  46. if invalidAscii(b) {
  47. err.Index = offset + i
  48. err.Size = 1
  49. return
  50. }
  51. }
  52. p = p[8:]
  53. offset += 8
  54. }
  55. n := len(p)
  56. for i := 0; i < n; {
  57. pi := p[i]
  58. if pi < utf8.RuneSelf {
  59. if invalidAscii(pi) {
  60. err.Index = offset + i
  61. err.Size = 1
  62. return
  63. }
  64. i++
  65. continue
  66. }
  67. x := first[pi]
  68. if x == xx {
  69. // Illegal starter byte.
  70. err.Index = offset + i
  71. err.Size = 1
  72. return
  73. }
  74. size := int(x & 7)
  75. if i+size > n {
  76. // Short or invalid.
  77. err.Index = offset + i
  78. err.Size = n - i
  79. return
  80. }
  81. accept := acceptRanges[x>>4]
  82. if c := p[i+1]; c < accept.lo || accept.hi < c {
  83. err.Index = offset + i
  84. err.Size = 2
  85. return
  86. } else if size == 2 {
  87. } else if c := p[i+2]; c < locb || hicb < c {
  88. err.Index = offset + i
  89. err.Size = 3
  90. return
  91. } else if size == 3 {
  92. } else if c := p[i+3]; c < locb || hicb < c {
  93. err.Index = offset + i
  94. err.Size = 4
  95. return
  96. }
  97. i += size
  98. }
  99. return
  100. }
  101. // Return the size of the next rune if valid, 0 otherwise.
  102. func utf8ValidNext(p []byte) int {
  103. c := p[0]
  104. if c < utf8.RuneSelf {
  105. if invalidAscii(c) {
  106. return 0
  107. }
  108. return 1
  109. }
  110. x := first[c]
  111. if x == xx {
  112. // Illegal starter byte.
  113. return 0
  114. }
  115. size := int(x & 7)
  116. if size > len(p) {
  117. // Short or invalid.
  118. return 0
  119. }
  120. accept := acceptRanges[x>>4]
  121. if c := p[1]; c < accept.lo || accept.hi < c {
  122. return 0
  123. } else if size == 2 {
  124. } else if c := p[2]; c < locb || hicb < c {
  125. return 0
  126. } else if size == 3 {
  127. } else if c := p[3]; c < locb || hicb < c {
  128. return 0
  129. }
  130. return size
  131. }
  132. var invalidAsciiTable = [256]bool{
  133. 0x00: true,
  134. 0x01: true,
  135. 0x02: true,
  136. 0x03: true,
  137. 0x04: true,
  138. 0x05: true,
  139. 0x06: true,
  140. 0x07: true,
  141. 0x08: true,
  142. // 0x09 TAB
  143. // 0x0A LF
  144. 0x0B: true,
  145. 0x0C: true,
  146. // 0x0D CR
  147. 0x0E: true,
  148. 0x0F: true,
  149. 0x10: true,
  150. 0x11: true,
  151. 0x12: true,
  152. 0x13: true,
  153. 0x14: true,
  154. 0x15: true,
  155. 0x16: true,
  156. 0x17: true,
  157. 0x18: true,
  158. 0x19: true,
  159. 0x1A: true,
  160. 0x1B: true,
  161. 0x1C: true,
  162. 0x1D: true,
  163. 0x1E: true,
  164. 0x1F: true,
  165. // 0x20 - 0x7E Printable ASCII characters
  166. 0x7F: true,
  167. }
  168. func invalidAscii(b byte) bool {
  169. return invalidAsciiTable[b]
  170. }
  171. // acceptRange gives the range of valid values for the second byte in a UTF-8
  172. // sequence.
  173. type acceptRange struct {
  174. lo uint8 // lowest value for second byte.
  175. hi uint8 // highest value for second byte.
  176. }
  177. // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
  178. var acceptRanges = [16]acceptRange{
  179. 0: {locb, hicb},
  180. 1: {0xA0, hicb},
  181. 2: {locb, 0x9F},
  182. 3: {0x90, hicb},
  183. 4: {locb, 0x8F},
  184. }
  185. // first is information about the first byte in a UTF-8 sequence.
  186. var first = [256]uint8{
  187. // 1 2 3 4 5 6 7 8 9 A B C D E F
  188. as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
  189. as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
  190. as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
  191. as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
  192. as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
  193. as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
  194. as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
  195. as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
  196. // 1 2 3 4 5 6 7 8 9 A B C D E F
  197. xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
  198. xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
  199. xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
  200. xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
  201. xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
  202. s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
  203. s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
  204. s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
  205. }
  206. const (
  207. // The default lowest and highest continuation byte.
  208. locb = 0b10000000
  209. hicb = 0b10111111
  210. // These names of these constants are chosen to give nice alignment in the
  211. // table below. The first nibble is an index into acceptRanges or F for
  212. // special one-byte cases. The second nibble is the Rune length or the
  213. // Status for the special one-byte case.
  214. xx = 0xF1 // invalid: size 1
  215. as = 0xF0 // ASCII: size 1
  216. s1 = 0x02 // accept 0, size 2
  217. s2 = 0x13 // accept 1, size 3
  218. s3 = 0x03 // accept 0, size 3
  219. s4 = 0x23 // accept 2, size 3
  220. s5 = 0x34 // accept 3, size 4
  221. s6 = 0x04 // accept 0, size 4
  222. s7 = 0x44 // accept 4, size 4
  223. )