123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- package toml
- import (
- "unicode/utf8"
- )
- type utf8Err struct {
- Index int
- Size int
- }
- func (u utf8Err) Zero() bool {
- return u.Size == 0
- }
- // Verified that a given string is only made of valid UTF-8 characters allowed
- // by the TOML spec:
- //
- // Any Unicode character may be used except those that must be escaped:
- // quotation mark, backslash, and the control characters other than tab (U+0000
- // to U+0008, U+000A to U+001F, U+007F).
- //
- // It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
- // when a character is not allowed.
- //
- // The returned utf8Err is Zero() if the string is valid, or contains the byte
- // index and size of the invalid character.
- //
- // quotation mark => already checked
- // backslash => already checked
- // 0-0x8 => invalid
- // 0x9 => tab, ok
- // 0xA - 0x1F => invalid
- // 0x7F => invalid
- func utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
- // Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
- offset := 0
- for len(p) >= 8 {
- // Combining two 32 bit loads allows the same code to be used
- // for 32 and 64 bit platforms.
- // The compiler can generate a 32bit load for first32 and second32
- // on many platforms. See test/codegen/memcombine.go.
- first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
- second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
- if (first32|second32)&0x80808080 != 0 {
- // Found a non ASCII byte (>= RuneSelf).
- break
- }
- for i, b := range p[:8] {
- if invalidAscii(b) {
- err.Index = offset + i
- err.Size = 1
- return
- }
- }
- p = p[8:]
- offset += 8
- }
- n := len(p)
- for i := 0; i < n; {
- pi := p[i]
- if pi < utf8.RuneSelf {
- if invalidAscii(pi) {
- err.Index = offset + i
- err.Size = 1
- return
- }
- i++
- continue
- }
- x := first[pi]
- if x == xx {
- // Illegal starter byte.
- err.Index = offset + i
- err.Size = 1
- return
- }
- size := int(x & 7)
- if i+size > n {
- // Short or invalid.
- err.Index = offset + i
- err.Size = n - i
- return
- }
- accept := acceptRanges[x>>4]
- if c := p[i+1]; c < accept.lo || accept.hi < c {
- err.Index = offset + i
- err.Size = 2
- return
- } else if size == 2 {
- } else if c := p[i+2]; c < locb || hicb < c {
- err.Index = offset + i
- err.Size = 3
- return
- } else if size == 3 {
- } else if c := p[i+3]; c < locb || hicb < c {
- err.Index = offset + i
- err.Size = 4
- return
- }
- i += size
- }
- return
- }
- // Return the size of the next rune if valid, 0 otherwise.
- func utf8ValidNext(p []byte) int {
- c := p[0]
- if c < utf8.RuneSelf {
- if invalidAscii(c) {
- return 0
- }
- return 1
- }
- x := first[c]
- if x == xx {
- // Illegal starter byte.
- return 0
- }
- size := int(x & 7)
- if size > len(p) {
- // Short or invalid.
- return 0
- }
- accept := acceptRanges[x>>4]
- if c := p[1]; c < accept.lo || accept.hi < c {
- return 0
- } else if size == 2 {
- } else if c := p[2]; c < locb || hicb < c {
- return 0
- } else if size == 3 {
- } else if c := p[3]; c < locb || hicb < c {
- return 0
- }
- return size
- }
- var invalidAsciiTable = [256]bool{
- 0x00: true,
- 0x01: true,
- 0x02: true,
- 0x03: true,
- 0x04: true,
- 0x05: true,
- 0x06: true,
- 0x07: true,
- 0x08: true,
- // 0x09 TAB
- // 0x0A LF
- 0x0B: true,
- 0x0C: true,
- // 0x0D CR
- 0x0E: true,
- 0x0F: true,
- 0x10: true,
- 0x11: true,
- 0x12: true,
- 0x13: true,
- 0x14: true,
- 0x15: true,
- 0x16: true,
- 0x17: true,
- 0x18: true,
- 0x19: true,
- 0x1A: true,
- 0x1B: true,
- 0x1C: true,
- 0x1D: true,
- 0x1E: true,
- 0x1F: true,
- // 0x20 - 0x7E Printable ASCII characters
- 0x7F: true,
- }
- func invalidAscii(b byte) bool {
- return invalidAsciiTable[b]
- }
- // acceptRange gives the range of valid values for the second byte in a UTF-8
- // sequence.
- type acceptRange struct {
- lo uint8 // lowest value for second byte.
- hi uint8 // highest value for second byte.
- }
- // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
- var acceptRanges = [16]acceptRange{
- 0: {locb, hicb},
- 1: {0xA0, hicb},
- 2: {locb, 0x9F},
- 3: {0x90, hicb},
- 4: {locb, 0x8F},
- }
- // first is information about the first byte in a UTF-8 sequence.
- var first = [256]uint8{
- // 1 2 3 4 5 6 7 8 9 A B C D E F
- as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
- as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
- as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
- as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
- as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
- as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
- as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
- as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
- // 1 2 3 4 5 6 7 8 9 A B C D E F
- xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
- xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
- xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
- xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
- xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
- s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
- s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
- s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
- }
- const (
- // The default lowest and highest continuation byte.
- locb = 0b10000000
- hicb = 0b10111111
- // These names of these constants are chosen to give nice alignment in the
- // table below. The first nibble is an index into acceptRanges or F for
- // special one-byte cases. The second nibble is the Rune length or the
- // Status for the special one-byte case.
- xx = 0xF1 // invalid: size 1
- as = 0xF0 // ASCII: size 1
- s1 = 0x02 // accept 0, size 2
- s2 = 0x13 // accept 1, size 3
- s3 = 0x03 // accept 0, size 3
- s4 = 0x23 // accept 2, size 3
- s5 = 0x34 // accept 3, size 4
- s6 = 0x04 // accept 0, size 4
- s7 = 0x44 // accept 4, size 4
- )
|