parse.go 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "errors"
  7. "strconv"
  8. "strings"
  9. "golang.org/x/text/internal/language"
  10. )
  11. // ValueError is returned by any of the parsing functions when the
  12. // input is well-formed but the respective subtag is not recognized
  13. // as a valid value.
  14. type ValueError interface {
  15. error
  16. // Subtag returns the subtag for which the error occurred.
  17. Subtag() string
  18. }
  19. // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  20. // failed it returns an error and any part of the tag that could be parsed.
  21. // If parsing succeeded but an unknown value was found, it returns
  22. // ValueError. The Tag returned in this case is just stripped of the unknown
  23. // value. All other values are preserved. It accepts tags in the BCP 47 format
  24. // and extensions to this standard defined in
  25. // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  26. // The resulting tag is canonicalized using the default canonicalization type.
  27. func Parse(s string) (t Tag, err error) {
  28. return Default.Parse(s)
  29. }
  30. // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  31. // failed it returns an error and any part of the tag that could be parsed.
  32. // If parsing succeeded but an unknown value was found, it returns
  33. // ValueError. The Tag returned in this case is just stripped of the unknown
  34. // value. All other values are preserved. It accepts tags in the BCP 47 format
  35. // and extensions to this standard defined in
  36. // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  37. // The resulting tag is canonicalized using the canonicalization type c.
  38. func (c CanonType) Parse(s string) (t Tag, err error) {
  39. defer func() {
  40. if recover() != nil {
  41. t = Tag{}
  42. err = language.ErrSyntax
  43. }
  44. }()
  45. tt, err := language.Parse(s)
  46. if err != nil {
  47. return makeTag(tt), err
  48. }
  49. tt, changed := canonicalize(c, tt)
  50. if changed {
  51. tt.RemakeString()
  52. }
  53. return makeTag(tt), err
  54. }
  55. // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  56. // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  57. // Base, Script or Region or slice of type Variant or Extension is passed more
  58. // than once, the latter will overwrite the former. Variants and Extensions are
  59. // accumulated, but if two extensions of the same type are passed, the latter
  60. // will replace the former. For -u extensions, though, the key-type pairs are
  61. // added, where later values overwrite older ones. A Tag overwrites all former
  62. // values and typically only makes sense as the first argument. The resulting
  63. // tag is returned after canonicalizing using the Default CanonType. If one or
  64. // more errors are encountered, one of the errors is returned.
  65. func Compose(part ...interface{}) (t Tag, err error) {
  66. return Default.Compose(part...)
  67. }
  68. // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  69. // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  70. // Base, Script or Region or slice of type Variant or Extension is passed more
  71. // than once, the latter will overwrite the former. Variants and Extensions are
  72. // accumulated, but if two extensions of the same type are passed, the latter
  73. // will replace the former. For -u extensions, though, the key-type pairs are
  74. // added, where later values overwrite older ones. A Tag overwrites all former
  75. // values and typically only makes sense as the first argument. The resulting
  76. // tag is returned after canonicalizing using CanonType c. If one or more errors
  77. // are encountered, one of the errors is returned.
  78. func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
  79. defer func() {
  80. if recover() != nil {
  81. t = Tag{}
  82. err = language.ErrSyntax
  83. }
  84. }()
  85. var b language.Builder
  86. if err = update(&b, part...); err != nil {
  87. return und, err
  88. }
  89. b.Tag, _ = canonicalize(c, b.Tag)
  90. return makeTag(b.Make()), err
  91. }
  92. var errInvalidArgument = errors.New("invalid Extension or Variant")
  93. func update(b *language.Builder, part ...interface{}) (err error) {
  94. for _, x := range part {
  95. switch v := x.(type) {
  96. case Tag:
  97. b.SetTag(v.tag())
  98. case Base:
  99. b.Tag.LangID = v.langID
  100. case Script:
  101. b.Tag.ScriptID = v.scriptID
  102. case Region:
  103. b.Tag.RegionID = v.regionID
  104. case Variant:
  105. if v.variant == "" {
  106. err = errInvalidArgument
  107. break
  108. }
  109. b.AddVariant(v.variant)
  110. case Extension:
  111. if v.s == "" {
  112. err = errInvalidArgument
  113. break
  114. }
  115. b.SetExt(v.s)
  116. case []Variant:
  117. b.ClearVariants()
  118. for _, v := range v {
  119. b.AddVariant(v.variant)
  120. }
  121. case []Extension:
  122. b.ClearExtensions()
  123. for _, e := range v {
  124. b.SetExt(e.s)
  125. }
  126. // TODO: support parsing of raw strings based on morphology or just extensions?
  127. case error:
  128. if v != nil {
  129. err = v
  130. }
  131. }
  132. }
  133. return
  134. }
  135. var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
  136. // ParseAcceptLanguage parses the contents of an Accept-Language header as
  137. // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
  138. // a list of corresponding quality weights. It is more permissive than RFC 2616
  139. // and may return non-nil slices even if the input is not valid.
  140. // The Tags will be sorted by highest weight first and then by first occurrence.
  141. // Tags with a weight of zero will be dropped. An error will be returned if the
  142. // input could not be parsed.
  143. func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
  144. defer func() {
  145. if recover() != nil {
  146. tag = nil
  147. q = nil
  148. err = language.ErrSyntax
  149. }
  150. }()
  151. var entry string
  152. for s != "" {
  153. if entry, s = split(s, ','); entry == "" {
  154. continue
  155. }
  156. entry, weight := split(entry, ';')
  157. // Scan the language.
  158. t, err := Parse(entry)
  159. if err != nil {
  160. id, ok := acceptFallback[entry]
  161. if !ok {
  162. return nil, nil, err
  163. }
  164. t = makeTag(language.Tag{LangID: id})
  165. }
  166. // Scan the optional weight.
  167. w := 1.0
  168. if weight != "" {
  169. weight = consume(weight, 'q')
  170. weight = consume(weight, '=')
  171. // consume returns the empty string when a token could not be
  172. // consumed, resulting in an error for ParseFloat.
  173. if w, err = strconv.ParseFloat(weight, 32); err != nil {
  174. return nil, nil, errInvalidWeight
  175. }
  176. // Drop tags with a quality weight of 0.
  177. if w <= 0 {
  178. continue
  179. }
  180. }
  181. tag = append(tag, t)
  182. q = append(q, float32(w))
  183. }
  184. sortStable(&tagSort{tag, q})
  185. return tag, q, nil
  186. }
  187. // consume removes a leading token c from s and returns the result or the empty
  188. // string if there is no such token.
  189. func consume(s string, c byte) string {
  190. if s == "" || s[0] != c {
  191. return ""
  192. }
  193. return strings.TrimSpace(s[1:])
  194. }
  195. func split(s string, c byte) (head, tail string) {
  196. if i := strings.IndexByte(s, c); i >= 0 {
  197. return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
  198. }
  199. return strings.TrimSpace(s), ""
  200. }
  201. // Add hack mapping to deal with a small number of cases that occur
  202. // in Accept-Language (with reasonable frequency).
  203. var acceptFallback = map[string]language.Language{
  204. "english": _en,
  205. "deutsch": _de,
  206. "italian": _it,
  207. "french": _fr,
  208. "*": _mul, // defined in the spec to match all languages.
  209. }
  210. type tagSort struct {
  211. tag []Tag
  212. q []float32
  213. }
  214. func (s *tagSort) Len() int {
  215. return len(s.q)
  216. }
  217. func (s *tagSort) Less(i, j int) bool {
  218. return s.q[i] > s.q[j]
  219. }
  220. func (s *tagSort) Swap(i, j int) {
  221. s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
  222. s.q[i], s.q[j] = s.q[j], s.q[i]
  223. }