language.go 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run gen.go gen_common.go -output tables.go
  5. //go:generate go run gen_index.go
  6. package language
  7. // TODO: Remove above NOTE after:
  8. // - verifying that tables are dropped correctly (most notably matcher tables).
  9. import (
  10. "errors"
  11. "fmt"
  12. "strings"
  13. )
  14. const (
  15. // maxCoreSize is the maximum size of a BCP 47 tag without variants and
  16. // extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
  17. maxCoreSize = 12
  18. // max99thPercentileSize is a somewhat arbitrary buffer size that presumably
  19. // is large enough to hold at least 99% of the BCP 47 tags.
  20. max99thPercentileSize = 32
  21. // maxSimpleUExtensionSize is the maximum size of a -u extension with one
  22. // key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
  23. maxSimpleUExtensionSize = 14
  24. )
  25. // Tag represents a BCP 47 language tag. It is used to specify an instance of a
  26. // specific language or locale. All language tag values are guaranteed to be
  27. // well-formed.
  28. type Tag struct {
  29. lang langID
  30. region regionID
  31. // TODO: we will soon run out of positions for script. Idea: instead of
  32. // storing lang, region, and script codes, store only the compact index and
  33. // have a lookup table from this code to its expansion. This greatly speeds
  34. // up table lookup, speed up common variant cases.
  35. // This will also immediately free up 3 extra bytes. Also, the pVariant
  36. // field can now be moved to the lookup table, as the compact index uniquely
  37. // determines the offset of a possible variant.
  38. script scriptID
  39. pVariant byte // offset in str, includes preceding '-'
  40. pExt uint16 // offset of first extension, includes preceding '-'
  41. // str is the string representation of the Tag. It will only be used if the
  42. // tag has variants or extensions.
  43. str string
  44. }
  45. // Make is a convenience wrapper for Parse that omits the error.
  46. // In case of an error, a sensible default is returned.
  47. func Make(s string) Tag {
  48. return Default.Make(s)
  49. }
  50. // Make is a convenience wrapper for c.Parse that omits the error.
  51. // In case of an error, a sensible default is returned.
  52. func (c CanonType) Make(s string) Tag {
  53. t, _ := c.Parse(s)
  54. return t
  55. }
  56. // Raw returns the raw base language, script and region, without making an
  57. // attempt to infer their values.
  58. func (t Tag) Raw() (b Base, s Script, r Region) {
  59. return Base{t.lang}, Script{t.script}, Region{t.region}
  60. }
  61. // equalTags compares language, script and region subtags only.
  62. func (t Tag) equalTags(a Tag) bool {
  63. return t.lang == a.lang && t.script == a.script && t.region == a.region
  64. }
  65. // IsRoot returns true if t is equal to language "und".
  66. func (t Tag) IsRoot() bool {
  67. if int(t.pVariant) < len(t.str) {
  68. return false
  69. }
  70. return t.equalTags(und)
  71. }
  72. // private reports whether the Tag consists solely of a private use tag.
  73. func (t Tag) private() bool {
  74. return t.str != "" && t.pVariant == 0
  75. }
  76. // CanonType can be used to enable or disable various types of canonicalization.
  77. type CanonType int
  78. const (
  79. // Replace deprecated base languages with their preferred replacements.
  80. DeprecatedBase CanonType = 1 << iota
  81. // Replace deprecated scripts with their preferred replacements.
  82. DeprecatedScript
  83. // Replace deprecated regions with their preferred replacements.
  84. DeprecatedRegion
  85. // Remove redundant scripts.
  86. SuppressScript
  87. // Normalize legacy encodings. This includes legacy languages defined in
  88. // CLDR as well as bibliographic codes defined in ISO-639.
  89. Legacy
  90. // Map the dominant language of a macro language group to the macro language
  91. // subtag. For example cmn -> zh.
  92. Macro
  93. // The CLDR flag should be used if full compatibility with CLDR is required.
  94. // There are a few cases where language.Tag may differ from CLDR. To follow all
  95. // of CLDR's suggestions, use All|CLDR.
  96. CLDR
  97. // Raw can be used to Compose or Parse without Canonicalization.
  98. Raw CanonType = 0
  99. // Replace all deprecated tags with their preferred replacements.
  100. Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
  101. // All canonicalizations recommended by BCP 47.
  102. BCP47 = Deprecated | SuppressScript
  103. // All canonicalizations.
  104. All = BCP47 | Legacy | Macro
  105. // Default is the canonicalization used by Parse, Make and Compose. To
  106. // preserve as much information as possible, canonicalizations that remove
  107. // potentially valuable information are not included. The Matcher is
  108. // designed to recognize similar tags that would be the same if
  109. // they were canonicalized using All.
  110. Default = Deprecated | Legacy
  111. canonLang = DeprecatedBase | Legacy | Macro
  112. // TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
  113. )
  114. // canonicalize returns the canonicalized equivalent of the tag and
  115. // whether there was any change.
  116. func (t Tag) canonicalize(c CanonType) (Tag, bool) {
  117. if c == Raw {
  118. return t, false
  119. }
  120. changed := false
  121. if c&SuppressScript != 0 {
  122. if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] {
  123. t.script = 0
  124. changed = true
  125. }
  126. }
  127. if c&canonLang != 0 {
  128. for {
  129. if l, aliasType := normLang(t.lang); l != t.lang {
  130. switch aliasType {
  131. case langLegacy:
  132. if c&Legacy != 0 {
  133. if t.lang == _sh && t.script == 0 {
  134. t.script = _Latn
  135. }
  136. t.lang = l
  137. changed = true
  138. }
  139. case langMacro:
  140. if c&Macro != 0 {
  141. // We deviate here from CLDR. The mapping "nb" -> "no"
  142. // qualifies as a typical Macro language mapping. However,
  143. // for legacy reasons, CLDR maps "no", the macro language
  144. // code for Norwegian, to the dominant variant "nb". This
  145. // change is currently under consideration for CLDR as well.
  146. // See http://unicode.org/cldr/trac/ticket/2698 and also
  147. // http://unicode.org/cldr/trac/ticket/1790 for some of the
  148. // practical implications. TODO: this check could be removed
  149. // if CLDR adopts this change.
  150. if c&CLDR == 0 || t.lang != _nb {
  151. changed = true
  152. t.lang = l
  153. }
  154. }
  155. case langDeprecated:
  156. if c&DeprecatedBase != 0 {
  157. if t.lang == _mo && t.region == 0 {
  158. t.region = _MD
  159. }
  160. t.lang = l
  161. changed = true
  162. // Other canonicalization types may still apply.
  163. continue
  164. }
  165. }
  166. } else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 {
  167. t.lang = _nb
  168. changed = true
  169. }
  170. break
  171. }
  172. }
  173. if c&DeprecatedScript != 0 {
  174. if t.script == _Qaai {
  175. changed = true
  176. t.script = _Zinh
  177. }
  178. }
  179. if c&DeprecatedRegion != 0 {
  180. if r := normRegion(t.region); r != 0 {
  181. changed = true
  182. t.region = r
  183. }
  184. }
  185. return t, changed
  186. }
  187. // Canonicalize returns the canonicalized equivalent of the tag.
  188. func (c CanonType) Canonicalize(t Tag) (Tag, error) {
  189. t, changed := t.canonicalize(c)
  190. if changed {
  191. t.remakeString()
  192. }
  193. return t, nil
  194. }
  195. // Confidence indicates the level of certainty for a given return value.
  196. // For example, Serbian may be written in Cyrillic or Latin script.
  197. // The confidence level indicates whether a value was explicitly specified,
  198. // whether it is typically the only possible value, or whether there is
  199. // an ambiguity.
  200. type Confidence int
  201. const (
  202. No Confidence = iota // full confidence that there was no match
  203. Low // most likely value picked out of a set of alternatives
  204. High // value is generally assumed to be the correct match
  205. Exact // exact match or explicitly specified value
  206. )
  207. var confName = []string{"No", "Low", "High", "Exact"}
  208. func (c Confidence) String() string {
  209. return confName[c]
  210. }
  211. // remakeString is used to update t.str in case lang, script or region changed.
  212. // It is assumed that pExt and pVariant still point to the start of the
  213. // respective parts.
  214. func (t *Tag) remakeString() {
  215. if t.str == "" {
  216. return
  217. }
  218. extra := t.str[t.pVariant:]
  219. if t.pVariant > 0 {
  220. extra = extra[1:]
  221. }
  222. if t.equalTags(und) && strings.HasPrefix(extra, "x-") {
  223. t.str = extra
  224. t.pVariant = 0
  225. t.pExt = 0
  226. return
  227. }
  228. var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
  229. b := buf[:t.genCoreBytes(buf[:])]
  230. if extra != "" {
  231. diff := len(b) - int(t.pVariant)
  232. b = append(b, '-')
  233. b = append(b, extra...)
  234. t.pVariant = uint8(int(t.pVariant) + diff)
  235. t.pExt = uint16(int(t.pExt) + diff)
  236. } else {
  237. t.pVariant = uint8(len(b))
  238. t.pExt = uint16(len(b))
  239. }
  240. t.str = string(b)
  241. }
  242. // genCoreBytes writes a string for the base languages, script and region tags
  243. // to the given buffer and returns the number of bytes written. It will never
  244. // write more than maxCoreSize bytes.
  245. func (t *Tag) genCoreBytes(buf []byte) int {
  246. n := t.lang.stringToBuf(buf[:])
  247. if t.script != 0 {
  248. n += copy(buf[n:], "-")
  249. n += copy(buf[n:], t.script.String())
  250. }
  251. if t.region != 0 {
  252. n += copy(buf[n:], "-")
  253. n += copy(buf[n:], t.region.String())
  254. }
  255. return n
  256. }
  257. // String returns the canonical string representation of the language tag.
  258. func (t Tag) String() string {
  259. if t.str != "" {
  260. return t.str
  261. }
  262. if t.script == 0 && t.region == 0 {
  263. return t.lang.String()
  264. }
  265. buf := [maxCoreSize]byte{}
  266. return string(buf[:t.genCoreBytes(buf[:])])
  267. }
  268. // MarshalText implements encoding.TextMarshaler.
  269. func (t Tag) MarshalText() (text []byte, err error) {
  270. if t.str != "" {
  271. text = append(text, t.str...)
  272. } else if t.script == 0 && t.region == 0 {
  273. text = append(text, t.lang.String()...)
  274. } else {
  275. buf := [maxCoreSize]byte{}
  276. text = buf[:t.genCoreBytes(buf[:])]
  277. }
  278. return text, nil
  279. }
  280. // UnmarshalText implements encoding.TextUnmarshaler.
  281. func (t *Tag) UnmarshalText(text []byte) error {
  282. tag, err := Raw.Parse(string(text))
  283. *t = tag
  284. return err
  285. }
  286. // Base returns the base language of the language tag. If the base language is
  287. // unspecified, an attempt will be made to infer it from the context.
  288. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  289. func (t Tag) Base() (Base, Confidence) {
  290. if t.lang != 0 {
  291. return Base{t.lang}, Exact
  292. }
  293. c := High
  294. if t.script == 0 && !(Region{t.region}).IsCountry() {
  295. c = Low
  296. }
  297. if tag, err := addTags(t); err == nil && tag.lang != 0 {
  298. return Base{tag.lang}, c
  299. }
  300. return Base{0}, No
  301. }
  302. // Script infers the script for the language tag. If it was not explicitly given, it will infer
  303. // a most likely candidate.
  304. // If more than one script is commonly used for a language, the most likely one
  305. // is returned with a low confidence indication. For example, it returns (Cyrl, Low)
  306. // for Serbian.
  307. // If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
  308. // as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
  309. // common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
  310. // See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
  311. // unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
  312. // Note that an inferred script is never guaranteed to be the correct one. Latin is
  313. // almost exclusively used for Afrikaans, but Arabic has been used for some texts
  314. // in the past. Also, the script that is commonly used may change over time.
  315. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  316. func (t Tag) Script() (Script, Confidence) {
  317. if t.script != 0 {
  318. return Script{t.script}, Exact
  319. }
  320. sc, c := scriptID(_Zzzz), No
  321. if t.lang < langNoIndexOffset {
  322. if scr := scriptID(suppressScript[t.lang]); scr != 0 {
  323. // Note: it is not always the case that a language with a suppress
  324. // script value is only written in one script (e.g. kk, ms, pa).
  325. if t.region == 0 {
  326. return Script{scriptID(scr)}, High
  327. }
  328. sc, c = scr, High
  329. }
  330. }
  331. if tag, err := addTags(t); err == nil {
  332. if tag.script != sc {
  333. sc, c = tag.script, Low
  334. }
  335. } else {
  336. t, _ = (Deprecated | Macro).Canonicalize(t)
  337. if tag, err := addTags(t); err == nil && tag.script != sc {
  338. sc, c = tag.script, Low
  339. }
  340. }
  341. return Script{sc}, c
  342. }
  343. // Region returns the region for the language tag. If it was not explicitly given, it will
  344. // infer a most likely candidate from the context.
  345. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  346. func (t Tag) Region() (Region, Confidence) {
  347. if t.region != 0 {
  348. return Region{t.region}, Exact
  349. }
  350. if t, err := addTags(t); err == nil {
  351. return Region{t.region}, Low // TODO: differentiate between high and low.
  352. }
  353. t, _ = (Deprecated | Macro).Canonicalize(t)
  354. if tag, err := addTags(t); err == nil {
  355. return Region{tag.region}, Low
  356. }
  357. return Region{_ZZ}, No // TODO: return world instead of undetermined?
  358. }
  359. // Variant returns the variants specified explicitly for this language tag.
  360. // or nil if no variant was specified.
  361. func (t Tag) Variants() []Variant {
  362. v := []Variant{}
  363. if int(t.pVariant) < int(t.pExt) {
  364. for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; {
  365. x, str = nextToken(str)
  366. v = append(v, Variant{x})
  367. }
  368. }
  369. return v
  370. }
  371. // Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
  372. // specific language are substituted with fields from the parent language.
  373. // The parent for a language may change for newer versions of CLDR.
  374. func (t Tag) Parent() Tag {
  375. if t.str != "" {
  376. // Strip the variants and extensions.
  377. t, _ = Raw.Compose(t.Raw())
  378. if t.region == 0 && t.script != 0 && t.lang != 0 {
  379. base, _ := addTags(Tag{lang: t.lang})
  380. if base.script == t.script {
  381. return Tag{lang: t.lang}
  382. }
  383. }
  384. return t
  385. }
  386. if t.lang != 0 {
  387. if t.region != 0 {
  388. maxScript := t.script
  389. if maxScript == 0 {
  390. max, _ := addTags(t)
  391. maxScript = max.script
  392. }
  393. for i := range parents {
  394. if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript {
  395. for _, r := range parents[i].fromRegion {
  396. if regionID(r) == t.region {
  397. return Tag{
  398. lang: t.lang,
  399. script: scriptID(parents[i].script),
  400. region: regionID(parents[i].toRegion),
  401. }
  402. }
  403. }
  404. }
  405. }
  406. // Strip the script if it is the default one.
  407. base, _ := addTags(Tag{lang: t.lang})
  408. if base.script != maxScript {
  409. return Tag{lang: t.lang, script: maxScript}
  410. }
  411. return Tag{lang: t.lang}
  412. } else if t.script != 0 {
  413. // The parent for an base-script pair with a non-default script is
  414. // "und" instead of the base language.
  415. base, _ := addTags(Tag{lang: t.lang})
  416. if base.script != t.script {
  417. return und
  418. }
  419. return Tag{lang: t.lang}
  420. }
  421. }
  422. return und
  423. }
  424. // returns token t and the rest of the string.
  425. func nextToken(s string) (t, tail string) {
  426. p := strings.Index(s[1:], "-")
  427. if p == -1 {
  428. return s[1:], ""
  429. }
  430. p++
  431. return s[1:p], s[p:]
  432. }
  433. // Extension is a single BCP 47 extension.
  434. type Extension struct {
  435. s string
  436. }
  437. // String returns the string representation of the extension, including the
  438. // type tag.
  439. func (e Extension) String() string {
  440. return e.s
  441. }
  442. // ParseExtension parses s as an extension and returns it on success.
  443. func ParseExtension(s string) (e Extension, err error) {
  444. scan := makeScannerString(s)
  445. var end int
  446. if n := len(scan.token); n != 1 {
  447. return Extension{}, errSyntax
  448. }
  449. scan.toLower(0, len(scan.b))
  450. end = parseExtension(&scan)
  451. if end != len(s) {
  452. return Extension{}, errSyntax
  453. }
  454. return Extension{string(scan.b)}, nil
  455. }
  456. // Type returns the one-byte extension type of e. It returns 0 for the zero
  457. // exception.
  458. func (e Extension) Type() byte {
  459. if e.s == "" {
  460. return 0
  461. }
  462. return e.s[0]
  463. }
  464. // Tokens returns the list of tokens of e.
  465. func (e Extension) Tokens() []string {
  466. return strings.Split(e.s, "-")
  467. }
  468. // Extension returns the extension of type x for tag t. It will return
  469. // false for ok if t does not have the requested extension. The returned
  470. // extension will be invalid in this case.
  471. func (t Tag) Extension(x byte) (ext Extension, ok bool) {
  472. for i := int(t.pExt); i < len(t.str)-1; {
  473. var ext string
  474. i, ext = getExtension(t.str, i)
  475. if ext[0] == x {
  476. return Extension{ext}, true
  477. }
  478. }
  479. return Extension{}, false
  480. }
  481. // Extensions returns all extensions of t.
  482. func (t Tag) Extensions() []Extension {
  483. e := []Extension{}
  484. for i := int(t.pExt); i < len(t.str)-1; {
  485. var ext string
  486. i, ext = getExtension(t.str, i)
  487. e = append(e, Extension{ext})
  488. }
  489. return e
  490. }
  491. // TypeForKey returns the type associated with the given key, where key and type
  492. // are of the allowed values defined for the Unicode locale extension ('u') in
  493. // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  494. // TypeForKey will traverse the inheritance chain to get the correct value.
  495. func (t Tag) TypeForKey(key string) string {
  496. if start, end, _ := t.findTypeForKey(key); end != start {
  497. return t.str[start:end]
  498. }
  499. return ""
  500. }
  501. var (
  502. errPrivateUse = errors.New("cannot set a key on a private use tag")
  503. errInvalidArguments = errors.New("invalid key or type")
  504. )
  505. // SetTypeForKey returns a new Tag with the key set to type, where key and type
  506. // are of the allowed values defined for the Unicode locale extension ('u') in
  507. // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  508. // An empty value removes an existing pair with the same key.
  509. func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
  510. if t.private() {
  511. return t, errPrivateUse
  512. }
  513. if len(key) != 2 {
  514. return t, errInvalidArguments
  515. }
  516. // Remove the setting if value is "".
  517. if value == "" {
  518. start, end, _ := t.findTypeForKey(key)
  519. if start != end {
  520. // Remove key tag and leading '-'.
  521. start -= 4
  522. // Remove a possible empty extension.
  523. if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
  524. start -= 2
  525. }
  526. if start == int(t.pVariant) && end == len(t.str) {
  527. t.str = ""
  528. t.pVariant, t.pExt = 0, 0
  529. } else {
  530. t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
  531. }
  532. }
  533. return t, nil
  534. }
  535. if len(value) < 3 || len(value) > 8 {
  536. return t, errInvalidArguments
  537. }
  538. var (
  539. buf [maxCoreSize + maxSimpleUExtensionSize]byte
  540. uStart int // start of the -u extension.
  541. )
  542. // Generate the tag string if needed.
  543. if t.str == "" {
  544. uStart = t.genCoreBytes(buf[:])
  545. buf[uStart] = '-'
  546. uStart++
  547. }
  548. // Create new key-type pair and parse it to verify.
  549. b := buf[uStart:]
  550. copy(b, "u-")
  551. copy(b[2:], key)
  552. b[4] = '-'
  553. b = b[:5+copy(b[5:], value)]
  554. scan := makeScanner(b)
  555. if parseExtensions(&scan); scan.err != nil {
  556. return t, scan.err
  557. }
  558. // Assemble the replacement string.
  559. if t.str == "" {
  560. t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
  561. t.str = string(buf[:uStart+len(b)])
  562. } else {
  563. s := t.str
  564. start, end, hasExt := t.findTypeForKey(key)
  565. if start == end {
  566. if hasExt {
  567. b = b[2:]
  568. }
  569. t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
  570. } else {
  571. t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
  572. }
  573. }
  574. return t, nil
  575. }
  576. // findKeyAndType returns the start and end position for the type corresponding
  577. // to key or the point at which to insert the key-value pair if the type
  578. // wasn't found. The hasExt return value reports whether an -u extension was present.
  579. // Note: the extensions are typically very small and are likely to contain
  580. // only one key-type pair.
  581. func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
  582. p := int(t.pExt)
  583. if len(key) != 2 || p == len(t.str) || p == 0 {
  584. return p, p, false
  585. }
  586. s := t.str
  587. // Find the correct extension.
  588. for p++; s[p] != 'u'; p++ {
  589. if s[p] > 'u' {
  590. p--
  591. return p, p, false
  592. }
  593. if p = nextExtension(s, p); p == len(s) {
  594. return len(s), len(s), false
  595. }
  596. }
  597. // Proceed to the hyphen following the extension name.
  598. p++
  599. // curKey is the key currently being processed.
  600. curKey := ""
  601. // Iterate over keys until we get the end of a section.
  602. for {
  603. // p points to the hyphen preceding the current token.
  604. if p3 := p + 3; s[p3] == '-' {
  605. // Found a key.
  606. // Check whether we just processed the key that was requested.
  607. if curKey == key {
  608. return start, p, true
  609. }
  610. // Set to the next key and continue scanning type tokens.
  611. curKey = s[p+1 : p3]
  612. if curKey > key {
  613. return p, p, true
  614. }
  615. // Start of the type token sequence.
  616. start = p + 4
  617. // A type is at least 3 characters long.
  618. p += 7 // 4 + 3
  619. } else {
  620. // Attribute or type, which is at least 3 characters long.
  621. p += 4
  622. }
  623. // p points past the third character of a type or attribute.
  624. max := p + 5 // maximum length of token plus hyphen.
  625. if len(s) < max {
  626. max = len(s)
  627. }
  628. for ; p < max && s[p] != '-'; p++ {
  629. }
  630. // Bail if we have exhausted all tokens or if the next token starts
  631. // a new extension.
  632. if p == len(s) || s[p+2] == '-' {
  633. if curKey == key {
  634. return start, p, true
  635. }
  636. return p, p, true
  637. }
  638. }
  639. }
  640. // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
  641. // for which data exists in the text repository. The index will change over time
  642. // and should not be stored in persistent storage. Extensions, except for the
  643. // 'va' type of the 'u' extension, are ignored. It will return 0, false if no
  644. // compact tag exists, where 0 is the index for the root language (Und).
  645. func CompactIndex(t Tag) (index int, ok bool) {
  646. // TODO: perhaps give more frequent tags a lower index.
  647. // TODO: we could make the indexes stable. This will excluded some
  648. // possibilities for optimization, so don't do this quite yet.
  649. b, s, r := t.Raw()
  650. if len(t.str) > 0 {
  651. if strings.HasPrefix(t.str, "x-") {
  652. // We have no entries for user-defined tags.
  653. return 0, false
  654. }
  655. if uint16(t.pVariant) != t.pExt {
  656. // There are no tags with variants and an u-va type.
  657. if t.TypeForKey("va") != "" {
  658. return 0, false
  659. }
  660. t, _ = Raw.Compose(b, s, r, t.Variants())
  661. } else if _, ok := t.Extension('u'); ok {
  662. // Strip all but the 'va' entry.
  663. variant := t.TypeForKey("va")
  664. t, _ = Raw.Compose(b, s, r)
  665. t, _ = t.SetTypeForKey("va", variant)
  666. }
  667. if len(t.str) > 0 {
  668. // We have some variants.
  669. for i, s := range specialTags {
  670. if s == t {
  671. return i + 1, true
  672. }
  673. }
  674. return 0, false
  675. }
  676. }
  677. // No variants specified: just compare core components.
  678. // The key has the form lllssrrr, where l, s, and r are nibbles for
  679. // respectively the langID, scriptID, and regionID.
  680. key := uint32(b.langID) << (8 + 12)
  681. key |= uint32(s.scriptID) << 12
  682. key |= uint32(r.regionID)
  683. x, ok := coreTags[key]
  684. return int(x), ok
  685. }
  686. // Base is an ISO 639 language code, used for encoding the base language
  687. // of a language tag.
  688. type Base struct {
  689. langID
  690. }
  691. // ParseBase parses a 2- or 3-letter ISO 639 code.
  692. // It returns a ValueError if s is a well-formed but unknown language identifier
  693. // or another error if another error occurred.
  694. func ParseBase(s string) (Base, error) {
  695. if n := len(s); n < 2 || 3 < n {
  696. return Base{}, errSyntax
  697. }
  698. var buf [3]byte
  699. l, err := getLangID(buf[:copy(buf[:], s)])
  700. return Base{l}, err
  701. }
  702. // Script is a 4-letter ISO 15924 code for representing scripts.
  703. // It is idiomatically represented in title case.
  704. type Script struct {
  705. scriptID
  706. }
  707. // ParseScript parses a 4-letter ISO 15924 code.
  708. // It returns a ValueError if s is a well-formed but unknown script identifier
  709. // or another error if another error occurred.
  710. func ParseScript(s string) (Script, error) {
  711. if len(s) != 4 {
  712. return Script{}, errSyntax
  713. }
  714. var buf [4]byte
  715. sc, err := getScriptID(script, buf[:copy(buf[:], s)])
  716. return Script{sc}, err
  717. }
  718. // Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
  719. type Region struct {
  720. regionID
  721. }
  722. // EncodeM49 returns the Region for the given UN M.49 code.
  723. // It returns an error if r is not a valid code.
  724. func EncodeM49(r int) (Region, error) {
  725. rid, err := getRegionM49(r)
  726. return Region{rid}, err
  727. }
  728. // ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
  729. // It returns a ValueError if s is a well-formed but unknown region identifier
  730. // or another error if another error occurred.
  731. func ParseRegion(s string) (Region, error) {
  732. if n := len(s); n < 2 || 3 < n {
  733. return Region{}, errSyntax
  734. }
  735. var buf [3]byte
  736. r, err := getRegionID(buf[:copy(buf[:], s)])
  737. return Region{r}, err
  738. }
  739. // IsCountry returns whether this region is a country or autonomous area. This
  740. // includes non-standard definitions from CLDR.
  741. func (r Region) IsCountry() bool {
  742. if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK {
  743. return false
  744. }
  745. return true
  746. }
  747. // IsGroup returns whether this region defines a collection of regions. This
  748. // includes non-standard definitions from CLDR.
  749. func (r Region) IsGroup() bool {
  750. if r.regionID == 0 {
  751. return false
  752. }
  753. return int(regionInclusion[r.regionID]) < len(regionContainment)
  754. }
  755. // Contains returns whether Region c is contained by Region r. It returns true
  756. // if c == r.
  757. func (r Region) Contains(c Region) bool {
  758. return r.regionID.contains(c.regionID)
  759. }
  760. func (r regionID) contains(c regionID) bool {
  761. if r == c {
  762. return true
  763. }
  764. g := regionInclusion[r]
  765. if g >= nRegionGroups {
  766. return false
  767. }
  768. m := regionContainment[g]
  769. d := regionInclusion[c]
  770. b := regionInclusionBits[d]
  771. // A contained country may belong to multiple disjoint groups. Matching any
  772. // of these indicates containment. If the contained region is a group, it
  773. // must strictly be a subset.
  774. if d >= nRegionGroups {
  775. return b&m != 0
  776. }
  777. return b&^m == 0
  778. }
  779. var errNoTLD = errors.New("language: region is not a valid ccTLD")
  780. // TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
  781. // In all other cases it returns either the region itself or an error.
  782. //
  783. // This method may return an error for a region for which there exists a
  784. // canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
  785. // region will already be canonicalized it was obtained from a Tag that was
  786. // obtained using any of the default methods.
  787. func (r Region) TLD() (Region, error) {
  788. // See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
  789. // difference between ISO 3166-1 and IANA ccTLD.
  790. if r.regionID == _GB {
  791. r = Region{_UK}
  792. }
  793. if (r.typ() & ccTLD) == 0 {
  794. return Region{}, errNoTLD
  795. }
  796. return r, nil
  797. }
  798. // Canonicalize returns the region or a possible replacement if the region is
  799. // deprecated. It will not return a replacement for deprecated regions that
  800. // are split into multiple regions.
  801. func (r Region) Canonicalize() Region {
  802. if cr := normRegion(r.regionID); cr != 0 {
  803. return Region{cr}
  804. }
  805. return r
  806. }
  807. // Variant represents a registered variant of a language as defined by BCP 47.
  808. type Variant struct {
  809. variant string
  810. }
  811. // ParseVariant parses and returns a Variant. An error is returned if s is not
  812. // a valid variant.
  813. func ParseVariant(s string) (Variant, error) {
  814. s = strings.ToLower(s)
  815. if _, ok := variantIndex[s]; ok {
  816. return Variant{s}, nil
  817. }
  818. return Variant{}, mkErrInvalid([]byte(s))
  819. }
  820. // String returns the string representation of the variant.
  821. func (v Variant) String() string {
  822. return v.variant
  823. }