gen.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. package main
  6. // This program generates table.go and table_test.go.
  7. // Invoke as:
  8. //
  9. // go run gen.go -version "xxx" >table.go
  10. // go run gen.go -version "xxx" -test >table_test.go
  11. //
  12. // The first of those two will take around 20 minutes to complete, as the final
  13. // table is optimized for size. When testing the code generation workflow, pass
  14. // -crush=false to skip this optimization step, although the results of such a
  15. // run should not be committed, as the generated table can be around 50% larger
  16. // and, more importantly, require a larger number of scarce node table bits.
  17. // You may need to increase nodesBitsTextOffset or other constants to generate
  18. // a table with -crush=false.
  19. //
  20. // Pass -v to print verbose progress information.
  21. //
  22. // The version is derived from information found at
  23. // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
  24. //
  25. // To fetch a particular git revision, such as 5c70ccd250, pass
  26. // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
  27. import (
  28. "bufio"
  29. "bytes"
  30. "flag"
  31. "fmt"
  32. "go/format"
  33. "io"
  34. "net/http"
  35. "os"
  36. "regexp"
  37. "sort"
  38. "strings"
  39. "golang.org/x/net/idna"
  40. )
  41. const (
  42. // These sum of these four values must be no greater than 32.
  43. nodesBitsChildren = 9
  44. nodesBitsICANN = 1
  45. nodesBitsTextOffset = 15
  46. nodesBitsTextLength = 6
  47. // These sum of these four values must be no greater than 32.
  48. childrenBitsWildcard = 1
  49. childrenBitsNodeType = 2
  50. childrenBitsHi = 14
  51. childrenBitsLo = 14
  52. )
  53. var (
  54. maxChildren int
  55. maxTextOffset int
  56. maxTextLength int
  57. maxHi uint32
  58. maxLo uint32
  59. )
  60. func max(a, b int) int {
  61. if a < b {
  62. return b
  63. }
  64. return a
  65. }
  66. func u32max(a, b uint32) uint32 {
  67. if a < b {
  68. return b
  69. }
  70. return a
  71. }
  72. const (
  73. nodeTypeNormal = 0
  74. nodeTypeException = 1
  75. nodeTypeParentOnly = 2
  76. numNodeType = 3
  77. )
  78. func nodeTypeStr(n int) string {
  79. switch n {
  80. case nodeTypeNormal:
  81. return "+"
  82. case nodeTypeException:
  83. return "!"
  84. case nodeTypeParentOnly:
  85. return "o"
  86. }
  87. panic("unreachable")
  88. }
  89. var (
  90. labelEncoding = map[string]uint32{}
  91. labelsList = []string{}
  92. labelsMap = map[string]bool{}
  93. rules = []string{}
  94. // validSuffix is used to check that the entries in the public suffix list
  95. // are in canonical form (after Punycode encoding). Specifically, capital
  96. // letters are not allowed.
  97. validSuffix = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
  98. crush = flag.Bool("crush", true, "make the generated node text as small as possible")
  99. subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
  100. url = flag.String("url",
  101. "https://publicsuffix.org/list/effective_tld_names.dat",
  102. "URL of the publicsuffix.org list. If empty, stdin is read instead")
  103. v = flag.Bool("v", false, "verbose output (to stderr)")
  104. version = flag.String("version", "", "the effective_tld_names.dat version")
  105. test = flag.Bool("test", false, "generate table_test.go")
  106. )
  107. func main() {
  108. if err := main1(); err != nil {
  109. fmt.Fprintln(os.Stderr, err)
  110. os.Exit(1)
  111. }
  112. }
  113. func main1() error {
  114. flag.Parse()
  115. if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
  116. return fmt.Errorf("not enough bits to encode the nodes table")
  117. }
  118. if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
  119. return fmt.Errorf("not enough bits to encode the children table")
  120. }
  121. if *version == "" {
  122. return fmt.Errorf("-version was not specified")
  123. }
  124. var r io.Reader = os.Stdin
  125. if *url != "" {
  126. res, err := http.Get(*url)
  127. if err != nil {
  128. return err
  129. }
  130. if res.StatusCode != http.StatusOK {
  131. return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
  132. }
  133. r = res.Body
  134. defer res.Body.Close()
  135. }
  136. var root node
  137. icann := false
  138. buf := new(bytes.Buffer)
  139. br := bufio.NewReader(r)
  140. for {
  141. s, err := br.ReadString('\n')
  142. if err != nil {
  143. if err == io.EOF {
  144. break
  145. }
  146. return err
  147. }
  148. s = strings.TrimSpace(s)
  149. if strings.Contains(s, "BEGIN ICANN DOMAINS") {
  150. icann = true
  151. continue
  152. }
  153. if strings.Contains(s, "END ICANN DOMAINS") {
  154. icann = false
  155. continue
  156. }
  157. if s == "" || strings.HasPrefix(s, "//") {
  158. continue
  159. }
  160. s, err = idna.ToASCII(s)
  161. if err != nil {
  162. return err
  163. }
  164. if !validSuffix.MatchString(s) {
  165. return fmt.Errorf("bad publicsuffix.org list data: %q", s)
  166. }
  167. if *subset {
  168. switch {
  169. case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
  170. case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
  171. case s == "ao" || strings.HasSuffix(s, ".ao"):
  172. case s == "ar" || strings.HasSuffix(s, ".ar"):
  173. case s == "arpa" || strings.HasSuffix(s, ".arpa"):
  174. case s == "cy" || strings.HasSuffix(s, ".cy"):
  175. case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
  176. case s == "jp":
  177. case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
  178. case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
  179. case s == "om" || strings.HasSuffix(s, ".om"):
  180. case s == "uk" || strings.HasSuffix(s, ".uk"):
  181. case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
  182. case s == "tw" || strings.HasSuffix(s, ".tw"):
  183. case s == "zw" || strings.HasSuffix(s, ".zw"):
  184. case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
  185. // xn--p1ai is Russian-Cyrillic "рф".
  186. default:
  187. continue
  188. }
  189. }
  190. rules = append(rules, s)
  191. nt, wildcard := nodeTypeNormal, false
  192. switch {
  193. case strings.HasPrefix(s, "*."):
  194. s, nt = s[2:], nodeTypeParentOnly
  195. wildcard = true
  196. case strings.HasPrefix(s, "!"):
  197. s, nt = s[1:], nodeTypeException
  198. }
  199. labels := strings.Split(s, ".")
  200. for n, i := &root, len(labels)-1; i >= 0; i-- {
  201. label := labels[i]
  202. n = n.child(label)
  203. if i == 0 {
  204. if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
  205. n.nodeType = nt
  206. }
  207. n.icann = n.icann && icann
  208. n.wildcard = n.wildcard || wildcard
  209. }
  210. labelsMap[label] = true
  211. }
  212. }
  213. labelsList = make([]string, 0, len(labelsMap))
  214. for label := range labelsMap {
  215. labelsList = append(labelsList, label)
  216. }
  217. sort.Strings(labelsList)
  218. p := printReal
  219. if *test {
  220. p = printTest
  221. }
  222. if err := p(buf, &root); err != nil {
  223. return err
  224. }
  225. b, err := format.Source(buf.Bytes())
  226. if err != nil {
  227. return err
  228. }
  229. _, err = os.Stdout.Write(b)
  230. return err
  231. }
  232. func printTest(w io.Writer, n *node) error {
  233. fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
  234. fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
  235. for _, rule := range rules {
  236. fmt.Fprintf(w, "%q,\n", rule)
  237. }
  238. fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
  239. if err := n.walk(w, printNodeLabel); err != nil {
  240. return err
  241. }
  242. fmt.Fprintf(w, "}\n")
  243. return nil
  244. }
  245. func printReal(w io.Writer, n *node) error {
  246. const header = `// generated by go run gen.go; DO NOT EDIT
  247. package publicsuffix
  248. const version = %q
  249. const (
  250. nodesBitsChildren = %d
  251. nodesBitsICANN = %d
  252. nodesBitsTextOffset = %d
  253. nodesBitsTextLength = %d
  254. childrenBitsWildcard = %d
  255. childrenBitsNodeType = %d
  256. childrenBitsHi = %d
  257. childrenBitsLo = %d
  258. )
  259. const (
  260. nodeTypeNormal = %d
  261. nodeTypeException = %d
  262. nodeTypeParentOnly = %d
  263. )
  264. // numTLD is the number of top level domains.
  265. const numTLD = %d
  266. `
  267. fmt.Fprintf(w, header, *version,
  268. nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
  269. childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
  270. nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
  271. text := makeText()
  272. if text == "" {
  273. return fmt.Errorf("internal error: makeText returned no text")
  274. }
  275. for _, label := range labelsList {
  276. offset, length := strings.Index(text, label), len(label)
  277. if offset < 0 {
  278. return fmt.Errorf("internal error: could not find %q in text %q", label, text)
  279. }
  280. maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
  281. if offset >= 1<<nodesBitsTextOffset {
  282. return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
  283. }
  284. if length >= 1<<nodesBitsTextLength {
  285. return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
  286. }
  287. labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
  288. }
  289. fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
  290. for len(text) > 0 {
  291. n, plus := len(text), ""
  292. if n > 64 {
  293. n, plus = 64, " +"
  294. }
  295. fmt.Fprintf(w, "%q%s\n", text[:n], plus)
  296. text = text[n:]
  297. }
  298. n.walk(w, assignIndexes)
  299. fmt.Fprintf(w, `
  300. // nodes is the list of nodes. Each node is represented as a uint32, which
  301. // encodes the node's children, wildcard bit and node type (as an index into
  302. // the children array), ICANN bit and text.
  303. //
  304. // In the //-comment after each node's data, the nodes indexes of the children
  305. // are formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
  306. // nodeType is printed as + for normal, ! for exception, and o for parent-only
  307. // nodes that have children but don't match a domain label in their own right.
  308. // An I denotes an ICANN domain.
  309. //
  310. // The layout within the uint32, from MSB to LSB, is:
  311. // [%2d bits] unused
  312. // [%2d bits] children index
  313. // [%2d bits] ICANN bit
  314. // [%2d bits] text index
  315. // [%2d bits] text length
  316. var nodes = [...]uint32{
  317. `,
  318. 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
  319. nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
  320. if err := n.walk(w, printNode); err != nil {
  321. return err
  322. }
  323. fmt.Fprintf(w, `}
  324. // children is the list of nodes' children, the parent's wildcard bit and the
  325. // parent's node type. If a node has no children then their children index
  326. // will be in the range [0, 6), depending on the wildcard bit and node type.
  327. //
  328. // The layout within the uint32, from MSB to LSB, is:
  329. // [%2d bits] unused
  330. // [%2d bits] wildcard bit
  331. // [%2d bits] node type
  332. // [%2d bits] high nodes index (exclusive) of children
  333. // [%2d bits] low nodes index (inclusive) of children
  334. var children=[...]uint32{
  335. `,
  336. 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
  337. childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
  338. for i, c := range childrenEncoding {
  339. s := "---------------"
  340. lo := c & (1<<childrenBitsLo - 1)
  341. hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
  342. if lo != hi {
  343. s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
  344. }
  345. nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
  346. wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
  347. fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
  348. c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
  349. }
  350. fmt.Fprintf(w, "}\n\n")
  351. fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
  352. fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
  353. fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
  354. fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
  355. fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
  356. return nil
  357. }
  358. type node struct {
  359. label string
  360. nodeType int
  361. icann bool
  362. wildcard bool
  363. // nodesIndex and childrenIndex are the index of this node in the nodes
  364. // and the index of its children offset/length in the children arrays.
  365. nodesIndex, childrenIndex int
  366. // firstChild is the index of this node's first child, or zero if this
  367. // node has no children.
  368. firstChild int
  369. // children are the node's children, in strictly increasing node label order.
  370. children []*node
  371. }
  372. func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
  373. if err := f(w, n); err != nil {
  374. return err
  375. }
  376. for _, c := range n.children {
  377. if err := c.walk(w, f); err != nil {
  378. return err
  379. }
  380. }
  381. return nil
  382. }
  383. // child returns the child of n with the given label. The child is created if
  384. // it did not exist beforehand.
  385. func (n *node) child(label string) *node {
  386. for _, c := range n.children {
  387. if c.label == label {
  388. return c
  389. }
  390. }
  391. c := &node{
  392. label: label,
  393. nodeType: nodeTypeParentOnly,
  394. icann: true,
  395. }
  396. n.children = append(n.children, c)
  397. sort.Sort(byLabel(n.children))
  398. return c
  399. }
  400. type byLabel []*node
  401. func (b byLabel) Len() int { return len(b) }
  402. func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
  403. func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
  404. var nextNodesIndex int
  405. // childrenEncoding are the encoded entries in the generated children array.
  406. // All these pre-defined entries have no children.
  407. var childrenEncoding = []uint32{
  408. 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
  409. 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
  410. 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
  411. 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
  412. 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
  413. 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
  414. }
  415. var firstCallToAssignIndexes = true
  416. func assignIndexes(w io.Writer, n *node) error {
  417. if len(n.children) != 0 {
  418. // Assign nodesIndex.
  419. n.firstChild = nextNodesIndex
  420. for _, c := range n.children {
  421. c.nodesIndex = nextNodesIndex
  422. nextNodesIndex++
  423. }
  424. // The root node's children is implicit.
  425. if firstCallToAssignIndexes {
  426. firstCallToAssignIndexes = false
  427. return nil
  428. }
  429. // Assign childrenIndex.
  430. maxChildren = max(maxChildren, len(childrenEncoding))
  431. if len(childrenEncoding) >= 1<<nodesBitsChildren {
  432. return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
  433. }
  434. n.childrenIndex = len(childrenEncoding)
  435. lo := uint32(n.firstChild)
  436. hi := lo + uint32(len(n.children))
  437. maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
  438. if lo >= 1<<childrenBitsLo {
  439. return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
  440. }
  441. if hi >= 1<<childrenBitsHi {
  442. return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
  443. }
  444. enc := hi<<childrenBitsLo | lo
  445. enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
  446. if n.wildcard {
  447. enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
  448. }
  449. childrenEncoding = append(childrenEncoding, enc)
  450. } else {
  451. n.childrenIndex = n.nodeType
  452. if n.wildcard {
  453. n.childrenIndex += numNodeType
  454. }
  455. }
  456. return nil
  457. }
  458. func printNode(w io.Writer, n *node) error {
  459. for _, c := range n.children {
  460. s := "---------------"
  461. if len(c.children) != 0 {
  462. s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
  463. }
  464. encoding := labelEncoding[c.label]
  465. if c.icann {
  466. encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
  467. }
  468. encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
  469. fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
  470. encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
  471. nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
  472. )
  473. }
  474. return nil
  475. }
  476. func printNodeLabel(w io.Writer, n *node) error {
  477. for _, c := range n.children {
  478. fmt.Fprintf(w, "%q,\n", c.label)
  479. }
  480. return nil
  481. }
  482. func icannStr(icann bool) string {
  483. if icann {
  484. return "I"
  485. }
  486. return " "
  487. }
  488. func wildcardStr(wildcard bool) string {
  489. if wildcard {
  490. return "*"
  491. }
  492. return " "
  493. }
  494. // makeText combines all the strings in labelsList to form one giant string.
  495. // If the crush flag is true, then overlapping strings will be merged: "arpa"
  496. // and "parliament" could yield "arparliament".
  497. func makeText() string {
  498. if !*crush {
  499. return strings.Join(labelsList, "")
  500. }
  501. beforeLength := 0
  502. for _, s := range labelsList {
  503. beforeLength += len(s)
  504. }
  505. // Make a copy of labelsList.
  506. ss := append(make([]string, 0, len(labelsList)), labelsList...)
  507. // Remove strings that are substrings of other strings.
  508. for changed := true; changed; {
  509. changed = false
  510. for i, s := range ss {
  511. if s == "" {
  512. continue
  513. }
  514. for j, t := range ss {
  515. if i != j && t != "" && strings.Contains(s, t) {
  516. changed = true
  517. ss[j] = ""
  518. }
  519. }
  520. }
  521. }
  522. // Remove the empty strings.
  523. sort.Strings(ss)
  524. for len(ss) > 0 && ss[0] == "" {
  525. ss = ss[1:]
  526. }
  527. // Join strings where one suffix matches another prefix.
  528. for {
  529. // Find best i, j, k such that ss[i][len-k:] == ss[j][:k],
  530. // maximizing overlap length k.
  531. besti := -1
  532. bestj := -1
  533. bestk := 0
  534. for i, s := range ss {
  535. if s == "" {
  536. continue
  537. }
  538. for j, t := range ss {
  539. if i == j {
  540. continue
  541. }
  542. for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
  543. if s[len(s)-k:] == t[:k] {
  544. besti = i
  545. bestj = j
  546. bestk = k
  547. }
  548. }
  549. }
  550. }
  551. if bestk > 0 {
  552. if *v {
  553. fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n",
  554. bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj])
  555. }
  556. ss[besti] += ss[bestj][bestk:]
  557. ss[bestj] = ""
  558. continue
  559. }
  560. break
  561. }
  562. text := strings.Join(ss, "")
  563. if *v {
  564. fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
  565. }
  566. return text
  567. }