links.go 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. /*
  2. Copyright 2015 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. // This tool extracts the links from types.go and .md files, visits the link and
  14. // checks the status code of the response.
  15. // Usage:
  16. // $ linkcheck --root-dir=${ROOT}
  17. package main
  18. import (
  19. "fmt"
  20. "io/ioutil"
  21. "net/http"
  22. "os"
  23. "path/filepath"
  24. "regexp"
  25. "strconv"
  26. "strings"
  27. "time"
  28. "github.com/mvdan/xurls"
  29. flag "github.com/spf13/pflag"
  30. )
  31. var (
  32. rootDir = flag.String("root-dir", "", "Root directory containing documents to be processed.")
  33. fileSuffix = flag.StringSlice("file-suffix", []string{"types.go", ".md"}, "suffix of files to be checked")
  34. // URLs matching the patterns in the regWhiteList won't be checked. Patterns
  35. // of dummy URLs should be added to the list to avoid false alerts. Also,
  36. // patterns of URLs that we don't care about can be added here to improve
  37. // efficiency.
  38. regWhiteList = []*regexp.Regexp{
  39. regexp.MustCompile(`https://kubernetes-site\.appspot\.com`),
  40. // skip url that doesn't start with an English alphabet, e.g., URLs with IP addresses.
  41. regexp.MustCompile(`https?://[^A-Za-z].*`),
  42. regexp.MustCompile(`https?://localhost.*`),
  43. }
  44. // URLs listed in the fullURLWhiteList won't be checked. This separated from
  45. // the RegWhiteList to improve efficiency. This list includes dummy URLs that
  46. // are hard to be generalized by a regex, and URLs that will cause false alerts.
  47. fullURLWhiteList = map[string]struct{}{
  48. "http://github.com/some/repo.git": {},
  49. // This URL returns 404 when visited by this tool, but it works fine if visited by a browser.
  50. "http://stackoverflow.com/questions/ask?tags=kubernetes": {},
  51. "https://github.com/$YOUR_GITHUB_USERNAME/kubernetes.git": {},
  52. "https://github.com/$YOUR_GITHUB_USERNAME/kubernetes": {},
  53. "http://storage.googleapis.com/kubernetes-release/release/v${K8S_VERSION}/bin/darwin/amd64/kubectl": {},
  54. // It seems this server expects certain User-Agent value, it works fine with Chrome, but returns 404 if we issue a plain cURL to it.
  55. "http://supervisord.org/": {},
  56. "http://kubernetes.io/vX.Y/docs": {},
  57. "http://kubernetes.io/vX.Y/docs/": {},
  58. "http://kubernetes.io/vX.Y/": {},
  59. }
  60. visitedURLs = map[string]struct{}{}
  61. htmlpreviewReg = regexp.MustCompile(`https://htmlpreview\.github\.io/\?`)
  62. httpOrhttpsReg = regexp.MustCompile(`https?.*`)
  63. )
  64. func newWalkFunc(invalidLink *bool, client *http.Client) filepath.WalkFunc {
  65. return func(filePath string, info os.FileInfo, err error) error {
  66. hasSuffix := false
  67. for _, suffix := range *fileSuffix {
  68. hasSuffix = hasSuffix || strings.HasSuffix(info.Name(), suffix)
  69. }
  70. if !hasSuffix {
  71. return nil
  72. }
  73. fileBytes, err := ioutil.ReadFile(filePath)
  74. if err != nil {
  75. return err
  76. }
  77. foundInvalid := false
  78. allURLs := xurls.Strict.FindAll(fileBytes, -1)
  79. fmt.Fprintf(os.Stdout, "\nChecking file %s\n", filePath)
  80. URL:
  81. for _, URL := range allURLs {
  82. // Don't check non http/https URL
  83. if !httpOrhttpsReg.Match(URL) {
  84. continue
  85. }
  86. for _, whiteURL := range regWhiteList {
  87. if whiteURL.Match(URL) {
  88. continue URL
  89. }
  90. }
  91. if _, found := fullURLWhiteList[string(URL)]; found {
  92. continue
  93. }
  94. // remove the htmlpreview Prefix
  95. processedURL := htmlpreviewReg.ReplaceAll(URL, []byte{})
  96. // check if we have visited the URL.
  97. if _, found := visitedURLs[string(processedURL)]; found {
  98. continue
  99. }
  100. visitedURLs[string(processedURL)] = struct{}{}
  101. retry := 0
  102. const maxRetry int = 3
  103. backoff := 100
  104. for retry < maxRetry {
  105. fmt.Fprintf(os.Stdout, "Visiting %s\n", string(processedURL))
  106. // Use verb HEAD to increase efficiency. However, some servers
  107. // do not handle HEAD well, so we need to try a GET to avoid
  108. // false alert.
  109. resp, err := client.Head(string(processedURL))
  110. // URLs with mock host or mock port will cause error. If we report
  111. // the error here, people need to add the mock URL to the white
  112. // list every time they add a mock URL, which will be a maintenance
  113. // nightmare. Hence, we decide to only report 404 to catch the
  114. // cases where host and port are legit, but path is not, which
  115. // is the most common mistake in our docs.
  116. if err != nil {
  117. break
  118. }
  119. if resp.StatusCode == 429 {
  120. retryAfter := resp.Header.Get("Retry-After")
  121. if seconds, err := strconv.Atoi(retryAfter); err != nil {
  122. backoff = seconds + 10
  123. }
  124. fmt.Fprintf(os.Stderr, "Got %d visiting %s, retry after %d seconds.\n", resp.StatusCode, string(URL), backoff)
  125. time.Sleep(time.Duration(backoff) * time.Second)
  126. backoff *= 2
  127. retry++
  128. } else if resp.StatusCode == 404 {
  129. // We only check for 404 error for now. 401, 403 errors are hard to handle.
  130. // We need to try a GET to avoid false alert.
  131. resp, err = client.Get(string(processedURL))
  132. if err != nil {
  133. break
  134. }
  135. if resp.StatusCode != 404 {
  136. continue URL
  137. }
  138. foundInvalid = true
  139. fmt.Fprintf(os.Stderr, "Failed: in file %s, Got %d visiting %s\n", filePath, resp.StatusCode, string(URL))
  140. break
  141. } else {
  142. break
  143. }
  144. }
  145. if retry == maxRetry {
  146. foundInvalid = true
  147. fmt.Fprintf(os.Stderr, "Failed: in file %s, still got 429 visiting %s after %d retries\n", filePath, string(URL), maxRetry)
  148. }
  149. }
  150. if foundInvalid {
  151. *invalidLink = true
  152. }
  153. return nil
  154. }
  155. }
  156. func main() {
  157. flag.Parse()
  158. if *rootDir == "" {
  159. flag.Usage()
  160. os.Exit(2)
  161. }
  162. client := http.Client{
  163. Timeout: time.Duration(5 * time.Second),
  164. }
  165. invalidLink := false
  166. if err := filepath.Walk(*rootDir, newWalkFunc(&invalidLink, &client)); err != nil {
  167. fmt.Fprintf(os.Stderr, "Fail: %v.\n", err)
  168. os.Exit(2)
  169. }
  170. if invalidLink {
  171. os.Exit(1)
  172. }
  173. }