123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375 |
- /*
- Package purell offers URL normalization as described on the wikipedia page:
- http://en.wikipedia.org/wiki/URL_normalization
- */
- package purell
- import (
- "bytes"
- "fmt"
- "net/url"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "github.com/PuerkitoBio/urlesc"
- "golang.org/x/net/idna"
- "golang.org/x/text/secure/precis"
- "golang.org/x/text/unicode/norm"
- )
- // A set of normalization flags determines how a URL will
- // be normalized.
- type NormalizationFlags uint
- const (
- // Safe normalizations
- FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1
- FlagLowercaseHost // http://HOST -> http://host
- FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF
- FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA
- FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$
- FlagRemoveDefaultPort // http://host:80 -> http://host
- FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path
- // Usually safe normalizations
- FlagRemoveTrailingSlash // http://host/path/ -> http://host/path
- FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags)
- FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c
- // Unsafe normalizations
- FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/
- FlagRemoveFragment // http://host/path#fragment -> http://host/path
- FlagForceHTTP // https://host -> http://host
- FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b
- FlagRemoveWWW // http://www.host/ -> http://host/
- FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags)
- FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3
- // Normalizations not in the wikipedia article, required to cover tests cases
- // submitted by jehiah
- FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147
- FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147
- FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147
- FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path
- FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path
- // Convenience set of safe normalizations
- FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator
- // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags,
- // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix".
- // Convenience set of usually safe normalizations (includes FlagsSafe)
- FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments
- FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments
- // Convenience set of unsafe normalizations (includes FlagsUsuallySafe)
- FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery
- FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery
- // Convenience set of all available flags
- FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
- FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
- )
- const (
- defaultHttpPort = ":80"
- defaultHttpsPort = ":443"
- )
- // Regular expressions used by the normalizations
- var rxPort = regexp.MustCompile(`(:\d+)/?$`)
- var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`)
- var rxDupSlashes = regexp.MustCompile(`/{2,}`)
- var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`)
- var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`)
- var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`)
- var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`)
- var rxEmptyPort = regexp.MustCompile(`:+$`)
- // Map of flags to implementation function.
- // FlagDecodeUnnecessaryEscapes has no action, since it is done automatically
- // by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator.
- // Since maps have undefined traversing order, make a slice of ordered keys
- var flagsOrder = []NormalizationFlags{
- FlagLowercaseScheme,
- FlagLowercaseHost,
- FlagRemoveDefaultPort,
- FlagRemoveDirectoryIndex,
- FlagRemoveDotSegments,
- FlagRemoveFragment,
- FlagForceHTTP, // Must be after remove default port (because https=443/http=80)
- FlagRemoveDuplicateSlashes,
- FlagRemoveWWW,
- FlagAddWWW,
- FlagSortQuery,
- FlagDecodeDWORDHost,
- FlagDecodeOctalHost,
- FlagDecodeHexHost,
- FlagRemoveUnnecessaryHostDots,
- FlagRemoveEmptyPortSeparator,
- FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last
- FlagAddTrailingSlash,
- }
- // ... and then the map, where order is unimportant
- var flags = map[NormalizationFlags]func(*url.URL){
- FlagLowercaseScheme: lowercaseScheme,
- FlagLowercaseHost: lowercaseHost,
- FlagRemoveDefaultPort: removeDefaultPort,
- FlagRemoveDirectoryIndex: removeDirectoryIndex,
- FlagRemoveDotSegments: removeDotSegments,
- FlagRemoveFragment: removeFragment,
- FlagForceHTTP: forceHTTP,
- FlagRemoveDuplicateSlashes: removeDuplicateSlashes,
- FlagRemoveWWW: removeWWW,
- FlagAddWWW: addWWW,
- FlagSortQuery: sortQuery,
- FlagDecodeDWORDHost: decodeDWORDHost,
- FlagDecodeOctalHost: decodeOctalHost,
- FlagDecodeHexHost: decodeHexHost,
- FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots,
- FlagRemoveEmptyPortSeparator: removeEmptyPortSeparator,
- FlagRemoveTrailingSlash: removeTrailingSlash,
- FlagAddTrailingSlash: addTrailingSlash,
- }
- // MustNormalizeURLString returns the normalized string, and panics if an error occurs.
- // It takes an URL string as input, as well as the normalization flags.
- func MustNormalizeURLString(u string, f NormalizationFlags) string {
- result, e := NormalizeURLString(u, f)
- if e != nil {
- panic(e)
- }
- return result
- }
- // NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object.
- // It takes an URL string as input, as well as the normalization flags.
- func NormalizeURLString(u string, f NormalizationFlags) (string, error) {
- if parsed, e := url.Parse(u); e != nil {
- return "", e
- } else {
- options := make([]precis.Option, 1, 3)
- options[0] = precis.IgnoreCase
- if f&FlagLowercaseHost == FlagLowercaseHost {
- options = append(options, precis.FoldCase())
- }
- options = append(options, precis.Norm(norm.NFC))
- profile := precis.NewFreeform(options...)
- if parsed.Host, e = idna.ToASCII(profile.NewTransformer().String(parsed.Host)); e != nil {
- return "", e
- }
- return NormalizeURL(parsed, f), nil
- }
- panic("Unreachable code.")
- }
- // NormalizeURL returns the normalized string.
- // It takes a parsed URL object as input, as well as the normalization flags.
- func NormalizeURL(u *url.URL, f NormalizationFlags) string {
- for _, k := range flagsOrder {
- if f&k == k {
- flags[k](u)
- }
- }
- return urlesc.Escape(u)
- }
- func lowercaseScheme(u *url.URL) {
- if len(u.Scheme) > 0 {
- u.Scheme = strings.ToLower(u.Scheme)
- }
- }
- func lowercaseHost(u *url.URL) {
- if len(u.Host) > 0 {
- u.Host = strings.ToLower(u.Host)
- }
- }
- func removeDefaultPort(u *url.URL) {
- if len(u.Host) > 0 {
- scheme := strings.ToLower(u.Scheme)
- u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string {
- if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) {
- return ""
- }
- return val
- })
- }
- }
- func removeTrailingSlash(u *url.URL) {
- if l := len(u.Path); l > 0 {
- if strings.HasSuffix(u.Path, "/") {
- u.Path = u.Path[:l-1]
- }
- } else if l = len(u.Host); l > 0 {
- if strings.HasSuffix(u.Host, "/") {
- u.Host = u.Host[:l-1]
- }
- }
- }
- func addTrailingSlash(u *url.URL) {
- if l := len(u.Path); l > 0 {
- if !strings.HasSuffix(u.Path, "/") {
- u.Path += "/"
- }
- } else if l = len(u.Host); l > 0 {
- if !strings.HasSuffix(u.Host, "/") {
- u.Host += "/"
- }
- }
- }
- func removeDotSegments(u *url.URL) {
- if len(u.Path) > 0 {
- var dotFree []string
- var lastIsDot bool
- sections := strings.Split(u.Path, "/")
- for _, s := range sections {
- if s == ".." {
- if len(dotFree) > 0 {
- dotFree = dotFree[:len(dotFree)-1]
- }
- } else if s != "." {
- dotFree = append(dotFree, s)
- }
- lastIsDot = (s == "." || s == "..")
- }
- // Special case if host does not end with / and new path does not begin with /
- u.Path = strings.Join(dotFree, "/")
- if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") {
- u.Path = "/" + u.Path
- }
- // Special case if the last segment was a dot, make sure the path ends with a slash
- if lastIsDot && !strings.HasSuffix(u.Path, "/") {
- u.Path += "/"
- }
- }
- }
- func removeDirectoryIndex(u *url.URL) {
- if len(u.Path) > 0 {
- u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1")
- }
- }
- func removeFragment(u *url.URL) {
- u.Fragment = ""
- }
- func forceHTTP(u *url.URL) {
- if strings.ToLower(u.Scheme) == "https" {
- u.Scheme = "http"
- }
- }
- func removeDuplicateSlashes(u *url.URL) {
- if len(u.Path) > 0 {
- u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/")
- }
- }
- func removeWWW(u *url.URL) {
- if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") {
- u.Host = u.Host[4:]
- }
- }
- func addWWW(u *url.URL) {
- if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") {
- u.Host = "www." + u.Host
- }
- }
- func sortQuery(u *url.URL) {
- q := u.Query()
- if len(q) > 0 {
- arKeys := make([]string, len(q))
- i := 0
- for k, _ := range q {
- arKeys[i] = k
- i++
- }
- sort.Strings(arKeys)
- buf := new(bytes.Buffer)
- for _, k := range arKeys {
- sort.Strings(q[k])
- for _, v := range q[k] {
- if buf.Len() > 0 {
- buf.WriteRune('&')
- }
- buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v)))
- }
- }
- // Rebuild the raw query string
- u.RawQuery = buf.String()
- }
- }
- func decodeDWORDHost(u *url.URL) {
- if len(u.Host) > 0 {
- if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 {
- var parts [4]int64
- dword, _ := strconv.ParseInt(matches[1], 10, 0)
- for i, shift := range []uint{24, 16, 8, 0} {
- parts[i] = dword >> shift & 0xFF
- }
- u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2])
- }
- }
- }
- func decodeOctalHost(u *url.URL) {
- if len(u.Host) > 0 {
- if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 {
- var parts [4]int64
- for i := 1; i <= 4; i++ {
- parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0)
- }
- u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5])
- }
- }
- }
- func decodeHexHost(u *url.URL) {
- if len(u.Host) > 0 {
- if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 {
- // Conversion is safe because of regex validation
- parsed, _ := strconv.ParseInt(matches[1], 16, 0)
- // Set host as DWORD (base 10) encoded host
- u.Host = fmt.Sprintf("%d%s", parsed, matches[2])
- // The rest is the same as decoding a DWORD host
- decodeDWORDHost(u)
- }
- }
- }
- func removeUnncessaryHostDots(u *url.URL) {
- if len(u.Host) > 0 {
- if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 {
- // Trim the leading and trailing dots
- u.Host = strings.Trim(matches[1], ".")
- if len(matches) > 2 {
- u.Host += matches[2]
- }
- }
- }
- }
- func removeEmptyPortSeparator(u *url.URL) {
- if len(u.Host) > 0 {
- u.Host = rxEmptyPort.ReplaceAllString(u.Host, "")
- }
- }
|