scanner.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. // Copyright 2012 The Gorilla Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package scanner
  5. import (
  6. "fmt"
  7. "regexp"
  8. "strings"
  9. "unicode"
  10. "unicode/utf8"
  11. )
  12. // tokenType identifies the type of lexical tokens.
  13. type tokenType int
  14. // String returns a string representation of the token type.
  15. func (t tokenType) String() string {
  16. return tokenNames[t]
  17. }
  18. // Token represents a token and the corresponding string.
  19. type Token struct {
  20. Type tokenType
  21. Value string
  22. Line int
  23. Column int
  24. }
  25. // String returns a string representation of the token.
  26. func (t *Token) String() string {
  27. if len(t.Value) > 10 {
  28. return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
  29. t.Type, t.Line, t.Column, t.Value)
  30. }
  31. return fmt.Sprintf("%s (line: %d, column: %d): %q",
  32. t.Type, t.Line, t.Column, t.Value)
  33. }
  34. // All tokens -----------------------------------------------------------------
  35. // The complete list of tokens in CSS3.
  36. const (
  37. // Scanner flags.
  38. TokenError tokenType = iota
  39. TokenEOF
  40. // From now on, only tokens from the CSS specification.
  41. TokenIdent
  42. TokenAtKeyword
  43. TokenString
  44. TokenHash
  45. TokenNumber
  46. TokenPercentage
  47. TokenDimension
  48. TokenURI
  49. TokenUnicodeRange
  50. TokenCDO
  51. TokenCDC
  52. TokenS
  53. TokenComment
  54. TokenFunction
  55. TokenIncludes
  56. TokenDashMatch
  57. TokenPrefixMatch
  58. TokenSuffixMatch
  59. TokenSubstringMatch
  60. TokenChar
  61. TokenBOM
  62. )
  63. // tokenNames maps tokenType's to their names. Used for conversion to string.
  64. var tokenNames = map[tokenType]string{
  65. TokenError: "error",
  66. TokenEOF: "EOF",
  67. TokenIdent: "IDENT",
  68. TokenAtKeyword: "ATKEYWORD",
  69. TokenString: "STRING",
  70. TokenHash: "HASH",
  71. TokenNumber: "NUMBER",
  72. TokenPercentage: "PERCENTAGE",
  73. TokenDimension: "DIMENSION",
  74. TokenURI: "URI",
  75. TokenUnicodeRange: "UNICODE-RANGE",
  76. TokenCDO: "CDO",
  77. TokenCDC: "CDC",
  78. TokenS: "S",
  79. TokenComment: "COMMENT",
  80. TokenFunction: "FUNCTION",
  81. TokenIncludes: "INCLUDES",
  82. TokenDashMatch: "DASHMATCH",
  83. TokenPrefixMatch: "PREFIXMATCH",
  84. TokenSuffixMatch: "SUFFIXMATCH",
  85. TokenSubstringMatch: "SUBSTRINGMATCH",
  86. TokenChar: "CHAR",
  87. TokenBOM: "BOM",
  88. }
  89. // Macros and productions -----------------------------------------------------
  90. // http://www.w3.org/TR/css3-syntax/#tokenization
  91. var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
  92. // macros maps macro names to patterns to be expanded.
  93. var macros = map[string]string{
  94. // must be escaped: `\.+*?()|[]{}^$`
  95. "ident": `-?{nmstart}{nmchar}*`,
  96. "name": `{nmchar}+`,
  97. "nmstart": `[a-zA-Z_]|{nonascii}|{escape}`,
  98. "nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
  99. "unicode": `\\[0-9a-fA-F]{1,6}{wc}?`,
  100. "escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
  101. "nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
  102. "num": `[0-9]*\.[0-9]+|[0-9]+`,
  103. "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
  104. "stringchar": `{urlchar}|[ ]|\\{nl}`,
  105. "nl": `[\n\r\f]|\r\n`,
  106. "w": `{wc}*`,
  107. "wc": `[\t\n\f\r ]`,
  108. // urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}]
  109. // ASCII characters range = `[\u0020-\u007e]`
  110. // Skip space \u0020 = `[\u0021-\u007e]`
  111. // Skip quotation mark \0022 = `[\u0021\u0023-\u007e]`
  112. // Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]`
  113. // Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]`
  114. // Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves
  115. "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}",
  116. }
  117. // productions maps the list of tokens to patterns to be expanded.
  118. var productions = map[tokenType]string{
  119. // Unused regexps (matched using other methods) are commented out.
  120. TokenIdent: `{ident}`,
  121. TokenAtKeyword: `@{ident}`,
  122. TokenString: `{string}`,
  123. TokenHash: `#{name}`,
  124. TokenNumber: `{num}`,
  125. TokenPercentage: `{num}%`,
  126. TokenDimension: `{num}{ident}`,
  127. TokenURI: `url\({w}(?:{string}|{urlchar}*?){w}\)`,
  128. TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
  129. //TokenCDO: `<!--`,
  130. TokenCDC: `-->`,
  131. TokenS: `{wc}+`,
  132. TokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
  133. TokenFunction: `{ident}\(`,
  134. //TokenIncludes: `~=`,
  135. //TokenDashMatch: `\|=`,
  136. //TokenPrefixMatch: `\^=`,
  137. //TokenSuffixMatch: `\$=`,
  138. //TokenSubstringMatch: `\*=`,
  139. //TokenChar: `[^"']`,
  140. //TokenBOM: "\uFEFF",
  141. }
  142. // matchers maps the list of tokens to compiled regular expressions.
  143. //
  144. // The map is filled on init() using the macros and productions defined in
  145. // the CSS specification.
  146. var matchers = map[tokenType]*regexp.Regexp{}
  147. // matchOrder is the order to test regexps when first-char shortcuts
  148. // can't be used.
  149. var matchOrder = []tokenType{
  150. TokenURI,
  151. TokenFunction,
  152. TokenUnicodeRange,
  153. TokenIdent,
  154. TokenDimension,
  155. TokenPercentage,
  156. TokenNumber,
  157. TokenCDC,
  158. }
  159. func init() {
  160. // replace macros and compile regexps for productions.
  161. replaceMacro := func(s string) string {
  162. return "(?:" + macros[s[1:len(s)-1]] + ")"
  163. }
  164. for t, s := range productions {
  165. for macroRegexp.MatchString(s) {
  166. s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
  167. }
  168. matchers[t] = regexp.MustCompile("^(?:" + s + ")")
  169. }
  170. }
  171. // Scanner --------------------------------------------------------------------
  172. // New returns a new CSS scanner for the given input.
  173. func New(input string) *Scanner {
  174. // Normalize newlines.
  175. input = strings.Replace(input, "\r\n", "\n", -1)
  176. return &Scanner{
  177. input: input,
  178. row: 1,
  179. col: 1,
  180. }
  181. }
  182. // Scanner scans an input and emits tokens following the CSS3 specification.
  183. type Scanner struct {
  184. input string
  185. pos int
  186. row int
  187. col int
  188. err *Token
  189. }
  190. // Next returns the next token from the input.
  191. //
  192. // At the end of the input the token type is TokenEOF.
  193. //
  194. // If the input can't be tokenized the token type is TokenError. This occurs
  195. // in case of unclosed quotation marks or comments.
  196. func (s *Scanner) Next() *Token {
  197. if s.err != nil {
  198. return s.err
  199. }
  200. if s.pos >= len(s.input) {
  201. s.err = &Token{TokenEOF, "", s.row, s.col}
  202. return s.err
  203. }
  204. if s.pos == 0 {
  205. // Test BOM only once, at the beginning of the file.
  206. if strings.HasPrefix(s.input, "\uFEFF") {
  207. return s.emitSimple(TokenBOM, "\uFEFF")
  208. }
  209. }
  210. // There's a lot we can guess based on the first byte so we'll take a
  211. // shortcut before testing multiple regexps.
  212. input := s.input[s.pos:]
  213. switch input[0] {
  214. case '\t', '\n', '\f', '\r', ' ':
  215. // Whitespace.
  216. return s.emitToken(TokenS, matchers[TokenS].FindString(input))
  217. case '.':
  218. // Dot is too common to not have a quick check.
  219. // We'll test if this is a Char; if it is followed by a number it is a
  220. // dimension/percentage/number, and this will be matched later.
  221. if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
  222. return s.emitSimple(TokenChar, ".")
  223. }
  224. case '#':
  225. // Another common one: Hash or Char.
  226. if match := matchers[TokenHash].FindString(input); match != "" {
  227. return s.emitToken(TokenHash, match)
  228. }
  229. return s.emitSimple(TokenChar, "#")
  230. case '@':
  231. // Another common one: AtKeyword or Char.
  232. if match := matchers[TokenAtKeyword].FindString(input); match != "" {
  233. return s.emitSimple(TokenAtKeyword, match)
  234. }
  235. return s.emitSimple(TokenChar, "@")
  236. case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
  237. // More common chars.
  238. return s.emitSimple(TokenChar, string(input[0]))
  239. case '"', '\'':
  240. // String or error.
  241. match := matchers[TokenString].FindString(input)
  242. if match != "" {
  243. return s.emitToken(TokenString, match)
  244. }
  245. s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col}
  246. return s.err
  247. case '/':
  248. // Comment, error or Char.
  249. if len(input) > 1 && input[1] == '*' {
  250. match := matchers[TokenComment].FindString(input)
  251. if match != "" {
  252. return s.emitToken(TokenComment, match)
  253. } else {
  254. s.err = &Token{TokenError, "unclosed comment", s.row, s.col}
  255. return s.err
  256. }
  257. }
  258. return s.emitSimple(TokenChar, "/")
  259. case '~':
  260. // Includes or Char.
  261. return s.emitPrefixOrChar(TokenIncludes, "~=")
  262. case '|':
  263. // DashMatch or Char.
  264. return s.emitPrefixOrChar(TokenDashMatch, "|=")
  265. case '^':
  266. // PrefixMatch or Char.
  267. return s.emitPrefixOrChar(TokenPrefixMatch, "^=")
  268. case '$':
  269. // SuffixMatch or Char.
  270. return s.emitPrefixOrChar(TokenSuffixMatch, "$=")
  271. case '*':
  272. // SubstringMatch or Char.
  273. return s.emitPrefixOrChar(TokenSubstringMatch, "*=")
  274. case '<':
  275. // CDO or Char.
  276. return s.emitPrefixOrChar(TokenCDO, "<!--")
  277. }
  278. // Test all regexps, in order.
  279. for _, token := range matchOrder {
  280. if match := matchers[token].FindString(input); match != "" {
  281. return s.emitToken(token, match)
  282. }
  283. }
  284. // We already handled unclosed quotation marks and comments,
  285. // so this can only be a Char.
  286. r, width := utf8.DecodeRuneInString(input)
  287. token := &Token{TokenChar, string(r), s.row, s.col}
  288. s.col += width
  289. s.pos += width
  290. return token
  291. }
  292. // updatePosition updates input coordinates based on the consumed text.
  293. func (s *Scanner) updatePosition(text string) {
  294. width := utf8.RuneCountInString(text)
  295. lines := strings.Count(text, "\n")
  296. s.row += lines
  297. if lines == 0 {
  298. s.col += width
  299. } else {
  300. s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
  301. }
  302. s.pos += len(text) // while col is a rune index, pos is a byte index
  303. }
  304. // emitToken returns a Token for the string v and updates the scanner position.
  305. func (s *Scanner) emitToken(t tokenType, v string) *Token {
  306. token := &Token{t, v, s.row, s.col}
  307. s.updatePosition(v)
  308. return token
  309. }
  310. // emitSimple returns a Token for the string v and updates the scanner
  311. // position in a simplified manner.
  312. //
  313. // The string is known to have only ASCII characters and to not have a newline.
  314. func (s *Scanner) emitSimple(t tokenType, v string) *Token {
  315. token := &Token{t, v, s.row, s.col}
  316. s.col += len(v)
  317. s.pos += len(v)
  318. return token
  319. }
  320. // emitPrefixOrChar returns a Token for type t if the current position
  321. // matches the given prefix. Otherwise it returns a Char token using the
  322. // first character from the prefix.
  323. //
  324. // The prefix is known to have only ASCII characters and to not have a newline.
  325. func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token {
  326. if strings.HasPrefix(s.input[s.pos:], prefix) {
  327. return s.emitSimple(t, prefix)
  328. }
  329. return s.emitSimple(TokenChar, string(prefix[0]))
  330. }