From 7563929d2b95cf1530a4c6eaaf29678f784845e7 Mon Sep 17 00:00:00 2001 From: Kane York Date: Mon, 12 Mar 2018 17:03:37 -0700 Subject: [PATCH 01/33] update README to point to the spec --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e266555..c8eee22 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,4 @@ css === [![GoDoc](https://godoc.org/github.com/gorilla/css?status.svg)](https://godoc.org/github.com/gorilla/css) [![Build Status](https://travis-ci.org/gorilla/css.png?branch=master)](https://travis-ci.org/gorilla/css) -A CSS3 tokenizer. +A CSS3 tokenizer based on https://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms From 55f89738f4941e08660ed869c7033c459544fd08 Mon Sep 17 00:00:00 2001 From: Kane York Date: Mon, 12 Mar 2018 17:12:01 -0700 Subject: [PATCH 02/33] Export TokenType, add missing token types --- scanner/scanner.go | 44 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/scanner/scanner.go b/scanner/scanner.go index 23fa740..88d73bf 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -12,17 +12,17 @@ import ( "unicode/utf8" ) -// tokenType identifies the type of lexical tokens. -type tokenType int +// TokenType identifies the type of lexical tokens. +type TokenType int // String returns a string representation of the token type. -func (t tokenType) String() string { +func (t TokenType) String() string { return tokenNames[t] } // Token represents a token and the corresponding string. type Token struct { - Type tokenType + Type TokenType Value string Line int Column int @@ -57,7 +57,7 @@ const ( TokenUnicodeRange TokenCDO TokenCDC - TokenS + TokenS // whitespace-token TokenComment TokenFunction TokenIncludes @@ -65,12 +65,28 @@ const ( TokenPrefixMatch TokenSuffixMatch TokenSubstringMatch - TokenChar + TokenDelim TokenBOM + // Added later + TokenBadString + TokenBadURI + TokenColumn + TokenColon + TokenSemicolon + TokenComma + TokenOpenBracket + TokenCloseBracket + TokenOpenParen + TokenCloseParen + TokenOpenBrace + TokenCloseBrace ) +// backwards compatibility +const TokenChar = TokenDelim + // tokenNames maps tokenType's to their names. Used for conversion to string. -var tokenNames = map[tokenType]string{ +var tokenNames = map[TokenType]string{ TokenError: "error", TokenEOF: "EOF", TokenIdent: "IDENT", @@ -92,8 +108,20 @@ var tokenNames = map[tokenType]string{ TokenPrefixMatch: "PREFIXMATCH", TokenSuffixMatch: "SUFFIXMATCH", TokenSubstringMatch: "SUBSTRINGMATCH", - TokenChar: "CHAR", + TokenDelim: "DELIM", TokenBOM: "BOM", + TokenBadString: "BAD-STRING", + TokenBadURI: "BAD-URI", + TokenColumn: "COLUMN", + TokenColon: "COLON", + TokenSemicolon: "SEMICOLON", + TokenComma: "COMMA", + TokenOpenBracket: "[", + TokenCloseBracket: "]", + TokenOpenParen: "(", + TokenCloseParen: ")", + TokenOpenBrace: "{", + TokenCloseBrace: "}", } // Macros and productions ----------------------------------------------------- From 63286005d87c505107be7eb61669fbd4bd70bba5 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 13 Mar 2018 10:17:05 -0700 Subject: [PATCH 03/33] Add a text/transform preprocessor for input --- scanner/crlf.go | 62 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 scanner/crlf.go diff --git a/scanner/crlf.go b/scanner/crlf.go new file mode 100644 index 0000000..ba5c7dc --- /dev/null +++ b/scanner/crlf.go @@ -0,0 +1,62 @@ +package scanner + +// The crlf package helps in dealing with files that have DOS-style CR/LF line +// endings. +// +// Copyright (c) 2015 Andy Balholm. Licensed under 2-Clause BSD. +// +// package crlf + +import ( + "io" + + "golang.org/x/text/transform" +) + +// Normalize takes CRLF, CR, or LF line endings in src, and converts them +// to LF in dst. +// +// cssparse: Also replace null bytes with U+FFFD REPLACEMENT CHARACTER. +type normalize struct { + prev byte +} + +const replacementCharacter = "\uFFFD" + +func (n *normalize) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + for nDst < len(dst) && nSrc < len(src) { + c := src[nSrc] + switch c { + case '\r': + dst[nDst] = '\n' + case '\n': + if n.prev == '\r' { + nSrc++ + n.prev = c + continue + } + dst[nDst] = '\n' + case 0: + // nb: len(replacementCharacter) == 3 + if nDst+3 >= len(dst) { + err = transform.ErrShortDst + return + } + copy(dst[nDst:], replacementCharacter[:]) + nDst += 2 + default: + dst[nDst] = c + } + n.prev = c + nDst++ + nSrc++ + } + if nSrc < len(src) { + err = transform.ErrShortDst + } + return +} + +func (n *normalize) Reset() { + n.prev = 0 +} From c02b43fa8daf5e8d99d0b3cff09f2dad5da75f54 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 13 Mar 2018 12:59:14 -0700 Subject: [PATCH 04/33] implement the 'consume a token' algorithm --- scanner/scanner.go | 111 +------------- scanner/token.go | 228 +++++++++++++++++++++++++++++ scanner/tokenizer.go | 335 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 564 insertions(+), 110 deletions(-) create mode 100644 scanner/token.go create mode 100644 scanner/tokenizer.go diff --git a/scanner/scanner.go b/scanner/scanner.go index 88d73bf..cbe5fdf 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -12,22 +12,6 @@ import ( "unicode/utf8" ) -// TokenType identifies the type of lexical tokens. -type TokenType int - -// String returns a string representation of the token type. -func (t TokenType) String() string { - return tokenNames[t] -} - -// Token represents a token and the corresponding string. -type Token struct { - Type TokenType - Value string - Line int - Column int -} - // String returns a string representation of the token. func (t *Token) String() string { if len(t.Value) > 10 { @@ -40,90 +24,6 @@ func (t *Token) String() string { // All tokens ----------------------------------------------------------------- -// The complete list of tokens in CSS3. -const ( - // Scanner flags. - TokenError tokenType = iota - TokenEOF - // From now on, only tokens from the CSS specification. - TokenIdent - TokenAtKeyword - TokenString - TokenHash - TokenNumber - TokenPercentage - TokenDimension - TokenURI - TokenUnicodeRange - TokenCDO - TokenCDC - TokenS // whitespace-token - TokenComment - TokenFunction - TokenIncludes - TokenDashMatch - TokenPrefixMatch - TokenSuffixMatch - TokenSubstringMatch - TokenDelim - TokenBOM - // Added later - TokenBadString - TokenBadURI - TokenColumn - TokenColon - TokenSemicolon - TokenComma - TokenOpenBracket - TokenCloseBracket - TokenOpenParen - TokenCloseParen - TokenOpenBrace - TokenCloseBrace -) - -// backwards compatibility -const TokenChar = TokenDelim - -// tokenNames maps tokenType's to their names. Used for conversion to string. -var tokenNames = map[TokenType]string{ - TokenError: "error", - TokenEOF: "EOF", - TokenIdent: "IDENT", - TokenAtKeyword: "ATKEYWORD", - TokenString: "STRING", - TokenHash: "HASH", - TokenNumber: "NUMBER", - TokenPercentage: "PERCENTAGE", - TokenDimension: "DIMENSION", - TokenURI: "URI", - TokenUnicodeRange: "UNICODE-RANGE", - TokenCDO: "CDO", - TokenCDC: "CDC", - TokenS: "S", - TokenComment: "COMMENT", - TokenFunction: "FUNCTION", - TokenIncludes: "INCLUDES", - TokenDashMatch: "DASHMATCH", - TokenPrefixMatch: "PREFIXMATCH", - TokenSuffixMatch: "SUFFIXMATCH", - TokenSubstringMatch: "SUBSTRINGMATCH", - TokenDelim: "DELIM", - TokenBOM: "BOM", - TokenBadString: "BAD-STRING", - TokenBadURI: "BAD-URI", - TokenColumn: "COLUMN", - TokenColon: "COLON", - TokenSemicolon: "SEMICOLON", - TokenComma: "COMMA", - TokenOpenBracket: "[", - TokenCloseBracket: "]", - TokenOpenParen: "(", - TokenCloseParen: ")", - TokenOpenBrace: "{", - TokenCloseBrace: "}", -} - // Macros and productions ----------------------------------------------------- // http://www.w3.org/TR/css3-syntax/#tokenization @@ -217,7 +117,7 @@ func init() { // Scanner -------------------------------------------------------------------- // New returns a new CSS scanner for the given input. -func New(input string) *Scanner { +func New(r *bufio.Reader) *Scanner { // Normalize newlines. input = strings.Replace(input, "\r\n", "\n", -1) return &Scanner{ @@ -227,15 +127,6 @@ func New(input string) *Scanner { } } -// Scanner scans an input and emits tokens following the CSS3 specification. -type Scanner struct { - input string - pos int - row int - col int - err *Token -} - // Next returns the next token from the input. // // At the end of the input the token type is TokenEOF. diff --git a/scanner/token.go b/scanner/token.go new file mode 100644 index 0000000..3a7dd27 --- /dev/null +++ b/scanner/token.go @@ -0,0 +1,228 @@ +// Copyright 2018 Kane York. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scanner + +// TokenType identifies the type of lexical tokens. +type TokenType int + +// String returns a string representation of the token type. +func (t TokenType) String() string { + return tokenNames[t] +} + +// Stop tokens are TokenError, TokenEOF, TokenBadEscape, +// TokenBadString, TokenBadURI. A consumer that does not want to tolerate +// parsing errors should stop parsing when this returns true. +func (t TokenType) StopToken() bool { + return t == TokenError || t == TokenEOF || t == TokenBadEscape || t == + TokenBadString || t == TokenBadURI +} + +// Simple tokens TODO figure out a useful definition for this. +func (t TokenType) SimpleToken() bool { + if t.StopToken() { + return false + } + if t == TokenHash || t == TokenNumber || t == TokenPercentage || t == TokenDimension || t == TokenUnicodeRange { + return false + } + return true +} + +// Token represents a token in the CSS syntax. +type Token struct { + Type TokenType + String string + // Extra data for the token beyond a simple string. + // Will always be a pointer to a "Token*Extra" type in this package. + Extra TokenExtra +} + +// The complete list of tokens in CSS Syntax Level 3. +const ( + // Scanner flags. + TokenError tokenType = iota + TokenEOF + // From now on, only tokens from the CSS specification. + TokenIdent + TokenAtKeyword + TokenString + TokenHash + TokenNumber + TokenPercentage + TokenDimension + TokenURI + TokenUnicodeRange + TokenCDO + TokenCDC + // Whitespace + TokenS + // CSS Syntax Level 3 removes comments from the token stream, but they are + // preserved here. + TokenComment + TokenFunction + TokenIncludes + TokenDashMatch + TokenPrefixMatch + TokenSuffixMatch + TokenSubstringMatch + TokenColumn + TokenDelim + // Error tokens + TokenBadString + TokenBadURI + TokenBadEscape // a '\' right before a newline + // Single-character tokens + TokenColon + TokenSemicolon + TokenComma + TokenOpenBracket + TokenCloseBracket + TokenOpenParen + TokenCloseParen + TokenOpenBrace + TokenCloseBrace +) + +// backwards compatibility +const TokenChar = TokenDelim + +// tokenNames maps tokenType's to their names. Used for conversion to string. +var tokenNames = map[TokenType]string{ + TokenError: "error", + TokenEOF: "EOF", + TokenIdent: "IDENT", + TokenAtKeyword: "ATKEYWORD", + TokenString: "STRING", + TokenHash: "HASH", + TokenNumber: "NUMBER", + TokenPercentage: "PERCENTAGE", + TokenDimension: "DIMENSION", + TokenURI: "URI", + TokenUnicodeRange: "UNICODE-RANGE", + TokenCDO: "CDO", + TokenCDC: "CDC", + TokenS: "S", + TokenComment: "COMMENT", + TokenFunction: "FUNCTION", + TokenIncludes: "INCLUDES", + TokenDashMatch: "DASHMATCH", + TokenPrefixMatch: "PREFIXMATCH", + TokenSuffixMatch: "SUFFIXMATCH", + TokenSubstringMatch: "SUBSTRINGMATCH", + TokenDelim: "DELIM", + TokenBOM: "BOM", + TokenBadString: "BAD-STRING", + TokenBadURI: "BAD-URI", + TokenBadEscape: "BAD-ESCAPE", + TokenColumn: "COLUMN", + TokenColon: "COLON", + TokenSemicolon: "SEMICOLON", + TokenComma: "COMMA", + TokenOpenBracket: "LEFT-BRACKET", // [] + TokenCloseBracket: "RIGHT-BRACKET", + TokenOpenParen: "LEFT-PAREN", // () + TokenCloseParen: "RIGHT-PAREN", + TokenOpenBrace: "LEFT-BRACE", // {} + TokenCloseBrace: "RIGHT-BRACE", +} + +// TokenExtra fills the .Extra field of a token. Consumers should perform a +// type cast to the proper type to inspect its data. +type TokenExtra interface { + String() string +} + +// TokenExtraTypeLookup provides a handy check for whether a given token type +// should contain extra data. +var TokenExtraTypeLookup = map[TokenType]interface{}{ + TokenError: &TokenExtraError{}, + TokenBadEscape: &TokenExtraError{}, + TokenBadString: &TokenExtraError{}, + TokenBadURI: &TokenExtraError{}, + TokenHash: &TokenExtraHash{}, + TokenNumber: &TokenExtraNumeric{}, + TokenPercentage: &TokenExtraNumeric{}, + TokenDimension: &TokenExtraNumeric{}, + TokenUnicodeRange: &TokenExtraUnicodeRange{}, +} + +// TokenExtraHash is attached to TokenHash. +type TokenExtraHash struct { + IsIdentifier bool +} + +func (e *TokenExtraHash) String() string { + if e == nil || !e.IsIdentifier { + return "unrestricted" + } else { + return "id" + } +} + +// TokenExtraNumeric is attached to TokenNumber, TokenPercentage, and +// TokenDimension. +type TokenExtraNumeric struct { + NonInteger bool + Dimension string +} + +func (e *TokenExtraNumeric) String() string { + if e == nil { + return "" + } + if e.Dimension != "" { + return e.Dimension + } + return "" +} + +// TokenExtraUnicodeRange is attached to a TokenUnicodeRange. +type TokenExtraUnicodeRange struct { + Start rune + End rune +} + +func (e *TokenExtraUnicodeRange) String() string { + if e == nil { + panic("TokenExtraUnicodeRange: unexpected nil pointer value") + } + + if e.Start == e.End { + return fmt.Sprintf("%0X", e.Start) + } else { + return fmt.Sprintf("%0X-%0X", e.Start, e.End) + } +} + +// TokenExtraError is attached to a TokenError and contains the same value as +// Tokenizer.Err(). See also the ParseError type and ParseError.Recoverable(). +type TokenExtraError struct { + Err error +} + +// String returns the error text. +func (e *TokenExtraError) String() string { + return e.Err.Error() +} + +// Error implements error. +func (e *TokenExtraError) Error() string { + return e.Err.Error() +} + +// Cause implements errors.Causer. +func (e *TokenExtraError) Cause() error { + return e.Err +} + +// Returns the ParseError object, if present. +func (e *TokenExtraError) ParseError() *ParseError { + pe, ok := e.Err.(*ParseError) + if !ok { + return nil + } + return pe +} diff --git a/scanner/tokenizer.go b/scanner/tokenizer.go new file mode 100644 index 0000000..0f023fc --- /dev/null +++ b/scanner/tokenizer.go @@ -0,0 +1,335 @@ +package scanner + +import ( + "bufio" + stdErrors "errors" + "golang.org/x/text/transform" +) + +var ( + ErrBadEscape = &ParseError{Type: TokenBadEscape, Message: "bad escape (backslash-newline) in input"} +) + +// Tokenizer scans an input and emits tokens following the CSS Syntax Level 3 +// specification. +type Tokenizer struct { + r *bufio.Reader + err error + peek [3]byte + + tok Token +} + +// Construct a Tokenizer from the given input. Input need not be normalized. +func NewTokenizer(r io.Reader) *Tokenizer { + return &Tokenizer{ + r: bufio.NewReader(transform.NewReader(r, new(normalize))), + } +} + +// Scan for the next token. If the tokenizer is in an error state, no input will be consumed. See .AcknowledgeError(). +func (z *Tokenizer) Scan() { + defer func() { + rec := recover() + if rErr, ok := rec.(error); ok { + z.err = rErr + } else if rec != nil { + panic(rec) + } + }() + + if z.err != nil { + z.tok = z.next() + } +} + +// Return the current token. +func (z *Tokenizer) Token() Token { + return t.tok +} + +func (z *Tokenizer) Err() error { + return t.err +} + +// Acknowledge a returned error token. This can only be called to clear TokenBadString, TokenBadURI, and TokenEscape. +func (z *Tokenizer) AcknowledgeError() { + parseErr, ok := t.err.(*ParseError) + if !ok { + panic("cssparse: AcknowledgeError() called for a foreign error") + } +} + +// repeek reads the next 3 bytes into the tokenizer. +func (z *Tokenizer) repeek() { + by, err := z.r.Peek(3) + if err != nil { + panic(err) + } + copy(z.peek, by) + + // zero fill on EOF + i := len(by) + for i < 3 { + z.peek[i] = 0 + i++ + } +} + +// up to 2 bytes +func isValidEscape(p []byte) bool { + if len(p) < 2 { + return false + } + if p[0] != '\\' { + return false + } + if p[1] == '\n' { + return false + } + return true +} + +func isNameStart(p byte) bool { + if p > 0x7F { + return true // any high code points + } + if p == '_' { + return true + } + if p >= 'A' && p <= 'Z' { + return true + } + if p >= 'a' && p <= 'z' { + return true + } + return false +} + +func isNameCode(p byte) { + if p > 0x7F { + return true // any high code points + } + if p == '_' || p == '-' { + return true + } + if p >= 'A' && p <= 'Z' { + return true + } + if p >= 'a' && p <= 'z' { + return true + } + if p >= '0' && p <= '9' { + return true + } + return false +} + +// up to 3 bytes +func isStartIdentifier(p []byte) bool { + if p[0] == '-' { + p = p[1:] + } + if isNameStart(p[0]) { + return true + } else if isValidEscape(p) { + return true + } + return false +} + +// up to 3 bytes +func isStartNumber(p []byte) bool { + if p[0] == '+' || p[0] == '-' { + p = p[1:] + } + if p[0] == '.' { + p = p[1:] + } + if p[0] >= '0' && p[0] <= '9' { + return true + } + return false +} + +// repeek must be called before the following: + +func (z *Tokenizer) nextIsEscape() bool { + return isValidEscape(z.peek[:2]) +} + +func (z *Tokenizer) nextStartsIdentifier() bool { + return isStartIdentifier(z.peek[:3]) +} + +func (z *Tokenizer) nextIsNumber() bool { + return isStartNumber(z.peek[:3]) +} + +func (z *Tokenizer) nextCompare(vs string) bool { + return string(z.peek[:len(vs)]) == vs +} + +var premadeTokens = map[byte]Token{ + '$': Token{ + Type: TokenSuffixMatch, + String: "$=", + }, + '*': Token{ + Type: TokenSubstringMatch, + String: "*=", + }, + '^': Token{ + Type: TokenPrefixMatch, + String: "^=", + }, + '~': Token{ + Type: TokenIncludeMatch, + String: "~=", + }, + '(': Token{Type: TokenOpenParen, String: "("}, + ')': Token{Type: TokenCloseParen, String: ")"}, + '[': Token{Type: TokenOpenBracket, String: "["}, + ']': Token{Type: TokenCloseBracket, String: "]"}, + '{': Token{Type: TokenOpenBrace, String: "{"}, + '}': Token{Type: TokenCloseBrace, String: "}"}, + ':': Token{Type: TokenColon, String: ":"}, + ';': Token{Type: TokenSemicolon, String: ";"}, + ',': Token{Type: TokenComma, String: ","}, + + '\\': Token{Type: TokenBadEscape, String: "\\"}, + + 'A': Token{Type: TokenDashMatch, String: "|="}, + 'B': Token{Type: TokenColumn, String: "||"}, + 'C': Token{Type: TokenCDC, String: "-->"}, + 'O': Token{Type: TokenCDO, String: "", TokenCDC, "-->") checkMatch(" \n \t \n", TokenS, " \n \t \n") checkMatch("/* foo */", TokenComment, "/* foo */") - checkMatch("bar(", TokenFunction, "bar(") + checkMatch("bar(", TokenFunction, "bar") checkMatch("~=", TokenIncludes, "~=") checkMatch("|=", TokenDashMatch, "|=") checkMatch("^=", TokenPrefixMatch, "^=") checkMatch("$=", TokenSuffixMatch, "$=") checkMatch("*=", TokenSubstringMatch, "*=") checkMatch("{", TokenChar, "{") - checkMatch("\uFEFF", TokenBOM, "\uFEFF") + // checkMatch("\uFEFF", TokenBOM, "\uFEFF") checkMatch(`╯︵┻━┻"stuff"`, TokenIdent, "╯︵┻━┻", TokenString, `"stuff"`) } diff --git a/scanner/token.go b/scanner/token.go index 11270ad..ab5b377 100644 --- a/scanner/token.go +++ b/scanner/token.go @@ -1,9 +1,12 @@ // Copyright 2018 Kane York. +// Copyright 2012 The Gorilla Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package scanner +import "fmt" + // TokenType identifies the type of lexical tokens. type TokenType int @@ -31,10 +34,21 @@ func (t TokenType) SimpleToken() bool { return true } +// ParseError represents a CSS syntax error. +type ParseError struct { + Type TokenType + Message string + Loc int +} + +func (e *ParseError) Error() string { + return e.Message +} + // Token represents a token in the CSS syntax. type Token struct { - Type TokenType - String string + Type TokenType + Value string // Extra data for the token beyond a simple string. // Will always be a pointer to a "Token*Extra" type in this package. Extra TokenExtra @@ -43,7 +57,7 @@ type Token struct { // The complete list of tokens in CSS Syntax Level 3. const ( // Scanner flags. - TokenError tokenType = iota + TokenError TokenType = iota TokenEOF // From now on, only tokens from the CSS specification. TokenIdent @@ -113,7 +127,6 @@ var tokenNames = map[TokenType]string{ TokenSuffixMatch: "SUFFIXMATCH", TokenSubstringMatch: "SUBSTRINGMATCH", TokenDelim: "DELIM", - TokenBOM: "BOM", TokenBadString: "BAD-STRING", TokenBadURI: "BAD-URI", TokenBadEscape: "BAD-ESCAPE", @@ -192,9 +205,9 @@ func (e *TokenExtraUnicodeRange) String() string { } if e.Start == e.End { - return fmt.Sprintf("%0X", e.Start) + return fmt.Sprintf("U+%04X", e.Start) } else { - return fmt.Sprintf("%0X-%0X", e.Start, e.End) + return fmt.Sprintf("U+%04X-%04X", e.Start, e.End) } } diff --git a/scanner/tokenizer.go b/scanner/tokenizer.go index 716a96f..24e8720 100644 --- a/scanner/tokenizer.go +++ b/scanner/tokenizer.go @@ -1,13 +1,21 @@ +// Copyright (c) 2018 Kane York. Licensed under 2-Clause BSD. + package scanner import ( "bufio" - stdErrors "errors" + "bytes" + "fmt" + "io" + "strconv" + "strings" + "unicode/utf8" + "golang.org/x/text/transform" ) var ( - ErrBadEscape = &ParseError{Type: TokenBadEscape, Message: "bad escape (backslash-newline) in input"} + errBadEscape = &ParseError{Type: TokenBadEscape, Message: "bad escape (backslash-newline) in input"} ) // Tokenizer scans an input and emits tokens following the CSS Syntax Level 3 @@ -27,46 +35,73 @@ func NewTokenizer(r io.Reader) *Tokenizer { } } -// Scan for the next token. If the tokenizer is in an error state, no input will be consumed. See .AcknowledgeError(). +// Scan for the next token. If the tokenizer is in an error state, no input +// will be consumed, and .AcknowledgeError() should be called instead. func (z *Tokenizer) Scan() { defer func() { rec := recover() if rErr, ok := rec.(error); ok { + // we only ever panic(err) z.err = rErr + z.tok = Token{ + Type: TokenError, + Extra: &TokenExtraError{Err: z.err}, + } } else if rec != nil { panic(rec) } }() - if z.err != nil { - z.tok = z.next() + if z.err == nil { + z.tok = z.consume() + } else if z.err == io.EOF { + z.tok = Token{ + Type: TokenEOF, + } + } else { + z.tok = Token{ + Type: TokenError, + Value: z.err.Error(), + Extra: &TokenExtraError{Err: z.err}, + } } } // Return the current token. func (z *Tokenizer) Token() Token { - return t.tok + return z.tok } +// Combines the calls to Scan() and Token(). +func (z *Tokenizer) Next() Token { + z.Scan() + return z.tok +} + +// Err returns the last error to be encountered and not cleared. func (z *Tokenizer) Err() error { - return t.err + return z.err } -// Acknowledge a returned error token. This can only be called to clear TokenBadString, TokenBadURI, and TokenEscape. +// Acknowledge a returned error token. This can only be called to clear +// TokenBadString, TokenBadURI, and TokenBadEscape. Using it for non-parsing +// errors will panic. func (z *Tokenizer) AcknowledgeError() { - parseErr, ok := t.err.(*ParseError) + _, ok := z.err.(*ParseError) if !ok { - panic("cssparse: AcknowledgeError() called for a foreign error") + panic("cssparse: AcknowledgeError() called for a foreign (non-syntax) error") } + z.err = nil } -// repeek reads the next 3 bytes into the tokenizer. +// repeek reads the next 3 bytes into the tokenizer. on EOF, the bytes are +// filled with zeroes. (Null bytes in the input are preprocessed into U+FFFD.) func (z *Tokenizer) repeek() { by, err := z.r.Peek(3) - if err != nil { + if err != nil && err != io.EOF { panic(err) } - copy(z.peek, by) + copy(z.peek[:], by) // zero fill on EOF i := len(by) @@ -93,7 +128,7 @@ func isValidEscape(p []byte) bool { // §4.3.9 func isNameStart(p byte) bool { - if p > 0x7F { + if p >= utf8.RuneSelf { return true // any high code points } if p == '_' { @@ -108,8 +143,8 @@ func isNameStart(p byte) bool { return false } -func isNameCode(p byte) { - if p > 0x7F { +func isNameCode(p byte) bool { + if p >= utf8.RuneSelf { return true // any high code points } if p == '_' || p == '-' { @@ -127,6 +162,19 @@ func isNameCode(p byte) { return false } +func isHexDigit(p byte) bool { + if p >= 'A' && p <= 'F' { + return true + } + if p >= 'a' && p <= 'f' { + return true + } + if p >= '0' && p <= '9' { + return true + } + return false +} + // up to 3 bytes func isStartIdentifier(p []byte) bool { if p[0] == '-' { @@ -175,53 +223,48 @@ func (z *Tokenizer) nextCompare(vs string) bool { var premadeTokens = map[byte]Token{ '$': Token{ - Type: TokenSuffixMatch, - String: "$=", + Type: TokenSuffixMatch, + Value: "$=", }, '*': Token{ - Type: TokenSubstringMatch, - String: "*=", + Type: TokenSubstringMatch, + Value: "*=", }, '^': Token{ - Type: TokenPrefixMatch, - String: "^=", + Type: TokenPrefixMatch, + Value: "^=", }, '~': Token{ - Type: TokenIncludeMatch, - String: "~=", + Type: TokenIncludes, + Value: "~=", }, - '(': Token{Type: TokenOpenParen, String: "("}, - ')': Token{Type: TokenCloseParen, String: ")"}, - '[': Token{Type: TokenOpenBracket, String: "["}, - ']': Token{Type: TokenCloseBracket, String: "]"}, - '{': Token{Type: TokenOpenBrace, String: "{"}, - '}': Token{Type: TokenCloseBrace, String: "}"}, - ':': Token{Type: TokenColon, String: ":"}, - ';': Token{Type: TokenSemicolon, String: ";"}, - ',': Token{Type: TokenComma, String: ","}, - - '\\': Token{Type: TokenBadEscape, String: "\\"}, - - 'A': Token{Type: TokenDashMatch, String: "|="}, - 'B': Token{Type: TokenColumn, String: "||"}, - 'C': Token{Type: TokenCDC, String: "-->"}, - 'O': Token{Type: TokenCDO, String: ""}, + 'O': Token{Type: TokenCDO, Value: "") { z.r.Discard(2) return premadeTokens['C'] } + z.readByte() // re-read, fall down to TokenDelim case '.': + z.unreadByte() z.repeek() if z.nextIsNumber() { - z.r.UnreadByte() return z.consumeNumeric() } + z.readByte() // re-read, fall down to TokenDelim case '/': z.repeek() if z.peek[0] == '*' { + z.r.Discard(1) return z.consumeComment() } case '<': @@ -299,25 +344,23 @@ func (z *Tokenizer) consume() Token { if z.nextStartsIdentifier() { s := z.consumeName() return Token{ - Type: TokenAtKeyword, - String: s, + Type: TokenAtKeyword, + Value: s, } } case '\\': + z.unreadByte() z.repeek() - if z.peek[0] != '\n' { + if z.nextIsEscape() { // input stream has a valid escape - z.r.UnreadByte() return z.consumeIdentish() } - z.err = ErrBadEscape + z.readByte() + z.err = errBadEscape return premadeTokens['\\'] case 'U', 'u': z.repeek() - if z.peek[0] == '+' && ((z.peek[1] >= '0' && z.peek[1] <= '9') || - (z.peek[1] >= 'A' && z.peek[1] <= 'F') || - (z.peek[1] >= 'a' && z.peek[1] <= 'f') || - (z.peek[1] == '?')) { + if z.peek[0] == '+' && (isHexDigit(z.peek[1]) || (z.peek[1] == '?')) { z.r.Discard(1) // (!) only discard the plus sign return z.consumeUnicodeRange() } @@ -325,23 +368,27 @@ func (z *Tokenizer) consume() Token { } if '0' <= ch && ch <= '9' { - z.r.UnreadByte() + z.unreadByte() return z.consumeNumeric() } if isNameStart(ch) { - z.r.UnreadByte() + z.unreadByte() return z.consumeIdentish() } return Token{ - Type: TokenDelim, - String: string(rune(ch)), + Type: TokenDelim, + Value: string(rune(ch)), } } // return the next byte, with 0 on EOF and panicing on other errors func (z *Tokenizer) nextByte() byte { + if z.err == io.EOF { + return 0 + } by, err := z.r.ReadByte() if err == io.EOF { + z.err = io.EOF return 0 } else if err != nil { panic(err) @@ -349,6 +396,69 @@ func (z *Tokenizer) nextByte() byte { return by } +func (z *Tokenizer) unreadByte() { + if z.err == io.EOF { + // don't unread after EOF + return + } + z.r.UnreadByte() +} + +func isWhitespace(r rune) bool { + return r == ' ' || r == '\t' || r == '\n' +} + +func isNotWhitespace(r rune) bool { + return !isWhitespace(r) +} + +func (z *Tokenizer) consumeWhitespace(ch byte) Token { + const wsBufSize = 32 + + sawNewline := false + if ch == '\n' { + sawNewline = true + } + + for { + // Consume whitespace in chunks of up to wsBufSize + buf, err := z.r.Peek(wsBufSize) + if err != nil && err != io.EOF { + panic(err) + } + if len(buf) == 0 { + break // Reached EOF + } + // find first non-whitespace char, discard up to there + idx := bytes.IndexFunc(buf, isNotWhitespace) + if idx == 0 { + break // Nothing to trim + } + if idx == -1 { + idx = len(buf) // Entire buffer is spaces + } + if /* const */ ch != 0 { + // only check for newlines when we're actually outputting a token + nlIdx := bytes.IndexByte(buf[:idx], '\n') + if nlIdx != -1 { + sawNewline = true + } + } + z.r.Discard(idx) + } + + if sawNewline { + return Token{ + Type: TokenS, + Value: "\n", + } + } + return Token{ + Type: TokenS, + Value: " ", + } +} + // 4.3.2 func (z *Tokenizer) consumeNumeric() Token { repr, notInteger := z.consumeNumericInner() @@ -356,9 +466,9 @@ func (z *Tokenizer) consumeNumeric() Token { NonInteger: notInteger, } t := Token{ - Type: TokenNumeric, - String: string(repr), - Extra: e, + Type: TokenNumber, + Value: string(repr), + Extra: e, } z.repeek() if z.nextStartsIdentifier() { @@ -373,26 +483,342 @@ func (z *Tokenizer) consumeNumeric() Token { // §4.3.3 func (z *Tokenizer) consumeIdentish() Token { + s := z.consumeName() + z.repeek() + if z.peek[0] == '(' { + z.r.Discard(1) + if strings.EqualFold(s, "url") { + return z.consumeURL() + } + return Token{ + Type: TokenFunction, + Value: s, + } + } + return Token{ + Type: TokenIdent, + Value: s, + } } // §4.3.4 func (z *Tokenizer) consumeString(delim byte) Token { + var frag []byte + var by byte + for { + by = z.nextByte() + if by == delim || by == 0 { + // end of string, EOF + return Token{ + Type: TokenString, + Value: string(frag), + } + } else if by == '\n' { + z.unreadByte() + z.err = &ParseError{ + Type: TokenBadString, + Message: "unterminated string", + } + return Token{ + Type: TokenBadString, + Value: string(frag), + Extra: &TokenExtraError{Err: z.err}, + } + } else if by == '\\' { + z.repeek() + if z.peek[0] == 0 { + // escape @ EOF, ignore. + } else if z.peek[0] == '\n' { + // valid escaped newline, ignore. + z.r.Discard(1) + } else if true { + // stream will always contain a valid escape here + cp := z.consumeEscapedCP() + var tmp [utf8.UTFMax]byte + n := utf8.EncodeRune(tmp[:], cp) + frag = append(frag, tmp[:n]...) + } + } else { + frag = append(frag, by) + } + } } // §4.3.5 +// reader must be in the "url(" state func (z *Tokenizer) consumeURL() Token { + z.consumeWhitespace(0) + z.repeek() + if z.peek[0] == 0 { + return Token{ + Type: TokenURI, + Value: "", + } + } else if z.peek[0] == '\'' || z.peek[0] == '"' { + t := z.consumeString(z.peek[0]) + if t.Type == TokenBadString { + t.Type = TokenBadURI + t.Value += z.consumeBadURL() + z.err = &ParseError{ + Type: TokenBadURI, + Message: "unterminated string in url()", + } + t.Extra = &TokenExtraError{ + Err: z.err, + } + return t + } + t.Type = TokenURI + z.consumeWhitespace(0) + z.repeek() + if z.peek[0] == ')' || z.peek[0] == 0 { + z.nextByte() + return t + } + t.Type = TokenBadURI + t.Value += z.consumeBadURL() + z.err = &ParseError{ + Type: TokenBadURI, + Message: "url() with string missing close parenthesis", + } + t.Extra = &TokenExtraError{ + Err: z.err, + } + return t + } + var frag []byte + var by byte + for { + by = z.nextByte() + if by == ')' || by == 0 { + return Token{Type: TokenURI, Value: string(frag)} + } else if isWhitespace(rune(by)) { + z.consumeWhitespace(0) + z.repeek() + if z.peek[0] == ')' || z.peek[0] == 0 { + z.nextByte() // ')' + return Token{Type: TokenURI, Value: string(frag)} + } + z.err = &ParseError{ + Type: TokenBadURI, + Message: "bare url() with internal whitespace", + } + return Token{ + Type: TokenBadURI, + Value: string(frag) + z.consumeBadURL(), + Extra: &TokenExtraError{Err: z.err}, + } + } else if by == '\'' || by == '"' || by == '(' { + z.err = &ParseError{ + Type: TokenBadURI, + Message: fmt.Sprintf("bare url() with illegal character '%c'", by), + } + return Token{ + Type: TokenBadURI, + Value: string(frag) + z.consumeBadURL(), + Extra: &TokenExtraError{Err: z.err}, + } + } else if (0 <= by && by <= 0x08) || (0x0B == by) || (0x0E <= by && by <= 0x1F) || (0x7F == by) { + z.err = &ParseError{ + Type: TokenBadURI, + Message: fmt.Sprintf("bare url() with unprintable character '%d'", by), + } + return Token{ + Type: TokenBadURI, + Value: string(frag) + z.consumeBadURL(), + Extra: &TokenExtraError{Err: z.err}, + } + } else if by == '\\' { + z.repeek() + if z.peek[0] != '\n' && z.peek[0] != 0 { + cp := z.consumeEscapedCP() + var tmp [utf8.UTFMax]byte + n := utf8.EncodeRune(tmp[:], cp) + frag = append(frag, tmp[:n]...) + } else { + z.err = &ParseError{ + Type: TokenBadURI, + Message: fmt.Sprintf("bare url() with invalid escape"), + } + return Token{ + Type: TokenBadURI, + Value: string(frag) + z.consumeBadURL(), + Extra: &TokenExtraError{Err: z.err}, + } + } + } else { + frag = append(frag, by) + } + } } // §4.3.6 func (z *Tokenizer) consumeUnicodeRange() Token { + var sdigits [6]byte + var by byte + haveQuestionMarks := false + i := 0 + for { + by = z.nextByte() + if i >= 6 { + break // weird condition so that unreadByte() works + } + if by == '?' { + sdigits[i] = by + haveQuestionMarks = true + i++ + } else if !haveQuestionMarks && isHexDigit(by) { + sdigits[i] = by + i++ + } else { + break + } + } + z.unreadByte() + slen := i + var edigits [6]byte + var elen int + z.repeek() + + if haveQuestionMarks { + copy(edigits[:slen], sdigits[:slen]) + elen = slen + for idx := range sdigits { + if sdigits[idx] == '?' { + sdigits[idx] = '0' + edigits[idx] = 'F' + } + } + } else if z.peek[0] == '-' && isHexDigit(z.peek[1]) { + z.nextByte() // '-' + i = 0 + for { + by = z.nextByte() + if i < 6 && isHexDigit(by) { + edigits[i] = by + i++ + } else { + break + } + } + z.unreadByte() + elen = i + } else { + copy(edigits[:], sdigits[:]) + elen = slen + } + + startCP, err := strconv.ParseInt(string(sdigits[:slen]), 16, 22) + if err != nil { + panic(fmt.Sprintf("ParseInt failure: %s", err)) + } + endCP, err := strconv.ParseInt(string(edigits[:elen]), 16, 22) + if err != nil { + panic(fmt.Sprintf("ParseInt failure: %s", err)) + } + e := &TokenExtraUnicodeRange{ + Start: rune(startCP), + End: rune(endCP), + } + return Token{ + Type: TokenUnicodeRange, + Value: e.String(), + Extra: e, + } +} + +func (z *Tokenizer) consumeComment() Token { + var frag []byte + var by byte + for { + by = z.nextByte() + if by == '*' { + z.repeek() + if z.peek[0] == '/' { + z.nextByte() // '/' + return Token{ + Type: TokenComment, + Value: "/*" + string(frag) + "*/", + } + } + } else if by == 0 { + return Token{ + Type: TokenComment, + Value: "/*" + string(frag) + "*/", + } + } + frag = append(frag, by) + } } // §4.3.7 +// after the "\" func (z *Tokenizer) consumeEscapedCP() rune { + by := z.nextByte() + if by == 0 { + return utf8.RuneError + } else if isHexDigit(by) { + var digits = make([]byte, 6) + digits[0] = by + i := 1 + // (!) weird looping condition so that we UnreadByte() at the end + for { + by = z.nextByte() + if i < 6 && isHexDigit(by) { + digits[i] = by + i++ + } else { + break + } + } + + if isNotWhitespace(rune(by)) && by != 0 { + z.unreadByte() + } + digits = digits[:i] + // 16 = hex, 22 = bit width of unicode + cpi, err := strconv.ParseInt(string(digits), 16, 22) + if err != nil || cpi == 0 || cpi > utf8.MaxRune { + return utf8.RuneError + } + return rune(cpi) + } else { + z.unreadByte() + ru, _, err := z.r.ReadRune() + if err == io.EOF { + z.err = io.EOF + return utf8.RuneError + } else if err != nil { + z.err = err + panic(err) + } else { + return ru + } + } } // §4.3.11 func (z *Tokenizer) consumeName() string { + var frag []byte + var by byte + for { + by = z.nextByte() + if by == '\\' { + z.repeek() + if z.peek[0] != '\n' && z.peek[0] != 0 { + cp := z.consumeEscapedCP() + var tmp [utf8.UTFMax]byte + n := utf8.EncodeRune(tmp[:], cp) + frag = append(frag, tmp[:n]...) + continue + } + } else if isNameCode(by) { + frag = append(frag, by) + continue + } + + return string(frag) + } } // §4.3.12 @@ -409,7 +835,7 @@ func (z *Tokenizer) consumeNumericInner() (repr []byte, notInteger bool) { } if by != 0 { // don't attempt to unread EOF - z.r.UnreadByte() + z.unreadByte() } } @@ -425,9 +851,9 @@ func (z *Tokenizer) consumeNumericInner() (repr []byte, notInteger bool) { z.repeek() } // [eE][+-]?[0-9] - if z.peek[0] == 'e' || z.peek[0] == 'E' { + if (z.peek[0] == 'e') || (z.peek[0] == 'E') { var n int - if z.peek[1] == '+' && z.peek[1] == '-' && ('0' <= z.peek[2] && z.peek[2] <= '9') { + if (z.peek[1] == '+' || z.peek[1] == '-') && ('0' <= z.peek[2] && z.peek[2] <= '9') { n = 3 } else if '0' <= z.peek[1] && z.peek[1] <= '9' { n = 2 @@ -446,4 +872,23 @@ func (z *Tokenizer) consumeNumericInner() (repr []byte, notInteger bool) { // §4.3.14 func (z *Tokenizer) consumeBadURL() string { + var frag []byte + var by byte + for { + by = z.nextByte() + if by == ')' || by == 0 { + return string(frag) + } else if by == '\\' { + z.repeek() + if z.peek[0] != '\n' { + // Allow for escaped right paren "\)" + cp := z.consumeEscapedCP() + var tmp [utf8.UTFMax]byte + n := utf8.EncodeRune(tmp[:], cp) + frag = append(frag, tmp[:n]...) + continue + } + } + frag = append(frag, by) + } } From 7690df407b02a60495b9aa1397d402a3c90fe108 Mon Sep 17 00:00:00 2001 From: Kane York Date: Wed, 14 Mar 2018 14:17:46 -0700 Subject: [PATCH 07/33] fix all usages of "starts with a valid escape": cannot unread after peeking --- scanner/tokenizer.go | 47 +++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/scanner/tokenizer.go b/scanner/tokenizer.go index 24e8720..e30de15 100644 --- a/scanner/tokenizer.go +++ b/scanner/tokenizer.go @@ -305,7 +305,7 @@ func (z *Tokenizer) consume() Token { if z.nextIsNumber() { return z.consumeNumeric() } - z.readByte() // re-read, fall down to TokenDelim + z.nextByte() // re-read, fall down to TokenDelim case '-': z.unreadByte() z.repeek() @@ -319,14 +319,14 @@ func (z *Tokenizer) consume() Token { z.r.Discard(2) return premadeTokens['C'] } - z.readByte() // re-read, fall down to TokenDelim + z.nextByte() // re-read, fall down to TokenDelim case '.': z.unreadByte() z.repeek() if z.nextIsNumber() { return z.consumeNumeric() } - z.readByte() // re-read, fall down to TokenDelim + z.nextByte() // re-read, fall down to TokenDelim case '/': z.repeek() if z.peek[0] == '*' { @@ -355,7 +355,7 @@ func (z *Tokenizer) consume() Token { // input stream has a valid escape return z.consumeIdentish() } - z.readByte() + z.nextByte() z.err = errBadEscape return premadeTokens['\\'] case 'U', 'u': @@ -494,10 +494,11 @@ func (z *Tokenizer) consumeIdentish() Token { Type: TokenFunction, Value: s, } - } - return Token{ - Type: TokenIdent, - Value: s, + } else { + return Token{ + Type: TokenIdent, + Value: s, + } } } @@ -525,14 +526,18 @@ func (z *Tokenizer) consumeString(delim byte) Token { Extra: &TokenExtraError{Err: z.err}, } } else if by == '\\' { + z.unreadByte() z.repeek() - if z.peek[0] == 0 { + if z.peek[1] == 0 { // escape @ EOF, ignore. - } else if z.peek[0] == '\n' { + z.nextByte() // '\' + } else if z.peek[1] == '\n' { // valid escaped newline, ignore. - z.r.Discard(1) + z.nextByte() // '\' + z.nextByte() // newline } else if true { // stream will always contain a valid escape here + z.nextByte() // '\' cp := z.consumeEscapedCP() var tmp [utf8.UTFMax]byte n := utf8.EncodeRune(tmp[:], cp) @@ -629,8 +634,10 @@ func (z *Tokenizer) consumeURL() Token { Extra: &TokenExtraError{Err: z.err}, } } else if by == '\\' { + z.unreadByte() z.repeek() - if z.peek[0] != '\n' && z.peek[0] != 0 { + if z.nextIsEscape() { + z.nextByte() // '\' cp := z.consumeEscapedCP() var tmp [utf8.UTFMax]byte n := utf8.EncodeRune(tmp[:], cp) @@ -804,20 +811,25 @@ func (z *Tokenizer) consumeName() string { for { by = z.nextByte() if by == '\\' { + z.unreadByte() z.repeek() - if z.peek[0] != '\n' && z.peek[0] != 0 { + if z.nextIsEscape() { + z.nextByte() cp := z.consumeEscapedCP() var tmp [utf8.UTFMax]byte n := utf8.EncodeRune(tmp[:], cp) frag = append(frag, tmp[:n]...) continue + } else { + return string(frag) } } else if isNameCode(by) { frag = append(frag, by) continue + } else { + z.unreadByte() + return string(frag) } - - return string(frag) } } @@ -879,8 +891,10 @@ func (z *Tokenizer) consumeBadURL() string { if by == ')' || by == 0 { return string(frag) } else if by == '\\' { + z.unreadByte() z.repeek() - if z.peek[0] != '\n' { + if z.nextIsEscape() { + z.nextByte() // '\' // Allow for escaped right paren "\)" cp := z.consumeEscapedCP() var tmp [utf8.UTFMax]byte @@ -888,6 +902,7 @@ func (z *Tokenizer) consumeBadURL() string { frag = append(frag, tmp[:n]...) continue } + z.nextByte() // '\' } frag = append(frag, by) } From 1080914cd3fe669d75ac9209c633080413e23f5c Mon Sep 17 00:00:00 2001 From: Kane York Date: Wed, 14 Mar 2018 14:30:00 -0700 Subject: [PATCH 08/33] Change test data, make more fixes --- scanner/scanner_test.go | 52 +++++++++++++++++++++++++++++------------ scanner/tokenizer.go | 10 ++++---- 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index a349e16..46d9fe3 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -5,10 +5,23 @@ package scanner import ( + "bytes" "strings" "testing" ) +func Fuzz(b []byte) int { + tz := NewTokenizer(bytes.NewReader(b)) + for { + tt := tz.Next() + fmt.Printf("%v\n", tt) + if tt.Type.StopToken() { + break + } + } + return 1 +} + func TestMatchers(t *testing.T) { // Just basic checks, not exhaustive at all. checkMatch := func(s string, ttList ...interface{}) { @@ -54,31 +67,40 @@ func TestMatchers(t *testing.T) { checkMatch("42%", TokenPercentage, "42") checkMatch("4.2%", TokenPercentage, "4.2") checkMatch(".42%", TokenPercentage, ".42") - checkMatch("42px", TokenDimension, "42px") - checkMatch("url(http://domain.com)", TokenURI, "url(http://domain.com)") - checkMatch("url( http://domain.com/uri/between/space )", TokenURI, "url( http://domain.com/uri/between/space )") - checkMatch("url('http://domain.com/uri/between/single/quote')", TokenURI, "url('http://domain.com/uri/between/single/quote')") - checkMatch(`url("http://domain.com/uri/between/double/quote")`, TokenURI, `url("http://domain.com/uri/between/double/quote")`) - checkMatch("url(http://domain.com/?parentheses=%28)", TokenURI, "url(http://domain.com/?parentheses=%28)") - checkMatch("url( http://domain.com/?parentheses=%28&between=space )", TokenURI, "url( http://domain.com/?parentheses=%28&between=space )") - checkMatch("url('http://domain.com/uri/(parentheses)/between/single/quote')", TokenURI, "url('http://domain.com/uri/(parentheses)/between/single/quote')") - checkMatch(`url("http://domain.com/uri/(parentheses)/between/double/quote")`, TokenURI, `url("http://domain.com/uri/(parentheses)/between/double/quote")`) + checkMatch("42px", TokenDimension, "42") // TODO check the dimension stored in .Extra + checkMatch("url(http://domain.com)", TokenURI, "http://domain.com") + checkMatch("url( http://domain.com/uri/between/space )", TokenURI, "http://domain.com/uri/between/space") + checkMatch("url('http://domain.com/uri/between/single/quote')", TokenURI, "http://domain.com/uri/between/single/quote") + checkMatch(`url("http://domain.com/uri/between/double/quote")`, TokenURI, `http://domain.com/uri/between/double/quote`) + checkMatch("url(http://domain.com/?parentheses=%28)", TokenURI, "http://domain.com/?parentheses=%28") + checkMatch("url( http://domain.com/?parentheses=%28&between=space )", TokenURI, "http://domain.com/?parentheses=%28&between=space") + checkMatch("url('http://domain.com/uri/(parentheses)/between/single/quote')", TokenURI, "http://domain.com/uri/(parentheses)/between/single/quote") + checkMatch(`url("http://domain.com/uri/(parentheses)/between/double/quote")`, TokenURI, `http://domain.com/uri/(parentheses)/between/double/quote`) + checkMatch(`url(http://domain.com/uri/\(bare%20escaped\)/parentheses)`, TokenURI, `http://domain.com/uri/(bare%20escaped)/parentheses`) checkMatch("url(http://domain.com/uri/1)url(http://domain.com/uri/2)", - TokenURI, "url(http://domain.com/uri/1)", - TokenURI, "url(http://domain.com/uri/2)", + TokenURI, "http://domain.com/uri/1", + TokenURI, "http://domain.com/uri/2", + ) + checkMatch("url(http://domain.com/uri/1) url(http://domain.com/uri/2)", + TokenURI, "http://domain.com/uri/1", + TokenS, " ", + TokenURI, "http://domain.com/uri/2", ) checkMatch("U+0042", TokenUnicodeRange, "U+0042") checkMatch("", TokenCDC, "-->") - checkMatch(" \n \t \n", TokenS, " \n \t \n") - checkMatch("/* foo */", TokenComment, "/* foo */") + checkMatch(" \n \t \n", TokenS, "\n") // TODO - whitespace preservation + checkMatch("/**/", TokenComment, "") + checkMatch("/*foo*/", TokenComment, "foo") + checkMatch("/* foo */", TokenComment, " foo ") checkMatch("bar(", TokenFunction, "bar") checkMatch("~=", TokenIncludes, "~=") checkMatch("|=", TokenDashMatch, "|=") + checkMatch("||", TokenColumn, "||") checkMatch("^=", TokenPrefixMatch, "^=") checkMatch("$=", TokenSuffixMatch, "$=") checkMatch("*=", TokenSubstringMatch, "*=") - checkMatch("{", TokenChar, "{") + checkMatch("{", TokenOpenBrace, "{") // checkMatch("\uFEFF", TokenBOM, "\uFEFF") - checkMatch(`╯︵┻━┻"stuff"`, TokenIdent, "╯︵┻━┻", TokenString, `"stuff"`) + checkMatch(`╯︵┻━┻"stuff"`, TokenIdent, "╯︵┻━┻", TokenString, "stuff") } diff --git a/scanner/tokenizer.go b/scanner/tokenizer.go index e30de15..04dd726 100644 --- a/scanner/tokenizer.go +++ b/scanner/tokenizer.go @@ -316,7 +316,7 @@ func (z *Tokenizer) consume() Token { return z.consumeIdentish() } if z.nextCompare("-->") { - z.r.Discard(2) + z.r.Discard(3) return premadeTokens['C'] } z.nextByte() // re-read, fall down to TokenDelim @@ -560,7 +560,9 @@ func (z *Tokenizer) consumeURL() Token { Value: "", } } else if z.peek[0] == '\'' || z.peek[0] == '"' { - t := z.consumeString(z.peek[0]) + delim := z.peek[0] + z.nextByte() + t := z.consumeString(delim) if t.Type == TokenBadString { t.Type = TokenBadURI t.Value += z.consumeBadURL() @@ -745,13 +747,13 @@ func (z *Tokenizer) consumeComment() Token { z.nextByte() // '/' return Token{ Type: TokenComment, - Value: "/*" + string(frag) + "*/", + Value: string(frag), } } } else if by == 0 { return Token{ Type: TokenComment, - Value: "/*" + string(frag) + "*/", + Value: string(frag), } } frag = append(frag, by) From b99c1dd6dbf19025c0c0edbeb7c1e9f075f49140 Mon Sep 17 00:00:00 2001 From: Kane York Date: Wed, 14 Mar 2018 14:30:15 -0700 Subject: [PATCH 09/33] add fuzzing corpus from existing testdata --- scanner/testdata/fuzz/corpus/test-1 | 1 + scanner/testdata/fuzz/corpus/test-10 | 1 + scanner/testdata/fuzz/corpus/test-11 | 1 + scanner/testdata/fuzz/corpus/test-12 | 1 + scanner/testdata/fuzz/corpus/test-13 | 1 + scanner/testdata/fuzz/corpus/test-14 | 1 + scanner/testdata/fuzz/corpus/test-15 | 1 + scanner/testdata/fuzz/corpus/test-16 | 1 + scanner/testdata/fuzz/corpus/test-17 | 1 + scanner/testdata/fuzz/corpus/test-18 | 1 + scanner/testdata/fuzz/corpus/test-19 | 1 + scanner/testdata/fuzz/corpus/test-2 | 1 + scanner/testdata/fuzz/corpus/test-20 | 1 + scanner/testdata/fuzz/corpus/test-21 | 1 + scanner/testdata/fuzz/corpus/test-22 | 1 + scanner/testdata/fuzz/corpus/test-23 | 1 + scanner/testdata/fuzz/corpus/test-24 | 1 + scanner/testdata/fuzz/corpus/test-25 | 1 + scanner/testdata/fuzz/corpus/test-26 | 1 + scanner/testdata/fuzz/corpus/test-27 | 1 + scanner/testdata/fuzz/corpus/test-28 | 1 + scanner/testdata/fuzz/corpus/test-29 | 1 + scanner/testdata/fuzz/corpus/test-3 | 1 + scanner/testdata/fuzz/corpus/test-30 | 1 + scanner/testdata/fuzz/corpus/test-31 | 1 + scanner/testdata/fuzz/corpus/test-32 | 1 + scanner/testdata/fuzz/corpus/test-33 | 1 + scanner/testdata/fuzz/corpus/test-34 | 1 + scanner/testdata/fuzz/corpus/test-35 | 1 + scanner/testdata/fuzz/corpus/test-36 | 1 + scanner/testdata/fuzz/corpus/test-37 | 2 ++ scanner/testdata/fuzz/corpus/test-38 | 1 + scanner/testdata/fuzz/corpus/test-39 | 1 + scanner/testdata/fuzz/corpus/test-4 | 1 + scanner/testdata/fuzz/corpus/test-40 | 1 + scanner/testdata/fuzz/corpus/test-41 | 1 + scanner/testdata/fuzz/corpus/test-42 | 1 + scanner/testdata/fuzz/corpus/test-43 | 1 + scanner/testdata/fuzz/corpus/test-44 | 1 + scanner/testdata/fuzz/corpus/test-45 | 1 + scanner/testdata/fuzz/corpus/test-46 | 1 + scanner/testdata/fuzz/corpus/test-47 | 1 + scanner/testdata/fuzz/corpus/test-48 | 1 + scanner/testdata/fuzz/corpus/test-49 | 1 + scanner/testdata/fuzz/corpus/test-5 | 1 + scanner/testdata/fuzz/corpus/test-6 | 1 + scanner/testdata/fuzz/corpus/test-7 | 1 + scanner/testdata/fuzz/corpus/test-8 | 1 + scanner/testdata/fuzz/corpus/test-9 | 1 + 49 files changed, 50 insertions(+) create mode 100644 scanner/testdata/fuzz/corpus/test-1 create mode 100644 scanner/testdata/fuzz/corpus/test-10 create mode 100644 scanner/testdata/fuzz/corpus/test-11 create mode 100644 scanner/testdata/fuzz/corpus/test-12 create mode 100644 scanner/testdata/fuzz/corpus/test-13 create mode 100644 scanner/testdata/fuzz/corpus/test-14 create mode 100644 scanner/testdata/fuzz/corpus/test-15 create mode 100644 scanner/testdata/fuzz/corpus/test-16 create mode 100644 scanner/testdata/fuzz/corpus/test-17 create mode 100644 scanner/testdata/fuzz/corpus/test-18 create mode 100644 scanner/testdata/fuzz/corpus/test-19 create mode 100644 scanner/testdata/fuzz/corpus/test-2 create mode 100644 scanner/testdata/fuzz/corpus/test-20 create mode 100644 scanner/testdata/fuzz/corpus/test-21 create mode 100644 scanner/testdata/fuzz/corpus/test-22 create mode 100644 scanner/testdata/fuzz/corpus/test-23 create mode 100644 scanner/testdata/fuzz/corpus/test-24 create mode 100644 scanner/testdata/fuzz/corpus/test-25 create mode 100644 scanner/testdata/fuzz/corpus/test-26 create mode 100644 scanner/testdata/fuzz/corpus/test-27 create mode 100644 scanner/testdata/fuzz/corpus/test-28 create mode 100644 scanner/testdata/fuzz/corpus/test-29 create mode 100644 scanner/testdata/fuzz/corpus/test-3 create mode 100644 scanner/testdata/fuzz/corpus/test-30 create mode 100644 scanner/testdata/fuzz/corpus/test-31 create mode 100644 scanner/testdata/fuzz/corpus/test-32 create mode 100644 scanner/testdata/fuzz/corpus/test-33 create mode 100644 scanner/testdata/fuzz/corpus/test-34 create mode 100644 scanner/testdata/fuzz/corpus/test-35 create mode 100644 scanner/testdata/fuzz/corpus/test-36 create mode 100644 scanner/testdata/fuzz/corpus/test-37 create mode 100644 scanner/testdata/fuzz/corpus/test-38 create mode 100644 scanner/testdata/fuzz/corpus/test-39 create mode 100644 scanner/testdata/fuzz/corpus/test-4 create mode 100644 scanner/testdata/fuzz/corpus/test-40 create mode 100644 scanner/testdata/fuzz/corpus/test-41 create mode 100644 scanner/testdata/fuzz/corpus/test-42 create mode 100644 scanner/testdata/fuzz/corpus/test-43 create mode 100644 scanner/testdata/fuzz/corpus/test-44 create mode 100644 scanner/testdata/fuzz/corpus/test-45 create mode 100644 scanner/testdata/fuzz/corpus/test-46 create mode 100644 scanner/testdata/fuzz/corpus/test-47 create mode 100644 scanner/testdata/fuzz/corpus/test-48 create mode 100644 scanner/testdata/fuzz/corpus/test-49 create mode 100644 scanner/testdata/fuzz/corpus/test-5 create mode 100644 scanner/testdata/fuzz/corpus/test-6 create mode 100644 scanner/testdata/fuzz/corpus/test-7 create mode 100644 scanner/testdata/fuzz/corpus/test-8 create mode 100644 scanner/testdata/fuzz/corpus/test-9 diff --git a/scanner/testdata/fuzz/corpus/test-1 b/scanner/testdata/fuzz/corpus/test-1 new file mode 100644 index 0000000..85df507 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-1 @@ -0,0 +1 @@ +abcd \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-10 b/scanner/testdata/fuzz/corpus/test-10 new file mode 100644 index 0000000..1340eb3 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-10 @@ -0,0 +1 @@ +#name \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-11 b/scanner/testdata/fuzz/corpus/test-11 new file mode 100644 index 0000000..5dbaeac --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-11 @@ -0,0 +1 @@ +##name \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-12 b/scanner/testdata/fuzz/corpus/test-12 new file mode 100644 index 0000000..88cde2c --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-12 @@ -0,0 +1 @@ +42'' \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-13 b/scanner/testdata/fuzz/corpus/test-13 new file mode 100644 index 0000000..947355b --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-13 @@ -0,0 +1 @@ ++42 \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-14 b/scanner/testdata/fuzz/corpus/test-14 new file mode 100644 index 0000000..67f7ad0 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-14 @@ -0,0 +1 @@ +-42 \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-15 b/scanner/testdata/fuzz/corpus/test-15 new file mode 100644 index 0000000..8012ebb --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-15 @@ -0,0 +1 @@ +4.2 \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-16 b/scanner/testdata/fuzz/corpus/test-16 new file mode 100644 index 0000000..0401f1e --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-16 @@ -0,0 +1 @@ +.42 \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-17 b/scanner/testdata/fuzz/corpus/test-17 new file mode 100644 index 0000000..f8c987b --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-17 @@ -0,0 +1 @@ ++.42 \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-18 b/scanner/testdata/fuzz/corpus/test-18 new file mode 100644 index 0000000..3273e87 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-18 @@ -0,0 +1 @@ +-.42 \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-19 b/scanner/testdata/fuzz/corpus/test-19 new file mode 100644 index 0000000..67a9ae6 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-19 @@ -0,0 +1 @@ +42% \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-2 b/scanner/testdata/fuzz/corpus/test-2 new file mode 100644 index 0000000..af3501d --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-2 @@ -0,0 +1 @@ +"abcd" \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-20 b/scanner/testdata/fuzz/corpus/test-20 new file mode 100644 index 0000000..d44e379 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-20 @@ -0,0 +1 @@ +4.2% \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-21 b/scanner/testdata/fuzz/corpus/test-21 new file mode 100644 index 0000000..61542cd --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-21 @@ -0,0 +1 @@ +.42% \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-22 b/scanner/testdata/fuzz/corpus/test-22 new file mode 100644 index 0000000..9996f64 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-22 @@ -0,0 +1 @@ +42px \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-23 b/scanner/testdata/fuzz/corpus/test-23 new file mode 100644 index 0000000..6b16595 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-23 @@ -0,0 +1 @@ +url(http://domain.com) \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-24 b/scanner/testdata/fuzz/corpus/test-24 new file mode 100644 index 0000000..849a2f0 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-24 @@ -0,0 +1 @@ +url( http://domain.com/uri/between/space ) \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-25 b/scanner/testdata/fuzz/corpus/test-25 new file mode 100644 index 0000000..9efe089 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-25 @@ -0,0 +1 @@ +url('http://domain.com/uri/between/single/quote') \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-26 b/scanner/testdata/fuzz/corpus/test-26 new file mode 100644 index 0000000..5d37d41 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-26 @@ -0,0 +1 @@ +url("http://domain.com/uri/between/double/quote") \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-27 b/scanner/testdata/fuzz/corpus/test-27 new file mode 100644 index 0000000..4b67378 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-27 @@ -0,0 +1 @@ +url(http://domain.com/?parentheses=%28) \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-28 b/scanner/testdata/fuzz/corpus/test-28 new file mode 100644 index 0000000..7b0f6cb --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-28 @@ -0,0 +1 @@ +url( http://domain.com/?parentheses=%28&between=space ) \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-29 b/scanner/testdata/fuzz/corpus/test-29 new file mode 100644 index 0000000..e548025 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-29 @@ -0,0 +1 @@ +url('http://domain.com/uri/(parentheses)/between/single/quote') \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-3 b/scanner/testdata/fuzz/corpus/test-3 new file mode 100644 index 0000000..7d12177 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-3 @@ -0,0 +1 @@ +"ab'cd" \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-30 b/scanner/testdata/fuzz/corpus/test-30 new file mode 100644 index 0000000..4280336 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-30 @@ -0,0 +1 @@ +url("http://domain.com/uri/(parentheses)/between/double/quote") \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-31 b/scanner/testdata/fuzz/corpus/test-31 new file mode 100644 index 0000000..5416922 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-31 @@ -0,0 +1 @@ +url(http://domain.com/uri/\(bare%20escaped\)/parentheses) \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-32 b/scanner/testdata/fuzz/corpus/test-32 new file mode 100644 index 0000000..43d5b7d --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-32 @@ -0,0 +1 @@ +url(http://domain.com/uri/1)url(http://domain.com/uri/2) \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-33 b/scanner/testdata/fuzz/corpus/test-33 new file mode 100644 index 0000000..7871a01 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-33 @@ -0,0 +1 @@ +url(http://domain.com/uri/1) url(http://domain.com/uri/2) \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-34 b/scanner/testdata/fuzz/corpus/test-34 new file mode 100644 index 0000000..335d730 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-34 @@ -0,0 +1 @@ +U+0042 \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-35 b/scanner/testdata/fuzz/corpus/test-35 new file mode 100644 index 0000000..3e4e3d6 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-35 @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-37 b/scanner/testdata/fuzz/corpus/test-37 new file mode 100644 index 0000000..65ec83f --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-37 @@ -0,0 +1,2 @@ + + diff --git a/scanner/testdata/fuzz/corpus/test-38 b/scanner/testdata/fuzz/corpus/test-38 new file mode 100644 index 0000000..7068cde --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-38 @@ -0,0 +1 @@ +/**/ \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-39 b/scanner/testdata/fuzz/corpus/test-39 new file mode 100644 index 0000000..112b37c --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-39 @@ -0,0 +1 @@ +/*foo*/ \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-4 b/scanner/testdata/fuzz/corpus/test-4 new file mode 100644 index 0000000..cf25faa --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-4 @@ -0,0 +1 @@ +"ab\"cd" \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-40 b/scanner/testdata/fuzz/corpus/test-40 new file mode 100644 index 0000000..785ae54 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-40 @@ -0,0 +1 @@ +/* foo */ \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-41 b/scanner/testdata/fuzz/corpus/test-41 new file mode 100644 index 0000000..adaa030 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-41 @@ -0,0 +1 @@ +bar( \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-42 b/scanner/testdata/fuzz/corpus/test-42 new file mode 100644 index 0000000..c926849 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-42 @@ -0,0 +1 @@ +~= \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-43 b/scanner/testdata/fuzz/corpus/test-43 new file mode 100644 index 0000000..279d9d3 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-43 @@ -0,0 +1 @@ +|= \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-44 b/scanner/testdata/fuzz/corpus/test-44 new file mode 100644 index 0000000..27cc728 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-44 @@ -0,0 +1 @@ +|| \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-45 b/scanner/testdata/fuzz/corpus/test-45 new file mode 100644 index 0000000..49bae17 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-45 @@ -0,0 +1 @@ +^= \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-46 b/scanner/testdata/fuzz/corpus/test-46 new file mode 100644 index 0000000..3b0d355 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-46 @@ -0,0 +1 @@ +$= \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-47 b/scanner/testdata/fuzz/corpus/test-47 new file mode 100644 index 0000000..d2215e3 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-47 @@ -0,0 +1 @@ +*= \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-48 b/scanner/testdata/fuzz/corpus/test-48 new file mode 100644 index 0000000..81750b9 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-48 @@ -0,0 +1 @@ +{ \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-49 b/scanner/testdata/fuzz/corpus/test-49 new file mode 100644 index 0000000..e9a4a3c --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-49 @@ -0,0 +1 @@ +╯︵┻━┻"stuff" \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-5 b/scanner/testdata/fuzz/corpus/test-5 new file mode 100644 index 0000000..bf1efad --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-5 @@ -0,0 +1 @@ +"ab\\cd" \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-6 b/scanner/testdata/fuzz/corpus/test-6 new file mode 100644 index 0000000..62fe39b --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-6 @@ -0,0 +1 @@ +'abcd' \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-7 b/scanner/testdata/fuzz/corpus/test-7 new file mode 100644 index 0000000..099e0f4 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-7 @@ -0,0 +1 @@ +'ab"cd' \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-8 b/scanner/testdata/fuzz/corpus/test-8 new file mode 100644 index 0000000..db5af35 --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-8 @@ -0,0 +1 @@ +'ab\'cd' \ No newline at end of file diff --git a/scanner/testdata/fuzz/corpus/test-9 b/scanner/testdata/fuzz/corpus/test-9 new file mode 100644 index 0000000..17559ae --- /dev/null +++ b/scanner/testdata/fuzz/corpus/test-9 @@ -0,0 +1 @@ +'ab\\cd' \ No newline at end of file From aa841cee2102e74e4855521dfbf8fa9fbf6f65de Mon Sep 17 00:00:00 2001 From: Kane York Date: Wed, 14 Mar 2018 14:36:57 -0700 Subject: [PATCH 10/33] widen ParseInt calls to accept too-big codepoints --- scanner/fuzz.go | 16 ++++++++++++++++ scanner/scanner_test.go | 15 ++------------- scanner/tokenizer.go | 7 ++++--- 3 files changed, 22 insertions(+), 16 deletions(-) create mode 100644 scanner/fuzz.go diff --git a/scanner/fuzz.go b/scanner/fuzz.go new file mode 100644 index 0000000..5b770be --- /dev/null +++ b/scanner/fuzz.go @@ -0,0 +1,16 @@ +package scanner + +import "bytes" +import "fmt" + +func Fuzz(b []byte) int { + tz := NewTokenizer(bytes.NewReader(b)) + for { + tt := tz.Next() + fmt.Printf("%v\n", tt) + if tt.Type.StopToken() { + break + } + } + return 1 +} diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index 46d9fe3..f2d011f 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -5,23 +5,10 @@ package scanner import ( - "bytes" "strings" "testing" ) -func Fuzz(b []byte) int { - tz := NewTokenizer(bytes.NewReader(b)) - for { - tt := tz.Next() - fmt.Printf("%v\n", tt) - if tt.Type.StopToken() { - break - } - } - return 1 -} - func TestMatchers(t *testing.T) { // Just basic checks, not exhaustive at all. checkMatch := func(s string, ttList ...interface{}) { @@ -87,6 +74,8 @@ func TestMatchers(t *testing.T) { TokenURI, "http://domain.com/uri/2", ) checkMatch("U+0042", TokenUnicodeRange, "U+0042") + checkMatch("U+FFFFFF", TokenUnicodeRange, "U+FFFFFF") + checkMatch("U+??????", TokenUnicodeRange, "U+0000-FFFFFF") checkMatch("", TokenCDC, "-->") checkMatch(" \n \t \n", TokenS, "\n") // TODO - whitespace preservation diff --git a/scanner/tokenizer.go b/scanner/tokenizer.go index 04dd726..5cdc61e 100644 --- a/scanner/tokenizer.go +++ b/scanner/tokenizer.go @@ -717,11 +717,12 @@ func (z *Tokenizer) consumeUnicodeRange() Token { elen = slen } - startCP, err := strconv.ParseInt(string(sdigits[:slen]), 16, 22) + // 16 = hex, 32 = int32 + startCP, err := strconv.ParseInt(string(sdigits[:slen]), 16, 32) if err != nil { panic(fmt.Sprintf("ParseInt failure: %s", err)) } - endCP, err := strconv.ParseInt(string(edigits[:elen]), 16, 22) + endCP, err := strconv.ParseInt(string(edigits[:elen]), 16, 32) if err != nil { panic(fmt.Sprintf("ParseInt failure: %s", err)) } @@ -786,7 +787,7 @@ func (z *Tokenizer) consumeEscapedCP() rune { } digits = digits[:i] // 16 = hex, 22 = bit width of unicode - cpi, err := strconv.ParseInt(string(digits), 16, 22) + cpi, err := strconv.ParseInt(string(digits), 16, 32) if err != nil || cpi == 0 || cpi > utf8.MaxRune { return utf8.RuneError } From b5986f0bb76b18615e247c8d33d12c3a7b5b2ed6 Mon Sep 17 00:00:00 2001 From: Kane York Date: Wed, 14 Mar 2018 15:54:50 -0700 Subject: [PATCH 11/33] Add round-tripping test --- scanner/fuzz.go | 83 +++++++++++++- scanner/scanner_test.go | 14 +++ scanner/token.go | 243 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 328 insertions(+), 12 deletions(-) diff --git a/scanner/fuzz.go b/scanner/fuzz.go index 5b770be..036bb97 100644 --- a/scanner/fuzz.go +++ b/scanner/fuzz.go @@ -1,16 +1,91 @@ package scanner -import "bytes" -import "fmt" +import ( + "bytes" + "fmt" + "io" + "reflect" +) func Fuzz(b []byte) int { + var tokens []Token + tz := NewTokenizer(bytes.NewReader(b)) for { tt := tz.Next() - fmt.Printf("%v\n", tt) - if tt.Type.StopToken() { + fmt.Printf("[OT] %v\n", tt) + if tt.Type == TokenError { + // We should not have reading errors + panic(tt) + } else if tt.Type == TokenEOF { break + } else { + tokens = append(tokens, tt) + } + } + + // Render and retokenize + + var wr TokenRenderer + var rerenderBuf bytes.Buffer + success := false + defer func() { + if !success { + fmt.Println("RERENDER BUFFER:", rerenderBuf.String()) + } + }() + pr, pw := io.Pipe() + defer pr.Close() + + go func() { + writeTarget := io.MultiWriter(pw, &rerenderBuf) + for _, v := range tokens { + wr.WriteTokenTo(writeTarget, v) + } + pw.Close() + }() + + tz = NewTokenizer(pr) + i := 0 + for { + for i < len(tokens) && tokens[i].Type == TokenComment { + i++ + } + tt := tz.Next() + fmt.Printf("[RT] %v\n", tt) + if tt.Type == TokenComment { + // Ignore comments while comparing + continue + } + if tt.Type == TokenError { + panic(tt) + } + if tt.Type == TokenEOF { + if i != len(tokens) { + panic(fmt.Sprintf("unexpected EOF: got EOF from retokenizer, but original token stream is at %d/%d\n%v", i, len(tokens), tokens)) + } else { + break + } + } + if i == len(tokens) { + panic(fmt.Sprintf("expected EOF: reached end of original token stream but got %v from retokenizer\n%v", tt, tokens)) + } + + ot := tokens[i] + if tt.Type != ot.Type { + panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Type not equal)\n%v", tt, ot, tokens)) + } + if tt.Value != ot.Value { + panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Value not equal)\n%v", tt, ot, tokens)) + } + if TokenExtraTypeLookup[tt.Type] != nil { + if !reflect.DeepEqual(tt, ot) { + panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Extra not equal)\n%v", tt, ot, tokens)) + } } + i++ + continue } + success = true return 1 } diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index f2d011f..f92be94 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -31,6 +31,8 @@ func TestMatchers(t *testing.T) { t.Errorf("double missing EOF after token %s, got %+v", s, tok) } } + + Fuzz([]byte(s)) } checkMatch("abcd", TokenIdent, "abcd") @@ -92,4 +94,16 @@ func TestMatchers(t *testing.T) { checkMatch("{", TokenOpenBrace, "{") // checkMatch("\uFEFF", TokenBOM, "\uFEFF") checkMatch(`╯︵┻━┻"stuff"`, TokenIdent, "╯︵┻━┻", TokenString, "stuff") + + checkMatch("foo { bar: rgb(255, 0, 127); }", + TokenIdent, "foo", TokenS, " ", + TokenOpenBrace, "{", TokenS, " ", + TokenIdent, "bar", TokenColon, ":", TokenS, " ", + TokenFunction, "rgb", + TokenNumber, "255", TokenComma, ",", TokenS, " ", + TokenNumber, "0", TokenComma, ",", TokenS, " ", + TokenNumber, "127", TokenCloseParen, ")", + TokenSemicolon, ";", TokenS, " ", + TokenCloseBrace, "}", + ) } diff --git a/scanner/token.go b/scanner/token.go index ab5b377..ce5233d 100644 --- a/scanner/token.go +++ b/scanner/token.go @@ -5,7 +5,12 @@ package scanner -import "fmt" +import ( + "bytes" + "fmt" + "io" + "strings" +) // TokenType identifies the type of lexical tokens. type TokenType int @@ -61,6 +66,8 @@ const ( TokenEOF // From now on, only tokens from the CSS specification. TokenIdent + TokenFunction + TokenDelim // Single character TokenAtKeyword TokenString TokenHash @@ -76,19 +83,19 @@ const ( // CSS Syntax Level 3 removes comments from the token stream, but they are // preserved here. TokenComment - TokenFunction + + // Error tokens + TokenBadString + TokenBadURI + TokenBadEscape // a '\' right before a newline + + // Fixed-string tokens TokenIncludes TokenDashMatch TokenPrefixMatch TokenSuffixMatch TokenSubstringMatch TokenColumn - TokenDelim - // Error tokens - TokenBadString - TokenBadURI - TokenBadEscape // a '\' right before a newline - // Single-character tokens TokenColon TokenSemicolon TokenComma @@ -240,3 +247,223 @@ func (e *TokenExtraError) ParseError() *ParseError { } return pe } + +func escapeIdentifier(s string) string { + // TODO + return s +} + +func escapeDimension(s string) string { + if strings.HasPrefix(s, "e") || strings.HasPrefix(s, "E") { + return "\\" + escapeIdentifier(s) + } + return escapeIdentifier(s) +} + +var escapeStringReplacer = strings.NewReplacer("\"", "\\\"", "\n", "\\0A ", "\\", "\\\\") + +func (t *Token) Render() string { + var buf bytes.Buffer + t.WriteTo(&buf) + return buf.String() +} + +func (t *Token) WriteTo(w io.Writer) { + switch t.Type { + case TokenError: + return + case TokenEOF: + return + case TokenIdent: + fmt.Fprint(w, escapeIdentifier(t.Value)) + case TokenAtKeyword: + fmt.Fprint(w, "@", escapeIdentifier(t.Value)) + case TokenDelim: + if t.Value == "\\" { + fmt.Fprint(w, "\\\n") + } else { + fmt.Fprint(w, t.Value) + } + case TokenHash: + io.WriteString(w, "#") + fmt.Fprint(w, escapeIdentifier(t.Value)) + case TokenPercentage: + fmt.Fprint(w, t.Value, "%") + case TokenDimension: + e := t.Extra.(*TokenExtraNumeric) + fmt.Fprint(w, t.Value, e.Dimension) + case TokenString: + io.WriteString(w, "\"") + escapeStringReplacer.WriteString(w, t.Value) + io.WriteString(w, "\"") + case TokenURI: + io.WriteString(w, "url(\"") + escapeStringReplacer.WriteString(w, t.Value) + io.WriteString(w, "\")") + case TokenUnicodeRange: + fmt.Fprint(w, t.Extra.String()) + case TokenComment: + io.WriteString(w, "/*") + io.WriteString(w, t.Value) + io.WriteString(w, "/*") + case TokenFunction: + fmt.Fprint(w, t.Value, "(") + + case TokenBadEscape, TokenBadString, TokenBadURI: + fmt.Fprint(w, t.Value) + default: + fmt.Fprint(w, t.Value) + } +} + +// TokenRenderer takes care of the comment insertion rules for serialization. +type TokenRenderer struct { + lastToken Token +} + +func (r *TokenRenderer) WriteTokenTo(w io.Writer, t Token) { + var prevKey, curKey interface{} + if r.lastToken.Type == TokenDelim { + prevKey = r.lastToken.Value[0] + } else { + prevKey = r.lastToken.Type + } + if t.Type == TokenDelim { + curKey = t.Value[0] + } else { + curKey = t.Type + } + + m1, ok := commentInsertionRules[prevKey] + if ok { + if m1[curKey] { + io.WriteString(w, "/**/") + } + } + + t.WriteTo(w) + r.lastToken = t +} + +var commentInsertionThruCDC = map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: true, + '-': true, + '(': false, +} + +var commentInsertionRules = map[interface{}]map[interface{}]bool{ + TokenIdent: map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + '-': true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: true, + '(': true, + }, + TokenAtKeyword: commentInsertionThruCDC, + TokenHash: commentInsertionThruCDC, + TokenDimension: commentInsertionThruCDC, + '#': map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: false, + '-': true, + '(': false, + }, + '-': map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: false, + '-': false, + '(': false, + }, + TokenNumber: map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: false, + '-': false, + '(': false, + }, + '@': map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: false, + TokenPercentage: false, + TokenDimension: false, + TokenUnicodeRange: true, + TokenCDC: false, + '-': true, + '(': false, + }, + TokenUnicodeRange: map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: false, + '?': true, + }, + '.': map[interface{}]bool{ + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + }, + '+': map[interface{}]bool{ + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + }, + '$': map[interface{}]bool{ + '=': true, + }, + '*': map[interface{}]bool{ + '=': true, + }, + '^': map[interface{}]bool{ + '=': true, + }, + '~': map[interface{}]bool{ + '=': true, + }, + '|': map[interface{}]bool{ + '=': true, + '|': true, + }, + '/': map[interface{}]bool{ + '*': true, + }, +} From 6e71edb8690e6322b52b34d0125b93743a9cec10 Mon Sep 17 00:00:00 2001 From: Kane York Date: Wed, 14 Mar 2018 16:04:38 -0700 Subject: [PATCH 12/33] Fix: was discarding the leading 'u' --- scanner/scanner_test.go | 2 ++ scanner/tokenizer.go | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index f92be94..e4f9222 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -106,4 +106,6 @@ func TestMatchers(t *testing.T) { TokenSemicolon, ";", TokenS, " ", TokenCloseBrace, "}", ) + // Fuzzing results + checkMatch("ur(0", TokenFunction, "ur", TokenNumber, "0") } diff --git a/scanner/tokenizer.go b/scanner/tokenizer.go index 5cdc61e..927d59d 100644 --- a/scanner/tokenizer.go +++ b/scanner/tokenizer.go @@ -359,9 +359,10 @@ func (z *Tokenizer) consume() Token { z.err = errBadEscape return premadeTokens['\\'] case 'U', 'u': + z.unreadByte() z.repeek() - if z.peek[0] == '+' && (isHexDigit(z.peek[1]) || (z.peek[1] == '?')) { - z.r.Discard(1) // (!) only discard the plus sign + if z.peek[1] == '+' && (isHexDigit(z.peek[2]) || (z.peek[2] == '?')) { + z.r.Discard(2) // (!) only discard the U+ return z.consumeUnicodeRange() } break From c5a4afb97fa722b95c380e9fb2f4f8ca653ba25c Mon Sep 17 00:00:00 2001 From: Kane York Date: Wed, 14 Mar 2018 17:03:31 -0700 Subject: [PATCH 13/33] Fix more fuzzer findings --- scanner/fuzz.go | 4 +- scanner/scanner_test.go | 59 +++++++++++++------- scanner/token.go | 117 +++++++++++++++++++++++++++++++--------- scanner/tokenizer.go | 47 ++++++++++------ 4 files changed, 164 insertions(+), 63 deletions(-) diff --git a/scanner/fuzz.go b/scanner/fuzz.go index 036bb97..9eb7146 100644 --- a/scanner/fuzz.go +++ b/scanner/fuzz.go @@ -75,11 +75,11 @@ func Fuzz(b []byte) int { if tt.Type != ot.Type { panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Type not equal)\n%v", tt, ot, tokens)) } - if tt.Value != ot.Value { + if tt.Value != ot.Value && !tt.Type.StopToken() { panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Value not equal)\n%v", tt, ot, tokens)) } if TokenExtraTypeLookup[tt.Type] != nil { - if !reflect.DeepEqual(tt, ot) { + if !reflect.DeepEqual(tt, ot) && !tt.Type.StopToken() { panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Extra not equal)\n%v", tt, ot, tokens)) } } diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index e4f9222..adee06a 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -5,6 +5,7 @@ package scanner import ( + "reflect" "strings" "testing" ) @@ -18,11 +19,24 @@ func TestMatchers(t *testing.T) { for i < len(ttList) { tt := ttList[i].(TokenType) tVal := ttList[i+1].(string) + var tExtra TokenExtra + if TokenExtraTypeLookup[tt] != nil { + tExtra = ttList[i+2].(TokenExtra) + } if tok := tz.Next(); tok.Type != tt || tok.Value != tVal { t.Errorf("did not match: %s (got %s, wanted %s): %v", s, tok.Value, tVal, tok) + } else if tExtra != nil && !reflect.DeepEqual(tok.Extra, tExtra) { + if tt.StopToken() && tt != TokenError && tt != TokenEOF { + // mismatch ok + } else { + t.Errorf("did not match .Extra: %s (got %#v, wanted %#v): %v", s, tok.Extra, tExtra, tok) + } } i += 2 + if TokenExtraTypeLookup[tt] != nil { + i++ + } } if tok := tz.Next(); tok.Type != TokenEOF { @@ -44,19 +58,21 @@ func TestMatchers(t *testing.T) { checkMatch(`'ab"cd'`, TokenString, `ab"cd`) checkMatch(`'ab\'cd'`, TokenString, `ab'cd`) checkMatch(`'ab\\cd'`, TokenString, `ab\cd`) - checkMatch("#name", TokenHash, "name") - checkMatch("##name", TokenDelim, "#", TokenHash, "name") - checkMatch("42''", TokenNumber, "42", TokenString, "") - checkMatch("+42", TokenNumber, "+42") - checkMatch("-42", TokenNumber, "-42") - checkMatch("4.2", TokenNumber, "4.2") - checkMatch(".42", TokenNumber, ".42") - checkMatch("+.42", TokenNumber, "+.42") - checkMatch("-.42", TokenNumber, "-.42") - checkMatch("42%", TokenPercentage, "42") - checkMatch("4.2%", TokenPercentage, "4.2") - checkMatch(".42%", TokenPercentage, ".42") - checkMatch("42px", TokenDimension, "42") // TODO check the dimension stored in .Extra + checkMatch("#name", TokenHash, "name", &TokenExtraHash{IsIdentifier: true}) + checkMatch("##name", TokenDelim, "#", TokenHash, "name", &TokenExtraHash{IsIdentifier: true}) + checkMatch("42''", TokenNumber, "42", &TokenExtraNumeric{}, TokenString, "") + checkMatch("+42", TokenNumber, "+42", &TokenExtraNumeric{}) + checkMatch("-42", TokenNumber, "-42", &TokenExtraNumeric{}) + checkMatch("42.", TokenNumber, "42", &TokenExtraNumeric{}, TokenDelim, ".") + checkMatch("42.0", TokenNumber, "42.0", &TokenExtraNumeric{NonInteger: true}) + checkMatch("4.2", TokenNumber, "4.2", &TokenExtraNumeric{NonInteger: true}) + checkMatch(".42", TokenNumber, ".42", &TokenExtraNumeric{NonInteger: true}) + checkMatch("+.42", TokenNumber, "+.42", &TokenExtraNumeric{NonInteger: true}) + checkMatch("-.42", TokenNumber, "-.42", &TokenExtraNumeric{NonInteger: true}) + checkMatch("42%", TokenPercentage, "42", &TokenExtraNumeric{}) + checkMatch("4.2%", TokenPercentage, "4.2", &TokenExtraNumeric{NonInteger: true}) + checkMatch(".42%", TokenPercentage, ".42", &TokenExtraNumeric{NonInteger: true}) + checkMatch("42px", TokenDimension, "42", &TokenExtraNumeric{Dimension: "px"}) // TODO check the dimension stored in .Extra checkMatch("url(http://domain.com)", TokenURI, "http://domain.com") checkMatch("url( http://domain.com/uri/between/space )", TokenURI, "http://domain.com/uri/between/space") checkMatch("url('http://domain.com/uri/between/single/quote')", TokenURI, "http://domain.com/uri/between/single/quote") @@ -75,9 +91,9 @@ func TestMatchers(t *testing.T) { TokenS, " ", TokenURI, "http://domain.com/uri/2", ) - checkMatch("U+0042", TokenUnicodeRange, "U+0042") - checkMatch("U+FFFFFF", TokenUnicodeRange, "U+FFFFFF") - checkMatch("U+??????", TokenUnicodeRange, "U+0000-FFFFFF") + checkMatch("U+0042", TokenUnicodeRange, "U+0042", &TokenExtraUnicodeRange{Start: 0x42, End: 0x42}) + checkMatch("U+FFFFFF", TokenUnicodeRange, "U+FFFFFF", &TokenExtraUnicodeRange{Start: 0xFFFFFF, End: 0xFFFFFF}) + checkMatch("U+??????", TokenUnicodeRange, "U+0000-FFFFFF", &TokenExtraUnicodeRange{Start: 0, End: 0xFFFFFF}) checkMatch("", TokenCDC, "-->") checkMatch(" \n \t \n", TokenS, "\n") // TODO - whitespace preservation @@ -100,12 +116,15 @@ func TestMatchers(t *testing.T) { TokenOpenBrace, "{", TokenS, " ", TokenIdent, "bar", TokenColon, ":", TokenS, " ", TokenFunction, "rgb", - TokenNumber, "255", TokenComma, ",", TokenS, " ", - TokenNumber, "0", TokenComma, ",", TokenS, " ", - TokenNumber, "127", TokenCloseParen, ")", + TokenNumber, "255", &TokenExtraNumeric{}, TokenComma, ",", TokenS, " ", + TokenNumber, "0", &TokenExtraNumeric{}, TokenComma, ",", TokenS, " ", + TokenNumber, "127", &TokenExtraNumeric{}, TokenCloseParen, ")", TokenSemicolon, ";", TokenS, " ", TokenCloseBrace, "}", ) // Fuzzing results - checkMatch("ur(0", TokenFunction, "ur", TokenNumber, "0") + checkMatch("ur(0", TokenFunction, "ur", TokenNumber, "0", &TokenExtraNumeric{}) + checkMatch("1\\15", TokenDimension, "1", &TokenExtraNumeric{Dimension: "\x15"}) + checkMatch("url(0t')", TokenBadURI, "0t", &TokenExtraError{}) + checkMatch("uri/", TokenIdent, "uri", TokenDelim, "/") } diff --git a/scanner/token.go b/scanner/token.go index ce5233d..49cab7c 100644 --- a/scanner/token.go +++ b/scanner/token.go @@ -10,6 +10,7 @@ import ( "fmt" "io" "strings" + "unicode/utf8" ) // TokenType identifies the type of lexical tokens. @@ -194,10 +195,7 @@ func (e *TokenExtraNumeric) String() string { if e == nil { return "" } - if e.Dimension != "" { - return e.Dimension - } - return "" + return e.Dimension } // TokenExtraUnicodeRange is attached to a TokenUnicodeRange. @@ -248,19 +246,81 @@ func (e *TokenExtraError) ParseError() *ParseError { return pe } -func escapeIdentifier(s string) string { - // TODO - return s -} +func escapeIdentifier(s string) string { return escapeIdent(s, 0) } +func escapeDimension(s string) string { return escapeIdent(s, 2) } + +func escapeIdent(s string, mode int) string { + if s == "" { + return "" + } + var buf bytes.Buffer + buf.Grow(len(s)) + anyChanges := false + + // Handle first character + // dashes allowed at start only for TokenIdent-ish + // eE not allowed at start for Dimension + if !isNameStart(s[0]) && s[0] != '-' && s[0] != 'e' && s[0] != 'E' { + buf.WriteByte('\\') + buf.WriteByte(s[0]) + anyChanges = true + } else if s[0] == 'e' || s[0] == 'E' { + if mode == 2 { + buf.WriteByte('\\') + anyChanges = true + } + buf.WriteByte(s[0]) + } else if s[0] == '-' { + if len(s) == 1 { + return "\\-" + } else if isNameStart(s[1]) { + buf.WriteByte('-') + } else { + buf.WriteString("\\-") + } + } else { + buf.WriteByte(s[0]) + } + // Write the rest of the name + for i := 1; i < len(s); i++ { + if !isNameCode(s[i]) { + fmt.Fprintf(&buf, "\\%X", s[i]) + anyChanges = true + } else { + buf.WriteByte(s[i]) + } + } -func escapeDimension(s string) string { - if strings.HasPrefix(s, "e") || strings.HasPrefix(s, "E") { - return "\\" + escapeIdentifier(s) + if !anyChanges { + return s } - return escapeIdentifier(s) + return buf.String() } -var escapeStringReplacer = strings.NewReplacer("\"", "\\\"", "\n", "\\0A ", "\\", "\\\\") +func escapeString(s string) string { + var buf bytes.Buffer + buf.WriteByte('"') + for i := 0; i < len(s); i++ { + switch s[i] { + case '"': + buf.WriteString("\\\"") + continue + case '\n': + buf.WriteString("\\0A ") + continue + case '\\': + buf.WriteString("\\\\") + continue + } + if s[i] < utf8.RuneSelf && isNonPrintable(s[i]) { + fmt.Fprintf(&buf, "\\%X", s[i]) + continue + } + buf.WriteByte(s[i]) + } + buf.WriteByte('"') + return buf.String() +} func (t *Token) Render() string { var buf bytes.Buffer @@ -291,26 +351,35 @@ func (t *Token) WriteTo(w io.Writer) { fmt.Fprint(w, t.Value, "%") case TokenDimension: e := t.Extra.(*TokenExtraNumeric) - fmt.Fprint(w, t.Value, e.Dimension) + fmt.Fprint(w, t.Value, escapeDimension(e.Dimension)) case TokenString: - io.WriteString(w, "\"") - escapeStringReplacer.WriteString(w, t.Value) - io.WriteString(w, "\"") + io.WriteString(w, escapeString(t.Value)) case TokenURI: - io.WriteString(w, "url(\"") - escapeStringReplacer.WriteString(w, t.Value) - io.WriteString(w, "\")") + io.WriteString(w, "url(") + io.WriteString(w, escapeString(t.Value)) + io.WriteString(w, ")") case TokenUnicodeRange: - fmt.Fprint(w, t.Extra.String()) + io.WriteString(w, t.Extra.String()) case TokenComment: io.WriteString(w, "/*") io.WriteString(w, t.Value) io.WriteString(w, "/*") case TokenFunction: - fmt.Fprint(w, t.Value, "(") + io.WriteString(w, t.Value) + io.WriteString(w, "(") - case TokenBadEscape, TokenBadString, TokenBadURI: - fmt.Fprint(w, t.Value) + case TokenBadEscape: + io.WriteString(w, "\\\n") + case TokenBadString: + io.WriteString(w, "\"") + io.WriteString(w, t.Value) + io.WriteString(w, "\n") + case TokenBadURI: + io.WriteString(w, "url(") + str := escapeString(t.Value) + str = strings.TrimSuffix(str, "\"") + io.WriteString(w, str) + io.WriteString(w, "\n)") default: fmt.Fprint(w, t.Value) } diff --git a/scanner/tokenizer.go b/scanner/tokenizer.go index 927d59d..6c4babd 100644 --- a/scanner/tokenizer.go +++ b/scanner/tokenizer.go @@ -25,9 +25,17 @@ type Tokenizer struct { err error peek [3]byte + ErrorMode int + tok Token } +const ( + // Default error mode - tokenization errors are represented as special tokens in the stream, and I/O errors are TokenError. + ErrorModeTokens = iota + ErrorModeFatal +) + // Construct a Tokenizer from the given input. Input need not be normalized. func NewTokenizer(r io.Reader) *Tokenizer { return &Tokenizer{ @@ -89,7 +97,8 @@ func (z *Tokenizer) Err() error { func (z *Tokenizer) AcknowledgeError() { _, ok := z.err.(*ParseError) if !ok { - panic("cssparse: AcknowledgeError() called for a foreign (non-syntax) error") + // TODO ErrorMode + return } z.err = nil } @@ -203,6 +212,10 @@ func isStartNumber(p []byte) bool { return false } +func isNonPrintable(by byte) bool { + return (0 <= by && by <= 0x08) || (0x0B == by) || (0x0E <= by && by <= 0x1F) || (0x7F == by) +} + // repeek must be called before the following: func (z *Tokenizer) nextIsEscape() bool { @@ -356,7 +369,7 @@ func (z *Tokenizer) consume() Token { return z.consumeIdentish() } z.nextByte() - z.err = errBadEscape + // z.err = errBadEscape return premadeTokens['\\'] case 'U', 'u': z.unreadByte() @@ -517,14 +530,14 @@ func (z *Tokenizer) consumeString(delim byte) Token { } } else if by == '\n' { z.unreadByte() - z.err = &ParseError{ + /* z.err = */ er := &ParseError{ Type: TokenBadString, Message: "unterminated string", } return Token{ Type: TokenBadString, Value: string(frag), - Extra: &TokenExtraError{Err: z.err}, + Extra: &TokenExtraError{Err: er}, } } else if by == '\\' { z.unreadByte() @@ -567,12 +580,12 @@ func (z *Tokenizer) consumeURL() Token { if t.Type == TokenBadString { t.Type = TokenBadURI t.Value += z.consumeBadURL() - z.err = &ParseError{ + /* z.err = */ pe := &ParseError{ Type: TokenBadURI, Message: "unterminated string in url()", } t.Extra = &TokenExtraError{ - Err: z.err, + Err: pe, } return t } @@ -585,12 +598,12 @@ func (z *Tokenizer) consumeURL() Token { } t.Type = TokenBadURI t.Value += z.consumeBadURL() - z.err = &ParseError{ + /* z.err = */ pe := &ParseError{ Type: TokenBadURI, Message: "url() with string missing close parenthesis", } t.Extra = &TokenExtraError{ - Err: z.err, + Err: pe, } return t } @@ -607,34 +620,34 @@ func (z *Tokenizer) consumeURL() Token { z.nextByte() // ')' return Token{Type: TokenURI, Value: string(frag)} } - z.err = &ParseError{ + /* z.err = */ pe := &ParseError{ Type: TokenBadURI, Message: "bare url() with internal whitespace", } return Token{ Type: TokenBadURI, Value: string(frag) + z.consumeBadURL(), - Extra: &TokenExtraError{Err: z.err}, + Extra: &TokenExtraError{Err: pe}, } } else if by == '\'' || by == '"' || by == '(' { - z.err = &ParseError{ + /* z.err = */ pe := &ParseError{ Type: TokenBadURI, Message: fmt.Sprintf("bare url() with illegal character '%c'", by), } return Token{ Type: TokenBadURI, Value: string(frag) + z.consumeBadURL(), - Extra: &TokenExtraError{Err: z.err}, + Extra: &TokenExtraError{Err: pe}, } - } else if (0 <= by && by <= 0x08) || (0x0B == by) || (0x0E <= by && by <= 0x1F) || (0x7F == by) { - z.err = &ParseError{ + } else if isNonPrintable(by) { + /* z.err = */ pe := &ParseError{ Type: TokenBadURI, Message: fmt.Sprintf("bare url() with unprintable character '%d'", by), } return Token{ Type: TokenBadURI, Value: string(frag) + z.consumeBadURL(), - Extra: &TokenExtraError{Err: z.err}, + Extra: &TokenExtraError{Err: pe}, } } else if by == '\\' { z.unreadByte() @@ -646,14 +659,14 @@ func (z *Tokenizer) consumeURL() Token { n := utf8.EncodeRune(tmp[:], cp) frag = append(frag, tmp[:n]...) } else { - z.err = &ParseError{ + /* z.err = */ pe := &ParseError{ Type: TokenBadURI, Message: fmt.Sprintf("bare url() with invalid escape"), } return Token{ Type: TokenBadURI, Value: string(frag) + z.consumeBadURL(), - Extra: &TokenExtraError{Err: z.err}, + Extra: &TokenExtraError{Err: pe}, } } } else { From 4c0a5effa1e3a1cb0d73c155e28fae2526b904ec Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 14:32:44 -0700 Subject: [PATCH 14/33] More fixes from fuzzing --- scanner/scanner_test.go | 14 +++++++++++++- scanner/token.go | 13 ++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index adee06a..1a3b9c4 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -23,7 +23,9 @@ func TestMatchers(t *testing.T) { if TokenExtraTypeLookup[tt] != nil { tExtra = ttList[i+2].(TokenExtra) } - if tok := tz.Next(); tok.Type != tt || tok.Value != tVal { + if tok := tz.Next(); tok.Type != tt { + t.Errorf("did not match: %s (got %v, wanted %v)", s, tok, tt) + } else if tok.Value != tVal { t.Errorf("did not match: %s (got %s, wanted %s): %v", s, tok.Value, tVal, tok) } else if tExtra != nil && !reflect.DeepEqual(tok.Extra, tExtra) { if tt.StopToken() && tt != TokenError && tt != TokenEOF { @@ -127,4 +129,14 @@ func TestMatchers(t *testing.T) { checkMatch("1\\15", TokenDimension, "1", &TokenExtraNumeric{Dimension: "\x15"}) checkMatch("url(0t')", TokenBadURI, "0t", &TokenExtraError{}) checkMatch("uri/", TokenIdent, "uri", TokenDelim, "/") + checkMatch("\x00", TokenIdent, "\uFFFD") + checkMatch("a\\0", TokenIdent, "a\uFFFD") + checkMatch("b\\\\0", TokenIdent, "b\\0") + checkMatch("00\\d", TokenDimension, "00", &TokenExtraNumeric{Dimension: "\r"}) + // note: \f is form feed, which is 0x0C + checkMatch("\\0\\0\\C\\\f\\\\0", + TokenIdent, "\uFFFD\uFFFD\x0C\x0C\\0") + // String running to EOF is success, not badstring + checkMatch("\"a0\\d", TokenString, "a0\x0D") + checkMatch("\"a0\r", TokenBadString, "a0", &TokenExtraError{}, TokenS, "\n") } diff --git a/scanner/token.go b/scanner/token.go index 49cab7c..ed8c303 100644 --- a/scanner/token.go +++ b/scanner/token.go @@ -261,8 +261,12 @@ func escapeIdent(s string, mode int) string { // dashes allowed at start only for TokenIdent-ish // eE not allowed at start for Dimension if !isNameStart(s[0]) && s[0] != '-' && s[0] != 'e' && s[0] != 'E' { - buf.WriteByte('\\') - buf.WriteByte(s[0]) + if isNonPrintable(s[0]) || s[0] == '\r' || s[0] == '\n' { + fmt.Fprintf(&buf, "\\%X ", s[0]) + } else { + buf.WriteByte('\\') + buf.WriteByte(s[0]) + } anyChanges = true } else if s[0] == 'e' || s[0] == 'E' { if mode == 2 { @@ -284,7 +288,7 @@ func escapeIdent(s string, mode int) string { // Write the rest of the name for i := 1; i < len(s); i++ { if !isNameCode(s[i]) { - fmt.Fprintf(&buf, "\\%X", s[i]) + fmt.Fprintf(&buf, "\\%X ", s[i]) anyChanges = true } else { buf.WriteByte(s[i]) @@ -308,6 +312,9 @@ func escapeString(s string) string { case '\n': buf.WriteString("\\0A ") continue + case '\r': + buf.WriteString("\\0D ") + continue case '\\': buf.WriteString("\\\\") continue From 4c09d638c884748f2c0873b5bf4e07b64e9aed0e Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 14:48:01 -0700 Subject: [PATCH 15/33] Fix '5e', '#123', and comments --- scanner/scanner_test.go | 2 + scanner/token.go | 83 +++++++++++++++++++++++++++++------------ 2 files changed, 62 insertions(+), 23 deletions(-) diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index 1a3b9c4..89b1cca 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -62,6 +62,7 @@ func TestMatchers(t *testing.T) { checkMatch(`'ab\\cd'`, TokenString, `ab\cd`) checkMatch("#name", TokenHash, "name", &TokenExtraHash{IsIdentifier: true}) checkMatch("##name", TokenDelim, "#", TokenHash, "name", &TokenExtraHash{IsIdentifier: true}) + checkMatch("#123", TokenHash, "123", &TokenExtraHash{IsIdentifier: false}) checkMatch("42''", TokenNumber, "42", &TokenExtraNumeric{}, TokenString, "") checkMatch("+42", TokenNumber, "+42", &TokenExtraNumeric{}) checkMatch("-42", TokenNumber, "-42", &TokenExtraNumeric{}) @@ -139,4 +140,5 @@ func TestMatchers(t *testing.T) { // String running to EOF is success, not badstring checkMatch("\"a0\\d", TokenString, "a0\x0D") checkMatch("\"a0\r", TokenBadString, "a0", &TokenExtraError{}, TokenS, "\n") + checkMatch("5e", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e"}) } diff --git a/scanner/token.go b/scanner/token.go index ed8c303..e4929e7 100644 --- a/scanner/token.go +++ b/scanner/token.go @@ -247,8 +247,30 @@ func (e *TokenExtraError) ParseError() *ParseError { } func escapeIdentifier(s string) string { return escapeIdent(s, 0) } +func escapeHashName(s string) string { return escapeIdent(s, 1) } func escapeDimension(s string) string { return escapeIdent(s, 2) } +func needsHexEscaping(c byte, mode int) bool { + if c < 0x20 { + return true + } + if c >= utf8.RuneSelf { + return false + } + if mode == 2 { + if c == 'e' || c == 'E' { + return true + } + } + if c == '\\' { + return true + } + if isNameCode(c) { + return false + } + return true +} + func escapeIdent(s string, mode int) string { if s == "" { return "" @@ -257,36 +279,46 @@ func escapeIdent(s string, mode int) string { buf.Grow(len(s)) anyChanges := false + var i int + // Handle first character // dashes allowed at start only for TokenIdent-ish // eE not allowed at start for Dimension - if !isNameStart(s[0]) && s[0] != '-' && s[0] != 'e' && s[0] != 'E' { - if isNonPrintable(s[0]) || s[0] == '\r' || s[0] == '\n' { - fmt.Fprintf(&buf, "\\%X ", s[0]) + if mode != 1 { + if !isNameStart(s[0]) && s[0] != '-' && s[0] != 'e' && s[0] != 'E' { + if needsHexEscaping(s[0], mode) { + fmt.Fprintf(&buf, "\\%X ", s[0]) + anyChanges = true + } else { + buf.WriteByte('\\') + buf.WriteByte(s[0]) + anyChanges = true + } + } else if s[0] == 'e' || s[0] == 'E' { + if mode == 2 { + fmt.Fprintf(&buf, "\\%X ", s[0]) + anyChanges = true + } else { + buf.WriteByte(s[0]) + } + } else if s[0] == '-' { + if len(s) == 1 { + return "\\-" + } else if isNameStart(s[1]) { + buf.WriteByte('-') + } else { + buf.WriteString("\\-") + anyChanges = true + } } else { - buf.WriteByte('\\') buf.WriteByte(s[0]) } - anyChanges = true - } else if s[0] == 'e' || s[0] == 'E' { - if mode == 2 { - buf.WriteByte('\\') - anyChanges = true - } - buf.WriteByte(s[0]) - } else if s[0] == '-' { - if len(s) == 1 { - return "\\-" - } else if isNameStart(s[1]) { - buf.WriteByte('-') - } else { - buf.WriteString("\\-") - } + i = 1 } else { - buf.WriteByte(s[0]) + i = 0 } // Write the rest of the name - for i := 1; i < len(s); i++ { + for ; i < len(s); i++ { if !isNameCode(s[i]) { fmt.Fprintf(&buf, "\\%X ", s[i]) anyChanges = true @@ -352,8 +384,13 @@ func (t *Token) WriteTo(w io.Writer) { fmt.Fprint(w, t.Value) } case TokenHash: + e := t.Extra.(*TokenExtraHash) io.WriteString(w, "#") - fmt.Fprint(w, escapeIdentifier(t.Value)) + if e.IsIdentifier { + fmt.Fprint(w, escapeIdentifier(t.Value)) + } else { + fmt.Fprint(w, escapeHashName(t.Value)) + } case TokenPercentage: fmt.Fprint(w, t.Value, "%") case TokenDimension: @@ -370,7 +407,7 @@ func (t *Token) WriteTo(w io.Writer) { case TokenComment: io.WriteString(w, "/*") io.WriteString(w, t.Value) - io.WriteString(w, "/*") + io.WriteString(w, "*/") case TokenFunction: io.WriteString(w, t.Value) io.WriteString(w, "(") From 3c8aa10b959a2ae758a60d6b5a163f8003438a2e Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 14:48:58 -0700 Subject: [PATCH 16/33] fixup: add more comment tests --- scanner/fuzz.go | 1 + scanner/scanner_test.go | 2 ++ 2 files changed, 3 insertions(+) diff --git a/scanner/fuzz.go b/scanner/fuzz.go index 9eb7146..d29b52f 100644 --- a/scanner/fuzz.go +++ b/scanner/fuzz.go @@ -8,6 +8,7 @@ import ( ) func Fuzz(b []byte) int { + fmt.Printf("=== Start fuzz test ===\n%s\n", b) var tokens []Token tz := NewTokenizer(bytes.NewReader(b)) diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index 89b1cca..c09315e 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -101,6 +101,8 @@ func TestMatchers(t *testing.T) { checkMatch("-->", TokenCDC, "-->") checkMatch(" \n \t \n", TokenS, "\n") // TODO - whitespace preservation checkMatch("/**/", TokenComment, "") + checkMatch("/***/", TokenComment, "*") + checkMatch("/**", TokenComment, "*") checkMatch("/*foo*/", TokenComment, "foo") checkMatch("/* foo */", TokenComment, " foo ") checkMatch("bar(", TokenFunction, "bar") From d163d68db23f24d3713352ee89ff2d328015b6f7 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 14:50:55 -0700 Subject: [PATCH 17/33] add tests for '5e', '5e-', '5e-3' --- scanner/scanner_test.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index c09315e..389a302 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -76,6 +76,11 @@ func TestMatchers(t *testing.T) { checkMatch("4.2%", TokenPercentage, "4.2", &TokenExtraNumeric{NonInteger: true}) checkMatch(".42%", TokenPercentage, ".42", &TokenExtraNumeric{NonInteger: true}) checkMatch("42px", TokenDimension, "42", &TokenExtraNumeric{Dimension: "px"}) // TODO check the dimension stored in .Extra + + checkMatch("5e", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e"}) + checkMatch("5e-", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e-"}) + checkMatch("5e-3", TokenNumber, "5e-3", &TokenExtraNumeric{NonInteger: true}) + checkMatch("url(http://domain.com)", TokenURI, "http://domain.com") checkMatch("url( http://domain.com/uri/between/space )", TokenURI, "http://domain.com/uri/between/space") checkMatch("url('http://domain.com/uri/between/single/quote')", TokenURI, "http://domain.com/uri/between/single/quote") @@ -142,5 +147,4 @@ func TestMatchers(t *testing.T) { // String running to EOF is success, not badstring checkMatch("\"a0\\d", TokenString, "a0\x0D") checkMatch("\"a0\r", TokenBadString, "a0", &TokenExtraError{}, TokenS, "\n") - checkMatch("5e", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e"}) } From f065792a9707741a7896f8af92a4c913e5232d15 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 14:58:03 -0700 Subject: [PATCH 18/33] fix missing space after hex escape --- scanner/scanner_test.go | 1 + scanner/token.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index 389a302..97c1155 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -80,6 +80,7 @@ func TestMatchers(t *testing.T) { checkMatch("5e", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e"}) checkMatch("5e-", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e-"}) checkMatch("5e-3", TokenNumber, "5e-3", &TokenExtraNumeric{NonInteger: true}) + checkMatch("5e-\xf1", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e-\xf1"}) checkMatch("url(http://domain.com)", TokenURI, "http://domain.com") checkMatch("url( http://domain.com/uri/between/space )", TokenURI, "http://domain.com/uri/between/space") diff --git a/scanner/token.go b/scanner/token.go index e4929e7..a10817b 100644 --- a/scanner/token.go +++ b/scanner/token.go @@ -352,7 +352,7 @@ func escapeString(s string) string { continue } if s[i] < utf8.RuneSelf && isNonPrintable(s[i]) { - fmt.Fprintf(&buf, "\\%X", s[i]) + fmt.Fprintf(&buf, "\\%X ", s[i]) continue } buf.WriteByte(s[i]) From 0386e01b94b3b2aa0c2ed95581c0606c1e212cf4 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 15:02:40 -0700 Subject: [PATCH 19/33] call escapeIdentifer() for TokenFunction --- scanner/scanner_test.go | 1 + scanner/token.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index 97c1155..4b6d190 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -148,4 +148,5 @@ func TestMatchers(t *testing.T) { // String running to EOF is success, not badstring checkMatch("\"a0\\d", TokenString, "a0\x0D") checkMatch("\"a0\r", TokenBadString, "a0", &TokenExtraError{}, TokenS, "\n") + checkMatch("\\fun(", TokenFunction, "\x0fun") } diff --git a/scanner/token.go b/scanner/token.go index a10817b..7d9e068 100644 --- a/scanner/token.go +++ b/scanner/token.go @@ -409,7 +409,7 @@ func (t *Token) WriteTo(w io.Writer) { io.WriteString(w, t.Value) io.WriteString(w, "*/") case TokenFunction: - io.WriteString(w, t.Value) + io.WriteString(w, escapeIdentifier(t.Value)) io.WriteString(w, "(") case TokenBadEscape: From b5c30c672a9800a104c29ac8780067448177530f Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 15:11:11 -0700 Subject: [PATCH 20/33] Fuzz fixes for bad-string --- scanner/scanner_test.go | 1 + scanner/token.go | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index 4b6d190..f583b36 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -149,4 +149,5 @@ func TestMatchers(t *testing.T) { checkMatch("\"a0\\d", TokenString, "a0\x0D") checkMatch("\"a0\r", TokenBadString, "a0", &TokenExtraError{}, TokenS, "\n") checkMatch("\\fun(", TokenFunction, "\x0fun") + checkMatch("\"abc\\\"def\nghi", TokenBadString, "abc\"def", &TokenExtraError{}, TokenS, "\n", TokenIdent, "ghi") } diff --git a/scanner/token.go b/scanner/token.go index 7d9e068..bcec079 100644 --- a/scanner/token.go +++ b/scanner/token.go @@ -333,14 +333,20 @@ func escapeIdent(s string, mode int) string { return buf.String() } -func escapeString(s string) string { +func escapeString(s string, delim byte) string { var buf bytes.Buffer - buf.WriteByte('"') + if delim != 0 { + buf.WriteByte(delim) + } for i := 0; i < len(s); i++ { switch s[i] { case '"': buf.WriteString("\\\"") continue + case delim: + buf.WriteByte('\\') + buf.WriteByte(delim) + continue case '\n': buf.WriteString("\\0A ") continue @@ -357,7 +363,9 @@ func escapeString(s string) string { } buf.WriteByte(s[i]) } - buf.WriteByte('"') + if delim != 0 { + buf.WriteByte(delim) + } return buf.String() } @@ -397,10 +405,10 @@ func (t *Token) WriteTo(w io.Writer) { e := t.Extra.(*TokenExtraNumeric) fmt.Fprint(w, t.Value, escapeDimension(e.Dimension)) case TokenString: - io.WriteString(w, escapeString(t.Value)) + io.WriteString(w, escapeString(t.Value, '"')) case TokenURI: io.WriteString(w, "url(") - io.WriteString(w, escapeString(t.Value)) + io.WriteString(w, escapeString(t.Value, '"')) io.WriteString(w, ")") case TokenUnicodeRange: io.WriteString(w, t.Extra.String()) @@ -416,11 +424,11 @@ func (t *Token) WriteTo(w io.Writer) { io.WriteString(w, "\\\n") case TokenBadString: io.WriteString(w, "\"") - io.WriteString(w, t.Value) + io.WriteString(w, escapeString(t.Value, 0)) io.WriteString(w, "\n") case TokenBadURI: - io.WriteString(w, "url(") - str := escapeString(t.Value) + io.WriteString(w, "url(\"") + str := escapeString(t.Value, 0) str = strings.TrimSuffix(str, "\"") io.WriteString(w, str) io.WriteString(w, "\n)") From 87fb86e0b4bcb334a1aed976a8e16a40fbf27fbe Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 15:37:29 -0700 Subject: [PATCH 21/33] Rename package, update documentation --- scanner/doc.go | 33 -- scanner/scanner.go | 287 ------------------ {scanner => tokenizer}/crlf.go | 2 +- tokenizer/doc.go | 52 ++++ {scanner => tokenizer}/fuzz.go | 4 +- {scanner => tokenizer}/scanner_test.go | 4 +- .../testdata/fuzz/corpus/test-1 | 0 .../testdata/fuzz/corpus/test-10 | 0 .../testdata/fuzz/corpus/test-11 | 0 .../testdata/fuzz/corpus/test-12 | 0 .../testdata/fuzz/corpus/test-13 | 0 .../testdata/fuzz/corpus/test-14 | 0 .../testdata/fuzz/corpus/test-15 | 0 .../testdata/fuzz/corpus/test-16 | 0 .../testdata/fuzz/corpus/test-17 | 0 .../testdata/fuzz/corpus/test-18 | 0 .../testdata/fuzz/corpus/test-19 | 0 .../testdata/fuzz/corpus/test-2 | 0 .../testdata/fuzz/corpus/test-20 | 0 .../testdata/fuzz/corpus/test-21 | 0 .../testdata/fuzz/corpus/test-22 | 0 .../testdata/fuzz/corpus/test-23 | 0 .../testdata/fuzz/corpus/test-24 | 0 .../testdata/fuzz/corpus/test-25 | 0 .../testdata/fuzz/corpus/test-26 | 0 .../testdata/fuzz/corpus/test-27 | 0 .../testdata/fuzz/corpus/test-28 | 0 .../testdata/fuzz/corpus/test-29 | 0 .../testdata/fuzz/corpus/test-3 | 0 .../testdata/fuzz/corpus/test-30 | 0 .../testdata/fuzz/corpus/test-31 | 0 .../testdata/fuzz/corpus/test-32 | 0 .../testdata/fuzz/corpus/test-33 | 0 .../testdata/fuzz/corpus/test-34 | 0 .../testdata/fuzz/corpus/test-35 | 0 .../testdata/fuzz/corpus/test-36 | 0 .../testdata/fuzz/corpus/test-37 | 0 .../testdata/fuzz/corpus/test-38 | 0 .../testdata/fuzz/corpus/test-39 | 0 .../testdata/fuzz/corpus/test-4 | 0 .../testdata/fuzz/corpus/test-40 | 0 .../testdata/fuzz/corpus/test-41 | 0 .../testdata/fuzz/corpus/test-42 | 0 .../testdata/fuzz/corpus/test-43 | 0 .../testdata/fuzz/corpus/test-44 | 0 .../testdata/fuzz/corpus/test-45 | 0 .../testdata/fuzz/corpus/test-46 | 0 .../testdata/fuzz/corpus/test-47 | 0 .../testdata/fuzz/corpus/test-48 | 0 .../testdata/fuzz/corpus/test-49 | 0 .../testdata/fuzz/corpus/test-5 | 0 .../testdata/fuzz/corpus/test-6 | 0 .../testdata/fuzz/corpus/test-7 | 0 .../testdata/fuzz/corpus/test-8 | 0 .../testdata/fuzz/corpus/test-9 | 0 {scanner => tokenizer}/token.go | 2 +- {scanner => tokenizer}/tokenizer.go | 2 +- 57 files changed, 61 insertions(+), 325 deletions(-) delete mode 100644 scanner/doc.go delete mode 100644 scanner/scanner.go rename {scanner => tokenizer}/crlf.go (98%) create mode 100644 tokenizer/doc.go rename {scanner => tokenizer}/fuzz.go (97%) rename {scanner => tokenizer}/scanner_test.go (98%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-1 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-10 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-11 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-12 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-13 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-14 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-15 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-16 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-17 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-18 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-19 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-2 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-20 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-21 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-22 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-23 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-24 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-25 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-26 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-27 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-28 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-29 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-3 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-30 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-31 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-32 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-33 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-34 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-35 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-36 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-37 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-38 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-39 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-4 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-40 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-41 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-42 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-43 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-44 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-45 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-46 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-47 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-48 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-49 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-5 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-6 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-7 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-8 (100%) rename {scanner => tokenizer}/testdata/fuzz/corpus/test-9 (100%) rename {scanner => tokenizer}/token.go (99%) rename {scanner => tokenizer}/tokenizer.go (99%) diff --git a/scanner/doc.go b/scanner/doc.go deleted file mode 100644 index f19850e..0000000 --- a/scanner/doc.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2012 The Gorilla Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -/* -Package gorilla/css/scanner generates tokens for a CSS3 input. - -It follows the CSS3 specification located at: - - http://www.w3.org/TR/css3-syntax/ - -To use it, create a new scanner for a given CSS string and call Next() until -the token returned has type TokenEOF or TokenError: - - s := scanner.New(myCSS) - for { - token := s.Next() - if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError { - break - } - // Do something with the token... - } - -Following the CSS3 specification, an error can only occur when the scanner -finds an unclosed quote or unclosed comment. In these cases the text becomes -"untokenizable". Everything else is tokenizable and it is up to a parser -to make sense of the token stream (or ignore nonsensical token sequences). - -Note: the scanner doesn't perform lexical analysis or, in other words, it -doesn't care about the token context. It is intended to be used by a -lexer or parser. -*/ -package scanner diff --git a/scanner/scanner.go b/scanner/scanner.go deleted file mode 100644 index 7f034e2..0000000 --- a/scanner/scanner.go +++ /dev/null @@ -1,287 +0,0 @@ -// Copyright (c) 2018 Kane York. Licensed under 2-Clause BSD. - -//+build ignore - -// Copyright 2012 The Gorilla Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package scanner - -import ( - "bufio" - "fmt" - "regexp" - "strings" - "unicode" - "unicode/utf8" -) - -// String returns a string representation of the token. -func (t *Token) String() string { - if len(t.Value) > 10 { - return fmt.Sprintf("%s (line: %d, column: %d): %.10q...", - t.Type, t.Line, t.Column, t.Value) - } - return fmt.Sprintf("%s (line: %d, column: %d): %q", - t.Type, t.Line, t.Column, t.Value) -} - -// All tokens ----------------------------------------------------------------- - -// Macros and productions ----------------------------------------------------- -// http://www.w3.org/TR/css3-syntax/#tokenization - -var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`) - -// macros maps macro names to patterns to be expanded. -var macros = map[string]string{ - // must be escaped: `\.+*?()|[]{}^$` - "ident": `-?{nmstart}{nmchar}*`, - "name": `{nmchar}+`, - "nmstart": `[a-zA-Z_]|{nonascii}|{escape}`, - "nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", - "unicode": `\\[0-9a-fA-F]{1,6}{wc}?`, - "escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", - "nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`, - "num": `[0-9]*\.[0-9]+|[0-9]+`, - "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`, - "stringchar": `{urlchar}|[ ]|\\{nl}`, - "nl": `[\n\r\f]|\r\n`, - "w": `{wc}*`, - "wc": `[\t\n\f\r ]`, - - // urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}] - // ASCII characters range = `[\u0020-\u007e]` - // Skip space \u0020 = `[\u0021-\u007e]` - // Skip quotation mark \0022 = `[\u0021\u0023-\u007e]` - // Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]` - // Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]` - // Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves - "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}", -} - -// productions maps the list of tokens to patterns to be expanded. -var productions = map[TokenType]string{ - // Unused regexps (matched using other methods) are commented out. - TokenIdent: `{ident}`, - TokenAtKeyword: `@{ident}`, - TokenString: `{string}`, - TokenHash: `#{name}`, - TokenNumber: `{num}`, - TokenPercentage: `{num}%`, - TokenDimension: `{num}{ident}`, - TokenURI: `url\({w}(?:{string}|{urlchar}*?){w}\)`, - TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`, - //TokenCDO: ``, - TokenS: `{wc}+`, - TokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`, - TokenFunction: `{ident}\(`, - //TokenIncludes: `~=`, - //TokenDashMatch: `\|=`, - //TokenPrefixMatch: `\^=`, - //TokenSuffixMatch: `\$=`, - //TokenSubstringMatch: `\*=`, - //TokenChar: `[^"']`, - //TokenBOM: "\uFEFF", -} - -// matchers maps the list of tokens to compiled regular expressions. -// -// The map is filled on init() using the macros and productions defined in -// the CSS specification. -var matchers = map[TokenType]*regexp.Regexp{} - -// matchOrder is the order to test regexps when first-char shortcuts -// can't be used. -var matchOrder = []TokenType{ - TokenURI, - TokenFunction, - TokenUnicodeRange, - TokenIdent, - TokenDimension, - TokenPercentage, - TokenNumber, - TokenCDC, -} - -func init() { - // replace macros and compile regexps for productions. - replaceMacro := func(s string) string { - return "(?:" + macros[s[1:len(s)-1]] + ")" - } - for t, s := range productions { - for macroRegexp.MatchString(s) { - s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro) - } - matchers[t] = regexp.MustCompile("^(?:" + s + ")") - } -} - -// Scanner -------------------------------------------------------------------- - -type Scanner struct { - input string - row int - col int - pos int -} - -// New returns a new CSS scanner for the given input. -func New(r *bufio.Reader) *Scanner { - // Normalize newlines. - input = strings.Replace(input, "\r\n", "\n", -1) - return &Scanner{ - input: input, - row: 1, - col: 1, - } -} - -// Next returns the next token from the input. -// -// At the end of the input the token type is TokenEOF. -// -// If the input can't be tokenized the token type is TokenError. This occurs -// in case of unclosed quotation marks or comments. -func (s *Scanner) Next() *Token { - if s.err != nil { - return s.err - } - if s.pos >= len(s.input) { - s.err = &Token{TokenEOF, "", s.row, s.col} - return s.err - } - if s.pos == 0 { - // Test BOM only once, at the beginning of the file. - if strings.HasPrefix(s.input, "\uFEFF") { - return s.emitSimple(TokenBOM, "\uFEFF") - } - } - // There's a lot we can guess based on the first byte so we'll take a - // shortcut before testing multiple regexps. - input := s.input[s.pos:] - switch input[0] { - case '\t', '\n', '\f', '\r', ' ': - // Whitespace. - return s.emitToken(TokenS, matchers[TokenS].FindString(input)) - case '.': - // Dot is too common to not have a quick check. - // We'll test if this is a Char; if it is followed by a number it is a - // dimension/percentage/number, and this will be matched later. - if len(input) > 1 && !unicode.IsDigit(rune(input[1])) { - return s.emitSimple(TokenChar, ".") - } - case '#': - // Another common one: Hash or Char. - if match := matchers[TokenHash].FindString(input); match != "" { - return s.emitToken(TokenHash, match) - } - return s.emitSimple(TokenChar, "#") - case '@': - // Another common one: AtKeyword or Char. - if match := matchers[TokenAtKeyword].FindString(input); match != "" { - return s.emitSimple(TokenAtKeyword, match) - } - return s.emitSimple(TokenChar, "@") - case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}': - // More common chars. - return s.emitSimple(TokenChar, string(input[0])) - case '"', '\'': - // String or error. - match := matchers[TokenString].FindString(input) - if match != "" { - return s.emitToken(TokenString, match) - } - - s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col} - return s.err - case '/': - // Comment, error or Char. - if len(input) > 1 && input[1] == '*' { - match := matchers[TokenComment].FindString(input) - if match != "" { - return s.emitToken(TokenComment, match) - } else { - s.err = &Token{TokenError, "unclosed comment", s.row, s.col} - return s.err - } - } - return s.emitSimple(TokenChar, "/") - case '~': - // Includes or Char. - return s.emitPrefixOrChar(TokenIncludes, "~=") - case '|': - // DashMatch or Char. - return s.emitPrefixOrChar(TokenDashMatch, "|=") - case '^': - // PrefixMatch or Char. - return s.emitPrefixOrChar(TokenPrefixMatch, "^=") - case '$': - // SuffixMatch or Char. - return s.emitPrefixOrChar(TokenSuffixMatch, "$=") - case '*': - // SubstringMatch or Char. - return s.emitPrefixOrChar(TokenSubstringMatch, "*=") - case '<': - // CDO or Char. - return s.emitPrefixOrChar(TokenCDO, "`, + TokenS: `{wc}+`, + TokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`, + TokenFunction: `{ident}\(`, + //TokenIncludes: `~=`, + //TokenDashMatch: `\|=`, + //TokenPrefixMatch: `\^=`, + //TokenSuffixMatch: `\$=`, + //TokenSubstringMatch: `\*=`, + //TokenChar: `[^"']`, + //TokenBOM: "\uFEFF", +} + +// matchers maps the list of tokens to compiled regular expressions. +// +// The map is filled on init() using the macros and productions defined in +// the CSS specification. +var matchers = map[tokenType]*regexp.Regexp{} + +// matchOrder is the order to test regexps when first-char shortcuts +// can't be used. +var matchOrder = []tokenType{ + TokenURI, + TokenFunction, + TokenUnicodeRange, + TokenIdent, + TokenDimension, + TokenPercentage, + TokenNumber, + TokenCDC, +} + +func init() { + // replace macros and compile regexps for productions. + replaceMacro := func(s string) string { + return "(?:" + macros[s[1:len(s)-1]] + ")" + } + for t, s := range productions { + for macroRegexp.MatchString(s) { + s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro) + } + matchers[t] = regexp.MustCompile("^(?:" + s + ")") + } +} + +// Scanner -------------------------------------------------------------------- + +// New returns a new CSS scanner for the given input. +func New(input string) *Scanner { + // Normalize newlines. + input = strings.Replace(input, "\r\n", "\n", -1) + return &Scanner{ + input: input, + row: 1, + col: 1, + } +} + +// Scanner scans an input and emits tokens following the CSS3 specification. +type Scanner struct { + input string + pos int + row int + col int + err *Token +} + +// Next returns the next token from the input. +// +// At the end of the input the token type is TokenEOF. +// +// If the input can't be tokenized the token type is TokenError. This occurs +// in case of unclosed quotation marks or comments. +func (s *Scanner) Next() *Token { + if s.err != nil { + return s.err + } + if s.pos >= len(s.input) { + s.err = &Token{TokenEOF, "", s.row, s.col} + return s.err + } + if s.pos == 0 { + // Test BOM only once, at the beginning of the file. + if strings.HasPrefix(s.input, "\uFEFF") { + return s.emitSimple(TokenBOM, "\uFEFF") + } + } + // There's a lot we can guess based on the first byte so we'll take a + // shortcut before testing multiple regexps. + input := s.input[s.pos:] + switch input[0] { + case '\t', '\n', '\f', '\r', ' ': + // Whitespace. + return s.emitToken(TokenS, matchers[TokenS].FindString(input)) + case '.': + // Dot is too common to not have a quick check. + // We'll test if this is a Char; if it is followed by a number it is a + // dimension/percentage/number, and this will be matched later. + if len(input) > 1 && !unicode.IsDigit(rune(input[1])) { + return s.emitSimple(TokenChar, ".") + } + case '#': + // Another common one: Hash or Char. + if match := matchers[TokenHash].FindString(input); match != "" { + return s.emitToken(TokenHash, match) + } + return s.emitSimple(TokenChar, "#") + case '@': + // Another common one: AtKeyword or Char. + if match := matchers[TokenAtKeyword].FindString(input); match != "" { + return s.emitSimple(TokenAtKeyword, match) + } + return s.emitSimple(TokenChar, "@") + case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}': + // More common chars. + return s.emitSimple(TokenChar, string(input[0])) + case '"', '\'': + // String or error. + match := matchers[TokenString].FindString(input) + if match != "" { + return s.emitToken(TokenString, match) + } + + s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col} + return s.err + case '/': + // Comment, error or Char. + if len(input) > 1 && input[1] == '*' { + match := matchers[TokenComment].FindString(input) + if match != "" { + return s.emitToken(TokenComment, match) + } else { + s.err = &Token{TokenError, "unclosed comment", s.row, s.col} + return s.err + } + } + return s.emitSimple(TokenChar, "/") + case '~': + // Includes or Char. + return s.emitPrefixOrChar(TokenIncludes, "~=") + case '|': + // DashMatch or Char. + return s.emitPrefixOrChar(TokenDashMatch, "|=") + case '^': + // PrefixMatch or Char. + return s.emitPrefixOrChar(TokenPrefixMatch, "^=") + case '$': + // SuffixMatch or Char. + return s.emitPrefixOrChar(TokenSuffixMatch, "$=") + case '*': + // SubstringMatch or Char. + return s.emitPrefixOrChar(TokenSubstringMatch, "*=") + case '<': + // CDO or Char. + return s.emitPrefixOrChar(TokenCDO, "", TokenCDC, "-->") + checkMatch(" \n \t \n", TokenS, " \n \t \n") + checkMatch("/* foo */", TokenComment, "/* foo */") + checkMatch("bar(", TokenFunction, "bar(") + checkMatch("~=", TokenIncludes, "~=") + checkMatch("|=", TokenDashMatch, "|=") + checkMatch("^=", TokenPrefixMatch, "^=") + checkMatch("$=", TokenSuffixMatch, "$=") + checkMatch("*=", TokenSubstringMatch, "*=") + checkMatch("{", TokenChar, "{") + checkMatch("\uFEFF", TokenBOM, "\uFEFF") + checkMatch(`╯︵┻━┻"stuff"`, TokenIdent, "╯︵┻━┻", TokenString, `"stuff"`) +} From 08b0d9cf2c02d3f34245e635da4264c8919afa38 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 15:40:27 -0700 Subject: [PATCH 23/33] Ignore fuzz results --- tokenizer/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 tokenizer/.gitignore diff --git a/tokenizer/.gitignore b/tokenizer/.gitignore new file mode 100644 index 0000000..0bc5f58 --- /dev/null +++ b/tokenizer/.gitignore @@ -0,0 +1 @@ +testdata/fuzz From ff8d7b8edd3e5f148db6eed491e65b36dcc0aa48 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 15:46:27 -0700 Subject: [PATCH 24/33] Remove failing "--\--" test, add test for #2 --- tokenizer/doc.go | 2 +- tokenizer/scanner_test.go | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tokenizer/doc.go b/tokenizer/doc.go index d8c693d..2cd6d19 100644 --- a/tokenizer/doc.go +++ b/tokenizer/doc.go @@ -49,4 +49,4 @@ Note: the scanner doesn't perform lexical analysis or, in other words, it doesn't care about the token context. It is intended to be used by a lexer or parser. */ -package tokenzier +package tokenizer diff --git a/tokenizer/scanner_test.go b/tokenizer/scanner_test.go index f57ef2d..6d49f3f 100644 --- a/tokenizer/scanner_test.go +++ b/tokenizer/scanner_test.go @@ -151,5 +151,8 @@ func TestMatchers(t *testing.T) { checkMatch("\"a0\r", TokenBadString, "a0", &TokenExtraError{}, TokenS, "\n") checkMatch("\\fun(", TokenFunction, "\x0fun") checkMatch("\"abc\\\"def\nghi", TokenBadString, "abc\"def", &TokenExtraError{}, TokenS, "\n", TokenIdent, "ghi") - checkMatch("---\\\x18-00", TokenDelim, "-", TokenDelim, "-", TokenIdent, "-\x18-00") + // checkMatch("---\\\x18-00", TokenDelim, "-", TokenDelim, "-", TokenIdent, "-\x18-00") + Fuzz([]byte( + `#sw_tfbb,#id_d{display:none}.sw_pref{border-style:solid;border-width:7px 0 7px 10px;vertical-align:bottom}#b_tween{margin-top:-28px}#b_tween>span{line-height:30px}#b_tween .ftrH{line-height:30px;height:30px}input{font:inherit;font-size:100%}.b_searchboxForm{font:18px/normal 'Segoe UI',Arial,Helvetica,Sans-Serif}.b_beta{font:11px/normal Arial,Helvetica,Sans-Serif}.b_scopebar,.id_button{line-height:30px}.sa_ec{font:13px Arial,Helvetica,Sans-Serif}#sa_ul .sa_hd{font-size:11px;line-height:16px}#sw_as strong{font-family:'Segoe UI Semibold',Arial,Helvetica,Sans-Serif}#id_h{background-color:transparent!important;position:relativ e!important;float:right;height:35px!important;width:280px!important}.sw_pref{margin:0 15px 3px 0}#id_d{left:auto;right:26px;top:35px!important}.id_avatar{vertical-align:middle;margin:10px 0 10px 10px}`), + ) } From df4d3f6b5fbeb7de7d30dae58c3b62ee09e24341 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 16:04:31 -0700 Subject: [PATCH 25/33] Improve documentation, delete unused methods --- tokenizer/fuzz.go | 1 + tokenizer/token.go | 49 ++++++++++++++++++++++-------------------- tokenizer/tokenizer.go | 29 ++++++++++--------------- 3 files changed, 38 insertions(+), 41 deletions(-) diff --git a/tokenizer/fuzz.go b/tokenizer/fuzz.go index d8e512b..e074e2e 100644 --- a/tokenizer/fuzz.go +++ b/tokenizer/fuzz.go @@ -9,6 +9,7 @@ import ( "reflect" ) +// Entry point for fuzz testing. func Fuzz(b []byte) int { fmt.Printf("=== Start fuzz test ===\n%s\n", b) var tokens []Token diff --git a/tokenizer/token.go b/tokenizer/token.go index 5e6ec52..302f114 100644 --- a/tokenizer/token.go +++ b/tokenizer/token.go @@ -29,17 +29,6 @@ func (t TokenType) StopToken() bool { TokenBadString || t == TokenBadURI } -// Simple tokens TODO figure out a useful definition for this. -func (t TokenType) SimpleToken() bool { - if t.StopToken() { - return false - } - if t == TokenHash || t == TokenNumber || t == TokenPercentage || t == TokenDimension || t == TokenUnicodeRange { - return false - } - return true -} - // ParseError represents a CSS syntax error. type ParseError struct { Type TokenType @@ -47,16 +36,20 @@ type ParseError struct { Loc int } +// implements error func (e *ParseError) Error() string { return e.Message } // Token represents a token in the CSS syntax. type Token struct { - Type TokenType + Type TokenType + // A string representation of the token value that depends on the type. + // For example, for a TokenURI, the Value is the URI itself. For a + // TokenPercentage, the Value is the number without the percent sign. Value string - // Extra data for the token beyond a simple string. - // Will always be a pointer to a "Token*Extra" type in this package. + // Extra data for the token beyond a simple string. Will always be a + // pointer to a "TokenExtra*" type in this package. Extra TokenExtra } @@ -65,25 +58,27 @@ const ( // Scanner flags. TokenError TokenType = iota TokenEOF - // From now on, only tokens from the CSS specification. + + // Tokens TokenIdent TokenFunction + TokenURI TokenDelim // Single character TokenAtKeyword TokenString + TokenS // Whitespace + // CSS Syntax Level 3 removes comments from the token stream, but they are + // preserved here. + TokenComment + + // Extra data: TokenExtraHash TokenHash + // Extra data: TokenExtraNumeric TokenNumber TokenPercentage TokenDimension - TokenURI + // Extra data: TokenExtraUnicodeRange TokenUnicodeRange - TokenCDO - TokenCDC - // Whitespace - TokenS - // CSS Syntax Level 3 removes comments from the token stream, but they are - // preserved here. - TokenComment // Error tokens TokenBadString @@ -106,6 +101,8 @@ const ( TokenCloseParen TokenOpenBrace TokenCloseBrace + TokenCDO + TokenCDC ) // backwards compatibility @@ -369,12 +366,14 @@ func escapeString(s string, delim byte) string { return buf.String() } +// Attempt to turn the token back into a CSS string. (Wrapper around WriteTo.) func (t *Token) Render() string { var buf bytes.Buffer t.WriteTo(&buf) return buf.String() } +// Attempt to turn the token back into a CSS string. func (t *Token) WriteTo(w io.Writer) { switch t.Type { case TokenError: @@ -438,10 +437,14 @@ func (t *Token) WriteTo(w io.Writer) { } // TokenRenderer takes care of the comment insertion rules for serialization. +// This type is mostly intended for the fuzz test and not for general +// consumption, but it can be used for that. type TokenRenderer struct { lastToken Token } +// Write a token to the given io.Writer, potentially inserting an empty comment +// in front based on what the previous token was. func (r *TokenRenderer) WriteTokenTo(w io.Writer, t Token) { var prevKey, curKey interface{} if r.lastToken.Type == TokenDelim { diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go index 0ecf20e..fc67a13 100644 --- a/tokenizer/tokenizer.go +++ b/tokenizer/tokenizer.go @@ -25,18 +25,22 @@ type Tokenizer struct { err error peek [3]byte - ErrorMode int + // ErrorMode int tok Token } +/* const ( // Default error mode - tokenization errors are represented as special tokens in the stream, and I/O errors are TokenError. ErrorModeTokens = iota ErrorModeFatal ) +*/ -// Construct a Tokenizer from the given input. Input need not be normalized. +// Construct a Tokenizer from the given input. Input need not be 'normalized' +// according to the spec (newlines changed to \n, zero bytes changed to +// U+FFFD). func NewTokenizer(r io.Reader) *Tokenizer { return &Tokenizer{ r: bufio.NewReader(transform.NewReader(r, new(normalize))), @@ -44,7 +48,7 @@ func NewTokenizer(r io.Reader) *Tokenizer { } // Scan for the next token. If the tokenizer is in an error state, no input -// will be consumed, and .AcknowledgeError() should be called instead. +// will be consumed. func (z *Tokenizer) Scan() { defer func() { rec := recover() @@ -75,34 +79,23 @@ func (z *Tokenizer) Scan() { } } -// Return the current token. +// Get the most recently scanned token. func (z *Tokenizer) Token() Token { return z.tok } -// Combines the calls to Scan() and Token(). +// Scan for the next token and return it. func (z *Tokenizer) Next() Token { z.Scan() return z.tok } -// Err returns the last error to be encountered and not cleared. +// Err returns the last input reading error to be encountered. It is filled +// when TokenError is returned. func (z *Tokenizer) Err() error { return z.err } -// Acknowledge a returned error token. This can only be called to clear -// TokenBadString, TokenBadURI, and TokenBadEscape. Using it for non-parsing -// errors will panic. -func (z *Tokenizer) AcknowledgeError() { - _, ok := z.err.(*ParseError) - if !ok { - // TODO ErrorMode - return - } - z.err = nil -} - // repeek reads the next 3 bytes into the tokenizer. on EOF, the bytes are // filled with zeroes. (Null bytes in the input are preprocessed into U+FFFD.) func (z *Tokenizer) repeek() { From 2689bbfa6e605ff7190fbfbebabb52aabe0e29e7 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 16:05:59 -0700 Subject: [PATCH 26/33] tighten signature of TokenExtraTypeLookup --- tokenizer/token.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizer/token.go b/tokenizer/token.go index 302f114..0471985 100644 --- a/tokenizer/token.go +++ b/tokenizer/token.go @@ -155,7 +155,7 @@ type TokenExtra interface { // TokenExtraTypeLookup provides a handy check for whether a given token type // should contain extra data. -var TokenExtraTypeLookup = map[TokenType]interface{}{ +var TokenExtraTypeLookup = map[TokenType]TokenExtra{ TokenError: &TokenExtraError{}, TokenBadEscape: &TokenExtraError{}, TokenBadString: &TokenExtraError{}, From 5f3baa3f2cb22dbf7703eaf23a32fc10b5a65410 Mon Sep 17 00:00:00 2001 From: Kane York Date: Tue, 20 Mar 2018 16:08:26 -0700 Subject: [PATCH 27/33] improve documentation of TokenExtra.String() --- tokenizer/token.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tokenizer/token.go b/tokenizer/token.go index 0471985..8d860e8 100644 --- a/tokenizer/token.go +++ b/tokenizer/token.go @@ -172,6 +172,7 @@ type TokenExtraHash struct { IsIdentifier bool } +// Returns a descriptive string, either "unrestricted" or "id". func (e *TokenExtraHash) String() string { if e == nil || !e.IsIdentifier { return "unrestricted" @@ -188,6 +189,7 @@ type TokenExtraNumeric struct { Dimension string } +// Returns the Dimension field. func (e *TokenExtraNumeric) String() string { if e == nil { return "" @@ -201,6 +203,7 @@ type TokenExtraUnicodeRange struct { End rune } +// Returns a valid CSS representation of the token. func (e *TokenExtraUnicodeRange) String() string { if e == nil { panic("TokenExtraUnicodeRange: unexpected nil pointer value") @@ -219,7 +222,7 @@ type TokenExtraError struct { Err error } -// String returns the error text. +// Returns Err.Error(). func (e *TokenExtraError) String() string { return e.Err.Error() } From ad83c8e4820580972e7d149365cbfb7cc11100a2 Mon Sep 17 00:00:00 2001 From: riking Date: Sat, 24 Mar 2018 19:15:43 -0700 Subject: [PATCH 28/33] Change Token.WriteTo to standard signature --- tokenizer/token.go | 111 +++++++++++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 35 deletions(-) diff --git a/tokenizer/token.go b/tokenizer/token.go index 8d860e8..98f983c 100644 --- a/tokenizer/token.go +++ b/tokenizer/token.go @@ -369,86 +369,116 @@ func escapeString(s string, delim byte) string { return buf.String() } -// Attempt to turn the token back into a CSS string. (Wrapper around WriteTo.) +// Return the CSS source representation of the token. (Wrapper around +// WriteTo.) func (t *Token) Render() string { var buf bytes.Buffer - t.WriteTo(&buf) + _, _ = t.WriteTo(&buf) return buf.String() } -// Attempt to turn the token back into a CSS string. -func (t *Token) WriteTo(w io.Writer) { +func stickyWriteString(n *int, err *error, w io.Writer, s string) { + n2, err2 := io.WriteString(w, s) + *n += n2 + if err2 != nil { + if *err != nil { + *err = err2 + } + } +} + +// Write the CSS source representation of the token to the provided writer. If +// you are attempting to render a series of tokens, see the TokenRenderer type +// to handle comment insertion rules. +// +// Tokens with type TokenError do not write anything. +func (t *Token) WriteTo(w io.Writer) (n int, err error) { switch t.Type { case TokenError: return case TokenEOF: return case TokenIdent: - fmt.Fprint(w, escapeIdentifier(t.Value)) + return io.WriteString(w, escapeIdentifier(t.Value)) case TokenAtKeyword: - fmt.Fprint(w, "@", escapeIdentifier(t.Value)) + stickyWriteString(&n, &err, w, "@") + stickyWriteString(&n, &err, w, escapeIdentifier(t.Value)) + return case TokenDelim: if t.Value == "\\" { - fmt.Fprint(w, "\\\n") + // nb: should not happen, this is actually TokenBadEscape + return io.WriteString(w, "\\\n") } else { - fmt.Fprint(w, t.Value) + return io.WriteString(w, t.Value) } case TokenHash: e := t.Extra.(*TokenExtraHash) io.WriteString(w, "#") if e.IsIdentifier { - fmt.Fprint(w, escapeIdentifier(t.Value)) + return io.WriteString(w, escapeIdentifier(t.Value)) } else { - fmt.Fprint(w, escapeHashName(t.Value)) + return io.WriteString(w, escapeHashName(t.Value)) } case TokenPercentage: - fmt.Fprint(w, t.Value, "%") + stickyWriteString(&n, &err, w, t.Value) + stickyWriteString(&n, &err, w, "%") + return case TokenDimension: e := t.Extra.(*TokenExtraNumeric) - fmt.Fprint(w, t.Value, escapeDimension(e.Dimension)) + stickyWriteString(&n, &err, w, t.Value) + stickyWriteString(&n, &err, w, escapeDimension(e.Dimension)) + return case TokenString: - io.WriteString(w, escapeString(t.Value, '"')) + return io.WriteString(w, escapeString(t.Value, '"')) case TokenURI: - io.WriteString(w, "url(") - io.WriteString(w, escapeString(t.Value, '"')) - io.WriteString(w, ")") + stickyWriteString(&n, &err, w, "url(") + stickyWriteString(&n, &err, w, escapeString(t.Value, '"')) + stickyWriteString(&n, &err, w, ")") + return case TokenUnicodeRange: - io.WriteString(w, t.Extra.String()) + return io.WriteString(w, t.Extra.String()) case TokenComment: - io.WriteString(w, "/*") - io.WriteString(w, t.Value) - io.WriteString(w, "*/") + stickyWriteString(&n, &err, w, "/*") + stickyWriteString(&n, &err, w, t.Value) + stickyWriteString(&n, &err, w, "*/") + return case TokenFunction: - io.WriteString(w, escapeIdentifier(t.Value)) - io.WriteString(w, "(") - + stickyWriteString(&n, &err, w, escapeIdentifier(t.Value)) + stickyWriteString(&n, &err, w, "(") + return case TokenBadEscape: - io.WriteString(w, "\\\n") + return io.WriteString(w, "\\\n") case TokenBadString: - io.WriteString(w, "\"") - io.WriteString(w, escapeString(t.Value, 0)) - io.WriteString(w, "\n") + stickyWriteString(&n, &err, w, "\"") + stickyWriteString(&n, &err, w, escapeString(t.Value, 0)) + stickyWriteString(&n, &err, w, "\n") + return case TokenBadURI: - io.WriteString(w, "url(\"") + stickyWriteString(&n, &err, w, "url(\"") str := escapeString(t.Value, 0) str = strings.TrimSuffix(str, "\"") - io.WriteString(w, str) - io.WriteString(w, "\n)") + stickyWriteString(&n, &err, w, str) + stickyWriteString(&n, &err, w, "\n)") + return default: - fmt.Fprint(w, t.Value) + return io.WriteString(w, t.Value) } } // TokenRenderer takes care of the comment insertion rules for serialization. // This type is mostly intended for the fuzz test and not for general -// consumption, but it can be used for that. +// consumption, but it can be used by consumers that want to re-render a parse +// stream. type TokenRenderer struct { lastToken Token } // Write a token to the given io.Writer, potentially inserting an empty comment // in front based on what the previous token was. -func (r *TokenRenderer) WriteTokenTo(w io.Writer, t Token) { +// +// In the event of a writing error, the TokenRenderer is left in an +// indeterminate state. (TODO: maybe fix that?) +func (r *TokenRenderer) WriteTokenTo(w io.Writer, t Token) (n int, err error) { var prevKey, curKey interface{} if r.lastToken.Type == TokenDelim { prevKey = r.lastToken.Value[0] @@ -464,14 +494,25 @@ func (r *TokenRenderer) WriteTokenTo(w io.Writer, t Token) { m1, ok := commentInsertionRules[prevKey] if ok { if m1[curKey] { - io.WriteString(w, "/**/") + n2, err2 := io.WriteString(w, "/**/") + if err2 != nil { + return n2, err2 + } else if n2 != 4 { + return n2, io.ErrShortWrite + } else { + n += n2 + } } } - t.WriteTo(w) + n2, err2 := t.WriteTo(w) r.lastToken = t + n += n2 + return n, err2 } +// CSS Syntax Level 3 - Section 9 + var commentInsertionThruCDC = map[interface{}]bool{ TokenIdent: true, TokenFunction: true, From f4312d7e5e4c10630716a9f44c3bd12f69dbe81a Mon Sep 17 00:00:00 2001 From: riking Date: Sat, 24 Mar 2018 19:17:49 -0700 Subject: [PATCH 29/33] Update README, update tokenizer docs --- README.md | 6 +++++- tokenizer/doc.go | 26 +++++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index c8eee22..345d1b9 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,8 @@ css === [![GoDoc](https://godoc.org/github.com/gorilla/css?status.svg)](https://godoc.org/github.com/gorilla/css) [![Build Status](https://travis-ci.org/gorilla/css.png?branch=master)](https://travis-ci.org/gorilla/css) -A CSS3 tokenizer based on https://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms +A CSS3 tokenizer. + +This repository contains two packages. The 'scanner' package is based on an older version of the CSS specification, and is kept around for compatibility with existing code. + +The 'tokenizer' package is based on the CSS Syntax Level 3 specification at . diff --git a/tokenizer/doc.go b/tokenizer/doc.go index 2cd6d19..8da676c 100644 --- a/tokenizer/doc.go +++ b/tokenizer/doc.go @@ -8,9 +8,9 @@ Package gorilla/css/tokenizer generates tokens for a CSS3 input. It follows the CSS3 specification located at: - http://www.w3.org/TR/css3-syntax/ + http://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms -To use it, create a new scanner for a given CSS input and call Next() until +To use it, create a new tokenizer for a given CSS input and call Next() until the token returned is a "stop token": s := tokenizer.New(strings.NewReader(myCSS)) @@ -22,8 +22,8 @@ the token returned is a "stop token": // Do something with the token... } -If the consumer wants to accept malformed input, change the check to the -following instead: +If the consumer wants to accept malformed input, use the following check +instead: token := s.Next() if token.Type == tokenizer.TokenEOF || token.Type == tokenizer.TokenError { @@ -32,21 +32,21 @@ following instead: The three potential tokenization errors are a "bad-escape" (backslash-newline outside a "string" or url() in the input), a "bad-string" (unescaped newline -inside a "string"), and a "bad-url" (a few different cases). Parsers can choose -to abort when seeing one of these errors, or ignore the declaration and attempt -to recover. +inside a "string"), and a "bad-url" (a few different cases). Parsers can +choose to abort when seeing one of these errors, or ignore the declaration and +attempt to recover. -Returned tokens that carry extra information have a non-nil .Extra value. For +Returned tokens that carry extra information have a non-nil .Extra value. For TokenError, TokenBadEscape, TokenBadString, and TokenBadURI, the TokenExtraError type carries an `error` with informative text about the nature -of the error. For TokenNumber, TokenPercentage, and TokenDimension, the +of the error. For TokenNumber, TokenPercentage, and TokenDimension, the TokenExtraNumeric specifies whether the number is integral, and for -TokenDimension, contains the unit string (e.g. "px"). For TokenUnicodeRange, +TokenDimension, contains the unit string (e.g. "px"). For TokenUnicodeRange, the TokenExtraUnicodeRange type contains the actual start and end values of the range. -Note: the scanner doesn't perform lexical analysis or, in other words, it -doesn't care about the token context. It is intended to be used by a -lexer or parser. +Note: the tokenizer doesn't perform lexical analysis, it only implements +Section 4 of the CSS Syntax Level 3 specification. See Section 5 for the +parsing rules. */ package tokenizer From 551cdbaf8e5b12cd88988b88afb8a1a659d87fee Mon Sep 17 00:00:00 2001 From: riking Date: Sat, 24 Mar 2018 19:27:34 -0700 Subject: [PATCH 30/33] Suppress output from Fuzz during tests --- tokenizer/fuzz.go | 27 ++++++++++++++++++++------- tokenizer/scanner_test.go | 3 +++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/tokenizer/fuzz.go b/tokenizer/fuzz.go index e074e2e..c04dfcb 100644 --- a/tokenizer/fuzz.go +++ b/tokenizer/fuzz.go @@ -9,15 +9,29 @@ import ( "reflect" ) +// Tests should set this to true to suppress fuzzer output except on failure. +var fuzzNoPrint = false + // Entry point for fuzz testing. func Fuzz(b []byte) int { - fmt.Printf("=== Start fuzz test ===\n%s\n", b) - var tokens []Token + success := false + var testLogBuf bytes.Buffer + fuzzPrintf := func(f string, v ...interface{}) { + fmt.Fprintf(&testLogBuf, f, v...) + } + defer func() { + if !success { + fmt.Print(testLogBuf.String()) + } + }() + fuzzPrintf("=== Start fuzz test ===\n%s\n", b) + + var tokens []Token tz := NewTokenizer(bytes.NewReader(b)) for { tt := tz.Next() - fmt.Printf("[OT] %v\n", tt) + fuzzPrintf("[OT] %v\n", tt) if tt.Type == TokenError { // We should not have reading errors panic(tt) @@ -32,17 +46,16 @@ func Fuzz(b []byte) int { var wr TokenRenderer var rerenderBuf bytes.Buffer - success := false defer func() { if !success { - fmt.Println("RERENDER BUFFER:", rerenderBuf.String()) + fuzzPrintf("RE-RENDER BUFFER:\n%s\n", rerenderBuf.String()) } }() pr, pw := io.Pipe() defer pr.Close() go func() { - writeTarget := io.MultiWriter(pw, &rerenderBuf) + writeTarget := io.MultiWriter(&rerenderBuf, pw) for _, v := range tokens { wr.WriteTokenTo(writeTarget, v) } @@ -56,7 +69,7 @@ func Fuzz(b []byte) int { i++ } tt := tz.Next() - fmt.Printf("[RT] %v\n", tt) + fuzzPrintf("[RT] %v\n", tt) if tt.Type == TokenComment { // Ignore comments while comparing continue diff --git a/tokenizer/scanner_test.go b/tokenizer/scanner_test.go index 6d49f3f..b89bcea 100644 --- a/tokenizer/scanner_test.go +++ b/tokenizer/scanner_test.go @@ -12,6 +12,9 @@ import ( ) func TestMatchers(t *testing.T) { + // Fuzzer should not print during routine testing + fuzzNoPrint = true + // Just basic checks, not exhaustive at all. checkMatch := func(s string, ttList ...interface{}) { tz := NewTokenizer(strings.NewReader(s)) From 05a2682d8a5310c5d0a68787fe8d8079beb6b490 Mon Sep 17 00:00:00 2001 From: riking Date: Sat, 24 Mar 2018 19:33:42 -0700 Subject: [PATCH 31/33] Oops, WriteTo returns int64 not int --- tokenizer/token.go | 53 +++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/tokenizer/token.go b/tokenizer/token.go index 98f983c..04fcee8 100644 --- a/tokenizer/token.go +++ b/tokenizer/token.go @@ -377,9 +377,9 @@ func (t *Token) Render() string { return buf.String() } -func stickyWriteString(n *int, err *error, w io.Writer, s string) { +func stickyWriteString(n *int64, err *error, w io.Writer, s string) { n2, err2 := io.WriteString(w, s) - *n += n2 + *n += int64(n2) if err2 != nil { if *err != nil { *err = err2 @@ -392,14 +392,15 @@ func stickyWriteString(n *int, err *error, w io.Writer, s string) { // to handle comment insertion rules. // // Tokens with type TokenError do not write anything. -func (t *Token) WriteTo(w io.Writer) (n int, err error) { +func (t *Token) WriteTo(w io.Writer) (n int64, err error) { switch t.Type { case TokenError: return case TokenEOF: return case TokenIdent: - return io.WriteString(w, escapeIdentifier(t.Value)) + stickyWriteString(&n, &err, w, escapeIdentifier(t.Value)) + return case TokenAtKeyword: stickyWriteString(&n, &err, w, "@") stickyWriteString(&n, &err, w, escapeIdentifier(t.Value)) @@ -407,18 +408,20 @@ func (t *Token) WriteTo(w io.Writer) (n int, err error) { case TokenDelim: if t.Value == "\\" { // nb: should not happen, this is actually TokenBadEscape - return io.WriteString(w, "\\\n") + stickyWriteString(&n, &err, w, "\\\n") } else { - return io.WriteString(w, t.Value) + stickyWriteString(&n, &err, w, t.Value) } + return case TokenHash: e := t.Extra.(*TokenExtraHash) - io.WriteString(w, "#") + stickyWriteString(&n, &err, w, "#") if e.IsIdentifier { - return io.WriteString(w, escapeIdentifier(t.Value)) + stickyWriteString(&n, &err, w, escapeIdentifier(t.Value)) } else { - return io.WriteString(w, escapeHashName(t.Value)) + stickyWriteString(&n, &err, w, escapeHashName(t.Value)) } + return case TokenPercentage: stickyWriteString(&n, &err, w, t.Value) stickyWriteString(&n, &err, w, "%") @@ -429,14 +432,16 @@ func (t *Token) WriteTo(w io.Writer) (n int, err error) { stickyWriteString(&n, &err, w, escapeDimension(e.Dimension)) return case TokenString: - return io.WriteString(w, escapeString(t.Value, '"')) + stickyWriteString(&n, &err, w, escapeString(t.Value, '"')) + return case TokenURI: stickyWriteString(&n, &err, w, "url(") stickyWriteString(&n, &err, w, escapeString(t.Value, '"')) stickyWriteString(&n, &err, w, ")") return case TokenUnicodeRange: - return io.WriteString(w, t.Extra.String()) + stickyWriteString(&n, &err, w, t.Extra.String()) + return case TokenComment: stickyWriteString(&n, &err, w, "/*") stickyWriteString(&n, &err, w, t.Value) @@ -447,7 +452,8 @@ func (t *Token) WriteTo(w io.Writer) (n int, err error) { stickyWriteString(&n, &err, w, "(") return case TokenBadEscape: - return io.WriteString(w, "\\\n") + stickyWriteString(&n, &err, w, "\\\n") + return case TokenBadString: stickyWriteString(&n, &err, w, "\"") stickyWriteString(&n, &err, w, escapeString(t.Value, 0)) @@ -461,7 +467,8 @@ func (t *Token) WriteTo(w io.Writer) (n int, err error) { stickyWriteString(&n, &err, w, "\n)") return default: - return io.WriteString(w, t.Value) + stickyWriteString(&n, &err, w, t.Value) + return } } @@ -475,10 +482,7 @@ type TokenRenderer struct { // Write a token to the given io.Writer, potentially inserting an empty comment // in front based on what the previous token was. -// -// In the event of a writing error, the TokenRenderer is left in an -// indeterminate state. (TODO: maybe fix that?) -func (r *TokenRenderer) WriteTokenTo(w io.Writer, t Token) (n int, err error) { +func (r *TokenRenderer) WriteTokenTo(w io.Writer, t Token) (n int64, err error) { var prevKey, curKey interface{} if r.lastToken.Type == TokenDelim { prevKey = r.lastToken.Value[0] @@ -494,21 +498,18 @@ func (r *TokenRenderer) WriteTokenTo(w io.Writer, t Token) (n int, err error) { m1, ok := commentInsertionRules[prevKey] if ok { if m1[curKey] { - n2, err2 := io.WriteString(w, "/**/") - if err2 != nil { - return n2, err2 - } else if n2 != 4 { - return n2, io.ErrShortWrite - } else { - n += n2 - } + stickyWriteString(&n, &err, w, "/**/") } } n2, err2 := t.WriteTo(w) r.lastToken = t + n += n2 - return n, err2 + if err2 != nil && err == nil { + err = err2 + } + return n, err } // CSS Syntax Level 3 - Section 9 From 35e0c2bce7c8a8c388be18ab91a33dc225cbb372 Mon Sep 17 00:00:00 2001 From: riking Date: Sat, 24 Mar 2018 19:53:01 -0700 Subject: [PATCH 32/33] travis.yml: skip tokenizer package in old versions --- .travis.yml | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index fe78007..217dd4d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,12 +3,16 @@ sudo: false matrix: include: - - go: 1.3 - - go: 1.4 - - go: 1.5 - - go: 1.6 - - go: 1.7 - - go: 1.8 + - go: "1.3" + env: SKIP_TOKENIZER=true + - go: "1.4" + env: SKIP_TOKENIZER=true + - go: "1.5" + - go: "1.6" + - go: "1.7" + - go: "1.8" + - go: "1.9" + - go: "1.10" - go: tip allow_failures: - go: tip @@ -17,4 +21,9 @@ script: - go get -t -v ./... - diff -u <(echo -n) <(gofmt -d .) - go vet $(go list ./... | grep -v /vendor/) - - go test -v -race ./... + - > + if [ "$SKIP_TOKENIZER" = "true" ]; then + go test -v -race ./scanner + else + go test -v -race ./... + fi From c37ded0aac8956eb6d21b43690ef40e3e5f09346 Mon Sep 17 00:00:00 2001 From: riking Date: Sat, 24 Mar 2018 19:55:01 -0700 Subject: [PATCH 33/33] travis.yml: Drop go 1.3 and 1.4 support (bufio.Reader.Discard) --- .travis.yml | 11 +---------- README.md | 4 ++-- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 217dd4d..c73651e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,10 +3,6 @@ sudo: false matrix: include: - - go: "1.3" - env: SKIP_TOKENIZER=true - - go: "1.4" - env: SKIP_TOKENIZER=true - go: "1.5" - go: "1.6" - go: "1.7" @@ -21,9 +17,4 @@ script: - go get -t -v ./... - diff -u <(echo -n) <(gofmt -d .) - go vet $(go list ./... | grep -v /vendor/) - - > - if [ "$SKIP_TOKENIZER" = "true" ]; then - go test -v -race ./scanner - else - go test -v -race ./... - fi + - go test -v -race ./... diff --git a/README.md b/README.md index 345d1b9..90e5235 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,6 @@ css A CSS3 tokenizer. -This repository contains two packages. The 'scanner' package is based on an older version of the CSS specification, and is kept around for compatibility with existing code. +This repository contains two packages. The 'scanner' package is based on an older version of the CSS specification, and is kept around for compatibility with existing code. Minimum Go version is 1.3. -The 'tokenizer' package is based on the CSS Syntax Level 3 specification at . +The 'tokenizer' package is based on the CSS Syntax Level 3 specification at . Minimum Go version is 1.5.