diff --git a/.travis.yml b/.travis.yml index fe78007..c73651e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,12 +3,12 @@ sudo: false matrix: include: - - go: 1.3 - - go: 1.4 - - go: 1.5 - - go: 1.6 - - go: 1.7 - - go: 1.8 + - go: "1.5" + - go: "1.6" + - go: "1.7" + - go: "1.8" + - go: "1.9" + - go: "1.10" - go: tip allow_failures: - go: tip diff --git a/README.md b/README.md index e266555..90e5235 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,7 @@ css [![GoDoc](https://godoc.org/github.com/gorilla/css?status.svg)](https://godoc.org/github.com/gorilla/css) [![Build Status](https://travis-ci.org/gorilla/css.png?branch=master)](https://travis-ci.org/gorilla/css) A CSS3 tokenizer. + +This repository contains two packages. The 'scanner' package is based on an older version of the CSS specification, and is kept around for compatibility with existing code. Minimum Go version is 1.3. + +The 'tokenizer' package is based on the CSS Syntax Level 3 specification at . Minimum Go version is 1.5. diff --git a/tokenizer/.gitignore b/tokenizer/.gitignore new file mode 100644 index 0000000..0bc5f58 --- /dev/null +++ b/tokenizer/.gitignore @@ -0,0 +1 @@ +testdata/fuzz diff --git a/tokenizer/crlf.go b/tokenizer/crlf.go new file mode 100644 index 0000000..1c0a084 --- /dev/null +++ b/tokenizer/crlf.go @@ -0,0 +1,60 @@ +// Copyright (c) 2018 Kane York. Licensed under 2-Clause BSD. + +package tokenizer + +// The crlf package helps in dealing with files that have DOS-style CR/LF line +// endings. +// +// Copyright (c) 2015 Andy Balholm. Licensed under 2-Clause BSD. +// +// package crlf + +import "golang.org/x/text/transform" + +// Normalize takes CRLF, CR, or LF line endings in src, and converts them +// to LF in dst. +// +// cssparse: Also replace null bytes with U+FFFD REPLACEMENT CHARACTER. +type normalize struct { + prev byte +} + +const replacementCharacter = "\uFFFD" + +func (n *normalize) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + for nDst < len(dst) && nSrc < len(src) { + c := src[nSrc] + switch c { + case '\r': + dst[nDst] = '\n' + case '\n': + if n.prev == '\r' { + nSrc++ + n.prev = c + continue + } + dst[nDst] = '\n' + case 0: + // nb: len(replacementCharacter) == 3 + if nDst+3 >= len(dst) { + err = transform.ErrShortDst + return + } + copy(dst[nDst:], replacementCharacter[:]) + nDst += 2 + default: + dst[nDst] = c + } + n.prev = c + nDst++ + nSrc++ + } + if nSrc < len(src) { + err = transform.ErrShortDst + } + return +} + +func (n *normalize) Reset() { + n.prev = 0 +} diff --git a/tokenizer/doc.go b/tokenizer/doc.go new file mode 100644 index 0000000..8da676c --- /dev/null +++ b/tokenizer/doc.go @@ -0,0 +1,52 @@ +// Copyright 2018 Kane York. +// Copyright 2012 The Gorilla Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +Package gorilla/css/tokenizer generates tokens for a CSS3 input. + +It follows the CSS3 specification located at: + + http://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms + +To use it, create a new tokenizer for a given CSS input and call Next() until +the token returned is a "stop token": + + s := tokenizer.New(strings.NewReader(myCSS)) + for { + token := s.Next() + if token.Type.StopToken() { + break + } + // Do something with the token... + } + +If the consumer wants to accept malformed input, use the following check +instead: + + token := s.Next() + if token.Type == tokenizer.TokenEOF || token.Type == tokenizer.TokenError { + break + } + +The three potential tokenization errors are a "bad-escape" (backslash-newline +outside a "string" or url() in the input), a "bad-string" (unescaped newline +inside a "string"), and a "bad-url" (a few different cases). Parsers can +choose to abort when seeing one of these errors, or ignore the declaration and +attempt to recover. + +Returned tokens that carry extra information have a non-nil .Extra value. For +TokenError, TokenBadEscape, TokenBadString, and TokenBadURI, the +TokenExtraError type carries an `error` with informative text about the nature +of the error. For TokenNumber, TokenPercentage, and TokenDimension, the +TokenExtraNumeric specifies whether the number is integral, and for +TokenDimension, contains the unit string (e.g. "px"). For TokenUnicodeRange, +the TokenExtraUnicodeRange type contains the actual start and end values of the +range. + +Note: the tokenizer doesn't perform lexical analysis, it only implements +Section 4 of the CSS Syntax Level 3 specification. See Section 5 for the +parsing rules. +*/ +package tokenizer diff --git a/tokenizer/fuzz.go b/tokenizer/fuzz.go new file mode 100644 index 0000000..c04dfcb --- /dev/null +++ b/tokenizer/fuzz.go @@ -0,0 +1,108 @@ +// Copyright 2018 Kane York. + +package tokenizer + +import ( + "bytes" + "fmt" + "io" + "reflect" +) + +// Tests should set this to true to suppress fuzzer output except on failure. +var fuzzNoPrint = false + +// Entry point for fuzz testing. +func Fuzz(b []byte) int { + success := false + + var testLogBuf bytes.Buffer + fuzzPrintf := func(f string, v ...interface{}) { + fmt.Fprintf(&testLogBuf, f, v...) + } + defer func() { + if !success { + fmt.Print(testLogBuf.String()) + } + }() + fuzzPrintf("=== Start fuzz test ===\n%s\n", b) + + var tokens []Token + tz := NewTokenizer(bytes.NewReader(b)) + for { + tt := tz.Next() + fuzzPrintf("[OT] %v\n", tt) + if tt.Type == TokenError { + // We should not have reading errors + panic(tt) + } else if tt.Type == TokenEOF { + break + } else { + tokens = append(tokens, tt) + } + } + + // Render and retokenize + + var wr TokenRenderer + var rerenderBuf bytes.Buffer + defer func() { + if !success { + fuzzPrintf("RE-RENDER BUFFER:\n%s\n", rerenderBuf.String()) + } + }() + pr, pw := io.Pipe() + defer pr.Close() + + go func() { + writeTarget := io.MultiWriter(&rerenderBuf, pw) + for _, v := range tokens { + wr.WriteTokenTo(writeTarget, v) + } + pw.Close() + }() + + tz = NewTokenizer(pr) + i := 0 + for { + for i < len(tokens) && tokens[i].Type == TokenComment { + i++ + } + tt := tz.Next() + fuzzPrintf("[RT] %v\n", tt) + if tt.Type == TokenComment { + // Ignore comments while comparing + continue + } + if tt.Type == TokenError { + panic(tt) + } + if tt.Type == TokenEOF { + if i != len(tokens) { + panic(fmt.Sprintf("unexpected EOF: got EOF from retokenizer, but original token stream is at %d/%d\n%v", i, len(tokens), tokens)) + } else { + break + } + } + if i == len(tokens) { + panic(fmt.Sprintf("expected EOF: reached end of original token stream but got %v from retokenizer\n%v", tt, tokens)) + } + + ot := tokens[i] + if tt.Type != ot.Type { + panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Type not equal)\n%v", tt, ot, tokens)) + } + if tt.Value != ot.Value && !tt.Type.StopToken() { + panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Value not equal)\n%v", tt, ot, tokens)) + } + if TokenExtraTypeLookup[tt.Type] != nil { + if !reflect.DeepEqual(tt, ot) && !tt.Type.StopToken() { + panic(fmt.Sprintf("retokenizer gave %v, expected %v (.Extra not equal)\n%v", tt, ot, tokens)) + } + } + i++ + continue + } + success = true + return 1 +} diff --git a/tokenizer/scanner_test.go b/tokenizer/scanner_test.go new file mode 100644 index 0000000..b89bcea --- /dev/null +++ b/tokenizer/scanner_test.go @@ -0,0 +1,161 @@ +// Copyright 2018 Kane York. +// Copyright 2012 The Gorilla Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package tokenizer + +import ( + "reflect" + "strings" + "testing" +) + +func TestMatchers(t *testing.T) { + // Fuzzer should not print during routine testing + fuzzNoPrint = true + + // Just basic checks, not exhaustive at all. + checkMatch := func(s string, ttList ...interface{}) { + tz := NewTokenizer(strings.NewReader(s)) + + i := 0 + for i < len(ttList) { + tt := ttList[i].(TokenType) + tVal := ttList[i+1].(string) + var tExtra TokenExtra + if TokenExtraTypeLookup[tt] != nil { + tExtra = ttList[i+2].(TokenExtra) + } + if tok := tz.Next(); tok.Type != tt { + t.Errorf("did not match: %s (got %v, wanted %v)", s, tok, tt) + } else if tok.Value != tVal { + t.Errorf("did not match: %s (got %s, wanted %s): %v", s, tok.Value, tVal, tok) + } else if tExtra != nil && !reflect.DeepEqual(tok.Extra, tExtra) { + if tt.StopToken() && tt != TokenError && tt != TokenEOF { + // mismatch ok + } else { + t.Errorf("did not match .Extra: %s (got %#v, wanted %#v): %v", s, tok.Extra, tExtra, tok) + } + } + + i += 2 + if TokenExtraTypeLookup[tt] != nil { + i++ + } + } + + if tok := tz.Next(); tok.Type != TokenEOF { + t.Errorf("missing EOF after token %s, got %+v", s, tok) + if tok := tz.Next(); tok.Type != TokenEOF { + t.Errorf("double missing EOF after token %s, got %+v", s, tok) + } + } + + Fuzz([]byte(s)) + } + + checkMatch("abcd", TokenIdent, "abcd") + checkMatch(`"abcd"`, TokenString, `abcd`) + checkMatch(`"ab'cd"`, TokenString, `ab'cd`) + checkMatch(`"ab\"cd"`, TokenString, `ab"cd`) + checkMatch(`"ab\\cd"`, TokenString, `ab\cd`) + checkMatch("'abcd'", TokenString, "abcd") + checkMatch(`'ab"cd'`, TokenString, `ab"cd`) + checkMatch(`'ab\'cd'`, TokenString, `ab'cd`) + checkMatch(`'ab\\cd'`, TokenString, `ab\cd`) + checkMatch("#name", TokenHash, "name", &TokenExtraHash{IsIdentifier: true}) + checkMatch("##name", TokenDelim, "#", TokenHash, "name", &TokenExtraHash{IsIdentifier: true}) + checkMatch("#123", TokenHash, "123", &TokenExtraHash{IsIdentifier: false}) + checkMatch("42''", TokenNumber, "42", &TokenExtraNumeric{}, TokenString, "") + checkMatch("+42", TokenNumber, "+42", &TokenExtraNumeric{}) + checkMatch("-42", TokenNumber, "-42", &TokenExtraNumeric{}) + checkMatch("42.", TokenNumber, "42", &TokenExtraNumeric{}, TokenDelim, ".") + checkMatch("42.0", TokenNumber, "42.0", &TokenExtraNumeric{NonInteger: true}) + checkMatch("4.2", TokenNumber, "4.2", &TokenExtraNumeric{NonInteger: true}) + checkMatch(".42", TokenNumber, ".42", &TokenExtraNumeric{NonInteger: true}) + checkMatch("+.42", TokenNumber, "+.42", &TokenExtraNumeric{NonInteger: true}) + checkMatch("-.42", TokenNumber, "-.42", &TokenExtraNumeric{NonInteger: true}) + checkMatch("42%", TokenPercentage, "42", &TokenExtraNumeric{}) + checkMatch("4.2%", TokenPercentage, "4.2", &TokenExtraNumeric{NonInteger: true}) + checkMatch(".42%", TokenPercentage, ".42", &TokenExtraNumeric{NonInteger: true}) + checkMatch("42px", TokenDimension, "42", &TokenExtraNumeric{Dimension: "px"}) // TODO check the dimension stored in .Extra + + checkMatch("5e", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e"}) + checkMatch("5e-", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e-"}) + checkMatch("5e-3", TokenNumber, "5e-3", &TokenExtraNumeric{NonInteger: true}) + checkMatch("5e-\xf1", TokenDimension, "5", &TokenExtraNumeric{Dimension: "e-\xf1"}) + + checkMatch("url(http://domain.com)", TokenURI, "http://domain.com") + checkMatch("url( http://domain.com/uri/between/space )", TokenURI, "http://domain.com/uri/between/space") + checkMatch("url('http://domain.com/uri/between/single/quote')", TokenURI, "http://domain.com/uri/between/single/quote") + checkMatch(`url("http://domain.com/uri/between/double/quote")`, TokenURI, `http://domain.com/uri/between/double/quote`) + checkMatch("url(http://domain.com/?parentheses=%28)", TokenURI, "http://domain.com/?parentheses=%28") + checkMatch("url( http://domain.com/?parentheses=%28&between=space )", TokenURI, "http://domain.com/?parentheses=%28&between=space") + checkMatch("url('http://domain.com/uri/(parentheses)/between/single/quote')", TokenURI, "http://domain.com/uri/(parentheses)/between/single/quote") + checkMatch(`url("http://domain.com/uri/(parentheses)/between/double/quote")`, TokenURI, `http://domain.com/uri/(parentheses)/between/double/quote`) + checkMatch(`url(http://domain.com/uri/\(bare%20escaped\)/parentheses)`, TokenURI, `http://domain.com/uri/(bare%20escaped)/parentheses`) + checkMatch("url(http://domain.com/uri/1)url(http://domain.com/uri/2)", + TokenURI, "http://domain.com/uri/1", + TokenURI, "http://domain.com/uri/2", + ) + checkMatch("url(http://domain.com/uri/1) url(http://domain.com/uri/2)", + TokenURI, "http://domain.com/uri/1", + TokenS, " ", + TokenURI, "http://domain.com/uri/2", + ) + checkMatch("U+0042", TokenUnicodeRange, "U+0042", &TokenExtraUnicodeRange{Start: 0x42, End: 0x42}) + checkMatch("U+FFFFFF", TokenUnicodeRange, "U+FFFFFF", &TokenExtraUnicodeRange{Start: 0xFFFFFF, End: 0xFFFFFF}) + checkMatch("U+??????", TokenUnicodeRange, "U+0000-FFFFFF", &TokenExtraUnicodeRange{Start: 0, End: 0xFFFFFF}) + checkMatch("", TokenCDC, "-->") + checkMatch(" \n \t \n", TokenS, "\n") // TODO - whitespace preservation + checkMatch("/**/", TokenComment, "") + checkMatch("/***/", TokenComment, "*") + checkMatch("/**", TokenComment, "*") + checkMatch("/*foo*/", TokenComment, "foo") + checkMatch("/* foo */", TokenComment, " foo ") + checkMatch("bar(", TokenFunction, "bar") + checkMatch("~=", TokenIncludes, "~=") + checkMatch("|=", TokenDashMatch, "|=") + checkMatch("||", TokenColumn, "||") + checkMatch("^=", TokenPrefixMatch, "^=") + checkMatch("$=", TokenSuffixMatch, "$=") + checkMatch("*=", TokenSubstringMatch, "*=") + checkMatch("{", TokenOpenBrace, "{") + // checkMatch("\uFEFF", TokenBOM, "\uFEFF") + checkMatch(`╯︵┻━┻"stuff"`, TokenIdent, "╯︵┻━┻", TokenString, "stuff") + + checkMatch("foo { bar: rgb(255, 0, 127); }", + TokenIdent, "foo", TokenS, " ", + TokenOpenBrace, "{", TokenS, " ", + TokenIdent, "bar", TokenColon, ":", TokenS, " ", + TokenFunction, "rgb", + TokenNumber, "255", &TokenExtraNumeric{}, TokenComma, ",", TokenS, " ", + TokenNumber, "0", &TokenExtraNumeric{}, TokenComma, ",", TokenS, " ", + TokenNumber, "127", &TokenExtraNumeric{}, TokenCloseParen, ")", + TokenSemicolon, ";", TokenS, " ", + TokenCloseBrace, "}", + ) + // Fuzzing results + checkMatch("ur(0", TokenFunction, "ur", TokenNumber, "0", &TokenExtraNumeric{}) + checkMatch("1\\15", TokenDimension, "1", &TokenExtraNumeric{Dimension: "\x15"}) + checkMatch("url(0t')", TokenBadURI, "0t", &TokenExtraError{}) + checkMatch("uri/", TokenIdent, "uri", TokenDelim, "/") + checkMatch("\x00", TokenIdent, "\uFFFD") + checkMatch("a\\0", TokenIdent, "a\uFFFD") + checkMatch("b\\\\0", TokenIdent, "b\\0") + checkMatch("00\\d", TokenDimension, "00", &TokenExtraNumeric{Dimension: "\r"}) + // note: \f is form feed, which is 0x0C + checkMatch("\\0\\0\\C\\\f\\\\0", + TokenIdent, "\uFFFD\uFFFD\x0C\x0C\\0") + // String running to EOF is success, not badstring + checkMatch("\"a0\\d", TokenString, "a0\x0D") + checkMatch("\"a0\r", TokenBadString, "a0", &TokenExtraError{}, TokenS, "\n") + checkMatch("\\fun(", TokenFunction, "\x0fun") + checkMatch("\"abc\\\"def\nghi", TokenBadString, "abc\"def", &TokenExtraError{}, TokenS, "\n", TokenIdent, "ghi") + // checkMatch("---\\\x18-00", TokenDelim, "-", TokenDelim, "-", TokenIdent, "-\x18-00") + Fuzz([]byte( + `#sw_tfbb,#id_d{display:none}.sw_pref{border-style:solid;border-width:7px 0 7px 10px;vertical-align:bottom}#b_tween{margin-top:-28px}#b_tween>span{line-height:30px}#b_tween .ftrH{line-height:30px;height:30px}input{font:inherit;font-size:100%}.b_searchboxForm{font:18px/normal 'Segoe UI',Arial,Helvetica,Sans-Serif}.b_beta{font:11px/normal Arial,Helvetica,Sans-Serif}.b_scopebar,.id_button{line-height:30px}.sa_ec{font:13px Arial,Helvetica,Sans-Serif}#sa_ul .sa_hd{font-size:11px;line-height:16px}#sw_as strong{font-family:'Segoe UI Semibold',Arial,Helvetica,Sans-Serif}#id_h{background-color:transparent!important;position:relativ e!important;float:right;height:35px!important;width:280px!important}.sw_pref{margin:0 15px 3px 0}#id_d{left:auto;right:26px;top:35px!important}.id_avatar{vertical-align:middle;margin:10px 0 10px 10px}`), + ) +} diff --git a/tokenizer/testdata/fuzz/corpus/test-1 b/tokenizer/testdata/fuzz/corpus/test-1 new file mode 100644 index 0000000..85df507 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-1 @@ -0,0 +1 @@ +abcd \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-10 b/tokenizer/testdata/fuzz/corpus/test-10 new file mode 100644 index 0000000..1340eb3 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-10 @@ -0,0 +1 @@ +#name \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-11 b/tokenizer/testdata/fuzz/corpus/test-11 new file mode 100644 index 0000000..5dbaeac --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-11 @@ -0,0 +1 @@ +##name \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-12 b/tokenizer/testdata/fuzz/corpus/test-12 new file mode 100644 index 0000000..88cde2c --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-12 @@ -0,0 +1 @@ +42'' \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-13 b/tokenizer/testdata/fuzz/corpus/test-13 new file mode 100644 index 0000000..947355b --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-13 @@ -0,0 +1 @@ ++42 \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-14 b/tokenizer/testdata/fuzz/corpus/test-14 new file mode 100644 index 0000000..67f7ad0 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-14 @@ -0,0 +1 @@ +-42 \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-15 b/tokenizer/testdata/fuzz/corpus/test-15 new file mode 100644 index 0000000..8012ebb --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-15 @@ -0,0 +1 @@ +4.2 \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-16 b/tokenizer/testdata/fuzz/corpus/test-16 new file mode 100644 index 0000000..0401f1e --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-16 @@ -0,0 +1 @@ +.42 \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-17 b/tokenizer/testdata/fuzz/corpus/test-17 new file mode 100644 index 0000000..f8c987b --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-17 @@ -0,0 +1 @@ ++.42 \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-18 b/tokenizer/testdata/fuzz/corpus/test-18 new file mode 100644 index 0000000..3273e87 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-18 @@ -0,0 +1 @@ +-.42 \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-19 b/tokenizer/testdata/fuzz/corpus/test-19 new file mode 100644 index 0000000..67a9ae6 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-19 @@ -0,0 +1 @@ +42% \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-2 b/tokenizer/testdata/fuzz/corpus/test-2 new file mode 100644 index 0000000..af3501d --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-2 @@ -0,0 +1 @@ +"abcd" \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-20 b/tokenizer/testdata/fuzz/corpus/test-20 new file mode 100644 index 0000000..d44e379 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-20 @@ -0,0 +1 @@ +4.2% \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-21 b/tokenizer/testdata/fuzz/corpus/test-21 new file mode 100644 index 0000000..61542cd --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-21 @@ -0,0 +1 @@ +.42% \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-22 b/tokenizer/testdata/fuzz/corpus/test-22 new file mode 100644 index 0000000..9996f64 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-22 @@ -0,0 +1 @@ +42px \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-23 b/tokenizer/testdata/fuzz/corpus/test-23 new file mode 100644 index 0000000..6b16595 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-23 @@ -0,0 +1 @@ +url(http://domain.com) \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-24 b/tokenizer/testdata/fuzz/corpus/test-24 new file mode 100644 index 0000000..849a2f0 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-24 @@ -0,0 +1 @@ +url( http://domain.com/uri/between/space ) \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-25 b/tokenizer/testdata/fuzz/corpus/test-25 new file mode 100644 index 0000000..9efe089 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-25 @@ -0,0 +1 @@ +url('http://domain.com/uri/between/single/quote') \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-26 b/tokenizer/testdata/fuzz/corpus/test-26 new file mode 100644 index 0000000..5d37d41 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-26 @@ -0,0 +1 @@ +url("http://domain.com/uri/between/double/quote") \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-27 b/tokenizer/testdata/fuzz/corpus/test-27 new file mode 100644 index 0000000..4b67378 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-27 @@ -0,0 +1 @@ +url(http://domain.com/?parentheses=%28) \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-28 b/tokenizer/testdata/fuzz/corpus/test-28 new file mode 100644 index 0000000..7b0f6cb --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-28 @@ -0,0 +1 @@ +url( http://domain.com/?parentheses=%28&between=space ) \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-29 b/tokenizer/testdata/fuzz/corpus/test-29 new file mode 100644 index 0000000..e548025 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-29 @@ -0,0 +1 @@ +url('http://domain.com/uri/(parentheses)/between/single/quote') \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-3 b/tokenizer/testdata/fuzz/corpus/test-3 new file mode 100644 index 0000000..7d12177 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-3 @@ -0,0 +1 @@ +"ab'cd" \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-30 b/tokenizer/testdata/fuzz/corpus/test-30 new file mode 100644 index 0000000..4280336 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-30 @@ -0,0 +1 @@ +url("http://domain.com/uri/(parentheses)/between/double/quote") \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-31 b/tokenizer/testdata/fuzz/corpus/test-31 new file mode 100644 index 0000000..5416922 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-31 @@ -0,0 +1 @@ +url(http://domain.com/uri/\(bare%20escaped\)/parentheses) \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-32 b/tokenizer/testdata/fuzz/corpus/test-32 new file mode 100644 index 0000000..43d5b7d --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-32 @@ -0,0 +1 @@ +url(http://domain.com/uri/1)url(http://domain.com/uri/2) \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-33 b/tokenizer/testdata/fuzz/corpus/test-33 new file mode 100644 index 0000000..7871a01 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-33 @@ -0,0 +1 @@ +url(http://domain.com/uri/1) url(http://domain.com/uri/2) \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-34 b/tokenizer/testdata/fuzz/corpus/test-34 new file mode 100644 index 0000000..335d730 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-34 @@ -0,0 +1 @@ +U+0042 \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-35 b/tokenizer/testdata/fuzz/corpus/test-35 new file mode 100644 index 0000000..3e4e3d6 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-35 @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-37 b/tokenizer/testdata/fuzz/corpus/test-37 new file mode 100644 index 0000000..65ec83f --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-37 @@ -0,0 +1,2 @@ + + diff --git a/tokenizer/testdata/fuzz/corpus/test-38 b/tokenizer/testdata/fuzz/corpus/test-38 new file mode 100644 index 0000000..7068cde --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-38 @@ -0,0 +1 @@ +/**/ \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-39 b/tokenizer/testdata/fuzz/corpus/test-39 new file mode 100644 index 0000000..112b37c --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-39 @@ -0,0 +1 @@ +/*foo*/ \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-4 b/tokenizer/testdata/fuzz/corpus/test-4 new file mode 100644 index 0000000..cf25faa --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-4 @@ -0,0 +1 @@ +"ab\"cd" \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-40 b/tokenizer/testdata/fuzz/corpus/test-40 new file mode 100644 index 0000000..785ae54 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-40 @@ -0,0 +1 @@ +/* foo */ \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-41 b/tokenizer/testdata/fuzz/corpus/test-41 new file mode 100644 index 0000000..adaa030 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-41 @@ -0,0 +1 @@ +bar( \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-42 b/tokenizer/testdata/fuzz/corpus/test-42 new file mode 100644 index 0000000..c926849 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-42 @@ -0,0 +1 @@ +~= \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-43 b/tokenizer/testdata/fuzz/corpus/test-43 new file mode 100644 index 0000000..279d9d3 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-43 @@ -0,0 +1 @@ +|= \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-44 b/tokenizer/testdata/fuzz/corpus/test-44 new file mode 100644 index 0000000..27cc728 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-44 @@ -0,0 +1 @@ +|| \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-45 b/tokenizer/testdata/fuzz/corpus/test-45 new file mode 100644 index 0000000..49bae17 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-45 @@ -0,0 +1 @@ +^= \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-46 b/tokenizer/testdata/fuzz/corpus/test-46 new file mode 100644 index 0000000..3b0d355 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-46 @@ -0,0 +1 @@ +$= \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-47 b/tokenizer/testdata/fuzz/corpus/test-47 new file mode 100644 index 0000000..d2215e3 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-47 @@ -0,0 +1 @@ +*= \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-48 b/tokenizer/testdata/fuzz/corpus/test-48 new file mode 100644 index 0000000..81750b9 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-48 @@ -0,0 +1 @@ +{ \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-49 b/tokenizer/testdata/fuzz/corpus/test-49 new file mode 100644 index 0000000..e9a4a3c --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-49 @@ -0,0 +1 @@ +╯︵┻━┻"stuff" \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-5 b/tokenizer/testdata/fuzz/corpus/test-5 new file mode 100644 index 0000000..bf1efad --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-5 @@ -0,0 +1 @@ +"ab\\cd" \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-6 b/tokenizer/testdata/fuzz/corpus/test-6 new file mode 100644 index 0000000..62fe39b --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-6 @@ -0,0 +1 @@ +'abcd' \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-7 b/tokenizer/testdata/fuzz/corpus/test-7 new file mode 100644 index 0000000..099e0f4 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-7 @@ -0,0 +1 @@ +'ab"cd' \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-8 b/tokenizer/testdata/fuzz/corpus/test-8 new file mode 100644 index 0000000..db5af35 --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-8 @@ -0,0 +1 @@ +'ab\'cd' \ No newline at end of file diff --git a/tokenizer/testdata/fuzz/corpus/test-9 b/tokenizer/testdata/fuzz/corpus/test-9 new file mode 100644 index 0000000..17559ae --- /dev/null +++ b/tokenizer/testdata/fuzz/corpus/test-9 @@ -0,0 +1 @@ +'ab\\cd' \ No newline at end of file diff --git a/tokenizer/token.go b/tokenizer/token.go new file mode 100644 index 0000000..04fcee8 --- /dev/null +++ b/tokenizer/token.go @@ -0,0 +1,638 @@ +// Copyright 2018 Kane York. +// Copyright 2012 The Gorilla Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package tokenizer + +import ( + "bytes" + "fmt" + "io" + "strings" + "unicode/utf8" +) + +// TokenType identifies the type of lexical tokens. +type TokenType int + +// String returns a string representation of the token type. +func (t TokenType) String() string { + return tokenNames[t] +} + +// Stop tokens are TokenError, TokenEOF, TokenBadEscape, +// TokenBadString, TokenBadURI. A consumer that does not want to tolerate +// parsing errors should stop parsing when this returns true. +func (t TokenType) StopToken() bool { + return t == TokenError || t == TokenEOF || t == TokenBadEscape || t == + TokenBadString || t == TokenBadURI +} + +// ParseError represents a CSS syntax error. +type ParseError struct { + Type TokenType + Message string + Loc int +} + +// implements error +func (e *ParseError) Error() string { + return e.Message +} + +// Token represents a token in the CSS syntax. +type Token struct { + Type TokenType + // A string representation of the token value that depends on the type. + // For example, for a TokenURI, the Value is the URI itself. For a + // TokenPercentage, the Value is the number without the percent sign. + Value string + // Extra data for the token beyond a simple string. Will always be a + // pointer to a "TokenExtra*" type in this package. + Extra TokenExtra +} + +// The complete list of tokens in CSS Syntax Level 3. +const ( + // Scanner flags. + TokenError TokenType = iota + TokenEOF + + // Tokens + TokenIdent + TokenFunction + TokenURI + TokenDelim // Single character + TokenAtKeyword + TokenString + TokenS // Whitespace + // CSS Syntax Level 3 removes comments from the token stream, but they are + // preserved here. + TokenComment + + // Extra data: TokenExtraHash + TokenHash + // Extra data: TokenExtraNumeric + TokenNumber + TokenPercentage + TokenDimension + // Extra data: TokenExtraUnicodeRange + TokenUnicodeRange + + // Error tokens + TokenBadString + TokenBadURI + TokenBadEscape // a '\' right before a newline + + // Fixed-string tokens + TokenIncludes + TokenDashMatch + TokenPrefixMatch + TokenSuffixMatch + TokenSubstringMatch + TokenColumn + TokenColon + TokenSemicolon + TokenComma + TokenOpenBracket + TokenCloseBracket + TokenOpenParen + TokenCloseParen + TokenOpenBrace + TokenCloseBrace + TokenCDO + TokenCDC +) + +// backwards compatibility +const TokenChar = TokenDelim + +// tokenNames maps tokenType's to their names. Used for conversion to string. +var tokenNames = map[TokenType]string{ + TokenError: "error", + TokenEOF: "EOF", + TokenIdent: "IDENT", + TokenAtKeyword: "ATKEYWORD", + TokenString: "STRING", + TokenHash: "HASH", + TokenNumber: "NUMBER", + TokenPercentage: "PERCENTAGE", + TokenDimension: "DIMENSION", + TokenURI: "URI", + TokenUnicodeRange: "UNICODE-RANGE", + TokenCDO: "CDO", + TokenCDC: "CDC", + TokenS: "S", + TokenComment: "COMMENT", + TokenFunction: "FUNCTION", + TokenIncludes: "INCLUDES", + TokenDashMatch: "DASHMATCH", + TokenPrefixMatch: "PREFIXMATCH", + TokenSuffixMatch: "SUFFIXMATCH", + TokenSubstringMatch: "SUBSTRINGMATCH", + TokenDelim: "DELIM", + TokenBadString: "BAD-STRING", + TokenBadURI: "BAD-URI", + TokenBadEscape: "BAD-ESCAPE", + TokenColumn: "COLUMN", + TokenColon: "COLON", + TokenSemicolon: "SEMICOLON", + TokenComma: "COMMA", + TokenOpenBracket: "LEFT-BRACKET", // [] + TokenCloseBracket: "RIGHT-BRACKET", + TokenOpenParen: "LEFT-PAREN", // () + TokenCloseParen: "RIGHT-PAREN", + TokenOpenBrace: "LEFT-BRACE", // {} + TokenCloseBrace: "RIGHT-BRACE", +} + +// TokenExtra fills the .Extra field of a token. Consumers should perform a +// type cast to the proper type to inspect its data. +type TokenExtra interface { + String() string +} + +// TokenExtraTypeLookup provides a handy check for whether a given token type +// should contain extra data. +var TokenExtraTypeLookup = map[TokenType]TokenExtra{ + TokenError: &TokenExtraError{}, + TokenBadEscape: &TokenExtraError{}, + TokenBadString: &TokenExtraError{}, + TokenBadURI: &TokenExtraError{}, + TokenHash: &TokenExtraHash{}, + TokenNumber: &TokenExtraNumeric{}, + TokenPercentage: &TokenExtraNumeric{}, + TokenDimension: &TokenExtraNumeric{}, + TokenUnicodeRange: &TokenExtraUnicodeRange{}, +} + +// TokenExtraHash is attached to TokenHash. +type TokenExtraHash struct { + IsIdentifier bool +} + +// Returns a descriptive string, either "unrestricted" or "id". +func (e *TokenExtraHash) String() string { + if e == nil || !e.IsIdentifier { + return "unrestricted" + } else { + return "id" + } +} + +// TokenExtraNumeric is attached to TokenNumber, TokenPercentage, and +// TokenDimension. +type TokenExtraNumeric struct { + // Value float64 // omitted from this implementation + NonInteger bool + Dimension string +} + +// Returns the Dimension field. +func (e *TokenExtraNumeric) String() string { + if e == nil { + return "" + } + return e.Dimension +} + +// TokenExtraUnicodeRange is attached to a TokenUnicodeRange. +type TokenExtraUnicodeRange struct { + Start rune + End rune +} + +// Returns a valid CSS representation of the token. +func (e *TokenExtraUnicodeRange) String() string { + if e == nil { + panic("TokenExtraUnicodeRange: unexpected nil pointer value") + } + + if e.Start == e.End { + return fmt.Sprintf("U+%04X", e.Start) + } else { + return fmt.Sprintf("U+%04X-%04X", e.Start, e.End) + } +} + +// TokenExtraError is attached to a TokenError and contains the same value as +// Tokenizer.Err(). See also the ParseError type and ParseError.Recoverable(). +type TokenExtraError struct { + Err error +} + +// Returns Err.Error(). +func (e *TokenExtraError) String() string { + return e.Err.Error() +} + +// Error implements error. +func (e *TokenExtraError) Error() string { + return e.Err.Error() +} + +// Cause implements errors.Causer. +func (e *TokenExtraError) Cause() error { + return e.Err +} + +// Returns the ParseError object, if present. +func (e *TokenExtraError) ParseError() *ParseError { + pe, ok := e.Err.(*ParseError) + if !ok { + return nil + } + return pe +} + +func escapeIdentifier(s string) string { return escapeIdent(s, 0) } +func escapeHashName(s string) string { return escapeIdent(s, 1) } +func escapeDimension(s string) string { return escapeIdent(s, 2) } + +func needsHexEscaping(c byte, mode int) bool { + if c < 0x20 { + return true + } + if c >= utf8.RuneSelf { + return false + } + if mode == 2 { + if c == 'e' || c == 'E' { + return true + } + } + if c == '\\' { + return true + } + if isNameCode(c) { + return false + } + return true +} + +func escapeIdent(s string, mode int) string { + if s == "" { + return "" + } + var buf bytes.Buffer + buf.Grow(len(s)) + anyChanges := false + + var i int + + // Handle first character + // dashes allowed at start only for TokenIdent-ish + // eE not allowed at start for Dimension + if mode != 1 { + if !isNameStart(s[0]) && s[0] != '-' && s[0] != 'e' && s[0] != 'E' { + if needsHexEscaping(s[0], mode) { + fmt.Fprintf(&buf, "\\%X ", s[0]) + anyChanges = true + } else { + buf.WriteByte('\\') + buf.WriteByte(s[0]) + anyChanges = true + } + } else if s[0] == 'e' || s[0] == 'E' { + if mode == 2 { + fmt.Fprintf(&buf, "\\%X ", s[0]) + anyChanges = true + } else { + buf.WriteByte(s[0]) + } + } else if s[0] == '-' { + if len(s) == 1 { + return "\\-" + } else if isNameStart(s[1]) { + buf.WriteByte('-') + } else { + buf.WriteString("\\-") + anyChanges = true + } + } else { + buf.WriteByte(s[0]) + } + i = 1 + } else { + i = 0 + } + // Write the rest of the name + for ; i < len(s); i++ { + if !isNameCode(s[i]) { + fmt.Fprintf(&buf, "\\%X ", s[i]) + anyChanges = true + } else { + buf.WriteByte(s[i]) + } + } + + if !anyChanges { + return s + } + return buf.String() +} + +func escapeString(s string, delim byte) string { + var buf bytes.Buffer + if delim != 0 { + buf.WriteByte(delim) + } + for i := 0; i < len(s); i++ { + switch s[i] { + case '"': + buf.WriteString("\\\"") + continue + case delim: + buf.WriteByte('\\') + buf.WriteByte(delim) + continue + case '\n': + buf.WriteString("\\0A ") + continue + case '\r': + buf.WriteString("\\0D ") + continue + case '\\': + buf.WriteString("\\\\") + continue + } + if s[i] < utf8.RuneSelf && isNonPrintable(s[i]) { + fmt.Fprintf(&buf, "\\%X ", s[i]) + continue + } + buf.WriteByte(s[i]) + } + if delim != 0 { + buf.WriteByte(delim) + } + return buf.String() +} + +// Return the CSS source representation of the token. (Wrapper around +// WriteTo.) +func (t *Token) Render() string { + var buf bytes.Buffer + _, _ = t.WriteTo(&buf) + return buf.String() +} + +func stickyWriteString(n *int64, err *error, w io.Writer, s string) { + n2, err2 := io.WriteString(w, s) + *n += int64(n2) + if err2 != nil { + if *err != nil { + *err = err2 + } + } +} + +// Write the CSS source representation of the token to the provided writer. If +// you are attempting to render a series of tokens, see the TokenRenderer type +// to handle comment insertion rules. +// +// Tokens with type TokenError do not write anything. +func (t *Token) WriteTo(w io.Writer) (n int64, err error) { + switch t.Type { + case TokenError: + return + case TokenEOF: + return + case TokenIdent: + stickyWriteString(&n, &err, w, escapeIdentifier(t.Value)) + return + case TokenAtKeyword: + stickyWriteString(&n, &err, w, "@") + stickyWriteString(&n, &err, w, escapeIdentifier(t.Value)) + return + case TokenDelim: + if t.Value == "\\" { + // nb: should not happen, this is actually TokenBadEscape + stickyWriteString(&n, &err, w, "\\\n") + } else { + stickyWriteString(&n, &err, w, t.Value) + } + return + case TokenHash: + e := t.Extra.(*TokenExtraHash) + stickyWriteString(&n, &err, w, "#") + if e.IsIdentifier { + stickyWriteString(&n, &err, w, escapeIdentifier(t.Value)) + } else { + stickyWriteString(&n, &err, w, escapeHashName(t.Value)) + } + return + case TokenPercentage: + stickyWriteString(&n, &err, w, t.Value) + stickyWriteString(&n, &err, w, "%") + return + case TokenDimension: + e := t.Extra.(*TokenExtraNumeric) + stickyWriteString(&n, &err, w, t.Value) + stickyWriteString(&n, &err, w, escapeDimension(e.Dimension)) + return + case TokenString: + stickyWriteString(&n, &err, w, escapeString(t.Value, '"')) + return + case TokenURI: + stickyWriteString(&n, &err, w, "url(") + stickyWriteString(&n, &err, w, escapeString(t.Value, '"')) + stickyWriteString(&n, &err, w, ")") + return + case TokenUnicodeRange: + stickyWriteString(&n, &err, w, t.Extra.String()) + return + case TokenComment: + stickyWriteString(&n, &err, w, "/*") + stickyWriteString(&n, &err, w, t.Value) + stickyWriteString(&n, &err, w, "*/") + return + case TokenFunction: + stickyWriteString(&n, &err, w, escapeIdentifier(t.Value)) + stickyWriteString(&n, &err, w, "(") + return + case TokenBadEscape: + stickyWriteString(&n, &err, w, "\\\n") + return + case TokenBadString: + stickyWriteString(&n, &err, w, "\"") + stickyWriteString(&n, &err, w, escapeString(t.Value, 0)) + stickyWriteString(&n, &err, w, "\n") + return + case TokenBadURI: + stickyWriteString(&n, &err, w, "url(\"") + str := escapeString(t.Value, 0) + str = strings.TrimSuffix(str, "\"") + stickyWriteString(&n, &err, w, str) + stickyWriteString(&n, &err, w, "\n)") + return + default: + stickyWriteString(&n, &err, w, t.Value) + return + } +} + +// TokenRenderer takes care of the comment insertion rules for serialization. +// This type is mostly intended for the fuzz test and not for general +// consumption, but it can be used by consumers that want to re-render a parse +// stream. +type TokenRenderer struct { + lastToken Token +} + +// Write a token to the given io.Writer, potentially inserting an empty comment +// in front based on what the previous token was. +func (r *TokenRenderer) WriteTokenTo(w io.Writer, t Token) (n int64, err error) { + var prevKey, curKey interface{} + if r.lastToken.Type == TokenDelim { + prevKey = r.lastToken.Value[0] + } else { + prevKey = r.lastToken.Type + } + if t.Type == TokenDelim { + curKey = t.Value[0] + } else { + curKey = t.Type + } + + m1, ok := commentInsertionRules[prevKey] + if ok { + if m1[curKey] { + stickyWriteString(&n, &err, w, "/**/") + } + } + + n2, err2 := t.WriteTo(w) + r.lastToken = t + + n += n2 + if err2 != nil && err == nil { + err = err2 + } + return n, err +} + +// CSS Syntax Level 3 - Section 9 + +var commentInsertionThruCDC = map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: true, + '-': true, + '(': false, +} + +var commentInsertionRules = map[interface{}]map[interface{}]bool{ + TokenIdent: map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + '-': true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: true, + '(': true, + }, + TokenAtKeyword: commentInsertionThruCDC, + TokenHash: commentInsertionThruCDC, + TokenDimension: commentInsertionThruCDC, + '#': map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: false, + '-': true, + '(': false, + }, + '-': map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: false, + '-': false, + '(': false, + }, + TokenNumber: map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: true, + TokenCDC: false, + '-': false, + '(': false, + }, + '@': map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenURI: true, + TokenBadURI: true, + TokenNumber: false, + TokenPercentage: false, + TokenDimension: false, + TokenUnicodeRange: true, + TokenCDC: false, + '-': true, + '(': false, + }, + TokenUnicodeRange: map[interface{}]bool{ + TokenIdent: true, + TokenFunction: true, + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + TokenUnicodeRange: false, + '?': true, + }, + '.': map[interface{}]bool{ + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + }, + '+': map[interface{}]bool{ + TokenNumber: true, + TokenPercentage: true, + TokenDimension: true, + }, + '$': map[interface{}]bool{ + '=': true, + }, + '*': map[interface{}]bool{ + '=': true, + }, + '^': map[interface{}]bool{ + '=': true, + }, + '~': map[interface{}]bool{ + '=': true, + }, + '|': map[interface{}]bool{ + '=': true, + '|': true, + }, + '/': map[interface{}]bool{ + '*': true, + }, +} diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go new file mode 100644 index 0000000..fc67a13 --- /dev/null +++ b/tokenizer/tokenizer.go @@ -0,0 +1,919 @@ +// Copyright (c) 2018 Kane York. Licensed under 2-Clause BSD. + +package tokenizer + +import ( + "bufio" + "bytes" + "fmt" + "io" + "strconv" + "strings" + "unicode/utf8" + + "golang.org/x/text/transform" +) + +var ( + errBadEscape = &ParseError{Type: TokenBadEscape, Message: "bad escape (backslash-newline) in input"} +) + +// Tokenizer scans an input and emits tokens following the CSS Syntax Level 3 +// specification. +type Tokenizer struct { + r *bufio.Reader + err error + peek [3]byte + + // ErrorMode int + + tok Token +} + +/* +const ( + // Default error mode - tokenization errors are represented as special tokens in the stream, and I/O errors are TokenError. + ErrorModeTokens = iota + ErrorModeFatal +) +*/ + +// Construct a Tokenizer from the given input. Input need not be 'normalized' +// according to the spec (newlines changed to \n, zero bytes changed to +// U+FFFD). +func NewTokenizer(r io.Reader) *Tokenizer { + return &Tokenizer{ + r: bufio.NewReader(transform.NewReader(r, new(normalize))), + } +} + +// Scan for the next token. If the tokenizer is in an error state, no input +// will be consumed. +func (z *Tokenizer) Scan() { + defer func() { + rec := recover() + if rErr, ok := rec.(error); ok { + // we only ever panic(err) + z.err = rErr + z.tok = Token{ + Type: TokenError, + Extra: &TokenExtraError{Err: z.err}, + } + } else if rec != nil { + panic(rec) + } + }() + + if z.err == nil { + z.tok = z.consume() + } else if z.err == io.EOF { + z.tok = Token{ + Type: TokenEOF, + } + } else { + z.tok = Token{ + Type: TokenError, + Value: z.err.Error(), + Extra: &TokenExtraError{Err: z.err}, + } + } +} + +// Get the most recently scanned token. +func (z *Tokenizer) Token() Token { + return z.tok +} + +// Scan for the next token and return it. +func (z *Tokenizer) Next() Token { + z.Scan() + return z.tok +} + +// Err returns the last input reading error to be encountered. It is filled +// when TokenError is returned. +func (z *Tokenizer) Err() error { + return z.err +} + +// repeek reads the next 3 bytes into the tokenizer. on EOF, the bytes are +// filled with zeroes. (Null bytes in the input are preprocessed into U+FFFD.) +func (z *Tokenizer) repeek() { + by, err := z.r.Peek(3) + if err != nil && err != io.EOF { + panic(err) + } + copy(z.peek[:], by) + + // zero fill on EOF + i := len(by) + for i < 3 { + z.peek[i] = 0 + i++ + } +} + +// §4.3.8 +// up to 2 bytes +func isValidEscape(p []byte) bool { + if len(p) < 2 { + return false + } + if p[0] != '\\' { + return false + } + if p[1] == '\n' { + return false + } + return true +} + +// §4.3.9 +func isNameStart(p byte) bool { + if p >= utf8.RuneSelf { + return true // any high code points + } + if p == '_' { + return true + } + if p >= 'A' && p <= 'Z' { + return true + } + if p >= 'a' && p <= 'z' { + return true + } + return false +} + +func isNameCode(p byte) bool { + if p >= utf8.RuneSelf { + return true // any high code points + } + if p == '_' || p == '-' { + return true + } + if p >= 'A' && p <= 'Z' { + return true + } + if p >= 'a' && p <= 'z' { + return true + } + if p >= '0' && p <= '9' { + return true + } + return false +} + +func isHexDigit(p byte) bool { + if p >= 'A' && p <= 'F' { + return true + } + if p >= 'a' && p <= 'f' { + return true + } + if p >= '0' && p <= '9' { + return true + } + return false +} + +// up to 3 bytes +func isStartIdentifier(p []byte) bool { + if p[0] == '-' { + p = p[1:] + } + if isNameStart(p[0]) { + return true + } else if isValidEscape(p) { + return true + } + return false +} + +// §4.3.10 +// up to 3 bytes +func isStartNumber(p []byte) bool { + if p[0] == '+' || p[0] == '-' { + p = p[1:] + } + if p[0] == '.' { + p = p[1:] + } + if p[0] >= '0' && p[0] <= '9' { + return true + } + return false +} + +func isNonPrintable(by byte) bool { + return (0 <= by && by <= 0x08) || (0x0B == by) || (0x0E <= by && by <= 0x1F) || (0x7F == by) +} + +// repeek must be called before the following: + +func (z *Tokenizer) nextIsEscape() bool { + return isValidEscape(z.peek[:2]) +} + +func (z *Tokenizer) nextStartsIdentifier() bool { + return isStartIdentifier(z.peek[:3]) +} + +func (z *Tokenizer) nextIsNumber() bool { + return isStartNumber(z.peek[:3]) +} + +func (z *Tokenizer) nextCompare(vs string) bool { + return string(z.peek[:len(vs)]) == vs +} + +var premadeTokens = map[byte]Token{ + '$': Token{ + Type: TokenSuffixMatch, + Value: "$=", + }, + '*': Token{ + Type: TokenSubstringMatch, + Value: "*=", + }, + '^': Token{ + Type: TokenPrefixMatch, + Value: "^=", + }, + '~': Token{ + Type: TokenIncludes, + Value: "~=", + }, + '(': Token{Type: TokenOpenParen, Value: "("}, + ')': Token{Type: TokenCloseParen, Value: ")"}, + '[': Token{Type: TokenOpenBracket, Value: "["}, + ']': Token{Type: TokenCloseBracket, Value: "]"}, + '{': Token{Type: TokenOpenBrace, Value: "{"}, + '}': Token{Type: TokenCloseBrace, Value: "}"}, + ':': Token{Type: TokenColon, Value: ":"}, + ';': Token{Type: TokenSemicolon, Value: ";"}, + ',': Token{Type: TokenComma, Value: ","}, + + '\\': Token{Type: TokenBadEscape, Value: "\\"}, + + 'A': Token{Type: TokenDashMatch, Value: "|="}, + 'B': Token{Type: TokenColumn, Value: "||"}, + 'C': Token{Type: TokenCDC, Value: "-->"}, + 'O': Token{Type: TokenCDO, Value: "") { + z.r.Discard(3) + return premadeTokens['C'] + } + z.nextByte() // re-read, fall down to TokenDelim + case '.': + z.unreadByte() + z.repeek() + if z.nextIsNumber() { + return z.consumeNumeric() + } + z.nextByte() // re-read, fall down to TokenDelim + case '/': + z.repeek() + if z.peek[0] == '*' { + z.r.Discard(1) + return z.consumeComment() + } + case '<': + z.repeek() + if z.nextCompare("!--") { + z.r.Discard(3) + return premadeTokens['O'] + } + case '@': + z.repeek() + if z.nextStartsIdentifier() { + s := z.consumeName() + return Token{ + Type: TokenAtKeyword, + Value: s, + } + } + case '\\': + z.unreadByte() + z.repeek() + if z.nextIsEscape() { + // input stream has a valid escape + return z.consumeIdentish() + } + z.nextByte() + // z.err = errBadEscape + return premadeTokens['\\'] + case 'U', 'u': + z.unreadByte() + z.repeek() + if z.peek[1] == '+' && (isHexDigit(z.peek[2]) || (z.peek[2] == '?')) { + z.r.Discard(2) // (!) only discard the U+ + return z.consumeUnicodeRange() + } + break + } + + if '0' <= ch && ch <= '9' { + z.unreadByte() + return z.consumeNumeric() + } + if isNameStart(ch) { + z.unreadByte() + return z.consumeIdentish() + } + return Token{ + Type: TokenDelim, + Value: string(rune(ch)), + } +} + +// return the next byte, with 0 on EOF and panicing on other errors +func (z *Tokenizer) nextByte() byte { + if z.err == io.EOF { + return 0 + } + by, err := z.r.ReadByte() + if err == io.EOF { + z.err = io.EOF + return 0 + } else if err != nil { + panic(err) + } + return by +} + +func (z *Tokenizer) unreadByte() { + if z.err == io.EOF { + // don't unread after EOF + return + } + z.r.UnreadByte() +} + +func isWhitespace(r rune) bool { + return r == ' ' || r == '\t' || r == '\n' +} + +func isNotWhitespace(r rune) bool { + return !isWhitespace(r) +} + +func (z *Tokenizer) consumeWhitespace(ch byte) Token { + const wsBufSize = 32 + + sawNewline := false + if ch == '\n' { + sawNewline = true + } + + for { + // Consume whitespace in chunks of up to wsBufSize + buf, err := z.r.Peek(wsBufSize) + if err != nil && err != io.EOF { + panic(err) + } + if len(buf) == 0 { + break // Reached EOF + } + // find first non-whitespace char, discard up to there + idx := bytes.IndexFunc(buf, isNotWhitespace) + if idx == 0 { + break // Nothing to trim + } + if idx == -1 { + idx = len(buf) // Entire buffer is spaces + } + if /* const */ ch != 0 { + // only check for newlines when we're actually outputting a token + nlIdx := bytes.IndexByte(buf[:idx], '\n') + if nlIdx != -1 { + sawNewline = true + } + } + z.r.Discard(idx) + } + + if sawNewline { + return Token{ + Type: TokenS, + Value: "\n", + } + } + return Token{ + Type: TokenS, + Value: " ", + } +} + +// 4.3.2 +func (z *Tokenizer) consumeNumeric() Token { + repr, notInteger := z.consumeNumericInner() + e := &TokenExtraNumeric{ + NonInteger: notInteger, + } + t := Token{ + Type: TokenNumber, + Value: string(repr), + Extra: e, + } + z.repeek() + if z.nextStartsIdentifier() { + t.Type = TokenDimension + e.Dimension = z.consumeName() + } else if z.peek[0] == '%' { + z.r.Discard(1) + t.Type = TokenPercentage + } + return t +} + +// §4.3.3 +func (z *Tokenizer) consumeIdentish() Token { + s := z.consumeName() + z.repeek() + if z.peek[0] == '(' { + z.r.Discard(1) + if strings.EqualFold(s, "url") { + return z.consumeURL() + } + return Token{ + Type: TokenFunction, + Value: s, + } + } else { + return Token{ + Type: TokenIdent, + Value: s, + } + } +} + +// §4.3.4 +func (z *Tokenizer) consumeString(delim byte) Token { + var frag []byte + var by byte + for { + by = z.nextByte() + if by == delim || by == 0 { + // end of string, EOF + return Token{ + Type: TokenString, + Value: string(frag), + } + } else if by == '\n' { + z.unreadByte() + /* z.err = */ er := &ParseError{ + Type: TokenBadString, + Message: "unterminated string", + } + return Token{ + Type: TokenBadString, + Value: string(frag), + Extra: &TokenExtraError{Err: er}, + } + } else if by == '\\' { + z.unreadByte() + z.repeek() + if z.peek[1] == 0 { + // escape @ EOF, ignore. + z.nextByte() // '\' + } else if z.peek[1] == '\n' { + // valid escaped newline, ignore. + z.nextByte() // '\' + z.nextByte() // newline + } else if true { + // stream will always contain a valid escape here + z.nextByte() // '\' + cp := z.consumeEscapedCP() + var tmp [utf8.UTFMax]byte + n := utf8.EncodeRune(tmp[:], cp) + frag = append(frag, tmp[:n]...) + } + } else { + frag = append(frag, by) + } + } +} + +// §4.3.5 +// reader must be in the "url(" state +func (z *Tokenizer) consumeURL() Token { + z.consumeWhitespace(0) + z.repeek() + if z.peek[0] == 0 { + return Token{ + Type: TokenURI, + Value: "", + } + } else if z.peek[0] == '\'' || z.peek[0] == '"' { + delim := z.peek[0] + z.nextByte() + t := z.consumeString(delim) + if t.Type == TokenBadString { + t.Type = TokenBadURI + t.Value += z.consumeBadURL() + /* z.err = */ pe := &ParseError{ + Type: TokenBadURI, + Message: "unterminated string in url()", + } + t.Extra = &TokenExtraError{ + Err: pe, + } + return t + } + t.Type = TokenURI + z.consumeWhitespace(0) + z.repeek() + if z.peek[0] == ')' || z.peek[0] == 0 { + z.nextByte() + return t + } + t.Type = TokenBadURI + t.Value += z.consumeBadURL() + /* z.err = */ pe := &ParseError{ + Type: TokenBadURI, + Message: "url() with string missing close parenthesis", + } + t.Extra = &TokenExtraError{ + Err: pe, + } + return t + } + var frag []byte + var by byte + for { + by = z.nextByte() + if by == ')' || by == 0 { + return Token{Type: TokenURI, Value: string(frag)} + } else if isWhitespace(rune(by)) { + z.consumeWhitespace(0) + z.repeek() + if z.peek[0] == ')' || z.peek[0] == 0 { + z.nextByte() // ')' + return Token{Type: TokenURI, Value: string(frag)} + } + /* z.err = */ pe := &ParseError{ + Type: TokenBadURI, + Message: "bare url() with internal whitespace", + } + return Token{ + Type: TokenBadURI, + Value: string(frag) + z.consumeBadURL(), + Extra: &TokenExtraError{Err: pe}, + } + } else if by == '\'' || by == '"' || by == '(' { + /* z.err = */ pe := &ParseError{ + Type: TokenBadURI, + Message: fmt.Sprintf("bare url() with illegal character '%c'", by), + } + return Token{ + Type: TokenBadURI, + Value: string(frag) + z.consumeBadURL(), + Extra: &TokenExtraError{Err: pe}, + } + } else if isNonPrintable(by) { + /* z.err = */ pe := &ParseError{ + Type: TokenBadURI, + Message: fmt.Sprintf("bare url() with unprintable character '%d'", by), + } + return Token{ + Type: TokenBadURI, + Value: string(frag) + z.consumeBadURL(), + Extra: &TokenExtraError{Err: pe}, + } + } else if by == '\\' { + z.unreadByte() + z.repeek() + if z.nextIsEscape() { + z.nextByte() // '\' + cp := z.consumeEscapedCP() + var tmp [utf8.UTFMax]byte + n := utf8.EncodeRune(tmp[:], cp) + frag = append(frag, tmp[:n]...) + } else { + /* z.err = */ pe := &ParseError{ + Type: TokenBadURI, + Message: fmt.Sprintf("bare url() with invalid escape"), + } + return Token{ + Type: TokenBadURI, + Value: string(frag) + z.consumeBadURL(), + Extra: &TokenExtraError{Err: pe}, + } + } + } else { + frag = append(frag, by) + } + } +} + +// §4.3.6 +func (z *Tokenizer) consumeUnicodeRange() Token { + var sdigits [6]byte + var by byte + haveQuestionMarks := false + i := 0 + for { + by = z.nextByte() + if i >= 6 { + break // weird condition so that unreadByte() works + } + if by == '?' { + sdigits[i] = by + haveQuestionMarks = true + i++ + } else if !haveQuestionMarks && isHexDigit(by) { + sdigits[i] = by + i++ + } else { + break + } + } + z.unreadByte() + slen := i + var edigits [6]byte + var elen int + z.repeek() + + if haveQuestionMarks { + copy(edigits[:slen], sdigits[:slen]) + elen = slen + for idx := range sdigits { + if sdigits[idx] == '?' { + sdigits[idx] = '0' + edigits[idx] = 'F' + } + } + } else if z.peek[0] == '-' && isHexDigit(z.peek[1]) { + z.nextByte() // '-' + i = 0 + for { + by = z.nextByte() + if i < 6 && isHexDigit(by) { + edigits[i] = by + i++ + } else { + break + } + } + z.unreadByte() + elen = i + } else { + copy(edigits[:], sdigits[:]) + elen = slen + } + + // 16 = hex, 32 = int32 + startCP, err := strconv.ParseInt(string(sdigits[:slen]), 16, 32) + if err != nil { + panic(fmt.Sprintf("ParseInt failure: %s", err)) + } + endCP, err := strconv.ParseInt(string(edigits[:elen]), 16, 32) + if err != nil { + panic(fmt.Sprintf("ParseInt failure: %s", err)) + } + e := &TokenExtraUnicodeRange{ + Start: rune(startCP), + End: rune(endCP), + } + return Token{ + Type: TokenUnicodeRange, + Value: e.String(), + Extra: e, + } +} + +func (z *Tokenizer) consumeComment() Token { + var frag []byte + var by byte + for { + by = z.nextByte() + if by == '*' { + z.repeek() + if z.peek[0] == '/' { + z.nextByte() // '/' + return Token{ + Type: TokenComment, + Value: string(frag), + } + } + } else if by == 0 { + return Token{ + Type: TokenComment, + Value: string(frag), + } + } + frag = append(frag, by) + } +} + +// §4.3.7 +// after the "\" +func (z *Tokenizer) consumeEscapedCP() rune { + by := z.nextByte() + if by == 0 { + return utf8.RuneError + } else if isHexDigit(by) { + var digits = make([]byte, 6) + digits[0] = by + i := 1 + // (!) weird looping condition so that we UnreadByte() at the end + for { + by = z.nextByte() + if i < 6 && isHexDigit(by) { + digits[i] = by + i++ + } else { + break + } + } + + if isNotWhitespace(rune(by)) && by != 0 { + z.unreadByte() + } + digits = digits[:i] + // 16 = hex, 22 = bit width of unicode + cpi, err := strconv.ParseInt(string(digits), 16, 32) + if err != nil || cpi == 0 || cpi > utf8.MaxRune { + return utf8.RuneError + } + return rune(cpi) + } else { + z.unreadByte() + ru, _, err := z.r.ReadRune() + if err == io.EOF { + z.err = io.EOF + return utf8.RuneError + } else if err != nil { + z.err = err + panic(err) + } else { + return ru + } + } +} + +// §4.3.11 +func (z *Tokenizer) consumeName() string { + var frag []byte + var by byte + for { + by = z.nextByte() + if by == '\\' { + z.unreadByte() + z.repeek() + if z.nextIsEscape() { + z.nextByte() + cp := z.consumeEscapedCP() + var tmp [utf8.UTFMax]byte + n := utf8.EncodeRune(tmp[:], cp) + frag = append(frag, tmp[:n]...) + continue + } else { + return string(frag) + } + } else if isNameCode(by) { + frag = append(frag, by) + continue + } else { + z.unreadByte() + return string(frag) + } + } +} + +// §4.3.12 +func (z *Tokenizer) consumeNumericInner() (repr []byte, notInteger bool) { + by := z.nextByte() + if by == '+' || by == '-' { + repr = append(repr, by) + by = z.nextByte() + } + consumeDigits := func() { + for '0' <= by && by <= '9' { + repr = append(repr, by) + by = z.nextByte() + } + if by != 0 { + // don't attempt to unread EOF + z.unreadByte() + } + } + + consumeDigits() + z.repeek() + if z.peek[0] == '.' && '0' <= z.peek[1] && z.peek[1] <= '9' { + notInteger = true + + by = z.nextByte() // '.' + repr = append(repr, by) + by = z.nextByte() + consumeDigits() + z.repeek() + } + // [eE][+-]?[0-9] + if (z.peek[0] == 'e') || (z.peek[0] == 'E') { + var n int + if (z.peek[1] == '+' || z.peek[1] == '-') && ('0' <= z.peek[2] && z.peek[2] <= '9') { + n = 3 + } else if '0' <= z.peek[1] && z.peek[1] <= '9' { + n = 2 + } + if n != 0 { + notInteger = true + repr = append(repr, z.peek[:n]...) + z.r.Discard(n) + by = z.nextByte() + consumeDigits() + } + } + + return repr, notInteger +} + +// §4.3.14 +func (z *Tokenizer) consumeBadURL() string { + var frag []byte + var by byte + for { + by = z.nextByte() + if by == ')' || by == 0 { + return string(frag) + } else if by == '\\' { + z.unreadByte() + z.repeek() + if z.nextIsEscape() { + z.nextByte() // '\' + // Allow for escaped right paren "\)" + cp := z.consumeEscapedCP() + var tmp [utf8.UTFMax]byte + n := utf8.EncodeRune(tmp[:], cp) + frag = append(frag, tmp[:n]...) + continue + } + z.nextByte() // '\' + } + frag = append(frag, by) + } +}