diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json new file mode 100644 index 0000000..962d98b --- /dev/null +++ b/Godeps/Godeps.json @@ -0,0 +1,25 @@ +{ + "ImportPath": "github.com/jimeh/kotaku-uk-rss", + "GoVersion": "go1.3", + "Deps": [ + { + "ImportPath": "code.google.com/p/cascadia", + "Comment": "null-30", + "Rev": "4f03c71bc42ba0015a68bea86422f0ecbb71bf70" + }, + { + "ImportPath": "code.google.com/p/go.net/html", + "Comment": "null-144", + "Rev": "ad01a6fcc8a19d3a4478c836895ffe883bd2ceab" + }, + { + "ImportPath": "github.com/PuerkitoBio/goquery", + "Comment": "v0.3.2-27-g1e5417b", + "Rev": "1e5417b3dbc2ca68de909fb56d9095daa680a166" + }, + { + "ImportPath": "github.com/gorilla/feeds", + "Rev": "2e133eb352fab1ff3569ee169e9a3a94f69c9081" + } + ] +} diff --git a/Godeps/Readme b/Godeps/Readme new file mode 100644 index 0000000..4cdaa53 --- /dev/null +++ b/Godeps/Readme @@ -0,0 +1,5 @@ +This directory tree is generated automatically by godep. + +Please do not edit. + +See https://github.com/tools/godep for more information. diff --git a/Godeps/_workspace/.gitignore b/Godeps/_workspace/.gitignore new file mode 100644 index 0000000..f037d68 --- /dev/null +++ b/Godeps/_workspace/.gitignore @@ -0,0 +1,2 @@ +/pkg +/bin diff --git a/Godeps/_workspace/src/code.google.com/p/cascadia/.hgignore b/Godeps/_workspace/src/code.google.com/p/cascadia/.hgignore new file mode 100644 index 0000000..f97c559 --- /dev/null +++ b/Godeps/_workspace/src/code.google.com/p/cascadia/.hgignore @@ -0,0 +1,6 @@ +^_ +^\. +\.out$ +\.6$ +~$ +\.orig$ diff --git a/Godeps/_workspace/src/code.google.com/p/cascadia/LICENSE b/Godeps/_workspace/src/code.google.com/p/cascadia/LICENSE new file mode 100644 index 0000000..ee5ad35 --- /dev/null +++ b/Godeps/_workspace/src/code.google.com/p/cascadia/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2011 Andy Balholm. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Godeps/_workspace/src/code.google.com/p/cascadia/Makefile b/Godeps/_workspace/src/code.google.com/p/cascadia/Makefile new file mode 100644 index 0000000..c7d7a2b --- /dev/null +++ b/Godeps/_workspace/src/code.google.com/p/cascadia/Makefile @@ -0,0 +1,12 @@ +include $(GOROOT)/src/Make.inc + +TARG=cascadia + +GOFILES= \ + parser.go \ + selector.go \ + +include $(GOROOT)/src/Make.pkg + +format: + gofmt -w ${GOFILES} *_test.go diff --git a/Godeps/_workspace/src/code.google.com/p/cascadia/benchmark_test.go b/Godeps/_workspace/src/code.google.com/p/cascadia/benchmark_test.go new file mode 100644 index 0000000..cdd6eb2 --- /dev/null +++ b/Godeps/_workspace/src/code.google.com/p/cascadia/benchmark_test.go @@ -0,0 +1,52 @@ +package cascadia + +import ( + "code.google.com/p/go.net/html" + "strings" + "testing" +) + +func MustParseHTML(doc string) *html.Node { + dom, err := html.Parse(strings.NewReader(doc)) + if err != nil { + panic(err) + } + return dom +} + +var selector = MustCompile(`div.matched`) +var doc = ` + +
+`, + "#foo", + []string{ + `
`, + }, + }, + { + `
`, + "li#t1", + []string{ + `
`, + "p.t1", + []string{ + `
`, + }, + }, + { + `
`, + ".t1.fail", + []string{}, + }, + { + `
`, + "p.t1.t2", + []string{ + `
`, + }, + }, + { + `
`, + "p[title]", + []string{ + `
`, + }, + }, + { + `
`, + `address[title="foo"]`, + []string{ + ``, + }, + }, + { + ``, + `[ title ~= foo ]`, + []string{ + `
`, + }, + }, + { + `
`, + `[title~="hello world"]`, + []string{}, + }, + { + `
`, + `[lang|="en"]`, + []string{ + `
`, + `
`, + }, + }, + { + `
`, + `[title^="foo"]`, + []string{ + `
`, + }, + }, + { + `
`, + `[title$="bar"]`, + []string{ + `
`, + }, + }, + { + `
`, + `[title*="bar"]`, + []string{ + `
`, + }, + }, + { + `
`, + ".t1:not(.t2)", + []string{}, + }, + { + `
some text and a span and another
`, + `span:first-child`, + []string{ + ``, + }, + }, + { + `a span and some text`, + `span:last-child`, + []string{ + ``, + }, + }, + { + ``, + `p:nth-of-type(2)`, + []string{ + `
`, + }, + }, + { + `
`, + `p:nth-last-of-type(2)`, + []string{ + ``, + }, + }, + { + `
`, + `p:last-of-type`, + []string{ + ``, + }, + }, + { + `
`, + `p:first-of-type`, + []string{ + ``, + }, + }, + { + `
`, + }, + }, + { + `
`, + }, + }, + { + `
Hello
`,
+ `:empty`,
+ []string{
+ ``,
+ ` `,
+ ``,
+ },
+ },
+ {
+ ` `,
+ `div p`,
+ []string{
+ ` `,
+ ` `,
+ },
+ },
+ {
+ ` `,
+ `div table p`,
+ []string{
+ ` `,
+ },
+ },
+ {
+ ` `,
+ ` `,
+ },
+ },
+ {
+ ` `,
+ `p ~ p`,
+ []string{
+ ` `,
+ ` `,
+ },
+ },
+ {
+ ` `,
+ `p + p`,
+ []string{
+ ` `,
+ },
+ },
+ {
+ ` `,
+ `li, p`,
+ []string{
+ " ",
+ },
+ },
+ {
+ ` `,
+ `p +/*This is a comment*/ p`,
+ []string{
+ ` `,
+ },
+ },
+ {
+ ` Text block that wraps inner text and continues `,
+ },
+ },
+ {
+ ` Text block that wraps inner text and continues Text block that wraps inner text and continues Text block that wraps inner text and continues `,
+ },
+ },
+ {
+ ` text content contents 1 contents 2 contents 1 contents 2 `,
+ },
+ },
+ {
+ ` contents 1 contents 2 `,
+ },
+ },
+ {
+ ` 0123456789 abcdef 0123ABCD `,
+ ` `,
+ },
+ },
+ {
+ ` 0123456789 abcdef 0123ABCD `,
+ },
+ },
+ {
+ ` 0123456789 abcdef 0123ABCD `,
+ ` `,
+ },
+ },
+ {
+ ` 0123456789 abcdef 0123ABCD `,
+ ` `,
+ },
+ },
+ {
+ ` 0123456789 abcdef 0123ABCD `,
+ ` `,
+ ` `,
+ },
+ },
+ {
+ ` 0123456789 abcdef 0123ABCD `,
+ },
+ },
+ {
+ ` 0123456789 abcdef 0123ABCD `,
+ ` `,
+ },
+ },
+ {
+ ` 0123456789 `,
+ ``,
+ },
+ },
+ {
+ ``,
+ `[href#=(fina)]:not([href#=(\/\/[^\/]+untrusted)])`,
+ []string{
+ ``,
+ ``,
+ },
+ },
+ {
+ ``,
+ `[href#=(^https:\/\/[^\/]*\/?news)]`,
+ []string{
+ ``,
+ },
+ },
+}
+
+func TestSelectors(t *testing.T) {
+ for _, test := range selectorTests {
+ s, err := Compile(test.selector)
+ if err != nil {
+ t.Errorf("error compiling %q: %s", test.selector, err)
+ continue
+ }
+
+ doc, err := html.Parse(strings.NewReader(test.HTML))
+ if err != nil {
+ t.Errorf("error parsing %q: %s", test.HTML, err)
+ continue
+ }
+
+ matches := s.MatchAll(doc)
+ if len(matches) != len(test.results) {
+ t.Errorf("wanted %d elements, got %d instead", len(test.results), len(matches))
+ continue
+ }
+
+ for i, m := range matches {
+ got := nodeString(m)
+ if got != test.results[i] {
+ t.Errorf("wanted %s, got %s instead", test.results[i], got)
+ }
+ }
+
+ firstMatch := s.MatchFirst(doc)
+ if len(test.results) == 0 {
+ if firstMatch != nil {
+ t.Errorf("MatchFirst: want nil, got %s", nodeString(firstMatch))
+ }
+ } else {
+ got := nodeString(firstMatch)
+ if got != test.results[0] {
+ t.Errorf("MatchFirst: want %s, got %s", test.results[0], got)
+ }
+ }
+ }
+}
diff --git a/Godeps/_workspace/src/code.google.com/p/go.net/html/atom/atom.go b/Godeps/_workspace/src/code.google.com/p/go.net/html/atom/atom.go
new file mode 100644
index 0000000..227404b
--- /dev/null
+++ b/Godeps/_workspace/src/code.google.com/p/go.net/html/atom/atom.go
@@ -0,0 +1,78 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package atom provides integer codes (also known as atoms) for a fixed set of
+// frequently occurring HTML strings: tag names and attribute keys such as "p"
+// and "id".
+//
+// Sharing an atom's name between all elements with the same tag can result in
+// fewer string allocations when tokenizing and parsing HTML. Integer
+// comparisons are also generally faster than string comparisons.
+//
+// The value of an atom's particular code is not guaranteed to stay the same
+// between versions of this package. Neither is any ordering guaranteed:
+// whether atom.H1 < atom.H2 may also change. The codes are not guaranteed to
+// be dense. The only guarantees are that e.g. looking up "div" will yield
+// atom.Div, calling atom.Div.String will return "div", and atom.Div != 0.
+package atom
+
+// Atom is an integer code for a string. The zero value maps to "".
+type Atom uint32
+
+// String returns the atom's name.
+func (a Atom) String() string {
+ start := uint32(a >> 8)
+ n := uint32(a & 0xff)
+ if start+n > uint32(len(atomText)) {
+ return ""
+ }
+ return atomText[start : start+n]
+}
+
+func (a Atom) string() string {
+ return atomText[a>>8 : a>>8+a&0xff]
+}
+
+// fnv computes the FNV hash with an arbitrary starting value h.
+func fnv(h uint32, s []byte) uint32 {
+ for i := range s {
+ h ^= uint32(s[i])
+ h *= 16777619
+ }
+ return h
+}
+
+func match(s string, t []byte) bool {
+ for i, c := range t {
+ if s[i] != c {
+ return false
+ }
+ }
+ return true
+}
+
+// Lookup returns the atom whose name is s. It returns zero if there is no
+// such atom. The lookup is case sensitive.
+func Lookup(s []byte) Atom {
+ if len(s) == 0 || len(s) > maxAtomLen {
+ return 0
+ }
+ h := fnv(hash0, s)
+ if a := table[h&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
+ return a
+ }
+ if a := table[(h>>16)&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
+ return a
+ }
+ return 0
+}
+
+// String returns a string whose contents are equal to s. In that sense, it is
+// equivalent to string(s) but may be more efficient.
+func String(s []byte) string {
+ if a := Lookup(s); a != 0 {
+ return a.String()
+ }
+ return string(s)
+}
diff --git a/Godeps/_workspace/src/code.google.com/p/go.net/html/atom/atom_test.go b/Godeps/_workspace/src/code.google.com/p/go.net/html/atom/atom_test.go
new file mode 100644
index 0000000..6e33704
--- /dev/null
+++ b/Godeps/_workspace/src/code.google.com/p/go.net/html/atom/atom_test.go
@@ -0,0 +1,109 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atom
+
+import (
+ "sort"
+ "testing"
+)
+
+func TestKnown(t *testing.T) {
+ for _, s := range testAtomList {
+ if atom := Lookup([]byte(s)); atom.String() != s {
+ t.Errorf("Lookup(%q) = %#x (%q)", s, uint32(atom), atom.String())
+ }
+ }
+}
+
+func TestHits(t *testing.T) {
+ for _, a := range table {
+ if a == 0 {
+ continue
+ }
+ got := Lookup([]byte(a.String()))
+ if got != a {
+ t.Errorf("Lookup(%q) = %#x, want %#x", a.String(), uint32(got), uint32(a))
+ }
+ }
+}
+
+func TestMisses(t *testing.T) {
+ testCases := []string{
+ "",
+ "\x00",
+ "\xff",
+ "A",
+ "DIV",
+ "Div",
+ "dIV",
+ "aa",
+ "a\x00",
+ "ab",
+ "abb",
+ "abbr0",
+ "abbr ",
+ " abbr",
+ " a",
+ "acceptcharset",
+ "acceptCharset",
+ "accept_charset",
+ "h0",
+ "h1h2",
+ "h7",
+ "onClick",
+ "λ",
+ // The following string has the same hash (0xa1d7fab7) as "onmouseover".
+ "\x00\x00\x00\x00\x00\x50\x18\xae\x38\xd0\xb7",
+ }
+ for _, tc := range testCases {
+ got := Lookup([]byte(tc))
+ if got != 0 {
+ t.Errorf("Lookup(%q): got %d, want 0", tc, got)
+ }
+ }
+}
+
+func TestForeignObject(t *testing.T) {
+ const (
+ afo = Foreignobject
+ afO = ForeignObject
+ sfo = "foreignobject"
+ sfO = "foreignObject"
+ )
+ if got := Lookup([]byte(sfo)); got != afo {
+ t.Errorf("Lookup(%q): got %#v, want %#v", sfo, got, afo)
+ }
+ if got := Lookup([]byte(sfO)); got != afO {
+ t.Errorf("Lookup(%q): got %#v, want %#v", sfO, got, afO)
+ }
+ if got := afo.String(); got != sfo {
+ t.Errorf("Atom(%#v).String(): got %q, want %q", afo, got, sfo)
+ }
+ if got := afO.String(); got != sfO {
+ t.Errorf("Atom(%#v).String(): got %q, want %q", afO, got, sfO)
+ }
+}
+
+func BenchmarkLookup(b *testing.B) {
+ sortedTable := make([]string, 0, len(table))
+ for _, a := range table {
+ if a != 0 {
+ sortedTable = append(sortedTable, a.String())
+ }
+ }
+ sort.Strings(sortedTable)
+
+ x := make([][]byte, 1000)
+ for i := range x {
+ x[i] = []byte(sortedTable[i%len(sortedTable)])
+ }
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ for _, s := range x {
+ Lookup(s)
+ }
+ }
+}
diff --git a/Godeps/_workspace/src/code.google.com/p/go.net/html/atom/gen.go b/Godeps/_workspace/src/code.google.com/p/go.net/html/atom/gen.go
new file mode 100644
index 0000000..9958a71
--- /dev/null
+++ b/Godeps/_workspace/src/code.google.com/p/go.net/html/atom/gen.go
@@ -0,0 +1,636 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+// This program generates table.go and table_test.go.
+// Invoke as
+//
+// go run gen.go |gofmt >table.go
+// go run gen.go -test |gofmt >table_test.go
+
+import (
+ "flag"
+ "fmt"
+ "math/rand"
+ "os"
+ "sort"
+ "strings"
+)
+
+// identifier converts s to a Go exported identifier.
+// It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
+func identifier(s string) string {
+ b := make([]byte, 0, len(s))
+ cap := true
+ for _, c := range s {
+ if c == '-' {
+ cap = true
+ continue
+ }
+ if cap && 'a' <= c && c <= 'z' {
+ c -= 'a' - 'A'
+ }
+ cap = false
+ b = append(b, byte(c))
+ }
+ return string(b)
+}
+
+var test = flag.Bool("test", false, "generate table_test.go")
+
+func main() {
+ flag.Parse()
+
+ var all []string
+ all = append(all, elements...)
+ all = append(all, attributes...)
+ all = append(all, eventHandlers...)
+ all = append(all, extra...)
+ sort.Strings(all)
+
+ if *test {
+ fmt.Printf("// generated by go run gen.go -test; DO NOT EDIT\n\n")
+ fmt.Printf("package atom\n\n")
+ fmt.Printf("var testAtomList = []string{\n")
+ for _, s := range all {
+ fmt.Printf("\t%q,\n", s)
+ }
+ fmt.Printf("}\n")
+ return
+ }
+
+ // uniq - lists have dups
+ // compute max len too
+ maxLen := 0
+ w := 0
+ for _, s := range all {
+ if w == 0 || all[w-1] != s {
+ if maxLen < len(s) {
+ maxLen = len(s)
+ }
+ all[w] = s
+ w++
+ }
+ }
+ all = all[:w]
+
+ // Find hash that minimizes table size.
+ var best *table
+ for i := 0; i < 1000000; i++ {
+ if best != nil && 1<<(best.k-1) < len(all) {
+ break
+ }
+ h := rand.Uint32()
+ for k := uint(0); k <= 16; k++ {
+ if best != nil && k >= best.k {
+ break
+ }
+ var t table
+ if t.init(h, k, all) {
+ best = &t
+ break
+ }
+ }
+ }
+ if best == nil {
+ fmt.Fprintf(os.Stderr, "failed to construct string table\n")
+ os.Exit(1)
+ }
+
+ // Lay out strings, using overlaps when possible.
+ layout := append([]string{}, all...)
+
+ // Remove strings that are substrings of other strings
+ for changed := true; changed; {
+ changed = false
+ for i, s := range layout {
+ if s == "" {
+ continue
+ }
+ for j, t := range layout {
+ if i != j && t != "" && strings.Contains(s, t) {
+ changed = true
+ layout[j] = ""
+ }
+ }
+ }
+ }
+
+ // Join strings where one suffix matches another prefix.
+ for {
+ // Find best i, j, k such that layout[i][len-k:] == layout[j][:k],
+ // maximizing overlap length k.
+ besti := -1
+ bestj := -1
+ bestk := 0
+ for i, s := range layout {
+ if s == "" {
+ continue
+ }
+ for j, t := range layout {
+ if i == j {
+ continue
+ }
+ for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
+ if s[len(s)-k:] == t[:k] {
+ besti = i
+ bestj = j
+ bestk = k
+ }
+ }
+ }
+ }
+ if bestk > 0 {
+ layout[besti] += layout[bestj][bestk:]
+ layout[bestj] = ""
+ continue
+ }
+ break
+ }
+
+ text := strings.Join(layout, "")
+
+ atom := map[string]uint32{}
+ for _, s := range all {
+ off := strings.Index(text, s)
+ if off < 0 {
+ panic("lost string " + s)
+ }
+ atom[s] = uint32(off<<8 | len(s))
+ }
+
+ // Generate the Go code.
+ fmt.Printf("// generated by go run gen.go; DO NOT EDIT\n\n")
+ fmt.Printf("package atom\n\nconst (\n")
+ for _, s := range all {
+ fmt.Printf("\t%s Atom = %#x\n", identifier(s), atom[s])
+ }
+ fmt.Printf(")\n\n")
+
+ fmt.Printf("const hash0 = %#x\n\n", best.h0)
+ fmt.Printf("const maxAtomLen = %d\n\n", maxLen)
+
+ fmt.Printf("var table = [1<<%d]Atom{\n", best.k)
+ for i, s := range best.tab {
+ if s == "" {
+ continue
+ }
+ fmt.Printf("\t%#x: %#x, // %s\n", i, atom[s], s)
+ }
+ fmt.Printf("}\n")
+ datasize := (1 << best.k) * 4
+
+ fmt.Printf("const atomText =\n")
+ textsize := len(text)
+ for len(text) > 60 {
+ fmt.Printf("\t%q +\n", text[:60])
+ text = text[60:]
+ }
+ fmt.Printf("\t%q\n\n", text)
+
+ fmt.Fprintf(os.Stderr, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize)
+}
+
+type byLen []string
+
+func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) }
+func (x byLen) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
+func (x byLen) Len() int { return len(x) }
+
+// fnv computes the FNV hash with an arbitrary starting value h.
+func fnv(h uint32, s string) uint32 {
+ for i := 0; i < len(s); i++ {
+ h ^= uint32(s[i])
+ h *= 16777619
+ }
+ return h
+}
+
+// A table represents an attempt at constructing the lookup table.
+// The lookup table uses cuckoo hashing, meaning that each string
+// can be found in one of two positions.
+type table struct {
+ h0 uint32
+ k uint
+ mask uint32
+ tab []string
+}
+
+// hash returns the two hashes for s.
+func (t *table) hash(s string) (h1, h2 uint32) {
+ h := fnv(t.h0, s)
+ h1 = h & t.mask
+ h2 = (h >> 16) & t.mask
+ return
+}
+
+// init initializes the table with the given parameters.
+// h0 is the initial hash value,
+// k is the number of bits of hash value to use, and
+// x is the list of strings to store in the table.
+// init returns false if the table cannot be constructed.
+func (t *table) init(h0 uint32, k uint, x []string) bool {
+ t.h0 = h0
+ t.k = k
+ t.tab = make([]string, 1< HTTP charset The character encoding of a page can be set using the HTTP header charset declaration. The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector The only character encoding declaration for this HTML file is in the HTTP header, which sets the encoding to ISO 8859-15. the-input-byte-stream-001 HTTP vs UTF-8 BOM A character encoding set in the HTTP header has lower precedence than the UTF-8 signature. The HTTP header attempts to set the character encoding to ISO 8859-15. The page starts with a UTF-8 signature. The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector If the test is unsuccessful, the characters  should appear at the top of the page. These represent the bytes that make up the UTF-8 signature when encountered in the ISO 8859-15 encoding. the-input-byte-stream-034 HTTP vs meta charset The HTTP header has a higher precedence than an encoding declaration in a meta charset attribute. The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-1. The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector the-input-byte-stream-018 HTTP vs meta content The HTTP header has a higher precedence than an encoding declaration in a meta content attribute. The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-1. The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector the-input-byte-stream-016 No encoding declaration A page with no encoding information in HTTP, BOM, XML declaration or meta element will be treated as UTF-8. The test on this page contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector the-input-byte-stream-015 UTF-8 BOM vs meta charset A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta charset attribute declares a different encoding. The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature. The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector the-input-byte-stream-038 UTF-8 BOM vs meta content A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta content attribute declares a different encoding. The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature. The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector the-input-byte-stream-037 meta charset attribute The character encoding of the page can be set by a meta element with charset attribute. The only character encoding declaration for this HTML file is in the charset attribute of the meta element, which declares the encoding to be ISO 8859-15. The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector the-input-byte-stream-009 meta content attribute The character encoding of the page can be set by a meta element with http-equiv and content attributes. The only character encoding declaration for this HTML file is in the content attribute of the meta element, which declares the encoding to be ISO 8859-15. The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector the-input-byte-stream-007 Links:.test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.
Result summary & related tests
Detailed results for this test
Link to spec.test div.ýäè. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.
Result summary & related tests
Detailed results for this test
Link to spec.test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.
Result summary & related tests
Detailed results for this test
Link to spec.test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.
Result summary & related tests
Detailed results for this test
Link to spec.test div.ýäè. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.
Result summary & related tests
Detailed results for this test
Link to spec.test div.ýäè. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.
Result summary & related tests
Detailed results for this test
Link to spec.test div.ýäè. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.
Result summary & related tests
Detailed results for this test
Link to spec.test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.
Result summary & related tests
Detailed results for this test
Link to spec.test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.
Result summary & related tests
Detailed results for this test
Link to spec
are treated as start tags, except that
+ // hasSelfClosingToken is set while they are being processed.
+ hasSelfClosingToken bool
+ // doc is the document root element.
+ doc *Node
+ // The stack of open elements (section 12.2.3.2) and active formatting
+ // elements (section 12.2.3.3).
+ oe, afe nodeStack
+ // Element pointers (section 12.2.3.4).
+ head, form *Node
+ // Other parsing state flags (section 12.2.3.5).
+ scripting, framesetOK bool
+ // im is the current insertion mode.
+ im insertionMode
+ // originalIM is the insertion mode to go back to after completing a text
+ // or inTableText insertion mode.
+ originalIM insertionMode
+ // fosterParenting is whether new elements should be inserted according to
+ // the foster parenting rules (section 12.2.5.3).
+ fosterParenting bool
+ // quirks is whether the parser is operating in "quirks mode."
+ quirks bool
+ // fragment is whether the parser is parsing an HTML fragment.
+ fragment bool
+ // context is the context element when parsing an HTML fragment
+ // (section 12.4).
+ context *Node
+}
+
+func (p *parser) top() *Node {
+ if n := p.oe.top(); n != nil {
+ return n
+ }
+ return p.doc
+}
+
+// Stop tags for use in popUntil. These come from section 12.2.3.2.
+var (
+ defaultScopeStopTags = map[string][]a.Atom{
+ "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object},
+ "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
+ "svg": {a.Desc, a.ForeignObject, a.Title},
+ }
+)
+
+type scope int
+
+const (
+ defaultScope scope = iota
+ listItemScope
+ buttonScope
+ tableScope
+ tableRowScope
+ tableBodyScope
+ selectScope
+)
+
+// popUntil pops the stack of open elements at the highest element whose tag
+// is in matchTags, provided there is no higher element in the scope's stop
+// tags (as defined in section 12.2.3.2). It returns whether or not there was
+// such an element. If there was not, popUntil leaves the stack unchanged.
+//
+// For example, the set of stop tags for table scope is: "html", "table". If
+// the stack was:
+// ["html", "body", "font", "table", "b", "i", "u"]
+// then popUntil(tableScope, "font") would return false, but
+// popUntil(tableScope, "i") would return true and the stack would become:
+// ["html", "body", "font", "table", "b"]
+//
+// If an element's tag is in both the stop tags and matchTags, then the stack
+// will be popped and the function returns true (provided, of course, there was
+// no higher element in the stack that was also in the stop tags). For example,
+// popUntil(tableScope, "table") returns true and leaves:
+// ["html", "body", "font"]
+func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
+ if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
+ p.oe = p.oe[:i]
+ return true
+ }
+ return false
+}
+
+// indexOfElementInScope returns the index in p.oe of the highest element whose
+// tag is in matchTags that is in scope. If no matching element is in scope, it
+// returns -1.
+func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
+ for i := len(p.oe) - 1; i >= 0; i-- {
+ tagAtom := p.oe[i].DataAtom
+ if p.oe[i].Namespace == "" {
+ for _, t := range matchTags {
+ if t == tagAtom {
+ return i
+ }
+ }
+ switch s {
+ case defaultScope:
+ // No-op.
+ case listItemScope:
+ if tagAtom == a.Ol || tagAtom == a.Ul {
+ return -1
+ }
+ case buttonScope:
+ if tagAtom == a.Button {
+ return -1
+ }
+ case tableScope:
+ if tagAtom == a.Html || tagAtom == a.Table {
+ return -1
+ }
+ case selectScope:
+ if tagAtom != a.Optgroup && tagAtom != a.Option {
+ return -1
+ }
+ default:
+ panic("unreachable")
+ }
+ }
+ switch s {
+ case defaultScope, listItemScope, buttonScope:
+ for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
+ if t == tagAtom {
+ return -1
+ }
+ }
+ }
+ }
+ return -1
+}
+
+// elementInScope is like popUntil, except that it doesn't modify the stack of
+// open elements.
+func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
+ return p.indexOfElementInScope(s, matchTags...) != -1
+}
+
+// clearStackToContext pops elements off the stack of open elements until a
+// scope-defined element is found.
+func (p *parser) clearStackToContext(s scope) {
+ for i := len(p.oe) - 1; i >= 0; i-- {
+ tagAtom := p.oe[i].DataAtom
+ switch s {
+ case tableScope:
+ if tagAtom == a.Html || tagAtom == a.Table {
+ p.oe = p.oe[:i+1]
+ return
+ }
+ case tableRowScope:
+ if tagAtom == a.Html || tagAtom == a.Tr {
+ p.oe = p.oe[:i+1]
+ return
+ }
+ case tableBodyScope:
+ if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead {
+ p.oe = p.oe[:i+1]
+ return
+ }
+ default:
+ panic("unreachable")
+ }
+ }
+}
+
+// generateImpliedEndTags pops nodes off the stack of open elements as long as
+// the top node has a tag name of dd, dt, li, option, optgroup, p, rp, or rt.
+// If exceptions are specified, nodes with that name will not be popped off.
+func (p *parser) generateImpliedEndTags(exceptions ...string) {
+ var i int
+loop:
+ for i = len(p.oe) - 1; i >= 0; i-- {
+ n := p.oe[i]
+ if n.Type == ElementNode {
+ switch n.DataAtom {
+ case a.Dd, a.Dt, a.Li, a.Option, a.Optgroup, a.P, a.Rp, a.Rt:
+ for _, except := range exceptions {
+ if n.Data == except {
+ break loop
+ }
+ }
+ continue
+ }
+ }
+ break
+ }
+
+ p.oe = p.oe[:i+1]
+}
+
+// addChild adds a child node n to the top element, and pushes n onto the stack
+// of open elements if it is an element node.
+func (p *parser) addChild(n *Node) {
+ if p.shouldFosterParent() {
+ p.fosterParent(n)
+ } else {
+ p.top().AppendChild(n)
+ }
+
+ if n.Type == ElementNode {
+ p.oe = append(p.oe, n)
+ }
+}
+
+// shouldFosterParent returns whether the next node to be added should be
+// foster parented.
+func (p *parser) shouldFosterParent() bool {
+ if p.fosterParenting {
+ switch p.top().DataAtom {
+ case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
+ return true
+ }
+ }
+ return false
+}
+
+// fosterParent adds a child node according to the foster parenting rules.
+// Section 12.2.5.3, "foster parenting".
+func (p *parser) fosterParent(n *Node) {
+ var table, parent, prev *Node
+ var i int
+ for i = len(p.oe) - 1; i >= 0; i-- {
+ if p.oe[i].DataAtom == a.Table {
+ table = p.oe[i]
+ break
+ }
+ }
+
+ if table == nil {
+ // The foster parent is the html element.
+ parent = p.oe[0]
+ } else {
+ parent = table.Parent
+ }
+ if parent == nil {
+ parent = p.oe[i-1]
+ }
+
+ if table != nil {
+ prev = table.PrevSibling
+ } else {
+ prev = parent.LastChild
+ }
+ if prev != nil && prev.Type == TextNode && n.Type == TextNode {
+ prev.Data += n.Data
+ return
+ }
+
+ parent.InsertBefore(n, table)
+}
+
+// addText adds text to the preceding node if it is a text node, or else it
+// calls addChild with a new text node.
+func (p *parser) addText(text string) {
+ if text == "" {
+ return
+ }
+
+ if p.shouldFosterParent() {
+ p.fosterParent(&Node{
+ Type: TextNode,
+ Data: text,
+ })
+ return
+ }
+
+ t := p.top()
+ if n := t.LastChild; n != nil && n.Type == TextNode {
+ n.Data += text
+ return
+ }
+ p.addChild(&Node{
+ Type: TextNode,
+ Data: text,
+ })
+}
+
+// addElement adds a child element based on the current token.
+func (p *parser) addElement() {
+ p.addChild(&Node{
+ Type: ElementNode,
+ DataAtom: p.tok.DataAtom,
+ Data: p.tok.Data,
+ Attr: p.tok.Attr,
+ })
+}
+
+// Section 12.2.3.3.
+func (p *parser) addFormattingElement() {
+ tagAtom, attr := p.tok.DataAtom, p.tok.Attr
+ p.addElement()
+
+ // Implement the Noah's Ark clause, but with three per family instead of two.
+ identicalElements := 0
+findIdenticalElements:
+ for i := len(p.afe) - 1; i >= 0; i-- {
+ n := p.afe[i]
+ if n.Type == scopeMarkerNode {
+ break
+ }
+ if n.Type != ElementNode {
+ continue
+ }
+ if n.Namespace != "" {
+ continue
+ }
+ if n.DataAtom != tagAtom {
+ continue
+ }
+ if len(n.Attr) != len(attr) {
+ continue
+ }
+ compareAttributes:
+ for _, t0 := range n.Attr {
+ for _, t1 := range attr {
+ if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
+ // Found a match for this attribute, continue with the next attribute.
+ continue compareAttributes
+ }
+ }
+ // If we get here, there is no attribute that matches a.
+ // Therefore the element is not identical to the new one.
+ continue findIdenticalElements
+ }
+
+ identicalElements++
+ if identicalElements >= 3 {
+ p.afe.remove(n)
+ }
+ }
+
+ p.afe = append(p.afe, p.top())
+}
+
+// Section 12.2.3.3.
+func (p *parser) clearActiveFormattingElements() {
+ for {
+ n := p.afe.pop()
+ if len(p.afe) == 0 || n.Type == scopeMarkerNode {
+ return
+ }
+ }
+}
+
+// Section 12.2.3.3.
+func (p *parser) reconstructActiveFormattingElements() {
+ n := p.afe.top()
+ if n == nil {
+ return
+ }
+ if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
+ return
+ }
+ i := len(p.afe) - 1
+ for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
+ if i == 0 {
+ i = -1
+ break
+ }
+ i--
+ n = p.afe[i]
+ }
+ for {
+ i++
+ clone := p.afe[i].clone()
+ p.addChild(clone)
+ p.afe[i] = clone
+ if i == len(p.afe)-1 {
+ break
+ }
+ }
+}
+
+// Section 12.2.4.
+func (p *parser) acknowledgeSelfClosingTag() {
+ p.hasSelfClosingToken = false
+}
+
+// An insertion mode (section 12.2.3.1) is the state transition function from
+// a particular state in the HTML5 parser's state machine. It updates the
+// parser's fields depending on parser.tok (where ErrorToken means EOF).
+// It returns whether the token was consumed.
+type insertionMode func(*parser) bool
+
+// setOriginalIM sets the insertion mode to return to after completing a text or
+// inTableText insertion mode.
+// Section 12.2.3.1, "using the rules for".
+func (p *parser) setOriginalIM() {
+ if p.originalIM != nil {
+ panic("html: bad parser state: originalIM was set twice")
+ }
+ p.originalIM = p.im
+}
+
+// Section 12.2.3.1, "reset the insertion mode".
+func (p *parser) resetInsertionMode() {
+ for i := len(p.oe) - 1; i >= 0; i-- {
+ n := p.oe[i]
+ if i == 0 && p.context != nil {
+ n = p.context
+ }
+
+ switch n.DataAtom {
+ case a.Select:
+ p.im = inSelectIM
+ case a.Td, a.Th:
+ p.im = inCellIM
+ case a.Tr:
+ p.im = inRowIM
+ case a.Tbody, a.Thead, a.Tfoot:
+ p.im = inTableBodyIM
+ case a.Caption:
+ p.im = inCaptionIM
+ case a.Colgroup:
+ p.im = inColumnGroupIM
+ case a.Table:
+ p.im = inTableIM
+ case a.Head:
+ p.im = inBodyIM
+ case a.Body:
+ p.im = inBodyIM
+ case a.Frameset:
+ p.im = inFramesetIM
+ case a.Html:
+ p.im = beforeHeadIM
+ default:
+ continue
+ }
+ return
+ }
+ p.im = inBodyIM
+}
+
+const whitespace = " \t\r\n\f"
+
+// Section 12.2.5.4.1.
+func initialIM(p *parser) bool {
+ switch p.tok.Type {
+ case TextToken:
+ p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
+ if len(p.tok.Data) == 0 {
+ // It was all whitespace, so ignore it.
+ return true
+ }
+ case CommentToken:
+ p.doc.AppendChild(&Node{
+ Type: CommentNode,
+ Data: p.tok.Data,
+ })
+ return true
+ case DoctypeToken:
+ n, quirks := parseDoctype(p.tok.Data)
+ p.doc.AppendChild(n)
+ p.quirks = quirks
+ p.im = beforeHTMLIM
+ return true
+ }
+ p.quirks = true
+ p.im = beforeHTMLIM
+ return false
+}
+
+// Section 12.2.5.4.2.
+func beforeHTMLIM(p *parser) bool {
+ switch p.tok.Type {
+ case DoctypeToken:
+ // Ignore the token.
+ return true
+ case TextToken:
+ p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
+ if len(p.tok.Data) == 0 {
+ // It was all whitespace, so ignore it.
+ return true
+ }
+ case StartTagToken:
+ if p.tok.DataAtom == a.Html {
+ p.addElement()
+ p.im = beforeHeadIM
+ return true
+ }
+ case EndTagToken:
+ switch p.tok.DataAtom {
+ case a.Head, a.Body, a.Html, a.Br:
+ p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
+ return false
+ default:
+ // Ignore the token.
+ return true
+ }
+ case CommentToken:
+ p.doc.AppendChild(&Node{
+ Type: CommentNode,
+ Data: p.tok.Data,
+ })
+ return true
+ }
+ p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
+ return false
+}
+
+// Section 12.2.5.4.3.
+func beforeHeadIM(p *parser) bool {
+ switch p.tok.Type {
+ case TextToken:
+ p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
+ if len(p.tok.Data) == 0 {
+ // It was all whitespace, so ignore it.
+ return true
+ }
+ case StartTagToken:
+ switch p.tok.DataAtom {
+ case a.Head:
+ p.addElement()
+ p.head = p.top()
+ p.im = inHeadIM
+ return true
+ case a.Html:
+ return inBodyIM(p)
+ }
+ case EndTagToken:
+ switch p.tok.DataAtom {
+ case a.Head, a.Body, a.Html, a.Br:
+ p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
+ return false
+ default:
+ // Ignore the token.
+ return true
+ }
+ case CommentToken:
+ p.addChild(&Node{
+ Type: CommentNode,
+ Data: p.tok.Data,
+ })
+ return true
+ case DoctypeToken:
+ // Ignore the token.
+ return true
+ }
+
+ p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
+ return false
+}
+
+// Section 12.2.5.4.4.
+func inHeadIM(p *parser) bool {
+ switch p.tok.Type {
+ case TextToken:
+ s := strings.TrimLeft(p.tok.Data, whitespace)
+ if len(s) < len(p.tok.Data) {
+ // Add the initial whitespace to the current node.
+ p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
+ if s == "" {
+ return true
+ }
+ p.tok.Data = s
+ }
+ case StartTagToken:
+ switch p.tok.DataAtom {
+ case a.Html:
+ return inBodyIM(p)
+ case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta:
+ p.addElement()
+ p.oe.pop()
+ p.acknowledgeSelfClosingTag()
+ return true
+ case a.Script, a.Title, a.Noscript, a.Noframes, a.Style:
+ p.addElement()
+ p.setOriginalIM()
+ p.im = textIM
+ return true
+ case a.Head:
+ // Ignore the token.
+ return true
+ }
+ case EndTagToken:
+ switch p.tok.DataAtom {
+ case a.Head:
+ n := p.oe.pop()
+ if n.DataAtom != a.Head {
+ panic("html: bad parser state: element not found, in the in-head insertion mode")
+ }
+ p.im = afterHeadIM
+ return true
+ case a.Body, a.Html, a.Br:
+ p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
+ return false
+ default:
+ // Ignore the token.
+ return true
+ }
+ case CommentToken:
+ p.addChild(&Node{
+ Type: CommentNode,
+ Data: p.tok.Data,
+ })
+ return true
+ case DoctypeToken:
+ // Ignore the token.
+ return true
+ }
+
+ p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
+ return false
+}
+
+// Section 12.2.5.4.6.
+func afterHeadIM(p *parser) bool {
+ switch p.tok.Type {
+ case TextToken:
+ s := strings.TrimLeft(p.tok.Data, whitespace)
+ if len(s) < len(p.tok.Data) {
+ // Add the initial whitespace to the current node.
+ p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
+ if s == "" {
+ return true
+ }
+ p.tok.Data = s
+ }
+ case StartTagToken:
+ switch p.tok.DataAtom {
+ case a.Html:
+ return inBodyIM(p)
+ case a.Body:
+ p.addElement()
+ p.framesetOK = false
+ p.im = inBodyIM
+ return true
+ case a.Frameset:
+ p.addElement()
+ p.im = inFramesetIM
+ return true
+ case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title:
+ p.oe = append(p.oe, p.head)
+ defer p.oe.remove(p.head)
+ return inHeadIM(p)
+ case a.Head:
+ // Ignore the token.
+ return true
+ }
+ case EndTagToken:
+ switch p.tok.DataAtom {
+ case a.Body, a.Html, a.Br:
+ // Drop down to creating an implied tag.
+ default:
+ // Ignore the token.
+ return true
+ }
+ case CommentToken:
+ p.addChild(&Node{
+ Type: CommentNode,
+ Data: p.tok.Data,
+ })
+ return true
+ case DoctypeToken:
+ // Ignore the token.
+ return true
+ }
+
+ p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
+ p.framesetOK = true
+ return false
+}
+
+// copyAttributes copies attributes of src not found on dst to dst.
+func copyAttributes(dst *Node, src Token) {
+ if len(src.Attr) == 0 {
+ return
+ }
+ attr := map[string]string{}
+ for _, t := range dst.Attr {
+ attr[t.Key] = t.Val
+ }
+ for _, t := range src.Attr {
+ if _, ok := attr[t.Key]; !ok {
+ dst.Attr = append(dst.Attr, t)
+ attr[t.Key] = t.Val
+ }
+ }
+}
+
+// Section 12.2.5.4.7.
+func inBodyIM(p *parser) bool {
+ switch p.tok.Type {
+ case TextToken:
+ d := p.tok.Data
+ switch n := p.oe.top(); n.DataAtom {
+ case a.Pre, a.Listing:
+ if n.FirstChild == nil {
+ // Ignore a newline at the start of a block.
+ if d != "" && d[0] == '\r' {
+ d = d[1:]
+ }
+ if d != "" && d[0] == '\n' {
+ d = d[1:]
+ }
+ }
+ }
+ d = strings.Replace(d, "\x00", "", -1)
+ if d == "" {
+ return true
+ }
+ p.reconstructActiveFormattingElements()
+ p.addText(d)
+ if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
+ // There were non-whitespace characters inserted.
+ p.framesetOK = false
+ }
+ case StartTagToken:
+ switch p.tok.DataAtom {
+ case a.Html:
+ copyAttributes(p.oe[0], p.tok)
+ case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title:
+ return inHeadIM(p)
+ case a.Body:
+ if len(p.oe) >= 2 {
+ body := p.oe[1]
+ if body.Type == ElementNode && body.DataAtom == a.Body {
+ p.framesetOK = false
+ copyAttributes(body, p.tok)
+ }
+ }
+ case a.Frameset:
+ if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
+ // Ignore the token.
+ return true
+ }
+ body := p.oe[1]
+ if body.Parent != nil {
+ body.Parent.RemoveChild(body)
+ }
+ p.oe = p.oe[:1]
+ p.addElement()
+ p.im = inFramesetIM
+ return true
+ case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul:
+ p.popUntil(buttonScope, a.P)
+ p.addElement()
+ case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
+ p.popUntil(buttonScope, a.P)
+ switch n := p.top(); n.DataAtom {
+ case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
+ p.oe.pop()
+ }
+ p.addElement()
+ case a.Pre, a.Listing:
+ p.popUntil(buttonScope, a.P)
+ p.addElement()
+ // The newline, if any, will be dealt with by the TextToken case.
+ p.framesetOK = false
+ case a.Form:
+ if p.form == nil {
+ p.popUntil(buttonScope, a.P)
+ p.addElement()
+ p.form = p.top()
+ }
+ case a.Li:
+ p.framesetOK = false
+ for i := len(p.oe) - 1; i >= 0; i-- {
+ node := p.oe[i]
+ switch node.DataAtom {
+ case a.Li:
+ p.oe = p.oe[:i]
+ case a.Address, a.Div, a.P:
+ continue
+ default:
+ if !isSpecialElement(node) {
+ continue
+ }
+ }
+ break
+ }
+ p.popUntil(buttonScope, a.P)
+ p.addElement()
+ case a.Dd, a.Dt:
+ p.framesetOK = false
+ for i := len(p.oe) - 1; i >= 0; i-- {
+ node := p.oe[i]
+ switch node.DataAtom {
+ case a.Dd, a.Dt:
+ p.oe = p.oe[:i]
+ case a.Address, a.Div, a.P:
+ continue
+ default:
+ if !isSpecialElement(node) {
+ continue
+ }
+ }
+ break
+ }
+ p.popUntil(buttonScope, a.P)
+ p.addElement()
+ case a.Plaintext:
+ p.popUntil(buttonScope, a.P)
+ p.addElement()
+ case a.Button:
+ p.popUntil(defaultScope, a.Button)
+ p.reconstructActiveFormattingElements()
+ p.addElement()
+ p.framesetOK = false
+ case a.A:
+ for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
+ if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
+ p.inBodyEndTagFormatting(a.A)
+ p.oe.remove(n)
+ p.afe.remove(n)
+ break
+ }
+ }
+ p.reconstructActiveFormattingElements()
+ p.addFormattingElement()
+ case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
+ p.reconstructActiveFormattingElements()
+ p.addFormattingElement()
+ case a.Nobr:
+ p.reconstructActiveFormattingElements()
+ if p.elementInScope(defaultScope, a.Nobr) {
+ p.inBodyEndTagFormatting(a.Nobr)
+ p.reconstructActiveFormattingElements()
+ }
+ p.addFormattingElement()
+ case a.Applet, a.Marquee, a.Object:
+ p.reconstructActiveFormattingElements()
+ p.addElement()
+ p.afe = append(p.afe, &scopeMarker)
+ p.framesetOK = false
+ case a.Table:
+ if !p.quirks {
+ p.popUntil(buttonScope, a.P)
+ }
+ p.addElement()
+ p.framesetOK = false
+ p.im = inTableIM
+ return true
+ case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
+ p.reconstructActiveFormattingElements()
+ p.addElement()
+ p.oe.pop()
+ p.acknowledgeSelfClosingTag()
+ if p.tok.DataAtom == a.Input {
+ for _, t := range p.tok.Attr {
+ if t.Key == "type" {
+ if strings.ToLower(t.Val) == "hidden" {
+ // Skip setting framesetOK = false
+ return true
+ }
+ }
+ }
+ }
+ p.framesetOK = false
+ case a.Param, a.Source, a.Track:
+ p.addElement()
+ p.oe.pop()
+ p.acknowledgeSelfClosingTag()
+ case a.Hr:
+ p.popUntil(buttonScope, a.P)
+ p.addElement()
+ p.oe.pop()
+ p.acknowledgeSelfClosingTag()
+ p.framesetOK = false
+ case a.Image:
+ p.tok.DataAtom = a.Img
+ p.tok.Data = a.Img.String()
+ return false
+ case a.Isindex:
+ if p.form != nil {
+ // Ignore the token.
+ return true
+ }
+ action := ""
+ prompt := "This is a searchable index. Enter search keywords: "
+ attr := []Attribute{{Key: "name", Val: "isindex"}}
+ for _, t := range p.tok.Attr {
+ switch t.Key {
+ case "action":
+ action = t.Val
+ case "name":
+ // Ignore the attribute.
+ case "prompt":
+ prompt = t.Val
+ default:
+ attr = append(attr, t)
+ }
+ }
+ p.acknowledgeSelfClosingTag()
+ p.popUntil(buttonScope, a.P)
+ p.parseImpliedToken(StartTagToken, a.Form, a.Form.String())
+ if action != "" {
+ p.form.Attr = []Attribute{{Key: "action", Val: action}}
+ }
+ p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
+ p.parseImpliedToken(StartTagToken, a.Label, a.Label.String())
+ p.addText(prompt)
+ p.addChild(&Node{
+ Type: ElementNode,
+ DataAtom: a.Input,
+ Data: a.Input.String(),
+ Attr: attr,
+ })
+ p.oe.pop()
+ p.parseImpliedToken(EndTagToken, a.Label, a.Label.String())
+ p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
+ p.parseImpliedToken(EndTagToken, a.Form, a.Form.String())
+ case a.Textarea:
+ p.addElement()
+ p.setOriginalIM()
+ p.framesetOK = false
+ p.im = textIM
+ case a.Xmp:
+ p.popUntil(buttonScope, a.P)
+ p.reconstructActiveFormattingElements()
+ p.framesetOK = false
+ p.addElement()
+ p.setOriginalIM()
+ p.im = textIM
+ case a.Iframe:
+ p.framesetOK = false
+ p.addElement()
+ p.setOriginalIM()
+ p.im = textIM
+ case a.Noembed, a.Noscript:
+ p.addElement()
+ p.setOriginalIM()
+ p.im = textIM
+ case a.Select:
+ p.reconstructActiveFormattingElements()
+ p.addElement()
+ p.framesetOK = false
+ p.im = inSelectIM
+ return true
+ case a.Optgroup, a.Option:
+ if p.top().DataAtom == a.Option {
+ p.oe.pop()
+ }
+ p.reconstructActiveFormattingElements()
+ p.addElement()
+ case a.Rp, a.Rt:
+ if p.elementInScope(defaultScope, a.Ruby) {
+ p.generateImpliedEndTags()
+ }
+ p.addElement()
+ case a.Math, a.Svg:
+ p.reconstructActiveFormattingElements()
+ if p.tok.DataAtom == a.Math {
+ adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
+ } else {
+ adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
+ }
+ adjustForeignAttributes(p.tok.Attr)
+ p.addElement()
+ p.top().Namespace = p.tok.Data
+ if p.hasSelfClosingToken {
+ p.oe.pop()
+ p.acknowledgeSelfClosingTag()
+ }
+ return true
+ case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
+ // Ignore the token.
+ default:
+ p.reconstructActiveFormattingElements()
+ p.addElement()
+ }
+ case EndTagToken:
+ switch p.tok.DataAtom {
+ case a.Body:
+ if p.elementInScope(defaultScope, a.Body) {
+ p.im = afterBodyIM
+ }
+ case a.Html:
+ if p.elementInScope(defaultScope, a.Body) {
+ p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
+ return false
+ }
+ return true
+ case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul:
+ p.popUntil(defaultScope, p.tok.DataAtom)
+ case a.Form:
+ node := p.form
+ p.form = nil
+ i := p.indexOfElementInScope(defaultScope, a.Form)
+ if node == nil || i == -1 || p.oe[i] != node {
+ // Ignore the token.
+ return true
+ }
+ p.generateImpliedEndTags()
+ p.oe.remove(node)
+ case a.P:
+ if !p.elementInScope(buttonScope, a.P) {
+ p.parseImpliedToken(StartTagToken, a.P, a.P.String())
+ }
+ p.popUntil(buttonScope, a.P)
+ case a.Li:
+ p.popUntil(listItemScope, a.Li)
+ case a.Dd, a.Dt:
+ p.popUntil(defaultScope, p.tok.DataAtom)
+ case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
+ p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
+ case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
+ p.inBodyEndTagFormatting(p.tok.DataAtom)
+ case a.Applet, a.Marquee, a.Object:
+ if p.popUntil(defaultScope, p.tok.DataAtom) {
+ p.clearActiveFormattingElements()
+ }
+ case a.Br:
+ p.tok.Type = StartTagToken
+ return false
+ default:
+ p.inBodyEndTagOther(p.tok.DataAtom)
+ }
+ case CommentToken:
+ p.addChild(&Node{
+ Type: CommentNode,
+ Data: p.tok.Data,
+ })
+ }
+
+ return true
+}
+
+func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
+ // This is the "adoption agency" algorithm, described at
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency
+
+ // TODO: this is a fairly literal line-by-line translation of that algorithm.
+ // Once the code successfully parses the comprehensive test suite, we should
+ // refactor this code to be more idiomatic.
+
+ // Steps 1-3. The outer loop.
+ for i := 0; i < 8; i++ {
+ // Step 4. Find the formatting element.
+ var formattingElement *Node
+ for j := len(p.afe) - 1; j >= 0; j-- {
+ if p.afe[j].Type == scopeMarkerNode {
+ break
+ }
+ if p.afe[j].DataAtom == tagAtom {
+ formattingElement = p.afe[j]
+ break
+ }
+ }
+ if formattingElement == nil {
+ p.inBodyEndTagOther(tagAtom)
+ return
+ }
+ feIndex := p.oe.index(formattingElement)
+ if feIndex == -1 {
+ p.afe.remove(formattingElement)
+ return
+ }
+ if !p.elementInScope(defaultScope, tagAtom) {
+ // Ignore the tag.
+ return
+ }
+
+ // Steps 5-6. Find the furthest block.
+ var furthestBlock *Node
+ for _, e := range p.oe[feIndex:] {
+ if isSpecialElement(e) {
+ furthestBlock = e
+ break
+ }
+ }
+ if furthestBlock == nil {
+ e := p.oe.pop()
+ for e != formattingElement {
+ e = p.oe.pop()
+ }
+ p.afe.remove(e)
+ return
+ }
+
+ // Steps 7-8. Find the common ancestor and bookmark node.
+ commonAncestor := p.oe[feIndex-1]
+ bookmark := p.afe.index(formattingElement)
+
+ // Step 9. The inner loop. Find the lastNode to reparent.
+ lastNode := furthestBlock
+ node := furthestBlock
+ x := p.oe.index(node)
+ // Steps 9.1-9.3.
+ for j := 0; j < 3; j++ {
+ // Step 9.4.
+ x--
+ node = p.oe[x]
+ // Step 9.5.
+ if p.afe.index(node) == -1 {
+ p.oe.remove(node)
+ continue
+ }
+ // Step 9.6.
+ if node == formattingElement {
+ break
+ }
+ // Step 9.7.
+ clone := node.clone()
+ p.afe[p.afe.index(node)] = clone
+ p.oe[p.oe.index(node)] = clone
+ node = clone
+ // Step 9.8.
+ if lastNode == furthestBlock {
+ bookmark = p.afe.index(node) + 1
+ }
+ // Step 9.9.
+ if lastNode.Parent != nil {
+ lastNode.Parent.RemoveChild(lastNode)
+ }
+ node.AppendChild(lastNode)
+ // Step 9.10.
+ lastNode = node
+ }
+
+ // Step 10. Reparent lastNode to the common ancestor,
+ // or for misnested table nodes, to the foster parent.
+ if lastNode.Parent != nil {
+ lastNode.Parent.RemoveChild(lastNode)
+ }
+ switch commonAncestor.DataAtom {
+ case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
+ p.fosterParent(lastNode)
+ default:
+ commonAncestor.AppendChild(lastNode)
+ }
+
+ // Steps 11-13. Reparent nodes from the furthest block's children
+ // to a clone of the formatting element.
+ clone := formattingElement.clone()
+ reparentChildren(clone, furthestBlock)
+ furthestBlock.AppendChild(clone)
+
+ // Step 14. Fix up the list of active formatting elements.
+ if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
+ // Move the bookmark with the rest of the list.
+ bookmark--
+ }
+ p.afe.remove(formattingElement)
+ p.afe.insert(bookmark, clone)
+
+ // Step 15. Fix up the stack of open elements.
+ p.oe.remove(formattingElement)
+ p.oe.insert(p.oe.index(furthestBlock)+1, clone)
+ }
+}
+
+// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
+func (p *parser) inBodyEndTagOther(tagAtom a.Atom) {
+ for i := len(p.oe) - 1; i >= 0; i-- {
+ if p.oe[i].DataAtom == tagAtom {
+ p.oe = p.oe[:i]
+ break
+ }
+ if isSpecialElement(p.oe[i]) {
+ break
+ }
+ }
+}
+
+// Section 12.2.5.4.8.
+func textIM(p *parser) bool {
+ switch p.tok.Type {
+ case ErrorToken:
+ p.oe.pop()
+ case TextToken:
+ d := p.tok.Data
+ if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
+ // Ignore a newline at the start of a