//
// HtmlTreeBuilderState.swift
// SwiftSoup
//
// Created by Nabil Chatbi on 24/10/16.
// Copyright © 2016 Nabil Chatbi.. All rights reserved.
//
import Foundation
protocol HtmlTreeBuilderStateProtocol {
func process(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool
}
enum HtmlTreeBuilderState: String, HtmlTreeBuilderStateProtocol {
case Initial
case BeforeHtml
case BeforeHead
case InHead
case InHeadNoscript
case AfterHead
case InBody
case Text
case InTable
case InTableText
case InCaption
case InColumnGroup
case InTableBody
case InRow
case InCell
case InSelect
case InSelectInTable
case AfterBody
case InFrameset
case AfterFrameset
case AfterAfterBody
case AfterAfterFrameset
case ForeignContent
private enum TagSets {
static let outer = ["head", "body", "html", "br"]
static let outer2 = ["body", "html", "br"]
static let outer3 = ["body", "html"]
static let baseEtc = ["base", "basefont", "bgsound", "command", "link"]
static let baseEtc2 = ["basefont", "bgsound", "link", "meta", "noframes", "style"]
static let baseEtc3 = ["base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title"]
static let headNoscript = ["head", "noscript"]
static let table = ["table", "tbody", "tfoot", "thead", "tr"]
static let tableSections = ["tbody", "tfoot", "thead"]
static let tableMix = ["body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"]
static let tableMix2 = ["body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"]
static let tableMix3 = ["caption", "col", "colgroup", "tbody", "tfoot", "thead"]
static let tableMix4 = ["body", "caption", "col", "colgroup", "html", "td", "th", "tr"]
static let tableMix5 = ["caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr"]
static let tableMix6 = ["body", "caption", "col", "colgroup", "html", "td", "th"]
static let tableMix7 = ["body", "caption", "col", "colgroup", "html"]
static let tableMix8 = ["caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"]
static let tableRowsAndCols = ["caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"]
static let thTd = ["th", "td"]
static let inputKeygenTextarea = ["input", "keygen", "textarea"]
}
private static let nullString: String = "\u{0000}"
public func equals(_ s: HtmlTreeBuilderState) -> Bool {
return self.hashValue == s.hashValue
}
func process(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool {
switch self {
case .Initial:
if (HtmlTreeBuilderState.isWhitespace(t)) {
return true // ignore whitespace
} else if (t.isComment()) {
try tb.insert(t.asComment())
} else if (t.isDoctype()) {
// todo: parse error check on expected doctypes
// todo: quirk state check on doctype ids
let d: Token.Doctype = t.asDoctype()
let doctype: DocumentType = DocumentType(
tb.settings.normalizeTag(d.getName()), d.getPubSysKey(), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri())
//tb.settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri())
try tb.getDocument().appendChild(doctype)
if (d.isForceQuirks()) {
tb.getDocument().quirksMode(Document.QuirksMode.quirks)
}
tb.transition(.BeforeHtml)
} else {
// todo: check not iframe srcdoc
tb.transition(.BeforeHtml)
return try tb.process(t) // re-process token
}
return true
case .BeforeHtml:
func anythingElse(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool {
try tb.insertStartTag("html")
tb.transition(.BeforeHead)
return try tb.process(t)
}
if (t.isDoctype()) {
tb.error(self)
return false
} else if (t.isComment()) {
try tb.insert(t.asComment())
} else if (HtmlTreeBuilderState.isWhitespace(t)) {
return true // ignore whitespace
} else if t.startTagNormalName() == "html" {
try tb.insert(t.asStartTag())
tb.transition(.BeforeHead)
} else if let nName = t.endTagNormalName(), TagSets.outer.contains(nName) {
return try anythingElse(t, tb)
} else if (t.isEndTag()) {
tb.error(self)
return false
} else {
return try anythingElse(t, tb)
}
return true
case .BeforeHead:
if (HtmlTreeBuilderState.isWhitespace(t)) {
return true
} else if (t.isComment()) {
try tb.insert(t.asComment())
} else if (t.isDoctype()) {
tb.error(self)
return false
} else if t.startTagNormalName() == "html" {
return try HtmlTreeBuilderState.InBody.process(t, tb) // does not transition
} else if t.startTagNormalName() == "head" {
let head: Element = try tb.insert(t.asStartTag())
tb.setHeadElement(head)
tb.transition(.InHead)
} else if let nName = t.endTagNormalName(), TagSets.outer.contains(nName) {
try tb.processStartTag("head")
return try tb.process(t)
} else if (t.isEndTag()) {
tb.error(self)
return false
} else {
try tb.processStartTag("head")
return try tb.process(t)
}
return true
case .InHead:
func anythingElse(_ t: Token, _ tb: TreeBuilder)throws->Bool {
try tb.processEndTag("head")
return try tb.process(t)
}
if (HtmlTreeBuilderState.isWhitespace(t)) {
try tb.insert(t.asCharacter())
return true
}
switch (t.type) {
case .Comment:
try tb.insert(t.asComment())
break
case .Doctype:
tb.error(self)
return false
case .StartTag:
let start: Token.StartTag = t.asStartTag()
let name: String = start.normalName()!
if (name.equals("html")) {
return try HtmlTreeBuilderState.InBody.process(t, tb)
} else if TagSets.baseEtc.contains(name) {
let el: Element = try tb.insertEmpty(start)
// SwiftSoup special: update base the frist time it is seen
if (name.equals("base") && el.hasAttr("href")) {
try tb.maybeSetBaseUri(el)
}
} else if (name.equals("meta")) {
let _: Element = try tb.insertEmpty(start)
// todo: charset switches
} else if (name.equals("title")) {
try HtmlTreeBuilderState.handleRcData(start, tb)
} else if name == "noframes" || name == "style" {
try HtmlTreeBuilderState.handleRawtext(start, tb)
} else if (name.equals("noscript")) {
// else if noscript && scripting flag = true: rawtext (SwiftSoup doesn't run script, to handle as noscript)
try tb.insert(start)
tb.transition(.InHeadNoscript)
} else if (name.equals("script")) {
// skips some script rules as won't execute them
tb.tokeniser.transition(TokeniserState.ScriptData)
tb.markInsertionMode()
tb.transition(.Text)
try tb.insert(start)
} else if (name.equals("head")) {
tb.error(self)
return false
} else {
return try anythingElse(t, tb)
}
break
case .EndTag:
let end: Token.EndTag = t.asEndTag()
let name = end.normalName()
if (name?.equals("head"))! {
tb.pop()
tb.transition(.AfterHead)
} else if let name = name, TagSets.outer2.contains(name) {
return try anythingElse(t, tb)
} else {
tb.error(self)
return false
}
break
default:
return try anythingElse(t, tb)
}
return true
case .InHeadNoscript:
func anythingElse(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool {
tb.error(self)
try tb.insert(Token.Char().data(t.toString()))
return true
}
if (t.isDoctype()) {
tb.error(self)
} else if t.startTagNormalName() == "html" {
return try tb.process(t, .InBody)
} else if t.endTagNormalName() == "noscript" {
tb.pop()
tb.transition(.InHead)
} else if HtmlTreeBuilderState.isWhitespace(t) || t.isComment() || (t.isStartTag() && TagSets.baseEtc2.contains(t.asStartTag().normalName()!)) {
return try tb.process(t, .InHead)
} else if t.endTagNormalName() == "br" {
return try anythingElse(t, tb)
} else if (t.isStartTag() && TagSets.headNoscript.contains(t.asStartTag().normalName()!)) || t.isEndTag() {
tb.error(self)
return false
} else {
return try anythingElse(t, tb)
}
return true
case .AfterHead:
@discardableResult
func anythingElse(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool {
try tb.processStartTag("body")
tb.framesetOk(true)
return try tb.process(t)
}
if (HtmlTreeBuilderState.isWhitespace(t)) {
try tb.insert(t.asCharacter())
} else if (t.isComment()) {
try tb.insert(t.asComment())
} else if (t.isDoctype()) {
tb.error(self)
} else if (t.isStartTag()) {
let startTag: Token.StartTag = t.asStartTag()
let name: String = startTag.normalName()!
if (name.equals("html")) {
return try tb.process(t, .InBody)
} else if (name.equals("body")) {
try tb.insert(startTag)
tb.framesetOk(false)
tb.transition(.InBody)
} else if (name.equals("frameset")) {
try tb.insert(startTag)
tb.transition(.InFrameset)
} else if TagSets.baseEtc3.contains(name) {
tb.error(self)
let head: Element = tb.getHeadElement()!
tb.push(head)
try tb.process(t, .InHead)
tb.removeFromStack(head)
} else if (name.equals("head")) {
tb.error(self)
return false
} else {
try anythingElse(t, tb)
}
} else if (t.isEndTag()) {
if TagSets.outer3.contains(t.asEndTag().normalName()!) {
try anythingElse(t, tb)
} else {
tb.error(self)
return false
}
} else {
try anythingElse(t, tb)
}
return true
case .InBody:
func anyOtherEndTag(_ t: Token, _ tb: HtmlTreeBuilder) -> Bool {
let name: String? = t.asEndTag().normalName()
let stack: Array, unless in svg
} else {
try tb.insert(startTag)
}
} else if (name.equals("isindex")) {
// how much do we care about the early 90s?
tb.error(self)
if (tb.getFormElement() != nil) {
return false
}
tb.tokeniser.acknowledgeSelfClosingFlag()
try tb.processStartTag("form")
if (startTag._attributes.hasKey(key: "action")) {
if let form: Element = tb.getFormElement() {
try form.attr("action", startTag._attributes.get(key: "action"))
}
}
try tb.processStartTag("hr")
try tb.processStartTag("label")
// hope you like english.
let prompt: String = startTag._attributes.hasKey(key: "prompt") ?
startTag._attributes.get(key: "prompt") :
"self is a searchable index. Enter search keywords: "
try tb.process(Token.Char().data(prompt))
// input
let inputAttribs: Attributes = Attributes()
for attr: Attribute in startTag._attributes {
if (!Constants.InBodyStartInputAttribs.contains(attr.getKey())) {
inputAttribs.put(attribute: attr)
}
}
try inputAttribs.put("name", "isindex")
try tb.processStartTag("input", inputAttribs)
try tb.processEndTag("label")
try tb.processStartTag("hr")
try tb.processEndTag("form")
} else if (name.equals("textarea")) {
try tb.insert(startTag)
// todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.)
tb.tokeniser.transition(TokeniserState.Rcdata)
tb.markInsertionMode()
tb.framesetOk(false)
tb.transition(.Text)
} else if (name.equals("xmp")) {
if (try tb.inButtonScope("p")) {
try tb.processEndTag("p")
}
try tb.reconstructFormattingElements()
tb.framesetOk(false)
try HtmlTreeBuilderState.handleRawtext(startTag, tb)
} else if (name.equals("iframe")) {
tb.framesetOk(false)
try HtmlTreeBuilderState.handleRawtext(startTag, tb)
} else if (name.equals("noembed")) {
// also handle noscript if script enabled
try HtmlTreeBuilderState.handleRawtext(startTag, tb)
} else if (name.equals("select")) {
try tb.reconstructFormattingElements()
try tb.insert(startTag)
tb.framesetOk(false)
let state: HtmlTreeBuilderState = tb.state()
if (state.equals(.InTable) || state.equals(.InCaption) || state.equals(.InTableBody) || state.equals(.InRow) || state.equals(.InCell)) {
tb.transition(.InSelectInTable)
} else {
tb.transition(.InSelect)
}
} else if Constants.InBodyStartOptions.contains(name) {
if (tb.currentElement() != nil && tb.currentElement()!.nodeName().equals("option")) {
try tb.processEndTag("option")
}
try tb.reconstructFormattingElements()
try tb.insert(startTag)
} else if Constants.InBodyStartRuby.contains(name) {
if (try tb.inScope("ruby")) {
tb.generateImpliedEndTags()
if (tb.currentElement() != nil && !tb.currentElement()!.nodeName().equals("ruby")) {
tb.error(self)
tb.popStackToBefore("ruby") // i.e. close up to but not include name
}
try tb.insert(startTag)
}
} else if (name.equals("math")) {
try tb.reconstructFormattingElements()
// todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml)
try tb.insert(startTag)
tb.tokeniser.acknowledgeSelfClosingFlag()
} else if (name.equals("svg")) {
try tb.reconstructFormattingElements()
// todo: handle A start tag whose tag name is "svg" (xlink, svg)
try tb.insert(startTag)
tb.tokeniser.acknowledgeSelfClosingFlag()
} else if Constants.InBodyStartDrop.contains(name) {
tb.error(self)
return false
} else {
try tb.reconstructFormattingElements()
try tb.insert(startTag)
}
} else {
try tb.reconstructFormattingElements()
try tb.insert(startTag)
}
break
case .EndTag:
let endTag: Token.EndTag = t.asEndTag()
if let name = endTag.normalName() {
if Constants.InBodyEndAdoptionFormatters.contains(name) {
// Adoption Agency Algorithm.
for _ in 0..<8 {
let formatEl: Element? = tb.getActiveFormattingElement(name)
if (formatEl == nil) {
return anyOtherEndTag(t, tb)
} else if (!tb.onStack(formatEl!)) {
tb.error(self)
tb.removeFromActiveFormattingElements(formatEl!)
return true
} else if (try !tb.inScope(formatEl!.nodeName())) {
tb.error(self)
return false
} else if (tb.currentElement() != formatEl!) {
tb.error(self)
}
var furthestBlock: Element? = nil
var commonAncestor: Element? = nil
var seenFormattingElement: Bool = false
let stack: Array