// // CharacterReader.swift // SwiftSoup // // Created by Nabil Chatbi on 10/10/16. // Copyright © 2016 Nabil Chatbi.. All rights reserved. // import Foundation /** CharacterReader consumes tokens off a string. To replace the old TokenQueue. */ public final class CharacterReader { private static let empty = "" public static let EOF: UnicodeScalar = "\u{FFFF}"//65535 private let input: String.UnicodeScalarView private var pos: String.UnicodeScalarView.Index private var mark: String.UnicodeScalarView.Index //private let stringCache: Array // holds reused strings in this doc, to lessen garbage public init(_ input: String) { self.input = input.unicodeScalars self.pos = input.startIndex self.mark = input.startIndex } public func getPos() -> Int { return input.distance(from: input.startIndex, to: pos) } public func isEmpty() -> Bool { return pos >= input.endIndex } public func current() -> UnicodeScalar { return (pos >= input.endIndex) ? CharacterReader.EOF : input[pos] } @discardableResult public func consume() -> UnicodeScalar { guard pos < input.endIndex else { return CharacterReader.EOF } let val = input[pos] pos = input.index(after: pos) return val } public func unconsume() { guard pos > input.startIndex else { return } pos = input.index(before: pos) } public func advance() { guard pos < input.endIndex else { return } pos = input.index(after: pos) } public func markPos() { mark = pos } public func rewindToMark() { pos = mark } public func consumeAsString() -> String { guard pos < input.endIndex else { return "" } let str = String(input[pos]) pos = input.index(after: pos) return str } /** * Locate the next occurrence of a Unicode scalar * * - Parameter c: scan target * - Returns: offset between current position and next instance of target. -1 if not found. */ public func nextIndexOf(_ c: UnicodeScalar) -> String.UnicodeScalarView.Index? { // doesn't handle scanning for surrogates return input[pos...].firstIndex(of: c) } /** * Locate the next occurence of a target string * * - Parameter seq: scan target * - Returns: index of next instance of target. nil if not found. */ public func nextIndexOf(_ seq: String) -> String.UnicodeScalarView.Index? { // doesn't handle scanning for surrogates var start = pos let targetScalars = seq.unicodeScalars guard let firstChar = targetScalars.first else { return pos } // search for "" -> current place MATCH: while true { // Match on first scalar guard let firstCharIx = input[start...].firstIndex(of: firstChar) else { return nil } var current = firstCharIx // Then manually match subsequent scalars for scalar in targetScalars.dropFirst() { current = input.index(after: current) guard current < input.endIndex else { return nil } if input[current] != scalar { start = input.index(after: firstCharIx) continue MATCH } } // full match; current is at position of last matching character return firstCharIx } } public func consumeTo(_ c: UnicodeScalar) -> String { guard let targetIx = nextIndexOf(c) else { return consumeToEnd() } let consumed = cacheString(pos, targetIx) pos = targetIx return consumed } public func consumeTo(_ seq: String) -> String { guard let targetIx = nextIndexOf(seq) else { return consumeToEnd() } let consumed = cacheString(pos, targetIx) pos = targetIx return consumed } public func consumeToAny(_ chars: UnicodeScalar...) -> String { return consumeToAny(chars) } public func consumeToAny(_ chars: [UnicodeScalar]) -> String { let start = pos while pos < input.endIndex { if chars.contains(input[pos]) { break } pos = input.index(after: pos) } return cacheString(start, pos) } public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String { return consumeToAny(chars) } public func consumeToAnySorted(_ chars: [UnicodeScalar]) -> String { return consumeToAny(chars) } static let dataTerminators: [UnicodeScalar] = [.Ampersand, .LessThan, TokeniserStateVars.nullScalr] // read to &, <, or null public func consumeData() -> String { return consumeToAny(CharacterReader.dataTerminators) } static let tagNameTerminators: [UnicodeScalar] = [.BackslashT, .BackslashN, .BackslashR, .BackslashF, .Space, .Slash, .GreaterThan, TokeniserStateVars.nullScalr] // read to '\t', '\n', '\r', '\f', ' ', '/', '>', or nullChar public func consumeTagName() -> String { return consumeToAny(CharacterReader.tagNameTerminators) } public func consumeToEnd() -> String { let consumed = cacheString(pos, input.endIndex) pos = input.endIndex return consumed } public func consumeLetterSequence() -> String { let start = pos while pos < input.endIndex { let c = input[pos] if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) { pos = input.index(after: pos) } else { break } } return cacheString(start, pos) } public func consumeLetterThenDigitSequence() -> String { let start = pos while pos < input.endIndex { let c = input[pos] if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) { pos = input.index(after: pos) } else { break } } while pos < input.endIndex { let c = input[pos] if (c >= "0" && c <= "9") { pos = input.index(after: pos) } else { break } } return cacheString(start, pos) } public func consumeHexSequence() -> String { let start = pos while pos < input.endIndex { let c = input[pos] if ((c >= "0" && c <= "9") || (c >= "A" && c <= "F") || (c >= "a" && c <= "f")) { pos = input.index(after: pos) } else { break } } return cacheString(start, pos) } public func consumeDigitSequence() -> String { let start = pos while pos < input.endIndex { let c = input[pos] if (c >= "0" && c <= "9") { pos = input.index(after: pos) } else { break } } return cacheString(start, pos) } public func matches(_ c: UnicodeScalar) -> Bool { return !isEmpty() && input[pos] == c } public func matches(_ seq: String, ignoreCase: Bool = false, consume: Bool = false) -> Bool { var current = pos let scalars = seq.unicodeScalars for scalar in scalars { guard current < input.endIndex else { return false } if ignoreCase { guard input[current].uppercase == scalar.uppercase else { return false } } else { guard input[current] == scalar else { return false } } current = input.index(after: current) } if consume { pos = current } return true } public func matchesIgnoreCase(_ seq: String ) -> Bool { return matches(seq, ignoreCase: true) } public func matchesAny(_ seq: UnicodeScalar...) -> Bool { return matchesAny(seq) } public func matchesAny(_ seq: [UnicodeScalar]) -> Bool { guard pos < input.endIndex else { return false } return seq.contains(input[pos]) } public func matchesAnySorted(_ seq: [UnicodeScalar]) -> Bool { return matchesAny(seq) } public func matchesLetter() -> Bool { guard pos < input.endIndex else { return false } let c = input[pos] return (c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters) } public func matchesDigit() -> Bool { guard pos < input.endIndex else { return false } let c = input[pos] return c >= "0" && c <= "9" } @discardableResult public func matchConsume(_ seq: String) -> Bool { return matches(seq, consume: true) } @discardableResult public func matchConsumeIgnoreCase(_ seq: String) -> Bool { return matches(seq, ignoreCase: true, consume: true) } public func containsIgnoreCase(_ seq: String ) -> Bool { // used to check presence of , . only finds consistent case. let loScan = seq.lowercased(with: Locale(identifier: "en")) let hiScan = seq.uppercased(with: Locale(identifier: "eng")) return nextIndexOf(loScan) != nil || nextIndexOf(hiScan) != nil } public func toString() -> String { return String(input[pos...]) } /** * Originally intended as a caching mechanism for strings, but caching doesn't * seem to improve performance. Now just a stub. */ private func cacheString(_ start: String.UnicodeScalarView.Index, _ end: String.UnicodeScalarView.Index) -> String { return String(input[start..