aboutsummaryrefslogtreecommitdiffstats
path: root/lib/sax.js
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sax.js')
-rw-r--r--lib/sax.js992
1 files changed, 992 insertions, 0 deletions
diff --git a/lib/sax.js b/lib/sax.js
new file mode 100644
index 0000000..15cd7a2
--- /dev/null
+++ b/lib/sax.js
@@ -0,0 +1,992 @@
+// wrapper for non-node envs
+;(function (sax) {
+
+sax.parser = function (strict, opt) { return new SAXParser(strict, opt) }
+sax.SAXParser = SAXParser
+sax.SAXStream = SAXStream
+sax.createStream = createStream
+
+// When we pass the MAX_BUFFER_LENGTH position, start checking for buffer overruns.
+// When we check, schedule the next check for MAX_BUFFER_LENGTH - (max(buffer lengths)),
+// since that's the earliest that a buffer overrun could occur. This way, checks are
+// as rare as required, but as often as necessary to ensure never crossing this bound.
+// Furthermore, buffers are only tested at most once per write(), so passing a very
+// large string into write() might have undesirable effects, but this is manageable by
+// the caller, so it is assumed to be safe. Thus, a call to write() may, in the extreme
+// edge case, result in creating at most one complete copy of the string passed in.
+// Set to Infinity to have unlimited buffers.
+sax.MAX_BUFFER_LENGTH = 64 * 1024
+
+var buffers = [
+ "comment", "sgmlDecl", "textNode", "tagName", "doctype",
+ "procInstName", "procInstBody", "entity", "attribName",
+ "attribValue", "cdata", "script"
+]
+
+sax.EVENTS = // for discoverability.
+ [ "text"
+ , "processinginstruction"
+ , "sgmldeclaration"
+ , "doctype"
+ , "comment"
+ , "attribute"
+ , "opentag"
+ , "closetag"
+ , "opencdata"
+ , "cdata"
+ , "closecdata"
+ , "error"
+ , "end"
+ , "ready"
+ , "script"
+ , "opennamespace"
+ , "closenamespace"
+ ]
+
+function SAXParser (strict, opt) {
+ if (!(this instanceof SAXParser)) return new SAXParser(strict, opt)
+
+ var parser = this
+ clearBuffers(parser)
+ parser.q = parser.c = ""
+ parser.bufferCheckPosition = sax.MAX_BUFFER_LENGTH
+ parser.opt = opt || {}
+ parser.opt.lowercase = parser.opt.lowercase || parser.opt.lowercasetags;
+ parser.looseCase = parser.opt.lowercase ? "toLowerCase" : "toUpperCase"
+ parser.tags = []
+ parser.closed = parser.closedRoot = parser.sawRoot = false
+ parser.tag = parser.error = null
+ parser.strict = !!strict
+ parser.noscript = !!(strict || parser.opt.noscript)
+ parser.state = S.BEGIN
+ parser.ENTITIES = Object.create(sax.ENTITIES)
+ parser.attribList = []
+
+ // namespaces form a prototype chain.
+ // it always points at the current tag,
+ // which protos to its parent tag.
+ if (parser.opt.xmlns) parser.ns = Object.create(rootNS)
+
+ // mostly just for error reporting
+ parser.position = parser.line = parser.column = 0
+ emit(parser, "onready")
+}
+
+function checkBufferLength (parser) {
+ var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10)
+ , maxActual = 0
+ for (var i = 0, l = buffers.length; i < l; i ++) {
+ var len = parser[buffers[i]].length
+ if (len > maxAllowed) {
+ // Text/cdata nodes can get big, and since they're buffered,
+ // we can get here under normal conditions.
+ // Avoid issues by emitting the text node now,
+ // so at least it won't get any bigger.
+ switch (buffers[i]) {
+ case "textNode":
+ closeText(parser)
+ break
+
+ case "cdata":
+ emitNode(parser, "oncdata", parser.cdata)
+ parser.cdata = ""
+ break
+
+ case "script":
+ emitNode(parser, "onscript", parser.script)
+ parser.script = ""
+ break
+
+ default:
+ error(parser, "Max buffer length exceeded: "+buffers[i])
+ }
+ }
+ maxActual = Math.max(maxActual, len)
+ }
+ // schedule the next check for the earliest possible buffer overrun.
+ parser.bufferCheckPosition = (sax.MAX_BUFFER_LENGTH - maxActual)
+ + parser.position
+}
+
+function clearBuffers (parser) {
+ for (var i = 0, l = buffers.length; i < l; i ++) {
+ parser[buffers[i]] = ""
+ }
+}
+
+SAXParser.prototype =
+ { end: function () { end(this) }
+ , write: write
+ , resume: function () { this.error = null; return this }
+ , close: function () { return this.write(null) }
+ }
+
+try {
+ var Stream = require("stream").Stream
+} catch (ex) {
+ var Stream = function () {}
+}
+
+
+var streamWraps = sax.EVENTS.filter(function (ev) {
+ return ev !== "error" && ev !== "end"
+})
+
+function createStream (strict, opt) {
+ return new SAXStream(strict, opt)
+}
+
+function SAXStream (strict, opt) {
+ if (!(this instanceof SAXStream)) return new SAXStream(strict, opt)
+
+ Stream.apply(me)
+
+ this._parser = new SAXParser(strict, opt)
+ this.writable = true
+ this.readable = true
+
+
+ var me = this
+
+ this._parser.onend = function () {
+ me.emit("end")
+ }
+
+ this._parser.onerror = function (er) {
+ me.emit("error", er)
+
+ // if didn't throw, then means error was handled.
+ // go ahead and clear error, so we can write again.
+ me._parser.error = null
+ }
+
+ streamWraps.forEach(function (ev) {
+ Object.defineProperty(me, "on" + ev, {
+ get: function () { return me._parser["on" + ev] },
+ set: function (h) {
+ if (!h) {
+ me.removeAllListeners(ev)
+ return me._parser["on"+ev] = h
+ }
+ me.on(ev, h)
+ },
+ enumerable: true,
+ configurable: false
+ })
+ })
+}
+
+SAXStream.prototype = Object.create(Stream.prototype,
+ { constructor: { value: SAXStream } })
+
+SAXStream.prototype.write = function (data) {
+ this._parser.write(data.toString())
+ this.emit("data", data)
+ return true
+}
+
+SAXStream.prototype.end = function (chunk) {
+ if (chunk && chunk.length) this._parser.write(chunk.toString())
+ this._parser.end()
+ return true
+}
+
+SAXStream.prototype.on = function (ev, handler) {
+ var me = this
+ if (!me._parser["on"+ev] && streamWraps.indexOf(ev) !== -1) {
+ me._parser["on"+ev] = function () {
+ var args = arguments.length === 1 ? [arguments[0]]
+ : Array.apply(null, arguments)
+ args.splice(0, 0, ev)
+ me.emit.apply(me, args)
+ }
+ }
+
+ return Stream.prototype.on.call(me, ev, handler)
+}
+
+
+
+// character classes and tokens
+var whitespace = "\r\n\t "
+ // this really needs to be replaced with character classes.
+ // XML allows all manner of ridiculous numbers and digits.
+ , number = "0124356789"
+ , letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ // (Letter | "_" | ":")
+ , nameStart = letter+"_:"
+ , nameBody = nameStart+number+"-."
+ , quote = "'\""
+ , entity = number+letter+"#"
+ , attribEnd = whitespace + ">"
+ , CDATA = "[CDATA["
+ , DOCTYPE = "DOCTYPE"
+ , XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
+ , XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"
+ , rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE }
+
+// turn all the string character sets into character class objects.
+whitespace = charClass(whitespace)
+number = charClass(number)
+letter = charClass(letter)
+nameStart = charClass(nameStart)
+nameBody = charClass(nameBody)
+quote = charClass(quote)
+entity = charClass(entity)
+attribEnd = charClass(attribEnd)
+
+function charClass (str) {
+ return str.split("").reduce(function (s, c) {
+ s[c] = true
+ return s
+ }, {})
+}
+
+function is (charclass, c) {
+ return charclass[c]
+}
+
+function not (charclass, c) {
+ return !charclass[c]
+}
+
+var S = 0
+sax.STATE =
+{ BEGIN : S++
+, TEXT : S++ // general stuff
+, TEXT_ENTITY : S++ // &amp and such.
+, OPEN_WAKA : S++ // <
+, SGML_DECL : S++ // <!BLARG
+, SGML_DECL_QUOTED : S++ // <!BLARG foo "bar
+, DOCTYPE : S++ // <!DOCTYPE
+, DOCTYPE_QUOTED : S++ // <!DOCTYPE "//blah
+, DOCTYPE_DTD : S++ // <!DOCTYPE "//blah" [ ...
+, DOCTYPE_DTD_QUOTED : S++ // <!DOCTYPE "//blah" [ "foo
+, COMMENT_STARTING : S++ // <!-
+, COMMENT : S++ // <!--
+, COMMENT_ENDING : S++ // <!-- blah -
+, COMMENT_ENDED : S++ // <!-- blah --
+, CDATA : S++ // <![CDATA[ something
+, CDATA_ENDING : S++ // ]
+, CDATA_ENDING_2 : S++ // ]]
+, PROC_INST : S++ // <?hi
+, PROC_INST_BODY : S++ // <?hi there
+, PROC_INST_QUOTED : S++ // <?hi "there
+, PROC_INST_ENDING : S++ // <?hi "there" ?
+, OPEN_TAG : S++ // <strong
+, OPEN_TAG_SLASH : S++ // <strong /
+, ATTRIB : S++ // <a
+, ATTRIB_NAME : S++ // <a foo
+, ATTRIB_NAME_SAW_WHITE : S++ // <a foo _
+, ATTRIB_VALUE : S++ // <a foo=
+, ATTRIB_VALUE_QUOTED : S++ // <a foo="bar
+, ATTRIB_VALUE_UNQUOTED : S++ // <a foo=bar
+, ATTRIB_VALUE_ENTITY_Q : S++ // <foo bar="&quot;"
+, ATTRIB_VALUE_ENTITY_U : S++ // <foo bar=&quot;
+, CLOSE_TAG : S++ // </a
+, CLOSE_TAG_SAW_WHITE : S++ // </a >
+, SCRIPT : S++ // <script> ...
+, SCRIPT_ENDING : S++ // <script> ... <
+}
+
+sax.ENTITIES =
+{ "apos" : "'"
+, "quot" : "\""
+, "amp" : "&"
+, "gt" : ">"
+, "lt" : "<"
+}
+
+for (var S in sax.STATE) sax.STATE[sax.STATE[S]] = S
+
+// shorthand
+S = sax.STATE
+
+function emit (parser, event, data) {
+ parser[event] && parser[event](data)
+}
+
+function emitNode (parser, nodeType, data) {
+ if (parser.textNode) closeText(parser)
+ emit(parser, nodeType, data)
+}
+
+function closeText (parser) {
+ parser.textNode = textopts(parser.opt, parser.textNode)
+ if (parser.textNode) emit(parser, "ontext", parser.textNode)
+ parser.textNode = ""
+}
+
+function textopts (opt, text) {
+ if (opt.trim) text = text.trim()
+ if (opt.normalize) text = text.replace(/\s+/g, " ")
+ return text
+}
+
+function error (parser, er) {
+ closeText(parser)
+ er += "\nLine: "+parser.line+
+ "\nColumn: "+parser.column+
+ "\nChar: "+parser.c
+ er = new Error(er)
+ parser.error = er
+ emit(parser, "onerror", er)
+ return parser
+}
+
+function end (parser) {
+ if (parser.state !== S.TEXT) error(parser, "Unexpected end")
+ closeText(parser)
+ parser.c = ""
+ parser.closed = true
+ emit(parser, "onend")
+ SAXParser.call(parser, parser.strict, parser.opt)
+ return parser
+}
+
+function strictFail (parser, message) {
+ if (parser.strict) error(parser, message)
+}
+
+function newTag (parser) {
+ if (!parser.strict) parser.tagName = parser.tagName[parser.looseCase]()
+ var parent = parser.tags[parser.tags.length - 1] || parser
+ , tag = parser.tag = { name : parser.tagName, attributes : {} }
+
+ // will be overridden if tag contails an xmlns="foo" or xmlns:foo="bar"
+ if (parser.opt.xmlns) tag.ns = parent.ns
+ parser.attribList.length = 0
+}
+
+function qname (name) {
+ var i = name.indexOf(":")
+ , qualName = i < 0 ? [ "", name ] : name.split(":")
+ , prefix = qualName[0]
+ , local = qualName[1]
+
+ // <x "xmlns"="http://foo">
+ if (name === "xmlns") {
+ prefix = "xmlns"
+ local = ""
+ }
+
+ return { prefix: prefix, local: local }
+}
+
+function attrib (parser) {
+ if (!parser.strict) parser.attribName = parser.attribName[parser.looseCase]()
+ if (parser.opt.xmlns) {
+ var qn = qname(parser.attribName)
+ , prefix = qn.prefix
+ , local = qn.local
+
+ if (prefix === "xmlns") {
+ // namespace binding attribute; push the binding into scope
+ if (local === "xml" && parser.attribValue !== XML_NAMESPACE) {
+ strictFail( parser
+ , "xml: prefix must be bound to " + XML_NAMESPACE + "\n"
+ + "Actual: " + parser.attribValue )
+ } else if (local === "xmlns" && parser.attribValue !== XMLNS_NAMESPACE) {
+ strictFail( parser
+ , "xmlns: prefix must be bound to " + XMLNS_NAMESPACE + "\n"
+ + "Actual: " + parser.attribValue )
+ } else {
+ var tag = parser.tag
+ , parent = parser.tags[parser.tags.length - 1] || parser
+ if (tag.ns === parent.ns) {
+ tag.ns = Object.create(parent.ns)
+ }
+ tag.ns[local] = parser.attribValue
+ }
+ }
+
+ // defer onattribute events until all attributes have been seen
+ // so any new bindings can take effect; preserve attribute order
+ // so deferred events can be emitted in document order
+ parser.attribList.push([parser.attribName, parser.attribValue])
+ } else {
+ // in non-xmlns mode, we can emit the event right away
+ parser.tag.attributes[parser.attribName] = parser.attribValue
+ emitNode( parser
+ , "onattribute"
+ , { name: parser.attribName
+ , value: parser.attribValue } )
+ }
+
+ parser.attribName = parser.attribValue = ""
+}
+
+function openTag (parser, selfClosing) {
+ if (parser.opt.xmlns) {
+ // emit namespace binding events
+ var tag = parser.tag
+
+ // add namespace info to tag
+ var qn = qname(parser.tagName)
+ tag.prefix = qn.prefix
+ tag.local = qn.local
+ tag.uri = tag.ns[qn.prefix] || qn.prefix
+
+ if (tag.prefix && !tag.uri) {
+ strictFail(parser, "Unbound namespace prefix: "
+ + JSON.stringify(parser.tagName))
+ }
+
+ var parent = parser.tags[parser.tags.length - 1] || parser
+ if (tag.ns && parent.ns !== tag.ns) {
+ Object.keys(tag.ns).forEach(function (p) {
+ emitNode( parser
+ , "onopennamespace"
+ , { prefix: p , uri: tag.ns[p] } )
+ })
+ }
+
+ // handle deferred onattribute events
+ for (var i = 0, l = parser.attribList.length; i < l; i ++) {
+ var nv = parser.attribList[i]
+ var name = nv[0]
+ , value = nv[1]
+ , qualName = qname(name)
+ , prefix = qualName.prefix
+ , local = qualName.local
+ , uri = tag.ns[prefix] || ""
+ , a = { name: name
+ , value: value
+ , prefix: prefix
+ , local: local
+ , uri: uri
+ }
+
+ // if there's any attributes with an undefined namespace,
+ // then fail on them now.
+ if (prefix && prefix != "xmlns" && !uri) {
+ strictFail(parser, "Unbound namespace prefix: "
+ + JSON.stringify(prefix))
+ a.uri = prefix
+ }
+ parser.tag.attributes[name] = a
+ emitNode(parser, "onattribute", a)
+ }
+ parser.attribList.length = 0
+ }
+
+ // process the tag
+ parser.sawRoot = true
+ parser.tags.push(parser.tag)
+ emitNode(parser, "onopentag", parser.tag)
+ if (!selfClosing) {
+ // special case for <script> in non-strict mode.
+ if (!parser.noscript && parser.tagName.toLowerCase() === "script") {
+ parser.state = S.SCRIPT
+ } else {
+ parser.state = S.TEXT
+ }
+ parser.tag = null
+ parser.tagName = ""
+ }
+ parser.attribName = parser.attribValue = ""
+ parser.attribList.length = 0
+}
+
+function closeTag (parser) {
+ if (!parser.tagName) {
+ strictFail(parser, "Weird empty close tag.")
+ parser.textNode += "</>"
+ parser.state = S.TEXT
+ return
+ }
+ // first make sure that the closing tag actually exists.
+ // <a><b></c></b></a> will close everything, otherwise.
+ var t = parser.tags.length
+ var tagName = parser.tagName
+ if (!parser.strict) tagName = tagName[parser.looseCase]()
+ var closeTo = tagName
+ while (t --) {
+ var close = parser.tags[t]
+ if (close.name !== closeTo) {
+ // fail the first time in strict mode
+ strictFail(parser, "Unexpected close tag")
+ } else break
+ }
+
+ // didn't find it. we already failed for strict, so just abort.
+ if (t < 0) {
+ strictFail(parser, "Unmatched closing tag: "+parser.tagName)
+ parser.textNode += "</" + parser.tagName + ">"
+ parser.state = S.TEXT
+ return
+ }
+ parser.tagName = tagName
+ var s = parser.tags.length
+ while (s --> t) {
+ var tag = parser.tag = parser.tags.pop()
+ parser.tagName = parser.tag.name
+ emitNode(parser, "onclosetag", parser.tagName)
+
+ var x = {}
+ for (var i in tag.ns) x[i] = tag.ns[i]
+
+ var parent = parser.tags[parser.tags.length - 1] || parser
+ if (parser.opt.xmlns && tag.ns !== parent.ns) {
+ // remove namespace bindings introduced by tag
+ Object.keys(tag.ns).forEach(function (p) {
+ var n = tag.ns[p]
+ emitNode(parser, "onclosenamespace", { prefix: p, uri: n })
+ })
+ }
+ }
+ if (t === 0) parser.closedRoot = true
+ parser.tagName = parser.attribValue = parser.attribName = ""
+ parser.attribList.length = 0
+ parser.state = S.TEXT
+}
+
+function parseEntity (parser) {
+ var entity = parser.entity.toLowerCase()
+ , num
+ , numStr = ""
+ if (parser.ENTITIES[entity]) return parser.ENTITIES[entity]
+ if (entity.charAt(0) === "#") {
+ if (entity.charAt(1) === "x") {
+ entity = entity.slice(2)
+ num = parseInt(entity, 16)
+ numStr = num.toString(16)
+ } else {
+ entity = entity.slice(1)
+ num = parseInt(entity, 10)
+ numStr = num.toString(10)
+ }
+ }
+ entity = entity.replace(/^0+/, "")
+ if (numStr.toLowerCase() !== entity) {
+ strictFail(parser, "Invalid character entity")
+ return "&"+parser.entity + ";"
+ }
+ return String.fromCharCode(num)
+}
+
+function write (chunk) {
+ var parser = this
+ if (this.error) throw this.error
+ if (parser.closed) return error(parser,
+ "Cannot write after close. Assign an onready handler.")
+ if (chunk === null) return end(parser)
+ var i = 0, c = ""
+ while (parser.c = c = chunk.charAt(i++)) {
+ parser.position ++
+ if (c === "\n") {
+ parser.line ++
+ parser.column = 0
+ } else parser.column ++
+ switch (parser.state) {
+
+ case S.BEGIN:
+ if (c === "<") parser.state = S.OPEN_WAKA
+ else if (not(whitespace,c)) {
+ // have to process this as a text node.
+ // weird, but happens.
+ strictFail(parser, "Non-whitespace before first tag.")
+ parser.textNode = c
+ parser.state = S.TEXT
+ }
+ continue
+
+ case S.TEXT:
+ if (parser.sawRoot && !parser.closedRoot) {
+ var starti = i-1
+ while (c && c!=="<" && c!=="&") {
+ c = chunk.charAt(i++)
+ if (c) {
+ parser.position ++
+ if (c === "\n") {
+ parser.line ++
+ parser.column = 0
+ } else parser.column ++
+ }
+ }
+ parser.textNode += chunk.substring(starti, i-1)
+ }
+ if (c === "<") parser.state = S.OPEN_WAKA
+ else {
+ if (not(whitespace, c) && (!parser.sawRoot || parser.closedRoot))
+ strictFail("Text data outside of root node.")
+ if (c === "&") parser.state = S.TEXT_ENTITY
+ else parser.textNode += c
+ }
+ continue
+
+ case S.SCRIPT:
+ // only non-strict
+ if (c === "<") {
+ parser.state = S.SCRIPT_ENDING
+ } else parser.script += c
+ continue
+
+ case S.SCRIPT_ENDING:
+ if (c === "/") {
+ emitNode(parser, "onscript", parser.script)
+ parser.state = S.CLOSE_TAG
+ parser.script = ""
+ parser.tagName = ""
+ } else {
+ parser.script += "<" + c
+ parser.state = S.SCRIPT
+ }
+ continue
+
+ case S.OPEN_WAKA:
+ // either a /, ?, !, or text is coming next.
+ if (c === "!") {
+ parser.state = S.SGML_DECL
+ parser.sgmlDecl = ""
+ } else if (is(whitespace, c)) {
+ // wait for it...
+ } else if (is(nameStart,c)) {
+ parser.startTagPosition = parser.position - 1
+ parser.state = S.OPEN_TAG
+ parser.tagName = c
+ } else if (c === "/") {
+ parser.startTagPosition = parser.position - 1
+ parser.state = S.CLOSE_TAG
+ parser.tagName = ""
+ } else if (c === "?") {
+ parser.state = S.PROC_INST
+ parser.procInstName = parser.procInstBody = ""
+ } else {
+ strictFail(parser, "Unencoded <")
+ parser.textNode += "<" + c
+ parser.state = S.TEXT
+ }
+ continue
+
+ case S.SGML_DECL:
+ if ((parser.sgmlDecl+c).toUpperCase() === CDATA) {
+ emitNode(parser, "onopencdata")
+ parser.state = S.CDATA
+ parser.sgmlDecl = ""
+ parser.cdata = ""
+ } else if (parser.sgmlDecl+c === "--") {
+ parser.state = S.COMMENT
+ parser.comment = ""
+ parser.sgmlDecl = ""
+ } else if ((parser.sgmlDecl+c).toUpperCase() === DOCTYPE) {
+ parser.state = S.DOCTYPE
+ if (parser.doctype || parser.sawRoot) strictFail(parser,
+ "Inappropriately located doctype declaration")
+ parser.doctype = ""
+ parser.sgmlDecl = ""
+ } else if (c === ">") {
+ emitNode(parser, "onsgmldeclaration", parser.sgmlDecl)
+ parser.sgmlDecl = ""
+ parser.state = S.TEXT
+ } else if (is(quote, c)) {
+ parser.state = S.SGML_DECL_QUOTED
+ parser.sgmlDecl += c
+ } else parser.sgmlDecl += c
+ continue
+
+ case S.SGML_DECL_QUOTED:
+ if (c === parser.q) {
+ parser.state = S.SGML_DECL
+ parser.q = ""
+ }
+ parser.sgmlDecl += c
+ continue
+
+ case S.DOCTYPE:
+ if (c === ">") {
+ parser.state = S.TEXT
+ emitNode(parser, "ondoctype", parser.doctype)
+ parser.doctype = true // just remember that we saw it.
+ } else {
+ parser.doctype += c
+ if (c === "[") parser.state = S.DOCTYPE_DTD
+ else if (is(quote, c)) {
+ parser.state = S.DOCTYPE_QUOTED
+ parser.q = c
+ }
+ }
+ continue
+
+ case S.DOCTYPE_QUOTED:
+ parser.doctype += c
+ if (c === parser.q) {
+ parser.q = ""
+ parser.state = S.DOCTYPE
+ }
+ continue
+
+ case S.DOCTYPE_DTD:
+ parser.doctype += c
+ if (c === "]") parser.state = S.DOCTYPE
+ else if (is(quote,c)) {
+ parser.state = S.DOCTYPE_DTD_QUOTED
+ parser.q = c
+ }
+ continue
+
+ case S.DOCTYPE_DTD_QUOTED:
+ parser.doctype += c
+ if (c === parser.q) {
+ parser.state = S.DOCTYPE_DTD
+ parser.q = ""
+ }
+ continue
+
+ case S.COMMENT:
+ if (c === "-") parser.state = S.COMMENT_ENDING
+ else parser.comment += c
+ continue
+
+ case S.COMMENT_ENDING:
+ if (c === "-") {
+ parser.state = S.COMMENT_ENDED
+ parser.comment = textopts(parser.opt, parser.comment)
+ if (parser.comment) emitNode(parser, "oncomment", parser.comment)
+ parser.comment = ""
+ } else {
+ parser.comment += "-" + c
+ parser.state = S.COMMENT
+ }
+ continue
+
+ case S.COMMENT_ENDED:
+ if (c !== ">") {
+ strictFail(parser, "Malformed comment")
+ // allow <!-- blah -- bloo --> in non-strict mode,
+ // which is a comment of " blah -- bloo "
+ parser.comment += "--" + c
+ parser.state = S.COMMENT
+ } else parser.state = S.TEXT
+ continue
+
+ case S.CDATA:
+ if (c === "]") parser.state = S.CDATA_ENDING
+ else parser.cdata += c
+ continue
+
+ case S.CDATA_ENDING:
+ if (c === "]") parser.state = S.CDATA_ENDING_2
+ else {
+ parser.cdata += "]" + c
+ parser.state = S.CDATA
+ }
+ continue
+
+ case S.CDATA_ENDING_2:
+ if (c === ">") {
+ if (parser.cdata) emitNode(parser, "oncdata", parser.cdata)
+ emitNode(parser, "onclosecdata")
+ parser.cdata = ""
+ parser.state = S.TEXT
+ } else if (c === "]") {
+ parser.cdata += "]"
+ } else {
+ parser.cdata += "]]" + c
+ parser.state = S.CDATA
+ }
+ continue
+
+ case S.PROC_INST:
+ if (c === "?") parser.state = S.PROC_INST_ENDING
+ else if (is(whitespace, c)) parser.state = S.PROC_INST_BODY
+ else parser.procInstName += c
+ continue
+
+ case S.PROC_INST_BODY:
+ if (!parser.procInstBody && is(whitespace, c)) continue
+ else if (c === "?") parser.state = S.PROC_INST_ENDING
+ else if (is(quote, c)) {
+ parser.state = S.PROC_INST_QUOTED
+ parser.q = c
+ parser.procInstBody += c
+ } else parser.procInstBody += c
+ continue
+
+ case S.PROC_INST_ENDING:
+ if (c === ">") {
+ emitNode(parser, "onprocessinginstruction", {
+ name : parser.procInstName,
+ body : parser.procInstBody
+ })
+ parser.procInstName = parser.procInstBody = ""
+ parser.state = S.TEXT
+ } else {
+ parser.procInstBody += "?" + c
+ parser.state = S.PROC_INST_BODY
+ }
+ continue
+
+ case S.PROC_INST_QUOTED:
+ parser.procInstBody += c
+ if (c === parser.q) {
+ parser.state = S.PROC_INST_BODY
+ parser.q = ""
+ }
+ continue
+
+ case S.OPEN_TAG:
+ if (is(nameBody, c)) parser.tagName += c
+ else {
+ newTag(parser)
+ if (c === ">") openTag(parser)
+ else if (c === "/") parser.state = S.OPEN_TAG_SLASH
+ else {
+ if (not(whitespace, c)) strictFail(
+ parser, "Invalid character in tag name")
+ parser.state = S.ATTRIB
+ }
+ }
+ continue
+
+ case S.OPEN_TAG_SLASH:
+ if (c === ">") {
+ openTag(parser, true)
+ closeTag(parser)
+ } else {
+ strictFail(parser, "Forward-slash in opening tag not followed by >")
+ parser.state = S.ATTRIB
+ }
+ continue
+
+ case S.ATTRIB:
+ // haven't read the attribute name yet.
+ if (is(whitespace, c)) continue
+ else if (c === ">") openTag(parser)
+ else if (c === "/") parser.state = S.OPEN_TAG_SLASH
+ else if (is(nameStart, c)) {
+ parser.attribName = c
+ parser.attribValue = ""
+ parser.state = S.ATTRIB_NAME
+ } else strictFail(parser, "Invalid attribute name")
+ continue
+
+ case S.ATTRIB_NAME:
+ if (c === "=") parser.state = S.ATTRIB_VALUE
+ else if (is(whitespace, c)) parser.state = S.ATTRIB_NAME_SAW_WHITE
+ else if (is(nameBody, c)) parser.attribName += c
+ else strictFail(parser, "Invalid attribute name")
+ continue
+
+ case S.ATTRIB_NAME_SAW_WHITE:
+ if (c === "=") parser.state = S.ATTRIB_VALUE
+ else if (is(whitespace, c)) continue
+ else {
+ strictFail(parser, "Attribute without value")
+ parser.tag.attributes[parser.attribName] = ""
+ parser.attribValue = ""
+ emitNode(parser, "onattribute",
+ { name : parser.attribName, value : "" })
+ parser.attribName = ""
+ if (c === ">") openTag(parser)
+ else if (is(nameStart, c)) {
+ parser.attribName = c
+ parser.state = S.ATTRIB_NAME
+ } else {
+ strictFail(parser, "Invalid attribute name")
+ parser.state = S.ATTRIB
+ }
+ }
+ continue
+
+ case S.ATTRIB_VALUE:
+ if (is(whitespace, c)) continue
+ else if (is(quote, c)) {
+ parser.q = c
+ parser.state = S.ATTRIB_VALUE_QUOTED
+ } else {
+ strictFail(parser, "Unquoted attribute value")
+ parser.state = S.ATTRIB_VALUE_UNQUOTED
+ parser.attribValue = c
+ }
+ continue
+
+ case S.ATTRIB_VALUE_QUOTED:
+ if (c !== parser.q) {
+ if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_Q
+ else parser.attribValue += c
+ continue
+ }
+ attrib(parser)
+ parser.q = ""
+ parser.state = S.ATTRIB
+ continue
+
+ case S.ATTRIB_VALUE_UNQUOTED:
+ if (not(attribEnd,c)) {
+ if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_U
+ else parser.attribValue += c
+ continue
+ }
+ attrib(parser)
+ if (c === ">") openTag(parser)
+ else parser.state = S.ATTRIB
+ continue
+
+ case S.CLOSE_TAG:
+ if (!parser.tagName) {
+ if (is(whitespace, c)) continue
+ else if (not(nameStart, c)) strictFail(parser,
+ "Invalid tagname in closing tag.")
+ else parser.tagName = c
+ }
+ else if (c === ">") closeTag(parser)
+ else if (is(nameBody, c)) parser.tagName += c
+ else {
+ if (not(whitespace, c)) strictFail(parser,
+ "Invalid tagname in closing tag")
+ parser.state = S.CLOSE_TAG_SAW_WHITE
+ }
+ continue
+
+ case S.CLOSE_TAG_SAW_WHITE:
+ if (is(whitespace, c)) continue
+ if (c === ">") closeTag(parser)
+ else strictFail("Invalid characters in closing tag")
+ continue
+
+ case S.TEXT_ENTITY:
+ case S.ATTRIB_VALUE_ENTITY_Q:
+ case S.ATTRIB_VALUE_ENTITY_U:
+ switch(parser.state) {
+ case S.TEXT_ENTITY:
+ var returnState = S.TEXT, buffer = "textNode"
+ break
+
+ case S.ATTRIB_VALUE_ENTITY_Q:
+ var returnState = S.ATTRIB_VALUE_QUOTED, buffer = "attribValue"
+ break
+
+ case S.ATTRIB_VALUE_ENTITY_U:
+ var returnState = S.ATTRIB_VALUE_UNQUOTED, buffer = "attribValue"
+ break
+ }
+ if (c === ";") {
+ parser[buffer] += parseEntity(parser)
+ parser.entity = ""
+ parser.state = returnState
+ }
+ else if (is(entity, c)) parser.entity += c
+ else {
+ strictFail("Invalid character entity")
+ parser[buffer] += "&" + parser.entity + c
+ parser.entity = ""
+ parser.state = returnState
+ }
+ continue
+
+ default:
+ throw new Error(parser, "Unknown state: " + parser.state)
+ }
+ } // while
+ // cdata blocks can get very big under normal conditions. emit and move on.
+ // if (parser.state === S.CDATA && parser.cdata) {
+ // emitNode(parser, "oncdata", parser.cdata)
+ // parser.cdata = ""
+ // }
+ if (parser.position >= parser.bufferCheckPosition) checkBufferLength(parser)
+ return parser
+}
+
+})(typeof exports === "undefined" ? sax = {} : exports)
+