diff options
Diffstat (limited to 'lib/sax.js')
-rw-r--r-- | lib/sax.js | 992 |
1 files changed, 992 insertions, 0 deletions
diff --git a/lib/sax.js b/lib/sax.js new file mode 100644 index 0000000..15cd7a2 --- /dev/null +++ b/lib/sax.js @@ -0,0 +1,992 @@ +// wrapper for non-node envs +;(function (sax) { + +sax.parser = function (strict, opt) { return new SAXParser(strict, opt) } +sax.SAXParser = SAXParser +sax.SAXStream = SAXStream +sax.createStream = createStream + +// When we pass the MAX_BUFFER_LENGTH position, start checking for buffer overruns. +// When we check, schedule the next check for MAX_BUFFER_LENGTH - (max(buffer lengths)), +// since that's the earliest that a buffer overrun could occur. This way, checks are +// as rare as required, but as often as necessary to ensure never crossing this bound. +// Furthermore, buffers are only tested at most once per write(), so passing a very +// large string into write() might have undesirable effects, but this is manageable by +// the caller, so it is assumed to be safe. Thus, a call to write() may, in the extreme +// edge case, result in creating at most one complete copy of the string passed in. +// Set to Infinity to have unlimited buffers. +sax.MAX_BUFFER_LENGTH = 64 * 1024 + +var buffers = [ + "comment", "sgmlDecl", "textNode", "tagName", "doctype", + "procInstName", "procInstBody", "entity", "attribName", + "attribValue", "cdata", "script" +] + +sax.EVENTS = // for discoverability. + [ "text" + , "processinginstruction" + , "sgmldeclaration" + , "doctype" + , "comment" + , "attribute" + , "opentag" + , "closetag" + , "opencdata" + , "cdata" + , "closecdata" + , "error" + , "end" + , "ready" + , "script" + , "opennamespace" + , "closenamespace" + ] + +function SAXParser (strict, opt) { + if (!(this instanceof SAXParser)) return new SAXParser(strict, opt) + + var parser = this + clearBuffers(parser) + parser.q = parser.c = "" + parser.bufferCheckPosition = sax.MAX_BUFFER_LENGTH + parser.opt = opt || {} + parser.opt.lowercase = parser.opt.lowercase || parser.opt.lowercasetags; + parser.looseCase = parser.opt.lowercase ? "toLowerCase" : "toUpperCase" + parser.tags = [] + parser.closed = parser.closedRoot = parser.sawRoot = false + parser.tag = parser.error = null + parser.strict = !!strict + parser.noscript = !!(strict || parser.opt.noscript) + parser.state = S.BEGIN + parser.ENTITIES = Object.create(sax.ENTITIES) + parser.attribList = [] + + // namespaces form a prototype chain. + // it always points at the current tag, + // which protos to its parent tag. + if (parser.opt.xmlns) parser.ns = Object.create(rootNS) + + // mostly just for error reporting + parser.position = parser.line = parser.column = 0 + emit(parser, "onready") +} + +function checkBufferLength (parser) { + var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10) + , maxActual = 0 + for (var i = 0, l = buffers.length; i < l; i ++) { + var len = parser[buffers[i]].length + if (len > maxAllowed) { + // Text/cdata nodes can get big, and since they're buffered, + // we can get here under normal conditions. + // Avoid issues by emitting the text node now, + // so at least it won't get any bigger. + switch (buffers[i]) { + case "textNode": + closeText(parser) + break + + case "cdata": + emitNode(parser, "oncdata", parser.cdata) + parser.cdata = "" + break + + case "script": + emitNode(parser, "onscript", parser.script) + parser.script = "" + break + + default: + error(parser, "Max buffer length exceeded: "+buffers[i]) + } + } + maxActual = Math.max(maxActual, len) + } + // schedule the next check for the earliest possible buffer overrun. + parser.bufferCheckPosition = (sax.MAX_BUFFER_LENGTH - maxActual) + + parser.position +} + +function clearBuffers (parser) { + for (var i = 0, l = buffers.length; i < l; i ++) { + parser[buffers[i]] = "" + } +} + +SAXParser.prototype = + { end: function () { end(this) } + , write: write + , resume: function () { this.error = null; return this } + , close: function () { return this.write(null) } + } + +try { + var Stream = require("stream").Stream +} catch (ex) { + var Stream = function () {} +} + + +var streamWraps = sax.EVENTS.filter(function (ev) { + return ev !== "error" && ev !== "end" +}) + +function createStream (strict, opt) { + return new SAXStream(strict, opt) +} + +function SAXStream (strict, opt) { + if (!(this instanceof SAXStream)) return new SAXStream(strict, opt) + + Stream.apply(me) + + this._parser = new SAXParser(strict, opt) + this.writable = true + this.readable = true + + + var me = this + + this._parser.onend = function () { + me.emit("end") + } + + this._parser.onerror = function (er) { + me.emit("error", er) + + // if didn't throw, then means error was handled. + // go ahead and clear error, so we can write again. + me._parser.error = null + } + + streamWraps.forEach(function (ev) { + Object.defineProperty(me, "on" + ev, { + get: function () { return me._parser["on" + ev] }, + set: function (h) { + if (!h) { + me.removeAllListeners(ev) + return me._parser["on"+ev] = h + } + me.on(ev, h) + }, + enumerable: true, + configurable: false + }) + }) +} + +SAXStream.prototype = Object.create(Stream.prototype, + { constructor: { value: SAXStream } }) + +SAXStream.prototype.write = function (data) { + this._parser.write(data.toString()) + this.emit("data", data) + return true +} + +SAXStream.prototype.end = function (chunk) { + if (chunk && chunk.length) this._parser.write(chunk.toString()) + this._parser.end() + return true +} + +SAXStream.prototype.on = function (ev, handler) { + var me = this + if (!me._parser["on"+ev] && streamWraps.indexOf(ev) !== -1) { + me._parser["on"+ev] = function () { + var args = arguments.length === 1 ? [arguments[0]] + : Array.apply(null, arguments) + args.splice(0, 0, ev) + me.emit.apply(me, args) + } + } + + return Stream.prototype.on.call(me, ev, handler) +} + + + +// character classes and tokens +var whitespace = "\r\n\t " + // this really needs to be replaced with character classes. + // XML allows all manner of ridiculous numbers and digits. + , number = "0124356789" + , letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + // (Letter | "_" | ":") + , nameStart = letter+"_:" + , nameBody = nameStart+number+"-." + , quote = "'\"" + , entity = number+letter+"#" + , attribEnd = whitespace + ">" + , CDATA = "[CDATA[" + , DOCTYPE = "DOCTYPE" + , XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + , XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/" + , rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE } + +// turn all the string character sets into character class objects. +whitespace = charClass(whitespace) +number = charClass(number) +letter = charClass(letter) +nameStart = charClass(nameStart) +nameBody = charClass(nameBody) +quote = charClass(quote) +entity = charClass(entity) +attribEnd = charClass(attribEnd) + +function charClass (str) { + return str.split("").reduce(function (s, c) { + s[c] = true + return s + }, {}) +} + +function is (charclass, c) { + return charclass[c] +} + +function not (charclass, c) { + return !charclass[c] +} + +var S = 0 +sax.STATE = +{ BEGIN : S++ +, TEXT : S++ // general stuff +, TEXT_ENTITY : S++ // & and such. +, OPEN_WAKA : S++ // < +, SGML_DECL : S++ // <!BLARG +, SGML_DECL_QUOTED : S++ // <!BLARG foo "bar +, DOCTYPE : S++ // <!DOCTYPE +, DOCTYPE_QUOTED : S++ // <!DOCTYPE "//blah +, DOCTYPE_DTD : S++ // <!DOCTYPE "//blah" [ ... +, DOCTYPE_DTD_QUOTED : S++ // <!DOCTYPE "//blah" [ "foo +, COMMENT_STARTING : S++ // <!- +, COMMENT : S++ // <!-- +, COMMENT_ENDING : S++ // <!-- blah - +, COMMENT_ENDED : S++ // <!-- blah -- +, CDATA : S++ // <![CDATA[ something +, CDATA_ENDING : S++ // ] +, CDATA_ENDING_2 : S++ // ]] +, PROC_INST : S++ // <?hi +, PROC_INST_BODY : S++ // <?hi there +, PROC_INST_QUOTED : S++ // <?hi "there +, PROC_INST_ENDING : S++ // <?hi "there" ? +, OPEN_TAG : S++ // <strong +, OPEN_TAG_SLASH : S++ // <strong / +, ATTRIB : S++ // <a +, ATTRIB_NAME : S++ // <a foo +, ATTRIB_NAME_SAW_WHITE : S++ // <a foo _ +, ATTRIB_VALUE : S++ // <a foo= +, ATTRIB_VALUE_QUOTED : S++ // <a foo="bar +, ATTRIB_VALUE_UNQUOTED : S++ // <a foo=bar +, ATTRIB_VALUE_ENTITY_Q : S++ // <foo bar=""" +, ATTRIB_VALUE_ENTITY_U : S++ // <foo bar=" +, CLOSE_TAG : S++ // </a +, CLOSE_TAG_SAW_WHITE : S++ // </a > +, SCRIPT : S++ // <script> ... +, SCRIPT_ENDING : S++ // <script> ... < +} + +sax.ENTITIES = +{ "apos" : "'" +, "quot" : "\"" +, "amp" : "&" +, "gt" : ">" +, "lt" : "<" +} + +for (var S in sax.STATE) sax.STATE[sax.STATE[S]] = S + +// shorthand +S = sax.STATE + +function emit (parser, event, data) { + parser[event] && parser[event](data) +} + +function emitNode (parser, nodeType, data) { + if (parser.textNode) closeText(parser) + emit(parser, nodeType, data) +} + +function closeText (parser) { + parser.textNode = textopts(parser.opt, parser.textNode) + if (parser.textNode) emit(parser, "ontext", parser.textNode) + parser.textNode = "" +} + +function textopts (opt, text) { + if (opt.trim) text = text.trim() + if (opt.normalize) text = text.replace(/\s+/g, " ") + return text +} + +function error (parser, er) { + closeText(parser) + er += "\nLine: "+parser.line+ + "\nColumn: "+parser.column+ + "\nChar: "+parser.c + er = new Error(er) + parser.error = er + emit(parser, "onerror", er) + return parser +} + +function end (parser) { + if (parser.state !== S.TEXT) error(parser, "Unexpected end") + closeText(parser) + parser.c = "" + parser.closed = true + emit(parser, "onend") + SAXParser.call(parser, parser.strict, parser.opt) + return parser +} + +function strictFail (parser, message) { + if (parser.strict) error(parser, message) +} + +function newTag (parser) { + if (!parser.strict) parser.tagName = parser.tagName[parser.looseCase]() + var parent = parser.tags[parser.tags.length - 1] || parser + , tag = parser.tag = { name : parser.tagName, attributes : {} } + + // will be overridden if tag contails an xmlns="foo" or xmlns:foo="bar" + if (parser.opt.xmlns) tag.ns = parent.ns + parser.attribList.length = 0 +} + +function qname (name) { + var i = name.indexOf(":") + , qualName = i < 0 ? [ "", name ] : name.split(":") + , prefix = qualName[0] + , local = qualName[1] + + // <x "xmlns"="http://foo"> + if (name === "xmlns") { + prefix = "xmlns" + local = "" + } + + return { prefix: prefix, local: local } +} + +function attrib (parser) { + if (!parser.strict) parser.attribName = parser.attribName[parser.looseCase]() + if (parser.opt.xmlns) { + var qn = qname(parser.attribName) + , prefix = qn.prefix + , local = qn.local + + if (prefix === "xmlns") { + // namespace binding attribute; push the binding into scope + if (local === "xml" && parser.attribValue !== XML_NAMESPACE) { + strictFail( parser + , "xml: prefix must be bound to " + XML_NAMESPACE + "\n" + + "Actual: " + parser.attribValue ) + } else if (local === "xmlns" && parser.attribValue !== XMLNS_NAMESPACE) { + strictFail( parser + , "xmlns: prefix must be bound to " + XMLNS_NAMESPACE + "\n" + + "Actual: " + parser.attribValue ) + } else { + var tag = parser.tag + , parent = parser.tags[parser.tags.length - 1] || parser + if (tag.ns === parent.ns) { + tag.ns = Object.create(parent.ns) + } + tag.ns[local] = parser.attribValue + } + } + + // defer onattribute events until all attributes have been seen + // so any new bindings can take effect; preserve attribute order + // so deferred events can be emitted in document order + parser.attribList.push([parser.attribName, parser.attribValue]) + } else { + // in non-xmlns mode, we can emit the event right away + parser.tag.attributes[parser.attribName] = parser.attribValue + emitNode( parser + , "onattribute" + , { name: parser.attribName + , value: parser.attribValue } ) + } + + parser.attribName = parser.attribValue = "" +} + +function openTag (parser, selfClosing) { + if (parser.opt.xmlns) { + // emit namespace binding events + var tag = parser.tag + + // add namespace info to tag + var qn = qname(parser.tagName) + tag.prefix = qn.prefix + tag.local = qn.local + tag.uri = tag.ns[qn.prefix] || qn.prefix + + if (tag.prefix && !tag.uri) { + strictFail(parser, "Unbound namespace prefix: " + + JSON.stringify(parser.tagName)) + } + + var parent = parser.tags[parser.tags.length - 1] || parser + if (tag.ns && parent.ns !== tag.ns) { + Object.keys(tag.ns).forEach(function (p) { + emitNode( parser + , "onopennamespace" + , { prefix: p , uri: tag.ns[p] } ) + }) + } + + // handle deferred onattribute events + for (var i = 0, l = parser.attribList.length; i < l; i ++) { + var nv = parser.attribList[i] + var name = nv[0] + , value = nv[1] + , qualName = qname(name) + , prefix = qualName.prefix + , local = qualName.local + , uri = tag.ns[prefix] || "" + , a = { name: name + , value: value + , prefix: prefix + , local: local + , uri: uri + } + + // if there's any attributes with an undefined namespace, + // then fail on them now. + if (prefix && prefix != "xmlns" && !uri) { + strictFail(parser, "Unbound namespace prefix: " + + JSON.stringify(prefix)) + a.uri = prefix + } + parser.tag.attributes[name] = a + emitNode(parser, "onattribute", a) + } + parser.attribList.length = 0 + } + + // process the tag + parser.sawRoot = true + parser.tags.push(parser.tag) + emitNode(parser, "onopentag", parser.tag) + if (!selfClosing) { + // special case for <script> in non-strict mode. + if (!parser.noscript && parser.tagName.toLowerCase() === "script") { + parser.state = S.SCRIPT + } else { + parser.state = S.TEXT + } + parser.tag = null + parser.tagName = "" + } + parser.attribName = parser.attribValue = "" + parser.attribList.length = 0 +} + +function closeTag (parser) { + if (!parser.tagName) { + strictFail(parser, "Weird empty close tag.") + parser.textNode += "</>" + parser.state = S.TEXT + return + } + // first make sure that the closing tag actually exists. + // <a><b></c></b></a> will close everything, otherwise. + var t = parser.tags.length + var tagName = parser.tagName + if (!parser.strict) tagName = tagName[parser.looseCase]() + var closeTo = tagName + while (t --) { + var close = parser.tags[t] + if (close.name !== closeTo) { + // fail the first time in strict mode + strictFail(parser, "Unexpected close tag") + } else break + } + + // didn't find it. we already failed for strict, so just abort. + if (t < 0) { + strictFail(parser, "Unmatched closing tag: "+parser.tagName) + parser.textNode += "</" + parser.tagName + ">" + parser.state = S.TEXT + return + } + parser.tagName = tagName + var s = parser.tags.length + while (s --> t) { + var tag = parser.tag = parser.tags.pop() + parser.tagName = parser.tag.name + emitNode(parser, "onclosetag", parser.tagName) + + var x = {} + for (var i in tag.ns) x[i] = tag.ns[i] + + var parent = parser.tags[parser.tags.length - 1] || parser + if (parser.opt.xmlns && tag.ns !== parent.ns) { + // remove namespace bindings introduced by tag + Object.keys(tag.ns).forEach(function (p) { + var n = tag.ns[p] + emitNode(parser, "onclosenamespace", { prefix: p, uri: n }) + }) + } + } + if (t === 0) parser.closedRoot = true + parser.tagName = parser.attribValue = parser.attribName = "" + parser.attribList.length = 0 + parser.state = S.TEXT +} + +function parseEntity (parser) { + var entity = parser.entity.toLowerCase() + , num + , numStr = "" + if (parser.ENTITIES[entity]) return parser.ENTITIES[entity] + if (entity.charAt(0) === "#") { + if (entity.charAt(1) === "x") { + entity = entity.slice(2) + num = parseInt(entity, 16) + numStr = num.toString(16) + } else { + entity = entity.slice(1) + num = parseInt(entity, 10) + numStr = num.toString(10) + } + } + entity = entity.replace(/^0+/, "") + if (numStr.toLowerCase() !== entity) { + strictFail(parser, "Invalid character entity") + return "&"+parser.entity + ";" + } + return String.fromCharCode(num) +} + +function write (chunk) { + var parser = this + if (this.error) throw this.error + if (parser.closed) return error(parser, + "Cannot write after close. Assign an onready handler.") + if (chunk === null) return end(parser) + var i = 0, c = "" + while (parser.c = c = chunk.charAt(i++)) { + parser.position ++ + if (c === "\n") { + parser.line ++ + parser.column = 0 + } else parser.column ++ + switch (parser.state) { + + case S.BEGIN: + if (c === "<") parser.state = S.OPEN_WAKA + else if (not(whitespace,c)) { + // have to process this as a text node. + // weird, but happens. + strictFail(parser, "Non-whitespace before first tag.") + parser.textNode = c + parser.state = S.TEXT + } + continue + + case S.TEXT: + if (parser.sawRoot && !parser.closedRoot) { + var starti = i-1 + while (c && c!=="<" && c!=="&") { + c = chunk.charAt(i++) + if (c) { + parser.position ++ + if (c === "\n") { + parser.line ++ + parser.column = 0 + } else parser.column ++ + } + } + parser.textNode += chunk.substring(starti, i-1) + } + if (c === "<") parser.state = S.OPEN_WAKA + else { + if (not(whitespace, c) && (!parser.sawRoot || parser.closedRoot)) + strictFail("Text data outside of root node.") + if (c === "&") parser.state = S.TEXT_ENTITY + else parser.textNode += c + } + continue + + case S.SCRIPT: + // only non-strict + if (c === "<") { + parser.state = S.SCRIPT_ENDING + } else parser.script += c + continue + + case S.SCRIPT_ENDING: + if (c === "/") { + emitNode(parser, "onscript", parser.script) + parser.state = S.CLOSE_TAG + parser.script = "" + parser.tagName = "" + } else { + parser.script += "<" + c + parser.state = S.SCRIPT + } + continue + + case S.OPEN_WAKA: + // either a /, ?, !, or text is coming next. + if (c === "!") { + parser.state = S.SGML_DECL + parser.sgmlDecl = "" + } else if (is(whitespace, c)) { + // wait for it... + } else if (is(nameStart,c)) { + parser.startTagPosition = parser.position - 1 + parser.state = S.OPEN_TAG + parser.tagName = c + } else if (c === "/") { + parser.startTagPosition = parser.position - 1 + parser.state = S.CLOSE_TAG + parser.tagName = "" + } else if (c === "?") { + parser.state = S.PROC_INST + parser.procInstName = parser.procInstBody = "" + } else { + strictFail(parser, "Unencoded <") + parser.textNode += "<" + c + parser.state = S.TEXT + } + continue + + case S.SGML_DECL: + if ((parser.sgmlDecl+c).toUpperCase() === CDATA) { + emitNode(parser, "onopencdata") + parser.state = S.CDATA + parser.sgmlDecl = "" + parser.cdata = "" + } else if (parser.sgmlDecl+c === "--") { + parser.state = S.COMMENT + parser.comment = "" + parser.sgmlDecl = "" + } else if ((parser.sgmlDecl+c).toUpperCase() === DOCTYPE) { + parser.state = S.DOCTYPE + if (parser.doctype || parser.sawRoot) strictFail(parser, + "Inappropriately located doctype declaration") + parser.doctype = "" + parser.sgmlDecl = "" + } else if (c === ">") { + emitNode(parser, "onsgmldeclaration", parser.sgmlDecl) + parser.sgmlDecl = "" + parser.state = S.TEXT + } else if (is(quote, c)) { + parser.state = S.SGML_DECL_QUOTED + parser.sgmlDecl += c + } else parser.sgmlDecl += c + continue + + case S.SGML_DECL_QUOTED: + if (c === parser.q) { + parser.state = S.SGML_DECL + parser.q = "" + } + parser.sgmlDecl += c + continue + + case S.DOCTYPE: + if (c === ">") { + parser.state = S.TEXT + emitNode(parser, "ondoctype", parser.doctype) + parser.doctype = true // just remember that we saw it. + } else { + parser.doctype += c + if (c === "[") parser.state = S.DOCTYPE_DTD + else if (is(quote, c)) { + parser.state = S.DOCTYPE_QUOTED + parser.q = c + } + } + continue + + case S.DOCTYPE_QUOTED: + parser.doctype += c + if (c === parser.q) { + parser.q = "" + parser.state = S.DOCTYPE + } + continue + + case S.DOCTYPE_DTD: + parser.doctype += c + if (c === "]") parser.state = S.DOCTYPE + else if (is(quote,c)) { + parser.state = S.DOCTYPE_DTD_QUOTED + parser.q = c + } + continue + + case S.DOCTYPE_DTD_QUOTED: + parser.doctype += c + if (c === parser.q) { + parser.state = S.DOCTYPE_DTD + parser.q = "" + } + continue + + case S.COMMENT: + if (c === "-") parser.state = S.COMMENT_ENDING + else parser.comment += c + continue + + case S.COMMENT_ENDING: + if (c === "-") { + parser.state = S.COMMENT_ENDED + parser.comment = textopts(parser.opt, parser.comment) + if (parser.comment) emitNode(parser, "oncomment", parser.comment) + parser.comment = "" + } else { + parser.comment += "-" + c + parser.state = S.COMMENT + } + continue + + case S.COMMENT_ENDED: + if (c !== ">") { + strictFail(parser, "Malformed comment") + // allow <!-- blah -- bloo --> in non-strict mode, + // which is a comment of " blah -- bloo " + parser.comment += "--" + c + parser.state = S.COMMENT + } else parser.state = S.TEXT + continue + + case S.CDATA: + if (c === "]") parser.state = S.CDATA_ENDING + else parser.cdata += c + continue + + case S.CDATA_ENDING: + if (c === "]") parser.state = S.CDATA_ENDING_2 + else { + parser.cdata += "]" + c + parser.state = S.CDATA + } + continue + + case S.CDATA_ENDING_2: + if (c === ">") { + if (parser.cdata) emitNode(parser, "oncdata", parser.cdata) + emitNode(parser, "onclosecdata") + parser.cdata = "" + parser.state = S.TEXT + } else if (c === "]") { + parser.cdata += "]" + } else { + parser.cdata += "]]" + c + parser.state = S.CDATA + } + continue + + case S.PROC_INST: + if (c === "?") parser.state = S.PROC_INST_ENDING + else if (is(whitespace, c)) parser.state = S.PROC_INST_BODY + else parser.procInstName += c + continue + + case S.PROC_INST_BODY: + if (!parser.procInstBody && is(whitespace, c)) continue + else if (c === "?") parser.state = S.PROC_INST_ENDING + else if (is(quote, c)) { + parser.state = S.PROC_INST_QUOTED + parser.q = c + parser.procInstBody += c + } else parser.procInstBody += c + continue + + case S.PROC_INST_ENDING: + if (c === ">") { + emitNode(parser, "onprocessinginstruction", { + name : parser.procInstName, + body : parser.procInstBody + }) + parser.procInstName = parser.procInstBody = "" + parser.state = S.TEXT + } else { + parser.procInstBody += "?" + c + parser.state = S.PROC_INST_BODY + } + continue + + case S.PROC_INST_QUOTED: + parser.procInstBody += c + if (c === parser.q) { + parser.state = S.PROC_INST_BODY + parser.q = "" + } + continue + + case S.OPEN_TAG: + if (is(nameBody, c)) parser.tagName += c + else { + newTag(parser) + if (c === ">") openTag(parser) + else if (c === "/") parser.state = S.OPEN_TAG_SLASH + else { + if (not(whitespace, c)) strictFail( + parser, "Invalid character in tag name") + parser.state = S.ATTRIB + } + } + continue + + case S.OPEN_TAG_SLASH: + if (c === ">") { + openTag(parser, true) + closeTag(parser) + } else { + strictFail(parser, "Forward-slash in opening tag not followed by >") + parser.state = S.ATTRIB + } + continue + + case S.ATTRIB: + // haven't read the attribute name yet. + if (is(whitespace, c)) continue + else if (c === ">") openTag(parser) + else if (c === "/") parser.state = S.OPEN_TAG_SLASH + else if (is(nameStart, c)) { + parser.attribName = c + parser.attribValue = "" + parser.state = S.ATTRIB_NAME + } else strictFail(parser, "Invalid attribute name") + continue + + case S.ATTRIB_NAME: + if (c === "=") parser.state = S.ATTRIB_VALUE + else if (is(whitespace, c)) parser.state = S.ATTRIB_NAME_SAW_WHITE + else if (is(nameBody, c)) parser.attribName += c + else strictFail(parser, "Invalid attribute name") + continue + + case S.ATTRIB_NAME_SAW_WHITE: + if (c === "=") parser.state = S.ATTRIB_VALUE + else if (is(whitespace, c)) continue + else { + strictFail(parser, "Attribute without value") + parser.tag.attributes[parser.attribName] = "" + parser.attribValue = "" + emitNode(parser, "onattribute", + { name : parser.attribName, value : "" }) + parser.attribName = "" + if (c === ">") openTag(parser) + else if (is(nameStart, c)) { + parser.attribName = c + parser.state = S.ATTRIB_NAME + } else { + strictFail(parser, "Invalid attribute name") + parser.state = S.ATTRIB + } + } + continue + + case S.ATTRIB_VALUE: + if (is(whitespace, c)) continue + else if (is(quote, c)) { + parser.q = c + parser.state = S.ATTRIB_VALUE_QUOTED + } else { + strictFail(parser, "Unquoted attribute value") + parser.state = S.ATTRIB_VALUE_UNQUOTED + parser.attribValue = c + } + continue + + case S.ATTRIB_VALUE_QUOTED: + if (c !== parser.q) { + if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_Q + else parser.attribValue += c + continue + } + attrib(parser) + parser.q = "" + parser.state = S.ATTRIB + continue + + case S.ATTRIB_VALUE_UNQUOTED: + if (not(attribEnd,c)) { + if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_U + else parser.attribValue += c + continue + } + attrib(parser) + if (c === ">") openTag(parser) + else parser.state = S.ATTRIB + continue + + case S.CLOSE_TAG: + if (!parser.tagName) { + if (is(whitespace, c)) continue + else if (not(nameStart, c)) strictFail(parser, + "Invalid tagname in closing tag.") + else parser.tagName = c + } + else if (c === ">") closeTag(parser) + else if (is(nameBody, c)) parser.tagName += c + else { + if (not(whitespace, c)) strictFail(parser, + "Invalid tagname in closing tag") + parser.state = S.CLOSE_TAG_SAW_WHITE + } + continue + + case S.CLOSE_TAG_SAW_WHITE: + if (is(whitespace, c)) continue + if (c === ">") closeTag(parser) + else strictFail("Invalid characters in closing tag") + continue + + case S.TEXT_ENTITY: + case S.ATTRIB_VALUE_ENTITY_Q: + case S.ATTRIB_VALUE_ENTITY_U: + switch(parser.state) { + case S.TEXT_ENTITY: + var returnState = S.TEXT, buffer = "textNode" + break + + case S.ATTRIB_VALUE_ENTITY_Q: + var returnState = S.ATTRIB_VALUE_QUOTED, buffer = "attribValue" + break + + case S.ATTRIB_VALUE_ENTITY_U: + var returnState = S.ATTRIB_VALUE_UNQUOTED, buffer = "attribValue" + break + } + if (c === ";") { + parser[buffer] += parseEntity(parser) + parser.entity = "" + parser.state = returnState + } + else if (is(entity, c)) parser.entity += c + else { + strictFail("Invalid character entity") + parser[buffer] += "&" + parser.entity + c + parser.entity = "" + parser.state = returnState + } + continue + + default: + throw new Error(parser, "Unknown state: " + parser.state) + } + } // while + // cdata blocks can get very big under normal conditions. emit and move on. + // if (parser.state === S.CDATA && parser.cdata) { + // emitNode(parser, "oncdata", parser.cdata) + // parser.cdata = "" + // } + if (parser.position >= parser.bufferCheckPosition) checkBufferLength(parser) + return parser +} + +})(typeof exports === "undefined" ? sax = {} : exports) + |