Module:XmlParser

-- https://github.com/manoelcampos/xml2lua/blob/master/xml2lua.lua

--- @module Class providing the actual XML parser. -- Available options are: --     * stripWS --       Strip non-significant whitespace (leading/trailing) --       and do not generate events for empty text elements -- --      * expandEntities --       Expand entities (standard entities + single char --        numeric entities only currently - could be extended --        at runtime if suitable DTD parser added elements --        to table (see obj._ENTITIES). May also be possible --        to expand multibyre entities for UTF-8 only --  --      * errorHandler --        Custom error handler function -- --  NOTE: Boolean options must be set to 'nil' not '0'

---Converts the decimal code of a character to its corresponding char --if it's a graphical char, otherwise, returns the HTML ISO code --for that decimal value in the format &#code --@param code the decimal value to convert to its respective character local function decimalToHtmlChar(code) local n = tonumber(code) if n >= 0 and n < 256 then return string.char(n) else return "&#"..code..";" end end

---Converts the hexadecimal code of a character to its corresponding char --if it's a graphical char, otherwise, returns the HTML ISO code --for that hexadecimal value in the format &#xCode --@param code the hexadecimal value to convert to its respective character local function hexadecimalToHtmlChar(code) local n = tonumber(code, 16) if n >= 0 and n < 256 then return string.char(n) else return "&#x"..code..";" end end

local XmlParser = { -- Private attribures/functions _XML       = '^([^<]*)<(%/?)([^>]-)(%/?)>', _ATTR1     = '([%w-:_]+)%s*=%s*"(.-)"', _ATTR2     = '([%w-:_]+)%s*=%s*\'(.-)\'', _CDATA     = '<%!%[CDATA%[(.-)%]%]>', _PI        = '<%?(.-)%?>', _COMMENT   = '<!%-%-(.-)%-%->', _TAG       = '^(.-)%s.*', _LEADINGWS = '^%s+', _TRAILINGWS = '%s+$', _WS        = '^%s*$', _DTD1      = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*(%b[])%s*>', _DTD2      = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*(%b[])%s*>', _DTD3      = '<!DOCTYPE%s+(.-)%s*(%b[])%s*>', _DTD4      = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*>', _DTD5      = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*>',

--Matches an attribute with non-closing double quotes (The equal sign is matched non-greedly by using =+?) _ATTRERR1  = '=+?%s*"[^"]*$', --Matches an attribute with non-closing single quotes (The equal sign is matched non-greedly by using =+?) _ATTRERR2  = '=+?%s*\'[^\']*$', --Matches a closing tag such as or the end of a openning tag such as    _TAGEXT     = '(%/?)>',

_errstr = { xmlErr = "Error Parsing XML", declErr = "Error Parsing XMLDecl", declStartErr = "XMLDecl not at start of document", declAttrErr = "Invalid XMLDecl attributes", piErr = "Error Parsing Processing Instruction", commentErr = "Error Parsing Comment", cdataErr = "Error Parsing CDATA", dtdErr = "Error Parsing DTD", endTagErr = "End Tag Attributes Invalid", unmatchedTagErr = "Unbalanced Tag", incompleteXmlErr = "Incomplete XML Document", },

_ENTITIES = { ["&lt;"] = "<", ["&gt;"] = ">", ["&amp;"] = "&", ["&quot;"] = '"',       ["&apos;"] = "'",        ["&#(%d+);"] = decimalToHtmlChar,        ["&#x(%x+);"] = hexadecimalToHtmlChar,    }, }

--- Instantiates a XmlParser object. --@param _handler Handler module to be used to convert the XML string --              to another formats. See the available handlers at the handler directory. --              Usually you get an instance to a handler module using, for instance: --              local handler = require("xmlhandler/tree"). --@param _options Options for this XmlParser instance. --@see XmlParser.options function XmlParser.new(_handler, _options) local obj = { handler = _handler, options = _options, _stack = {} }

setmetatable(obj, XmlParser) obj.__index = XmlParser return obj; end

---Checks if a function/field exists in a table or in its metatable --@param table the table to check if it has a given function --@param elementName the name of the function/field to check if exists --@return true if the function/field exists, false otherwise local function fexists(table, elementName) if table == nil then return false end

if table[elementName] ~= nil then return true else return fexists(getmetatable(table), elementName) end end

local function err(self, err, pos) if self.options.errorHandler then self.options.errorHandler(err,pos) end end

--- Removes whitespaces local function stripWS(self, s)   if self.options.stripWS then s = string.gsub(s,'^%s+','') s = string.gsub(s,'%s+$','') end return s end

local function parseEntities(self, s)    if self.options.expandEntities then for k,v in pairs(self._ENTITIES) do           s = string.gsub(s,k,v) end end

return s end

--- Parses a string representing a tag. --@param s String containing tag text --@return a {name, attrs} table -- where name is the name of the tag and attrs -- is a table containing the atributtes of the tag local function parseTag(self, s)   local tag = { name = string.gsub(s, self._TAG, '%1'), attrs = {} }

local parseFunction = function (k, v)            tag.attrs[k] = parseEntities(self, v)            tag.attrs._ = 1 end string.gsub(s, self._ATTR1, parseFunction) string.gsub(s, self._ATTR2, parseFunction)

if tag.attrs._ then tag.attrs._ = nil else tag.attrs = nil end

return tag end

local function parseXmlDeclaration(self, xml, f)   -- XML Declaration f.match, f.endMatch, f.text = string.find(xml, self._PI, f.pos) if not f.match then err(self, self._errstr.declErr, f.pos) end

if f.match ~= 1 then -- Must be at start of doc if present err(self, self._errstr.declStartErr, f.pos) end

local tag = parseTag(self, f.text) -- TODO: Check if attributes are valid -- Check for version (mandatory) if tag.attrs and tag.attrs.version == nil then err(self, self._errstr.declAttrErr, f.pos) end

if fexists(self.handler, 'decl') then self.handler:decl(tag, f.match, f.endMatch) end

return tag end

local function parseXmlProcessingInstruction(self, xml, f)   local tag = {}

-- XML Processing Instruction (PI) f.match, f.endMatch, f.text = string.find(xml, self._PI, f.pos) if not f.match then err(self, self._errstr.piErr, f.pos) end if fexists(self.handler, 'pi') then -- Parse PI attributes & text tag = parseTag(self, f.text) local pi = string.sub(f.text, string.len(tag.name)+1) if pi ~= "" then if tag.attrs then tag.attrs._text = pi           else tag.attrs = { _text = pi } end end self.handler:pi(tag, f.match, f.endMatch) end

return tag end

local function parseComment(self, xml, f)   f.match, f.endMatch, f.text = string.find(xml, self._COMMENT, f.pos) if not f.match then err(self, self._errstr.commentErr, f.pos) end

if fexists(self.handler, 'comment') then f.text = parseEntities(self, stripWS(self, f.text)) self.handler:comment(f.text, next, f.match, f.endMatch) end end

local function _parseDtd(self, xml, pos) -- match,endMatch,root,type,name,uri,internal local m,e,r,t,n,u,i m,e,r,t,u,i = string.find(xml, self._DTD1,pos) if m then return m, e, {_root=r,_type=t,_uri=u,_internal=i} end

m,e,r,t,n,u,i = string.find(xml, self._DTD2,pos) if m then return m, e, {_root=r,_type=t,_name=n,_uri=u,_internal=i} end

m,e,r,i = string.find(xml, self._DTD3,pos) if m then return m, e, {_root=r,_internal=i} end

m,e,r,t,u = string.find(s,self._DTD4,pos) if m then return m,e,{_root=r,_type=t,_uri=u} end

m,e,r,t,n,u = string.find(s,self._DTD5,pos) if m then return m,e,{_root=r,_type=t,_name=n,_uri=u} end

return nil end

local function parseDtd(self, xml, f)   f.match, f.endMatch, attrs = self:_parseDtd(xml, f.pos) if not f.match then err(self, self._errstr.dtdErr, f.pos) end

if fexists(self.handler, 'dtd') then self.handler:dtd(attrs._root, attrs, f.match, f.endMatch) end end

local function parseCdata(self, xml, f)   f.match, f.endMatch, f.text = string.find(xml, self._CDATA, f.pos) if not f.match then err(self, self._errstr.cdataErr, f.pos) end

if fexists(self.handler, 'cdata') then self.handler:cdata(f.text, nil, f.match, f.endMatch) end end

--- Parse a Normal tag -- Need check for embedded '>' in attribute value and extend -- match recursively if necessary eg. 456"> local function parseNormalTag(self, xml, f)   --Check for errors    while 1 do        --If there isn't an attribute without closing quotes (single or double quotes)        --then breaks to follow the normal processing of the tag.        --Otherwise, try to find where the quotes close.        f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR1)

if f.errEnd == nil then f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR2) if f.errEnd == nil then break end end f.extStart, f.extEnd, f.endt2 = string.find(xml, self._TAGEXT, f.endMatch+1) f.tagstr = f.tagstr .. string.sub(xml, f.endMatch, f.extEnd-1) if not f.match then err(self, self._errstr.xmlErr, f.pos) end f.endMatch = f.extEnd end

-- Extract tag name and attrs local tag = parseTag(self, f.tagstr)

if (f.endt1=="/") then if fexists(self.handler, 'endtag') then if tag.attrs then -- Shouldn't have any attributes in endtag err(self, string.format("%s (/%s)", self._errstr.endTagErr, tag.name), f.pos) end if table.remove(self._stack) ~= tag.name then err(self, string.format("%s (/%s)", self._errstr.unmatchedTagErr, tag.name), f.pos) end self.handler:endtag(tag, f.match, f.endMatch) end else table.insert(self._stack, tag.name) if fexists(self.handler, 'starttag') then self.handler:starttag(tag, f.match, f.endMatch) end --TODO: Tags com fechamento automático estão sendo --retornadas como uma tabela, o que complica --para a app NCLua tratar isso. É preciso --fazer com que seja retornado um campo string vazio.

-- Self-Closing Tag if (f.endt2=="/") then table.remove(self._stack) if fexists(self.handler, 'endtag') then self.handler:endtag(tag, f.match, f.endMatch) end end end

return tag end

local function parseTagType(self, xml, f)   -- Test for tag type if string.find(string.sub(f.tagstr, 1, 5), "?xml%s") then parseXmlDeclaration(self, xml, f)   elseif string.sub(f.tagstr, 1, 1) == "?" then parseXmlProcessingInstruction(self, xml, f)   elseif string.sub(f.tagstr, 1, 3) == "!--" then parseComment(self, xml, f)   elseif string.sub(f.tagstr, 1, 8) == "!DOCTYPE" then parseDtd(self, xml, f)   elseif string.sub(f.tagstr, 1, 8) == "![CDATA[" then parseCdata(self, xml, f)   else parseNormalTag(self, xml, f)   end end

--- Get next tag (first pass - fix exceptions below). --@return true if the next tag could be got, false otherwise local function getNextTag(self, xml, f) f.match, f.endMatch, f.text, f.endt1, f.tagstr, f.endt2 = string.find(xml, self._XML, f.pos) if not f.match then if string.find(xml, self._WS, f.pos) then -- No more text - check document complete if #self._stack ~= 0 then err(self, self._errstr.incompleteXmlErr, f.pos) else return false end else -- Unparsable text err(self, self._errstr.xmlErr, f.pos) end end

f.text = f.text or '' f.tagstr = f.tagstr or '' f.match = f.match or 0 return f.endMatch ~= nil end

--Main function which starts the XML parsing process --@param xml the XML string to parse --@param parseAttributes indicates if tag attributes should be parsed or not. --      If omitted, the default value is true. function XmlParser:parse(xml, parseAttributes) if type(self) ~= "table" or getmetatable(self) ~= XmlParser then error("You must call xmlparser:parse(parameters) instead of xmlparser.parse(parameters)") end

if parseAttributes == nil then parseAttributes = true end

self.handler.parseAttributes = parseAttributes

--Stores string.find results and parameters --and other auxiliar variables local f = { --string.find return match = 0, endMatch = 0, -- text, end1, tagstr, end2,

--string.find parameters and auxiliar variables pos = 1, -- startText, endText, -- errStart, errEnd, extStart, extEnd, }

while f.match do       if not getNextTag(self, xml, f) then break end -- Handle leading text f.startText = f.match f.endText = f.match + string.len(f.text) - 1 f.match = f.match + string.len(f.text) f.text = parseEntities(self, stripWS(self, f.text)) if f.text ~= "" and fexists(self.handler, 'text') then self.handler:text(f.text, nil, f.match, f.endText) end

parseTagType(self, xml, f)       f.pos = f.endMatch + 1 end end

XmlParser.__index = XmlParser return XmlParser