четверг, 21 ноября 2013 г.

HTML parser на JavaScript

Напишем HTML парсер на JavaScript.

Наша парсер будет проверять:

- отсутствие закрывающих тегов:
<p><b>Hello → <p><b>Hello</b></p>

- отсутствие закрывающего слэша в «самозакрывающихся» элементах:
<img src=test.jpg> → <img src="test.jpg"/>

- незакрытый строчный элемент перед блочным:
<b>Hello <p>John → <b>Hello </b><p>John</p>

- отсутствие закрывающих тегов у элементов, для которых это допустимо в HTML4:
<p>Hello<p>World → <p>Hello</p><p>World</p>

- атрибуты без значений (флаги):
<input disabled> → <input disabled="disabled"/>

- ошибочный порядок закрывающих тегов вложенных элементов:
<b><i>example</b></i> → <b><i>example</i></b>

HTMLпарсер буде работать следующим образом:

SAX-style API

Управление тэгами, текстом и комментариями с помощью функций обратного вызова:

var results = "";

HTMLParser("<p id=test>hello <i>world", {
  start: function( tag, attrs, unary ) {
    results += "<" + tag;

    for ( var i = 0; i < attrs.length; i++ )
      results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';

    results += (unary ? "/" : "") + ">";
  },
  end: function( tag ) {
    results += "</" + tag + ">";
  },
  chars: function( text ) {
    results += text;
  },
  comment: function( text ) {
    results += "<!--" + text + "-->";
  }
});

results == '<p id="test">hello <i>world</i></p>"

XML Serializer

var results = HTMLtoXML("<p>Data: <input disabled>")
results == '<p>Data: <input disabled="disabled"/></p>'

DOM Builder

// The following is appended into the document body
HTMLtoDOM("<p>Hello <b>World", document)

// The follow is appended into the specified element
HTMLtoDOM("<p>Hello <b>World", document.getElementById("test"))

DOM Document Creator

var dom = HTMLtoDOM("<p>Data: <input disabled>");
dom.getElementsByTagName("body").length == 1
dom.getElementsByTagName("p").length == 1
While this library doesn’t cover the full gamut of possible weirdness that HTML provides, it does handle a lot of the most obvious stuff. All of the following are accounted for:

Код HTML парсера на JavaScript:

/*
 * HTML Parser By John Resig (ejohn.org)
 * Original code by Erik Arvidsson, Mozilla Public License
 * http: // erik.eae.net/simplehtmlparser/simplehtmlparser.js
 *
 * // Use like so:
 * HTMLParser(htmlString, {
 *     start: function(tag, attrs, unary) {},
 *     end: function(tag) {},
 *     chars: function(text) {},
 *     comment: function(text) {}
 * });
 *
 * // or to get an XML string:
 * HTMLtoXML(htmlString);
 *
 * // or to get an XML DOM Document
 * HTMLtoDOM(htmlString);
 *
 * // or to inject into an existing document/DOM node
 * HTMLtoDOM(htmlString, document);
 * HTMLtoDOM(htmlString, document.body);
 *
 */

(function(){

// Regular Expressions for parsing tags and attributes
var startTag = /^<([-A-Za-z0-9_]+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,
endTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/,
attr = /([-A-Za-z0-9_]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;

// Empty Elements - HTML 4.01
var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");

// Block Elements - HTML 4.01
var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");

// Inline Elements - HTML 4.01
var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");

// Elements that you can, intentionally, leave open
// (and which close themselves)
var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");

// Attributes that have their values filled in disabled="disabled"
var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");

// Special Elements (can contain anything)
var special = makeMap("script,style");

var HTMLParser = this.HTMLParser = function( html, handler ) {
var index, chars, match, stack = [], last = html;
stack.last = function(){
return this[ this.length - 1 ];
};

while ( html ) {
chars = true;

// Make sure we're not in a script or style element
if ( !stack.last() || !special[ stack.last() ] ) {

// Comment
if ( html.indexOf("<!--") == 0 ) {
index = html.indexOf("-->");

if ( index >= 0 ) {
if ( handler.comment )
handler.comment( html.substring( 4, index ) );
html = html.substring( index + 3 );
chars = false;
}

// end tag
} else if ( html.indexOf("</") == 0 ) {
match = html.match( endTag );

if ( match ) {
html = html.substring( match[0].length );
match[0].replace( endTag, parseEndTag );
chars = false;
}

// start tag
} else if ( html.indexOf("<") == 0 ) {
match = html.match( startTag );

if ( match ) {
html = html.substring( match[0].length );
match[0].replace( startTag, parseStartTag );
chars = false;
}
}

if ( chars ) {
index = html.indexOf("<");

var text = index < 0 ? html : html.substring( 0, index );
html = index < 0 ? "" : html.substring( index );

if ( handler.chars )
handler.chars( text );
}

} else {
html = html.replace(new RegExp("(.*)<\/" + stack.last() + "[^>]*>"), function(all, text){
text = text.replace(/<!--(.*?)-->/g, "$1")
.replace(/<!\[CDATA\[(.*?)]]>/g, "$1");

if ( handler.chars )
handler.chars( text );

return "";
});

parseEndTag( "", stack.last() );
}

if ( html == last )
throw "Parse Error: " + html;
last = html;
}

// Clean up any remaining tags
parseEndTag();

function parseStartTag( tag, tagName, rest, unary ) {
tagName = tagName.toLowerCase();

if ( block[ tagName ] ) {
while ( stack.last() && inline[ stack.last() ] ) {
parseEndTag( "", stack.last() );
}
}

if ( closeSelf[ tagName ] && stack.last() == tagName ) {
parseEndTag( "", tagName );
}

unary = empty[ tagName ] || !!unary;

if ( !unary )
stack.push( tagName );

if ( handler.start ) {
var attrs = [];

rest.replace(attr, function(match, name) {
var value = arguments[2] ? arguments[2] :
arguments[3] ? arguments[3] :
arguments[4] ? arguments[4] :
fillAttrs[name] ? name : "";

attrs.push({
name: name,
value: value,
escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //"
});
});

if ( handler.start )
handler.start( tagName, attrs, unary );
}
}

function parseEndTag( tag, tagName ) {
// If no tag name is provided, clean shop
if ( !tagName )
var pos = 0;

// Find the closest opened tag of the same type
else
for ( var pos = stack.length - 1; pos >= 0; pos-- )
if ( stack[ pos ] == tagName )
break;

if ( pos >= 0 ) {
// Close all the open elements, up the stack
for ( var i = stack.length - 1; i >= pos; i-- )
if ( handler.end )
handler.end( stack[ i ] );

// Remove the open elements from the stack
stack.length = pos;
}
}
};

this.HTMLtoXML = function( html ) {
var results = "";

HTMLParser(html, {
start: function( tag, attrs, unary ) {
results += "<" + tag;

for ( var i = 0; i < attrs.length; i++ )
results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';

results += (unary ? "/" : "") + ">";
},
end: function( tag ) {
results += "</" + tag + ">";
},
chars: function( text ) {
results += text;
},
comment: function( text ) {
results += "<!--" + text + "-->";
}
});

return results;
};

this.HTMLtoDOM = function( html, doc ) {
// There can be only one of these elements
var one = makeMap("html,head,body,title");

// Enforce a structure for the document
var structure = {
link: "head",
base: "head"
};

if ( !doc ) {
if ( typeof DOMDocument != "undefined" )
doc = new DOMDocument();
else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument )
doc = document.implementation.createDocument("", "", null);
else if ( typeof ActiveX != "undefined" )
doc = new ActiveXObject("Msxml.DOMDocument");

} else
doc = doc.ownerDocument ||
doc.getOwnerDocument && doc.getOwnerDocument() ||
doc;

var elems = [],
documentElement = doc.documentElement ||
doc.getDocumentElement && doc.getDocumentElement();

// If we're dealing with an empty document then we
// need to pre-populate it with the HTML document structure
if ( !documentElement && doc.createElement ) (function(){
var html = doc.createElement("html");
var head = doc.createElement("head");
head.appendChild( doc.createElement("title") );
html.appendChild( head );
html.appendChild( doc.createElement("body") );
doc.appendChild( html );
})();

// Find all the unique elements
if ( doc.getElementsByTagName )
for ( var i in one )
one[ i ] = doc.getElementsByTagName( i )[0];

// If we're working with a document, inject contents into
// the body element
var curParentNode = one.body;

HTMLParser( html, {
start: function( tagName, attrs, unary ) {
// If it's a pre-built element, then we can ignore
// its construction
if ( one[ tagName ] ) {
curParentNode = one[ tagName ];
if ( !unary ) {
elems.push( curParentNode );
}
return;
}

var elem = doc.createElement( tagName );

for ( var attr in attrs )
elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value );

if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" )
one[ structure[ tagName ] ].appendChild( elem );

else if ( curParentNode && curParentNode.appendChild )
curParentNode.appendChild( elem );

if ( !unary ) {
elems.push( elem );
curParentNode = elem;
}
},
end: function( tag ) {
elems.length -= 1;

// Init the new parentNode
curParentNode = elems[ elems.length - 1 ];
},
chars: function( text ) {
curParentNode.appendChild( doc.createTextNode( text ) );
},
comment: function( text ) {
// create comment node
}
});

return doc;
};

function makeMap(str){
var obj = {}, items = str.split(",");
for ( var i = 0; i < items.length; i++ )
obj[ items[i] ] = true;
return obj;
}

})();

Комментариев нет:

Отправить комментарий