blob: c6704faae969a43d6d85576ff45fbee449495c34 [file] [log] [blame]
import os
import sys
import traceback
import StringIO
import warnings
import re
warnings.simplefilter("error")
from support import html5lib_test_files as data_files
from support import TestData, convert, convertExpected
import html5lib
from html5lib import html5parser, treebuilders, constants
treeTypes = {"simpletree":treebuilders.getTreeBuilder("simpletree"),
"DOM":treebuilders.getTreeBuilder("dom")}
#Try whatever etree implementations are avaliable from a list that are
#"supposed" to work
try:
import xml.etree.ElementTree as ElementTree
treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
except ImportError:
try:
import elementtree.ElementTree as ElementTree
treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
except ImportError:
pass
try:
import xml.etree.cElementTree as cElementTree
treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
except ImportError:
try:
import cElementTree
treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
except ImportError:
pass
try:
try:
import lxml.html as lxml
except ImportError:
import lxml.etree as lxml
treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml", lxml, fullTree=True)
except ImportError:
pass
try:
import BeautifulSoup
treeTypes["beautifulsoup"] = treebuilders.getTreeBuilder("beautifulsoup", fullTree=True)
except ImportError:
pass
#Try whatever dom implementations are avaliable from a list that are
#"supposed" to work
try:
import pxdom
treeTypes["pxdom"] = treebuilders.getTreeBuilder("dom", pxdom)
except ImportError:
pass
#Run the parse error checks
checkParseErrors = False
#XXX - There should just be one function here but for some reason the testcase
#format differs from the treedump format by a single space character
def convertTreeDump(data):
return "\n".join(convert(3)(data).split("\n")[1:])
namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
def runParserTest(innerHTML, input, expected, errors, treeClass,
namespaceHTMLElements):
#XXX - move this out into the setup function
#concatenate all consecutive character tokens into a single token
try:
p = html5parser.HTMLParser(tree = treeClass,
namespaceHTMLElements=namespaceHTMLElements)
except constants.DataLossWarning:
return
try:
if innerHTML:
document = p.parseFragment(input, innerHTML)
else:
try:
document = p.parse(input)
except constants.DataLossWarning:
return
except:
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
u"\nTraceback:", traceback.format_exc()])
assert False, errorMsg.encode("utf8")
output = convertTreeDump(p.tree.testSerializer(document))
expected = convertExpected(expected)
if namespaceHTMLElements:
expected = namespaceExpected(r"\1<html \2>", expected)
errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
u"\nReceived:", output])
assert expected == output, errorMsg.encode("utf8")
errStr = [u"Line: %i Col: %i %s"%(line, col,
constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for
((line,col), errorcode, datavars) in p.errors]
errorMsg2 = u"\n".join([u"\n\nInput:", input,
u"\nExpected errors (" + str(len(errors)) + u"):\n" + u"\n".join(errors),
u"\nActual errors (" + str(len(p.errors)) + u"):\n" + u"\n".join(errStr)])
if checkParseErrors:
assert len(p.errors) == len(errors), errorMsg2.encode("utf-8")
def test_parser():
sys.stderr.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n")
files = data_files('tree-construction')
for filename in files:
testName = os.path.basename(filename).replace(".dat","")
tests = TestData(filename, "data")
for index, test in enumerate(tests):
input, errors, innerHTML, expected = [test[key] for key in
'data', 'errors',
'document-fragment',
'document']
if errors:
errors = errors.split("\n")
for treeName, treeCls in treeTypes.iteritems():
for namespaceHTMLElements in (True, False):
print input
yield (runParserTest, innerHTML, input, expected, errors, treeCls,
namespaceHTMLElements)
break