blob: f2c9610c1188af972a9910977f3ffbdc3d02cbaa [file] [log] [blame]
# Copyright 2015 the Melange authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests HTML sanitizer."""
import unittest
from html5lib import constants
from html5lib import html5parser
from html5lib import sanitizer
from melange.utils import htmlsanitizer
VOID_ELEMENT_FRAGMENT = '<%(element)s/>'
CLOSED_ELEMENT_FRAGMENT = '<%(element)s>Some inner HTML</%(element)s>'
ESCAPED_ELEMENT_FRAGMENT = (
'&lt;%(element)s&gt;Some inner HTML&lt;/%(element)s&gt;')
ATTRIBUTE_FRAGMENT = '<p %s="foo">Some inner HTML</p>'
REMOVED_ATTRIBUTE_FRAGMENT = '<p>Some inner HTML</p>'
STYLE_FRAGMENT = '<p style="%s">Some inner HTML</p>'
REMOVED_STYLE_FRAGMENT = '<p style="">Some inner HTML</p>'
# List of elements which are acceptable but only make sense in some context,
# i.e. when they are enclosed by other elements. They cannot be tested
# in a usual way, using the fragments above, because HTML parser figures out
# that they do not make sense as standalone elements.
TABLE_ELEMENTS = [
'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
'thead', 'tr']
# Special fragment to be used to test table related elements
TABLE_ELEMENT_FRAGMENT = """
<table>
<thead>
<tr>
<th>Header 1</th>
<th>Header 2</th>
</tr>
</thead>
<tbody>
<tr>
<td>Value 1</td>
<td>Value 2</td>
</tr>
<tr>
<td>Value 3</td>
<td>Value 4</td>
</tr>
</tbody>
</table>
"""
# List of selected elements which are forbidden
SELECTED_FORBIDDEN_ELEMENTS = ['form']
# List of global event attributes. They should never go through, as they
# accept JavaScript code for their value.
GLOBAL_EVENT_ATTRIBUTES = [
'onafterprint', 'onbeforeprint', 'onbeforeunload', 'onerror',
'onhashchange', 'onload', 'onmessage', 'onoffline', 'ononline',
'onpagehide', 'onpageshow', 'onpopstate', 'onresize', 'onstorage',
'onunload', 'onblur', 'onchange', 'oncontextmenu', 'onfocus', 'oninput',
'oninvalid', 'onreset', 'onsearch', 'onselect', 'onsubmit', 'onkeydown',
'onkeypress', 'onkeyup', 'onclick', 'ondblclick', 'ondrag', 'ondragend',
'ondragenter', 'ondragleave', 'ondragover', 'ondragstart', 'ondrop',
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup',
'onmousewheel', 'onscroll', 'onwheel', 'oncopy', 'oncut', 'onpaste',
'onerror', 'onshow', 'ontoggle']
# List of selected attributes which are forbidden
SELECTED_FORBIDDEN_ATTRIBUTES = GLOBAL_EVENT_ATTRIBUTES
SELECTED_ALLOWED_CSS = [
'background-image: none;',
'color: red;']
SELECTED_FORBIDDEN_CSS = [
'background-image: url(www.example.com);',
'background: #ffffff url("img_tree.png") no-repeat right top;']
def sanitize_html(fragment):
"""Returns sanitized HTML fragment.
Args:
fragment: A string containing HTML fragment.
Returns:
A string containing sanitized HTML fragment.
"""
return ''.join(
token.toxml() for token in
(html5parser.HTMLParser(tokenizer=htmlsanitizer.HTMLSanitizer)
.parseFragment(fragment)
.childNodes))
class HTMLSanitizerTest(unittest.TestCase):
"""Unit tests for HTMLSanitizer class."""
def setUp(self):
"""See unittest.TestCase.setUp for specification."""
def testAcceptableElements(self):
"""Tests that acceptable elements are left as they are."""
for element in htmlsanitizer.HTMLSanitizer.acceptable_elements:
if element in TABLE_ELEMENTS:
# table related elements are simply removed by the parser
# because they do not make sense without context
continue
elif element == 'table':
self.assertEqual(
TABLE_ELEMENT_FRAGMENT,
sanitize_html(TABLE_ELEMENT_FRAGMENT))
elif element in constants.voidElements:
self.assertEqual(
VOID_ELEMENT_FRAGMENT % {'element': element},
sanitize_html(VOID_ELEMENT_FRAGMENT % {'element': element}))
else:
self.assertEqual(
CLOSED_ELEMENT_FRAGMENT % {'element': element},
sanitize_html(CLOSED_ELEMENT_FRAGMENT % {'element': element}))
def testForbiddenElements(self):
"""Tests that forbidden elements are escaped."""
for element in SELECTED_FORBIDDEN_ELEMENTS:
self.assertEqual(
ESCAPED_ELEMENT_FRAGMENT % {'element': element},
sanitize_html(CLOSED_ELEMENT_FRAGMENT % {'element': element}))
def testAllowedAttributes(self):
"""Tests that allowed attributes are left as they are."""
for attribute in htmlsanitizer.HTMLSanitizer.acceptable_attributes:
if attribute == 'style':
# CSS is covered by a different test case
continue
else:
self.assertEqual(
ATTRIBUTE_FRAGMENT % attribute,
sanitize_html(ATTRIBUTE_FRAGMENT % attribute))
def testForbiddenAttributes(self):
"""Tests that forbidden attributes are removed."""
# check all attributes which are allowed only by the base sanitizer
attrs_allowed_by_base_sanitizer = (
set(sanitizer.HTMLSanitizer.acceptable_attributes) -
set(htmlsanitizer.HTMLSanitizer.acceptable_attributes))
for attribute in attrs_allowed_by_base_sanitizer:
self.assertEqual(
REMOVED_ATTRIBUTE_FRAGMENT,
sanitize_html(ATTRIBUTE_FRAGMENT % attribute))
# check attributes that are explicitly listed
for attribute in SELECTED_FORBIDDEN_ATTRIBUTES:
self.assertEqual(
REMOVED_ATTRIBUTE_FRAGMENT,
sanitize_html(ATTRIBUTE_FRAGMENT % attribute))
def testAllowedCss(self):
"""Tests that allowed CSS fragments are left as they are."""
for css in SELECTED_ALLOWED_CSS:
self.assertEqual(
STYLE_FRAGMENT % css,
sanitize_html(STYLE_FRAGMENT % css))
def testForbiddenCss(self):
"""Tests that forbidden CSS fragments are removed."""
for css in SELECTED_FORBIDDEN_CSS:
self.assertEqual(
REMOVED_STYLE_FRAGMENT, sanitize_html(STYLE_FRAGMENT % css))