blob: 8b8e397e7272b0ab58f0ad5634f96bc9a3514d65 [file] [log] [blame]
# Copyright 2015 the Melange authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of HTML sanitizer to be used in Melange."""
from html5lib import sanitizer
from html5lib import tokenizer
class MelangeHTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
"""Mixin class which inherits methods from the actual sanitizer but defines
its own acceptable elements which are safe yet sufficient for Melange.
"""
acceptable_elements = [
'a', 'abbr', 'acronym', 'address', 'article', 'aside', 'b', 'big',
'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'em',
'figcaption', 'figure', 'footer', 'font', 'header', 'h1', 'h2', 'h3',
'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'label', 'legend', 'li',
'nav', 'ol', 'p', 'pre', 'q', 's', 'samp', 'section', 'small',
'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'time',
'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var'
]
acceptable_attributes = [
'abbr', 'align', 'alt', 'axis', 'bgcolor', 'border', 'bordercolor',
'bordercolordark', 'bordercolorlight', 'cellpadding', 'cellspacing', 'ch',
'char', 'charoff', 'choff', 'cite', 'class', 'clear', 'color', 'colspan',
'data', 'datetime', 'dir', 'for', 'headers', 'height', 'href', 'id',
'lang', 'name', 'noshade', 'nowrap', 'open', 'rel', 'replace', 'rowspan',
'rules', 'scope', 'span', 'src', 'start', 'style', 'summary', 'tabindex',
'target', 'title', 'valign', 'value', 'variable', 'width', 'wrap'
]
acceptable_svg_properties = []
acceptable_protocols = ['http', 'https', 'mailto']
allowed_elements = acceptable_elements
allowed_attributes = acceptable_attributes
allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols
class HTMLSanitizer(tokenizer.HTMLTokenizer, MelangeHTMLSanitizerMixin):
"""Sanitizer which is backed up by MelangeHTMLSanitizerMixin class.
The actual implementation is copied from and identical to
html5lib.sanitizer.HTMLSanitizer but this class uses Melnage's own mixin
that defines all acceptable elements.
"""
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False,
parser=None):
"""See tokenizer.HTMLTokenizer.__init__ for specification."""
# Change case matching defaults as we only output lowercase html anyway
# This solution doesn't seem ideal...
tokenizer.HTMLTokenizer.__init__(
self, stream, encoding, parseMeta, useChardet, lowercaseElementName,
lowercaseAttrName, parser=parser)
def __iter__(self):
"""See tokenizer.HTMLTokenizer.__iter__ for specification."""
for token in tokenizer.HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token