blob: 5c8f2e4c9d17ab6d5bfd8352b245b1d1bbbbf3b0 [file] [log] [blame]
"""Copyright (c) 2002-2003, Benjamin Saller <bcsaller@ideasuite.com>, and
the respective authors.
Elements Copyright (c) 2001 Zope Foundation and Contributors.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Archetypes nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."""
from sgmllib import SGMLParser, SGMLParseError
import re
from cgi import escape
# tag mapping: tag -> short or long tag
VALID_TAGS = {'code': 1, 'meter': 1, 'tbody': 1, 'img': 0, 'title': 1, 'tt': 1, 'tr': 1, 'li': 1, 'source': 1, 'tfoot': 1, 'th': 1, 'td': 1, 'dl': 1, 'blockquote': 1, 'big': 1, 'dd': 1, 'kbd': 1, 'dt': 1, 'p': 1, 'small': 1, 'output': 1, 'div': 1, 'em': 1, 'datalist': 1, 'hgroup': 1, 'meta': 0, 'video': 1, 'rt': 1, 'canvas': 1, 'rp': 1, 'sub': 1, 'section': 1, 'sup': 1, 'progress': 1, 'acronym': 1, 'base': 0, 'br': 0, 'address': 1, 'article': 1, 'strong': 1, 'ol': 1, 'caption': 1, 'dialog': 1, 'col': 1, 'h2': 1, 'h3': 1, 'h1': 1, 'h6': 1, 'h4': 1, 'h5': 1, 'header': 1, 'table': 1, 'span': 1, 'area': 0, 'mark': 1, 'dfn': 1, 'var': 1, 'cite': 1, 'thead': 1, 'head': 1, 'hr': 0, 'ruby': 1, 'b': 1, 'colgroup': 1, 'keygen': 1, 'ul': 1, 'del': 1, 'pre': 1, 'figure': 1, 'ins': 1, 'aside': 1, 'nav': 1, 'details': 1, 'u': 1, 'samp': 1, 'map': 1, 'a': 1, 'footer': 1, 'i': 1, 'q': 1, 'command': 1, 'time': 1, 'audio': 1, 'bdo': 1, 'abbr': 1}
NASTY_TAGS = {'style': 1, 'script': 1, 'object': 1, 'applet': 1, 'meta': 1, 'embed': 1, 'head': 1}
msg_pat = """
<div class="system-message">
<p class="system-message-title">System message: %s</p>
%s</d>
"""
def hasScript(s):
"""Dig out evil Java/VB script inside an HTML attribute.
>>> hasScript('script:evil(1);')
True
>>> hasScript('expression:evil(1);')
True
>>> hasScript('http://foo.com/ExpressionOfInterest.doc')
False
"""
s = decode_htmlentities(s)
s = ''.join(s.split()).lower()
for t in ('script:', 'expression:', 'expression('):
if t in s:
return True
return False
def decode_htmlentities(s):
""" XSS code can be hidden with htmlentities """
entity_pattern = re.compile("&#(?P<htmlentity>x?\w+)?;?")
s = entity_pattern.sub(decode_htmlentity,s)
return s
def decode_htmlentity(m):
entity_value = m.groupdict()['htmlentity']
if entity_value.lower().startswith('x'):
try:
return chr(int('0'+entity_value,16))
except ValueError:
return entity_value
try:
return chr(int(entity_value))
except ValueError:
return entity_value
class IllegalHTML(Exception): pass
class StrippingParser(SGMLParser):
"""Pass only allowed tags; raise exception for known-bad.
Copied from Products.CMFDefault.utils
Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
"""
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self, valid, nasty, remove_javascript, raise_error):
SGMLParser.__init__( self )
self.result = []
self.valid = valid
self.nasty = nasty
self.remove_javascript = remove_javascript
self.raise_error = raise_error
self.suppress = False
def handle_data(self, data):
if self.suppress: return
if data:
self.result.append(escape(data))
def handle_charref(self, name):
if self.suppress: return
self.result.append('&#%s;' % name)
def handle_comment(self, comment):
pass
def handle_decl(self, data):
pass
def handle_entityref(self, name):
if self.suppress: return
if self.entitydefs.has_key(name):
x = ';'
else:
# this breaks unstandard entities that end with ';'
x = ''
self.result.append('&%s%s' % (name, x))
def unknown_starttag(self, tag, attrs):
""" Delete all tags except for legal ones.
"""
if self.suppress: return
if self.valid.has_key(tag):
self.result.append('<' + tag)
remove_script = getattr(self,'remove_javascript',True)
for k, v in attrs:
if remove_script and k.strip().lower().startswith('on'):
if not self.raise_error: continue
else: raise IllegalHTML, 'Script event "%s" not allowed.' % k
elif remove_script and hasScript(v):
if not self.raise_error: continue
else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v
else:
self.result.append(' %s="%s"' % (k, v))
#UNUSED endTag = '</%s>' % tag
if self.valid.get(tag):
self.result.append('>')
else:
self.result.append(' />')
elif self.nasty.has_key(tag):
self.suppress = True
if self.raise_error:
raise IllegalHTML, 'Dynamic tag "%s" not allowed.' % tag
else:
# omit tag
pass
def unknown_endtag(self, tag):
if self.nasty.has_key(tag) and not self.valid.has_key(tag):
self.suppress = False
if self.suppress: return
if self.valid.get(tag):
self.result.append('</%s>' % tag)
#remTag = '</%s>' % tag
def parse_declaration(self, i):
"""Fix handling of CDATA sections. Code borrowed from BeautifulSoup.
"""
j = None
if self.rawdata[i:i+9] == '<![CDATA[':
k = self.rawdata.find(']]>', i)
if k == -1:
k = len(self.rawdata)
data = self.rawdata[i+9:k]
j = k+3
self.result.append("<![CDATA[%s]]>" % data)
else:
try:
j = SGMLParser.parse_declaration(self, i)
except SGMLParseError:
toHandle = self.rawdata[i:]
self.result.append(toHandle)
j = i + len(toHandle)
return j
def getResult(self):
return ''.join(self.result)
def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS,
remove_javascript=True, raise_error=True):
""" Strip illegal HTML tags from string text.
"""
parser = StrippingParser(valid=valid, nasty=nasty,
remove_javascript=remove_javascript,
raise_error=raise_error)
parser.feed(html)
parser.close()
return parser.getResult()