00001 from htmllib import HTMLParser
00002 from cgi import escape
00003 from urlparse import urlparse
00004 from formatter import AbstractFormatter
00005 from htmlentitydefs import entitydefs
00006 from xml.sax.saxutils import quoteattr
00007
00008 def xssescape(text):
00009 """Gets rid of < and > and & and, for good measure, :"""
00010 return escape(text, quote=True).replace(':',':')
00011
00012 class XssCleaner(HTMLParser):
00013 def __init__(self, fmt = AbstractFormatter):
00014 HTMLParser.__init__(self, fmt)
00015 self.result = ""
00016 self.open_tags = []
00017
00018
00019
00020 self.permitted_tags = ['a', 'b', 'blockquote', 'br', 'i',
00021 'li', 'ol', 'ul', 'p', 'cite']
00022
00023
00024 self.requires_no_close = ['img', 'br']
00025
00026
00027
00028
00029
00030 self.allowed_attributes = \
00031 {'a':['href','title'],
00032 'img':['src','alt'],
00033 'blockquote':['type']}
00034
00035
00036
00037 self.allowed_schemes = ['http','https','ftp']
00038 def handle_data(self, data):
00039 if data:
00040 self.result += xssescape(data)
00041 def handle_charref(self, ref):
00042 if len(ref) < 7 and ref.isdigit():
00043 self.result += '&#%s;' % ref
00044 else:
00045 self.result += xssescape('&#%s' % ref)
00046 def handle_entityref(self, ref):
00047 if ref in entitydefs:
00048 self.result += '&%s;' % ref
00049 else:
00050 self.result += xssescape('&%s' % ref)
00051 def handle_comment(self, comment):
00052 if comment:
00053 self.result += xssescape("<!--%s-->" % comment)
00054
00055 def handle_starttag(self, tag, method, attrs):
00056 if tag not in self.permitted_tags:
00057 self.result += xssescape("<%s>" % tag)
00058 else:
00059 bt = "<" + tag
00060 if tag in self.allowed_attributes:
00061 attrs = dict(attrs)
00062 self.allowed_attributes_here = \
00063 [x for x in self.allowed_attributes[tag] if x in attrs \
00064 and len(attrs[x]) > 0]
00065 for attribute in self.allowed_attributes_here:
00066 if attribute in ['href', 'src', 'background']:
00067 if self.url_is_acceptable(attrs[attribute]):
00068 bt += ' %s="%s"' % (attribute, attrs[attribute])
00069 else:
00070 bt += ' %s=%s' % \
00071 (xssescape(attribute), quoteattr(attrs[attribute]))
00072 if bt == "<a" or bt == "<img":
00073 return
00074 if tag in self.requires_no_close:
00075 bt += "/"
00076 bt += ">"
00077 self.result += bt
00078 self.open_tags.insert(0, tag)
00079
00080 def handle_endtag(self, tag, attrs):
00081 bracketed = "</%s>" % tag
00082 if tag not in self.permitted_tags:
00083 self.result += xssescape(bracketed)
00084 elif tag in self.open_tags:
00085 self.result += bracketed
00086 self.open_tags.remove(tag)
00087
00088 def unknown_starttag(self, tag, attributes):
00089 self.handle_starttag(tag, None, attributes)
00090 def unknown_endtag(self, tag):
00091 self.handle_endtag(tag, None)
00092 def url_is_acceptable(self,url):
00093
00094 parsed = urlparse(url)
00095 return parsed[0] in self.allowed_schemes and '.' in parsed[1]
00096 def strip(self, rawstring):
00097 """Returns the argument stripped of potentially harmful HTML or Javascript code"""
00098 self.result = ""
00099 self.feed(rawstring)
00100 for endtag in self.open_tags:
00101 if endtag not in self.requires_no_close:
00102 self.result += "</%s>" % endtag
00103 return self.result
00104 def xtags(self):
00105 """Returns a printable string informing the user which tags are allowed"""
00106 self.permitted_tags.sort()
00107 tg = ""
00108 for x in self.permitted_tags:
00109 tg += "<" + x
00110 if x in self.allowed_attributes:
00111 for y in self.allowed_attributes[x]:
00112 tg += ' %s=""' % y
00113 tg += "> "
00114 return xssescape(tg.strip())