xss.py
Go to the documentation of this file.
00001 from htmllib import HTMLParser
00002 from cgi import escape
00003 from urlparse import urlparse
00004 from formatter import AbstractFormatter
00005 from htmlentitydefs import entitydefs
00006 from xml.sax.saxutils import quoteattr
00007 
00008 def xssescape(text):
00009     """Gets rid of < and > and & and, for good measure, :"""
00010     return escape(text, quote=True).replace(':','&#58;')
00011 
00012 class XssCleaner(HTMLParser):
00013     def __init__(self, fmt = AbstractFormatter):
00014         HTMLParser.__init__(self, fmt)
00015         self.result = ""
00016         self.open_tags = []
00017         # A list of the only tags allowed.  Be careful adding to this.  Adding
00018         # "script," for example, would not be smart.  'img' is out by default 
00019         # because of the danger of IMG embedded commands, and/or web bugs.
00020         self.permitted_tags = ['a', 'b', 'blockquote', 'br', 'i', 
00021                           'li', 'ol', 'ul', 'p', 'cite']
00022 
00023         # A list of tags that require no closing tag.
00024         self.requires_no_close = ['img', 'br']
00025 
00026         # A dictionary showing the only attributes allowed for particular tags.
00027         # If a tag is not listed here, it is allowed no attributes.  Adding
00028         # "on" tags, like "onhover," would not be smart.  Also be very careful
00029         # of "background" and "style."
00030         self.allowed_attributes = \
00031             {'a':['href','title'],
00032              'img':['src','alt'],
00033              'blockquote':['type']}
00034 
00035         # The only schemes allowed in URLs (for href and src attributes).
00036         # Adding "javascript" or "vbscript" to this list would not be smart.
00037         self.allowed_schemes = ['http','https','ftp']
00038     def handle_data(self, data):
00039         if data:
00040             self.result += xssescape(data)
00041     def handle_charref(self, ref):
00042         if len(ref) < 7 and ref.isdigit():
00043             self.result += '&#%s;' % ref
00044         else:
00045             self.result += xssescape('&#%s' % ref)
00046     def handle_entityref(self, ref):
00047         if ref in entitydefs:
00048             self.result += '&%s;' % ref
00049         else:
00050             self.result += xssescape('&%s' % ref)
00051     def handle_comment(self, comment):
00052         if comment:
00053             self.result += xssescape("<!--%s-->" % comment)
00054 
00055     def handle_starttag(self, tag, method, attrs):
00056         if tag not in self.permitted_tags:
00057             self.result += xssescape("<%s>" %  tag)
00058         else:
00059             bt = "<" + tag
00060             if tag in self.allowed_attributes:
00061                 attrs = dict(attrs)
00062                 self.allowed_attributes_here = \
00063                   [x for x in self.allowed_attributes[tag] if x in attrs \
00064                    and len(attrs[x]) > 0]
00065                 for attribute in self.allowed_attributes_here:
00066                     if attribute in ['href', 'src', 'background']:
00067                         if self.url_is_acceptable(attrs[attribute]):
00068                             bt += ' %s="%s"' % (attribute, attrs[attribute])
00069                     else:
00070                         bt += ' %s=%s' % \
00071                            (xssescape(attribute), quoteattr(attrs[attribute]))
00072             if bt == "<a" or bt == "<img":
00073                 return
00074             if tag in self.requires_no_close:
00075                 bt += "/"
00076             bt += ">"                     
00077             self.result += bt
00078             self.open_tags.insert(0, tag)
00079             
00080     def handle_endtag(self, tag, attrs):
00081         bracketed = "</%s>" % tag
00082         if tag not in self.permitted_tags:
00083             self.result += xssescape(bracketed)
00084         elif tag in self.open_tags:
00085             self.result += bracketed
00086             self.open_tags.remove(tag)
00087             
00088     def unknown_starttag(self, tag, attributes):
00089         self.handle_starttag(tag, None, attributes)
00090     def unknown_endtag(self, tag):
00091         self.handle_endtag(tag, None)
00092     def url_is_acceptable(self,url):
00093         ### Requires all URLs to be "absolute."
00094         parsed = urlparse(url)
00095         return parsed[0] in self.allowed_schemes and '.' in parsed[1]
00096     def strip(self, rawstring):
00097         """Returns the argument stripped of potentially harmful HTML or Javascript code"""
00098         self.result = ""
00099         self.feed(rawstring)
00100         for endtag in self.open_tags:
00101             if endtag not in self.requires_no_close:
00102                 self.result += "</%s>" % endtag
00103         return self.result
00104     def xtags(self):
00105         """Returns a printable string informing the user which tags are allowed"""
00106         self.permitted_tags.sort()
00107         tg = ""
00108         for x in self.permitted_tags:
00109             tg += "<" + x
00110             if x in self.allowed_attributes:
00111                 for y in self.allowed_attributes[x]:
00112                     tg += ' %s=""' % y
00113             tg += "> "
00114         return xssescape(tg.strip())


webui
Author(s): Scott Hassan/hassan@willowgarage.com
autogenerated on Wed Apr 23 2014 10:36:00