$search
00001 from htmllib import HTMLParser 00002 from cgi import escape 00003 from urlparse import urlparse 00004 from formatter import AbstractFormatter 00005 from htmlentitydefs import entitydefs 00006 from xml.sax.saxutils import quoteattr 00007 00008 def xssescape(text): 00009 """Gets rid of < and > and & and, for good measure, :""" 00010 return escape(text, quote=True).replace(':',':') 00011 00012 class XssCleaner(HTMLParser): 00013 def __init__(self, fmt = AbstractFormatter): 00014 HTMLParser.__init__(self, fmt) 00015 self.result = "" 00016 self.open_tags = [] 00017 # A list of the only tags allowed. Be careful adding to this. Adding 00018 # "script," for example, would not be smart. 'img' is out by default 00019 # because of the danger of IMG embedded commands, and/or web bugs. 00020 self.permitted_tags = ['a', 'b', 'blockquote', 'br', 'i', 00021 'li', 'ol', 'ul', 'p', 'cite'] 00022 00023 # A list of tags that require no closing tag. 00024 self.requires_no_close = ['img', 'br'] 00025 00026 # A dictionary showing the only attributes allowed for particular tags. 00027 # If a tag is not listed here, it is allowed no attributes. Adding 00028 # "on" tags, like "onhover," would not be smart. Also be very careful 00029 # of "background" and "style." 00030 self.allowed_attributes = \ 00031 {'a':['href','title'], 00032 'img':['src','alt'], 00033 'blockquote':['type']} 00034 00035 # The only schemes allowed in URLs (for href and src attributes). 00036 # Adding "javascript" or "vbscript" to this list would not be smart. 00037 self.allowed_schemes = ['http','https','ftp'] 00038 def handle_data(self, data): 00039 if data: 00040 self.result += xssescape(data) 00041 def handle_charref(self, ref): 00042 if len(ref) < 7 and ref.isdigit(): 00043 self.result += '&#%s;' % ref 00044 else: 00045 self.result += xssescape('&#%s' % ref) 00046 def handle_entityref(self, ref): 00047 if ref in entitydefs: 00048 self.result += '&%s;' % ref 00049 else: 00050 self.result += xssescape('&%s' % ref) 00051 def handle_comment(self, comment): 00052 if comment: 00053 self.result += xssescape("<!--%s-->" % comment) 00054 00055 def handle_starttag(self, tag, method, attrs): 00056 if tag not in self.permitted_tags: 00057 self.result += xssescape("<%s>" % tag) 00058 else: 00059 bt = "<" + tag 00060 if tag in self.allowed_attributes: 00061 attrs = dict(attrs) 00062 self.allowed_attributes_here = \ 00063 [x for x in self.allowed_attributes[tag] if x in attrs \ 00064 and len(attrs[x]) > 0] 00065 for attribute in self.allowed_attributes_here: 00066 if attribute in ['href', 'src', 'background']: 00067 if self.url_is_acceptable(attrs[attribute]): 00068 bt += ' %s="%s"' % (attribute, attrs[attribute]) 00069 else: 00070 bt += ' %s=%s' % \ 00071 (xssescape(attribute), quoteattr(attrs[attribute])) 00072 if bt == "<a" or bt == "<img": 00073 return 00074 if tag in self.requires_no_close: 00075 bt += "/" 00076 bt += ">" 00077 self.result += bt 00078 self.open_tags.insert(0, tag) 00079 00080 def handle_endtag(self, tag, attrs): 00081 bracketed = "</%s>" % tag 00082 if tag not in self.permitted_tags: 00083 self.result += xssescape(bracketed) 00084 elif tag in self.open_tags: 00085 self.result += bracketed 00086 self.open_tags.remove(tag) 00087 00088 def unknown_starttag(self, tag, attributes): 00089 self.handle_starttag(tag, None, attributes) 00090 def unknown_endtag(self, tag): 00091 self.handle_endtag(tag, None) 00092 def url_is_acceptable(self,url): 00093 ### Requires all URLs to be "absolute." 00094 parsed = urlparse(url) 00095 return parsed[0] in self.allowed_schemes and '.' in parsed[1] 00096 def strip(self, rawstring): 00097 """Returns the argument stripped of potentially harmful HTML or Javascript code""" 00098 self.result = "" 00099 self.feed(rawstring) 00100 for endtag in self.open_tags: 00101 if endtag not in self.requires_no_close: 00102 self.result += "</%s>" % endtag 00103 return self.result 00104 def xtags(self): 00105 """Returns a printable string informing the user which tags are allowed""" 00106 self.permitted_tags.sort() 00107 tg = "" 00108 for x in self.permitted_tags: 00109 tg += "<" + x 00110 if x in self.allowed_attributes: 00111 for y in self.allowed_attributes[x]: 00112 tg += ' %s=""' % y 00113 tg += "> " 00114 return xssescape(tg.strip())