$search
00001 # Aseba - an event-based framework for distributed robot control 00002 # Copyright (C) 2007--2011: 00003 # Stephane Magnenat <stephane at magnenat dot net> 00004 # (http://stephane.magnenat.net) 00005 # and other contributors, see authors.txt for details 00006 # 00007 # This program is free software: you can redistribute it and/or modify 00008 # it under the terms of the GNU Lesser General Public License as published 00009 # by the Free Software Foundation, version 3 of the License. 00010 # 00011 # This program is distributed in the hope that it will be useful, 00012 # but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 # GNU Lesser General Public License for more details. 00015 # 00016 # You should have received a copy of the GNU Lesser General Public License 00017 # along with this program. If not, see <http://www.gnu.org/licenses/>. 00018 00019 from HTMLParser import HTMLParser 00020 00021 class MyParser(HTMLParser): 00022 """Custom HTML parser, derived from HTMLParser lib. 00023 00024 Member functions are overidden to output the HTML document 00025 as it, without any changes. The document is retrieved with the 00026 get_doc() function.""" 00027 00028 def __init__(self): 00029 """Initialize the parser""" 00030 HTMLParser.__init__(self) 00031 self.reset() 00032 00033 # Public interface 00034 def reset(self): 00035 """Reset the parser's state""" 00036 self.out_doc = "" 00037 HTMLParser.reset(self) 00038 00039 def get_doc(self): 00040 """Return the parsed document""" 00041 return self.out_doc 00042 00043 # Private functions 00044 def format_start_tag(self, tag, attrs): 00045 """Private - Format a <tag attributes>""" 00046 # Format back attributes 00047 attributes = "" 00048 for attr in attrs: 00049 if ~isinstance(attr[1], str): 00050 attr = (attr[0], str(attr[1])) 00051 attributes += (attr[0] + "=\"" + attr[1] + "\" ") 00052 return "<{} {}>".format(tag, attributes) 00053 00054 def format_end_tag(self, tag): 00055 """Private - Format an </tag>""" 00056 return "</{}>".format(tag) 00057 00058 # Inherited functions 00059 def handle_starttag(self, tag, attrs): 00060 """Overidden - Called when a start tag is parsed""" 00061 self.out_doc += self.format_start_tag(tag, attrs) 00062 00063 def handle_endtag(self, tag): 00064 """Overidden - Called when an end tag is parsed""" 00065 self.out_doc += self.format_end_tag(tag) 00066 00067 def handle_data(self, data): 00068 """Overidden - Called when some data is encountered""" 00069 """ 00070 chars = self.badchars_regex.findall(data) 00071 if len(chars) > 0: 00072 print >> sys.stderr, "Found bad characters: ", chars 00073 """ 00074 self.out_doc += data 00075 00076 def handle_charref(self, name): 00077 """Overidden - Called when a charref (&#xyz) is parsed""" 00078 self.out_doc += ("&#" + name + ";") 00079 00080 def handle_entityref(self, name): 00081 """Overidden - Called when an entityref (&xyz) is parsed""" 00082 self.out_doc += ("&" + name + ";") 00083 00084 def handle_decl(self, decl): 00085 """Overidden - Called when a SGML declaration (<!) is parsed""" 00086 self.out_doc += ("<!" + decl + ">") 00087