myparser.py
Go to the documentation of this file.
00001 #   Aseba - an event-based framework for distributed robot control
00002 #   Copyright (C) 2007--2011:
00003 #           Stephane Magnenat <stephane at magnenat dot net>
00004 #           (http://stephane.magnenat.net)
00005 #           and other contributors, see authors.txt for details
00006 #
00007 #   This program is free software: you can redistribute it and/or modify
00008 #   it under the terms of the GNU Lesser General Public License as published
00009 #   by the Free Software Foundation, version 3 of the License.
00010 #
00011 #   This program is distributed in the hope that it will be useful,
00012 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 #   GNU Lesser General Public License for more details.
00015 #
00016 #   You should have received a copy of the GNU Lesser General Public License
00017 #   along with this program. If not, see <http://www.gnu.org/licenses/>.
00018 
00019 from HTMLParser import HTMLParser
00020 
00021 class MyParser(HTMLParser):
00022     """Custom HTML parser, derived from HTMLParser lib.
00023 
00024     Member functions are overidden to output the HTML document
00025     as it, without any changes. The document is retrieved with the
00026     get_doc() function."""
00027 
00028     def __init__(self):
00029         """Initialize the parser"""
00030         HTMLParser.__init__(self)
00031         self.reset()
00032 
00033     # Public interface
00034     def reset(self):
00035         """Reset the parser's state"""
00036         self.out_doc = ""
00037         HTMLParser.reset(self)
00038 
00039     def get_doc(self):
00040         """Return the parsed document"""
00041         return self.out_doc
00042 
00043     # Private functions
00044     def format_start_tag(self, tag, attrs):
00045         """Private - Format a <tag attributes>"""
00046         # Format back attributes
00047         attributes = ""
00048         for attr in attrs:
00049             if ~isinstance(attr[1], str):
00050                 attr = (attr[0], str(attr[1]))
00051             attributes += (attr[0] + "=\"" + attr[1] + "\" ")
00052         return "<{} {}>".format(tag, attributes)
00053 
00054     def format_end_tag(self, tag):
00055         """Private - Format an </tag>"""
00056         return "</{}>".format(tag)
00057 
00058     # Inherited functions
00059     def handle_starttag(self, tag, attrs):
00060         """Overidden - Called when a start tag is parsed"""
00061         self.out_doc += self.format_start_tag(tag, attrs)
00062 
00063     def handle_endtag(self, tag):
00064         """Overidden - Called when an end tag is parsed"""
00065         self.out_doc += self.format_end_tag(tag)
00066 
00067     def handle_data(self, data):
00068         """Overidden - Called when some data is encountered"""
00069         """
00070         chars = self.badchars_regex.findall(data)
00071         if len(chars) > 0:
00072             print >> sys.stderr, "Found bad characters: ", chars
00073         """
00074         self.out_doc += data
00075 
00076     def handle_charref(self, name):
00077         """Overidden - Called when a charref (&#xyz) is parsed"""
00078         self.out_doc += ("&#" + name + ";")
00079 
00080     def handle_entityref(self, name):
00081         """Overidden - Called when an entityref (&xyz) is parsed"""
00082         self.out_doc += ("&" + name + ";")
00083 
00084     def handle_decl(self, decl):
00085         """Overidden - Called when a SGML declaration (<!) is parsed"""
00086         self.out_doc += ("<!" + decl + ">")
00087 


aseba
Author(s): Stéphane Magnenat
autogenerated on Sun Oct 5 2014 23:46:38