Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 from HTMLParser import HTMLParser
00020
00021 class MyParser(HTMLParser):
00022 """Custom HTML parser, derived from HTMLParser lib.
00023
00024 Member functions are overidden to output the HTML document
00025 as it, without any changes. The document is retrieved with the
00026 get_doc() function."""
00027
00028 def __init__(self):
00029 """Initialize the parser"""
00030 HTMLParser.__init__(self)
00031 self.reset()
00032
00033
00034 def reset(self):
00035 """Reset the parser's state"""
00036 self.out_doc = ""
00037 HTMLParser.reset(self)
00038
00039 def get_doc(self):
00040 """Return the parsed document"""
00041 return self.out_doc
00042
00043
00044 def format_start_tag(self, tag, attrs):
00045 """Private - Format a <tag attributes>"""
00046
00047 attributes = ""
00048 for attr in attrs:
00049 if ~isinstance(attr[1], str):
00050 attr = (attr[0], str(attr[1]))
00051 attributes += (attr[0] + "=\"" + attr[1] + "\" ")
00052 return "<{} {}>".format(tag, attributes)
00053
00054 def format_end_tag(self, tag):
00055 """Private - Format an </tag>"""
00056 return "</{}>".format(tag)
00057
00058
00059 def handle_starttag(self, tag, attrs):
00060 """Overidden - Called when a start tag is parsed"""
00061 self.out_doc += self.format_start_tag(tag, attrs)
00062
00063 def handle_endtag(self, tag):
00064 """Overidden - Called when an end tag is parsed"""
00065 self.out_doc += self.format_end_tag(tag)
00066
00067 def handle_data(self, data):
00068 """Overidden - Called when some data is encountered"""
00069 """
00070 chars = self.badchars_regex.findall(data)
00071 if len(chars) > 0:
00072 print >> sys.stderr, "Found bad characters: ", chars
00073 """
00074 self.out_doc += data
00075
00076 def handle_charref(self, name):
00077 """Overidden - Called when a charref (&#xyz) is parsed"""
00078 self.out_doc += ("&#" + name + ";")
00079
00080 def handle_entityref(self, name):
00081 """Overidden - Called when an entityref (&xyz) is parsed"""
00082 self.out_doc += ("&" + name + ";")
00083
00084 def handle_decl(self, decl):
00085 """Overidden - Called when a SGML declaration (<!) is parsed"""
00086 self.out_doc += ("<!" + decl + ">")
00087