BeautifulSoup.py
Go to the documentation of this file.
00001 """Beautiful Soup
00002 Elixir and Tonic
00003 "The Screen-Scraper's Friend"
00004 http://www.crummy.com/software/BeautifulSoup/
00005 
00006 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
00007 tree representation. It provides methods and Pythonic idioms that make
00008 it easy to navigate, search, and modify the tree.
00009 
00010 A well-formed XML/HTML document yields a well-formed data
00011 structure. An ill-formed XML/HTML document yields a correspondingly
00012 ill-formed data structure. If your document is only locally
00013 well-formed, you can use this library to find and process the
00014 well-formed part of it. The BeautifulSoup class 
00015 
00016 Beautiful Soup works with Python 2.2 and up. It has no external
00017 dependencies, but you'll have more success at converting data to UTF-8
00018 if you also install these three packages:
00019 
00020 * chardet, for auto-detecting character encodings
00021   http://chardet.feedparser.org/
00022 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
00023   by stock Python.
00024   http://cjkpython.i18n.org/
00025 
00026 Beautiful Soup defines classes for two main parsing strategies:
00027     
00028  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
00029    language that kind of looks like XML.
00030 
00031  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
00032    or invalid. This class has web browser-like heuristics for
00033    obtaining a sensible parse tree in the face of common HTML errors.
00034 
00035 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
00036 the encoding of an HTML or XML document, and converting it to
00037 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
00038 
00039 For more than you ever wanted to know about Beautiful Soup, see the
00040 documentation:
00041 http://www.crummy.com/software/BeautifulSoup/documentation.html
00042 
00043 """
00044 from __future__ import generators
00045 
00046 __author__ = "Leonard Richardson (leonardr@segfault.org)"
00047 __version__ = "3.0.4"
00048 __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
00049 __license__ = "PSF"
00050 
00051 from sgmllib import SGMLParser, SGMLParseError
00052 import codecs
00053 import types
00054 import re
00055 import sgmllib
00056 try:
00057   from htmlentitydefs import name2codepoint
00058 except ImportError:
00059   name2codepoint = {}
00060 
00061 #This hack makes Beautiful Soup able to parse XML with namespaces
00062 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
00063 
00064 DEFAULT_OUTPUT_ENCODING = "utf-8"
00065 
00066 # First, the classes that represent markup elements.
00067 
00068 class PageElement:
00069     """Contains the navigational information for some part of the page
00070     (either a tag or a piece of text)"""
00071 
00072     def setup(self, parent=None, previous=None):
00073         """Sets up the initial relations between this element and
00074         other elements."""        
00075         self.parent = parent
00076         self.previous = previous
00077         self.next = None
00078         self.previousSibling = None
00079         self.nextSibling = None
00080         if self.parent and self.parent.contents:
00081             self.previousSibling = self.parent.contents[-1]
00082             self.previousSibling.nextSibling = self
00083 
00084     def replaceWith(self, replaceWith):        
00085         oldParent = self.parent
00086         myIndex = self.parent.contents.index(self)
00087         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
00088             # We're replacing this element with one of its siblings.
00089             index = self.parent.contents.index(replaceWith)
00090             if index and index < myIndex:
00091                 # Furthermore, it comes before this element. That
00092                 # means that when we extract it, the index of this
00093                 # element will change.
00094                 myIndex = myIndex - 1
00095         self.extract()        
00096         oldParent.insert(myIndex, replaceWith)
00097         
00098     def extract(self):
00099         """Destructively rips this element out of the tree."""        
00100         if self.parent:
00101             try:
00102                 self.parent.contents.remove(self)
00103             except ValueError:
00104                 pass
00105 
00106         #Find the two elements that would be next to each other if
00107         #this element (and any children) hadn't been parsed. Connect
00108         #the two.        
00109         lastChild = self._lastRecursiveChild()
00110         nextElement = lastChild.next
00111 
00112         if self.previous:
00113             self.previous.next = nextElement
00114         if nextElement:
00115             nextElement.previous = self.previous
00116         self.previous = None
00117         lastChild.next = None
00118 
00119         self.parent = None        
00120         if self.previousSibling:
00121             self.previousSibling.nextSibling = self.nextSibling
00122         if self.nextSibling:
00123             self.nextSibling.previousSibling = self.previousSibling
00124         self.previousSibling = self.nextSibling = None       
00125 
00126     def _lastRecursiveChild(self):
00127         "Finds the last element beneath this object to be parsed."
00128         lastChild = self
00129         while hasattr(lastChild, 'contents') and lastChild.contents:
00130             lastChild = lastChild.contents[-1]
00131         return lastChild
00132 
00133     def insert(self, position, newChild):
00134         if (isinstance(newChild, basestring)
00135             or isinstance(newChild, unicode)) \
00136             and not isinstance(newChild, NavigableString):
00137             newChild = NavigableString(newChild)        
00138 
00139         position =  min(position, len(self.contents))
00140         if hasattr(newChild, 'parent') and newChild.parent != None:
00141             # We're 'inserting' an element that's already one
00142             # of this object's children. 
00143             if newChild.parent == self:
00144                 index = self.find(newChild)
00145                 if index and index < position:
00146                     # Furthermore we're moving it further down the
00147                     # list of this object's children. That means that
00148                     # when we extract this element, our target index
00149                     # will jump down one.
00150                     position = position - 1
00151             newChild.extract()
00152             
00153         newChild.parent = self
00154         previousChild = None
00155         if position == 0:
00156             newChild.previousSibling = None
00157             newChild.previous = self
00158         else:
00159             previousChild = self.contents[position-1]
00160             newChild.previousSibling = previousChild
00161             newChild.previousSibling.nextSibling = newChild
00162             newChild.previous = previousChild._lastRecursiveChild()
00163         if newChild.previous:
00164             newChild.previous.next = newChild        
00165 
00166         newChildsLastElement = newChild._lastRecursiveChild()
00167 
00168         if position >= len(self.contents):
00169             newChild.nextSibling = None
00170             
00171             parent = self
00172             parentsNextSibling = None
00173             while not parentsNextSibling:
00174                 parentsNextSibling = parent.nextSibling
00175                 parent = parent.parent
00176                 if not parent: # This is the last element in the document.
00177                     break
00178             if parentsNextSibling:
00179                 newChildsLastElement.next = parentsNextSibling
00180             else:
00181                 newChildsLastElement.next = None
00182         else:
00183             nextChild = self.contents[position]            
00184             newChild.nextSibling = nextChild            
00185             if newChild.nextSibling:
00186                 newChild.nextSibling.previousSibling = newChild
00187             newChildsLastElement.next = nextChild
00188 
00189         if newChildsLastElement.next:
00190             newChildsLastElement.next.previous = newChildsLastElement
00191         self.contents.insert(position, newChild)
00192 
00193     def findNext(self, name=None, attrs={}, text=None, **kwargs):
00194         """Returns the first item that matches the given criteria and
00195         appears after this Tag in the document."""
00196         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
00197 
00198     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
00199                     **kwargs):
00200         """Returns all items that match the given criteria and appear
00201         before after Tag in the document."""
00202         return self._findAll(name, attrs, text, limit, self.nextGenerator)
00203 
00204     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
00205         """Returns the closest sibling to this Tag that matches the
00206         given criteria and appears after this Tag in the document."""
00207         return self._findOne(self.findNextSiblings, name, attrs, text,
00208                              **kwargs)
00209 
00210     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
00211                          **kwargs):
00212         """Returns the siblings of this Tag that match the given
00213         criteria and appear after this Tag in the document."""
00214         return self._findAll(name, attrs, text, limit,
00215                              self.nextSiblingGenerator, **kwargs)
00216     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
00217 
00218     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
00219         """Returns the first item that matches the given criteria and
00220         appears before this Tag in the document."""
00221         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
00222 
00223     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
00224                         **kwargs):
00225         """Returns all items that match the given criteria and appear
00226         before this Tag in the document."""
00227         return self._findAll(name, attrs, text, limit, self.previousGenerator,
00228                            **kwargs)
00229     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
00230 
00231     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
00232         """Returns the closest sibling to this Tag that matches the
00233         given criteria and appears before this Tag in the document."""
00234         return self._findOne(self.findPreviousSiblings, name, attrs, text,
00235                              **kwargs)
00236 
00237     def findPreviousSiblings(self, name=None, attrs={}, text=None,
00238                              limit=None, **kwargs):
00239         """Returns the siblings of this Tag that match the given
00240         criteria and appear before this Tag in the document."""
00241         return self._findAll(name, attrs, text, limit,
00242                              self.previousSiblingGenerator, **kwargs)
00243     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
00244 
00245     def findParent(self, name=None, attrs={}, **kwargs):
00246         """Returns the closest parent of this Tag that matches the given
00247         criteria."""
00248         # NOTE: We can't use _findOne because findParents takes a different
00249         # set of arguments.
00250         r = None
00251         l = self.findParents(name, attrs, 1)
00252         if l:
00253             r = l[0]
00254         return r
00255 
00256     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
00257         """Returns the parents of this Tag that match the given
00258         criteria."""
00259 
00260         return self._findAll(name, attrs, None, limit, self.parentGenerator,
00261                              **kwargs)
00262     fetchParents = findParents # Compatibility with pre-3.x
00263 
00264     #These methods do the real heavy lifting.
00265 
00266     def _findOne(self, method, name, attrs, text, **kwargs):
00267         r = None
00268         l = method(name, attrs, text, 1, **kwargs)
00269         if l:
00270             r = l[0]
00271         return r
00272     
00273     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
00274         "Iterates over a generator looking for things that match."
00275 
00276         if isinstance(name, SoupStrainer):
00277             strainer = name
00278         else:
00279             # Build a SoupStrainer
00280             strainer = SoupStrainer(name, attrs, text, **kwargs)
00281         results = ResultSet(strainer)
00282         g = generator()
00283         while True:
00284             try:
00285                 i = g.next()
00286             except StopIteration:
00287                 break
00288             if i:
00289                 found = strainer.search(i)
00290                 if found:
00291                     results.append(found)
00292                     if limit and len(results) >= limit:
00293                         break
00294         return results
00295 
00296     #These Generators can be used to navigate starting from both
00297     #NavigableStrings and Tags.                
00298     def nextGenerator(self):
00299         i = self
00300         while i:
00301             i = i.next
00302             yield i
00303 
00304     def nextSiblingGenerator(self):
00305         i = self
00306         while i:
00307             i = i.nextSibling
00308             yield i
00309 
00310     def previousGenerator(self):
00311         i = self
00312         while i:
00313             i = i.previous
00314             yield i
00315 
00316     def previousSiblingGenerator(self):
00317         i = self
00318         while i:
00319             i = i.previousSibling
00320             yield i
00321 
00322     def parentGenerator(self):
00323         i = self
00324         while i:
00325             i = i.parent
00326             yield i
00327 
00328     # Utility methods
00329     def substituteEncoding(self, str, encoding=None):
00330         encoding = encoding or "utf-8"
00331         return str.replace("%SOUP-ENCODING%", encoding)    
00332 
00333     def toEncoding(self, s, encoding=None):
00334         """Encodes an object to a string in some encoding, or to Unicode.
00335         ."""
00336         if isinstance(s, unicode):
00337             if encoding:
00338                 s = s.encode(encoding)
00339         elif isinstance(s, str):
00340             if encoding:
00341                 s = s.encode(encoding)
00342             else:
00343                 s = unicode(s)
00344         else:
00345             if encoding:
00346                 s  = self.toEncoding(str(s), encoding)
00347             else:
00348                 s = unicode(s)
00349         return s
00350 
00351 class NavigableString(unicode, PageElement):
00352 
00353     def __getattr__(self, attr):
00354         """text.string gives you text. This is for backwards
00355         compatibility for Navigable*String, but for CData* it lets you
00356         get the string without the CData wrapper."""
00357         if attr == 'string':
00358             return self
00359         else:
00360             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
00361 
00362     def __unicode__(self):
00363         return self.__str__(None)
00364 
00365     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00366         if encoding:
00367             return self.encode(encoding)
00368         else:
00369             return self
00370         
00371 class CData(NavigableString):
00372 
00373     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00374         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
00375 
00376 class ProcessingInstruction(NavigableString):
00377     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00378         output = self
00379         if "%SOUP-ENCODING%" in output:
00380             output = self.substituteEncoding(output, encoding)
00381         return "<?%s?>" % self.toEncoding(output, encoding)
00382 
00383 class Comment(NavigableString):
00384     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00385         return "<!--%s-->" % NavigableString.__str__(self, encoding)    
00386 
00387 class Declaration(NavigableString):
00388     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00389         return "<!%s>" % NavigableString.__str__(self, encoding)        
00390 
00391 class Tag(PageElement):
00392 
00393     """Represents a found HTML tag with its attributes and contents."""
00394 
00395     XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
00396                                       '"' : "quote",
00397                                       "&" : "amp",
00398                                       "<" : "lt",
00399                                       ">" : "gt" }
00400 
00401     def __init__(self, parser, name, attrs=None, parent=None,
00402                  previous=None):
00403         "Basic constructor."
00404 
00405         # We don't actually store the parser object: that lets extracted
00406         # chunks be garbage-collected
00407         self.parserClass = parser.__class__
00408         self.isSelfClosing = parser.isSelfClosingTag(name)
00409         self.name = name
00410         if attrs == None:
00411             attrs = []
00412         self.attrs = attrs
00413         self.contents = []
00414         self.setup(parent, previous)
00415         self.hidden = False
00416         self.containsSubstitutions = False
00417 
00418     def get(self, key, default=None):
00419         """Returns the value of the 'key' attribute for the tag, or
00420         the value given for 'default' if it doesn't have that
00421         attribute."""
00422         return self._getAttrMap().get(key, default)    
00423 
00424     def has_key(self, key):
00425         return self._getAttrMap().has_key(key)
00426 
00427     def __getitem__(self, key):
00428         """tag[key] returns the value of the 'key' attribute for the tag,
00429         and throws an exception if it's not there."""
00430         return self._getAttrMap()[key]
00431 
00432     def __iter__(self):
00433         "Iterating over a tag iterates over its contents."
00434         return iter(self.contents)
00435 
00436     def __len__(self):
00437         "The length of a tag is the length of its list of contents."
00438         return len(self.contents)
00439 
00440     def __contains__(self, x):
00441         return x in self.contents
00442 
00443     def __nonzero__(self):
00444         "A tag is non-None even if it has no contents."
00445         return True
00446 
00447     def __setitem__(self, key, value):        
00448         """Setting tag[key] sets the value of the 'key' attribute for the
00449         tag."""
00450         self._getAttrMap()
00451         self.attrMap[key] = value
00452         found = False
00453         for i in range(0, len(self.attrs)):
00454             if self.attrs[i][0] == key:
00455                 self.attrs[i] = (key, value)
00456                 found = True
00457         if not found:
00458             self.attrs.append((key, value))
00459         self._getAttrMap()[key] = value
00460 
00461     def __delitem__(self, key):
00462         "Deleting tag[key] deletes all 'key' attributes for the tag."
00463         for item in self.attrs:
00464             if item[0] == key:
00465                 self.attrs.remove(item)
00466                 #We don't break because bad HTML can define the same
00467                 #attribute multiple times.
00468             self._getAttrMap()
00469             if self.attrMap.has_key(key):
00470                 del self.attrMap[key]
00471 
00472     def __call__(self, *args, **kwargs):
00473         """Calling a tag like a function is the same as calling its
00474         findAll() method. Eg. tag('a') returns a list of all the A tags
00475         found within this tag."""
00476         return apply(self.findAll, args, kwargs)
00477 
00478     def __getattr__(self, tag):
00479         #print "Getattr %s.%s" % (self.__class__, tag)
00480         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
00481             return self.find(tag[:-3])
00482         elif tag.find('__') != 0:
00483             return self.find(tag)
00484 
00485     def __eq__(self, other):
00486         """Returns true iff this tag has the same name, the same attributes,
00487         and the same contents (recursively) as the given tag.
00488 
00489         NOTE: right now this will return false if two tags have the
00490         same attributes in a different order. Should this be fixed?"""
00491         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
00492             return False
00493         for i in range(0, len(self.contents)):
00494             if self.contents[i] != other.contents[i]:
00495                 return False
00496         return True
00497 
00498     def __ne__(self, other):
00499         """Returns true iff this tag is not identical to the other tag,
00500         as defined in __eq__."""
00501         return not self == other
00502 
00503     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00504         """Renders this tag as a string."""
00505         return self.__str__(encoding)
00506 
00507     def __unicode__(self):
00508         return self.__str__(None)
00509 
00510     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
00511                 prettyPrint=False, indentLevel=0):
00512         """Returns a string or Unicode representation of this tag and
00513         its contents. To get Unicode, pass None for encoding.
00514 
00515         NOTE: since Python's HTML parser consumes whitespace, this
00516         method is not certain to reproduce the whitespace present in
00517         the original string."""
00518 
00519         encodedName = self.toEncoding(self.name, encoding)
00520 
00521         attrs = []
00522         if self.attrs:
00523             for key, val in self.attrs:
00524                 fmt = '%s="%s"'
00525                 if isString(val):                    
00526                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
00527                         val = self.substituteEncoding(val, encoding)
00528 
00529                     # The attribute value either:
00530                     #
00531                     # * Contains no embedded double quotes or single quotes.
00532                     #   No problem: we enclose it in double quotes.
00533                     # * Contains embedded single quotes. No problem:
00534                     #   double quotes work here too.
00535                     # * Contains embedded double quotes. No problem:
00536                     #   we enclose it in single quotes.
00537                     # * Embeds both single _and_ double quotes. This
00538                     #   can't happen naturally, but it can happen if
00539                     #   you modify an attribute value after parsing
00540                     #   the document. Now we have a bit of a
00541                     #   problem. We solve it by enclosing the
00542                     #   attribute in single quotes, and escaping any
00543                     #   embedded single quotes to XML entities.
00544                     if '"' in val:
00545                         fmt = "%s='%s'"
00546                         # This can't happen naturally, but it can happen
00547                         # if you modify an attribute value after parsing.
00548                         if "'" in val:
00549                             val = val.replace("'", "&squot;")
00550 
00551                     # Now we're okay w/r/t quotes. But the attribute
00552                     # value might also contain angle brackets, or
00553                     # ampersands that aren't part of entities. We need
00554                     # to escape those to XML entities too.
00555                     val = re.sub("([<>]|&(?![^\s]+;))",
00556                                  lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
00557                                  val)
00558                                       
00559                 attrs.append(fmt % (self.toEncoding(key, encoding),
00560                                     self.toEncoding(val, encoding)))
00561         close = ''
00562         closeTag = ''
00563         if self.isSelfClosing:
00564             close = ' /'
00565         else:
00566             closeTag = '</%s>' % encodedName
00567 
00568         indentTag, indentContents = 0, 0
00569         if prettyPrint:
00570             indentTag = indentLevel
00571             space = (' ' * (indentTag-1))
00572             indentContents = indentTag + 1
00573         contents = self.renderContents(encoding, prettyPrint, indentContents)
00574         if self.hidden:
00575             s = contents
00576         else:
00577             s = []
00578             attributeString = ''
00579             if attrs:
00580                 attributeString = ' ' + ' '.join(attrs)            
00581             if prettyPrint:
00582                 s.append(space)
00583             s.append('<%s%s%s>' % (encodedName, attributeString, close))
00584             if prettyPrint:
00585                 s.append("\n")
00586             s.append(contents)
00587             if prettyPrint and contents and contents[-1] != "\n":
00588                 s.append("\n")
00589             if prettyPrint and closeTag:
00590                 s.append(space)
00591             s.append(closeTag)
00592             if prettyPrint and closeTag and self.nextSibling:
00593                 s.append("\n")
00594             s = ''.join(s)
00595         return s
00596 
00597     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
00598         return self.__str__(encoding, True)
00599 
00600     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
00601                        prettyPrint=False, indentLevel=0):
00602         """Renders the contents of this tag as a string in the given
00603         encoding. If encoding is None, returns a Unicode string.."""
00604         s=[]
00605         for c in self:
00606             text = None
00607             if isinstance(c, NavigableString):
00608                 text = c.__str__(encoding)
00609             elif isinstance(c, Tag):
00610                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
00611             if text and prettyPrint:
00612                 text = text.strip()              
00613             if text:
00614                 if prettyPrint:
00615                     s.append(" " * (indentLevel-1))
00616                 s.append(text)
00617                 if prettyPrint:
00618                     s.append("\n")
00619         return ''.join(s)    
00620 
00621     #Soup methods
00622 
00623     def find(self, name=None, attrs={}, recursive=True, text=None,
00624              **kwargs):
00625         """Return only the first child of this Tag matching the given
00626         criteria."""
00627         r = None
00628         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
00629         if l:
00630             r = l[0]
00631         return r
00632     findChild = find
00633 
00634     def findAll(self, name=None, attrs={}, recursive=True, text=None,
00635                 limit=None, **kwargs):
00636         """Extracts a list of Tag objects that match the given
00637         criteria.  You can specify the name of the Tag and any
00638         attributes you want the Tag to have.
00639 
00640         The value of a key-value pair in the 'attrs' map can be a
00641         string, a list of strings, a regular expression object, or a
00642         callable that takes a string and returns whether or not the
00643         string matches for some custom definition of 'matches'. The
00644         same is true of the tag name."""
00645         generator = self.recursiveChildGenerator
00646         if not recursive:
00647             generator = self.childGenerator
00648         return self._findAll(name, attrs, text, limit, generator, **kwargs)
00649     findChildren = findAll
00650 
00651     # Pre-3.x compatibility methods
00652     first = find
00653     fetch = findAll
00654     
00655     def fetchText(self, text=None, recursive=True, limit=None):
00656         return self.findAll(text=text, recursive=recursive, limit=limit)
00657 
00658     def firstText(self, text=None, recursive=True):
00659         return self.find(text=text, recursive=recursive)
00660     
00661     #Utility methods
00662 
00663     def append(self, tag):
00664         """Appends the given tag to the contents of this tag."""
00665         self.contents.append(tag)
00666 
00667     #Private methods
00668 
00669     def _getAttrMap(self):
00670         """Initializes a map representation of this tag's attributes,
00671         if not already initialized."""
00672         if not getattr(self, 'attrMap'):
00673             self.attrMap = {}
00674             for (key, value) in self.attrs:
00675                 self.attrMap[key] = value 
00676         return self.attrMap
00677 
00678     #Generator methods
00679     def childGenerator(self):
00680         for i in range(0, len(self.contents)):
00681             yield self.contents[i]
00682         raise StopIteration
00683     
00684     def recursiveChildGenerator(self):
00685         stack = [(self, 0)]
00686         while stack:
00687             tag, start = stack.pop()
00688             if isinstance(tag, Tag):            
00689                 for i in range(start, len(tag.contents)):
00690                     a = tag.contents[i]
00691                     yield a
00692                     if isinstance(a, Tag) and tag.contents:
00693                         if i < len(tag.contents) - 1:
00694                             stack.append((tag, i+1))
00695                         stack.append((a, 0))
00696                         break
00697         raise StopIteration
00698 
00699 # Next, a couple classes to represent queries and their results.
00700 class SoupStrainer:
00701     """Encapsulates a number of ways of matching a markup element (tag or
00702     text)."""
00703 
00704     def __init__(self, name=None, attrs={}, text=None, **kwargs):
00705         self.name = name
00706         if isString(attrs):
00707             kwargs['class'] = attrs
00708             attrs = None
00709         if kwargs:
00710             if attrs:
00711                 attrs = attrs.copy()
00712                 attrs.update(kwargs)
00713             else:
00714                 attrs = kwargs
00715         self.attrs = attrs
00716         self.text = text
00717 
00718     def __str__(self):
00719         if self.text:
00720             return self.text
00721         else:
00722             return "%s|%s" % (self.name, self.attrs)
00723     
00724     def searchTag(self, markupName=None, markupAttrs={}):
00725         found = None
00726         markup = None
00727         if isinstance(markupName, Tag):
00728             markup = markupName
00729             markupAttrs = markup
00730         callFunctionWithTagData = callable(self.name) \
00731                                 and not isinstance(markupName, Tag)
00732 
00733         if (not self.name) \
00734                or callFunctionWithTagData \
00735                or (markup and self._matches(markup, self.name)) \
00736                or (not markup and self._matches(markupName, self.name)):
00737             if callFunctionWithTagData:
00738                 match = self.name(markupName, markupAttrs)
00739             else:
00740                 match = True            
00741                 markupAttrMap = None
00742                 for attr, matchAgainst in self.attrs.items():
00743                     if not markupAttrMap:
00744                          if hasattr(markupAttrs, 'get'):
00745                             markupAttrMap = markupAttrs
00746                          else:
00747                             markupAttrMap = {}
00748                             for k,v in markupAttrs:
00749                                 markupAttrMap[k] = v
00750                     attrValue = markupAttrMap.get(attr)
00751                     if not self._matches(attrValue, matchAgainst):
00752                         match = False
00753                         break
00754             if match:
00755                 if markup:
00756                     found = markup
00757                 else:
00758                     found = markupName
00759         return found
00760 
00761     def search(self, markup):
00762         #print 'looking for %s in %s' % (self, markup)
00763         found = None
00764         # If given a list of items, scan it for a text element that
00765         # matches.        
00766         if isList(markup) and not isinstance(markup, Tag):
00767             for element in markup:
00768                 if isinstance(element, NavigableString) \
00769                        and self.search(element):
00770                     found = element
00771                     break
00772         # If it's a Tag, make sure its name or attributes match.
00773         # Don't bother with Tags if we're searching for text.
00774         elif isinstance(markup, Tag):
00775             if not self.text:
00776                 found = self.searchTag(markup)
00777         # If it's text, make sure the text matches.
00778         elif isinstance(markup, NavigableString) or \
00779                  isString(markup):
00780             if self._matches(markup, self.text):
00781                 found = markup
00782         else:
00783             raise Exception, "I don't know how to match against a %s" \
00784                   % markup.__class__
00785         return found
00786         
00787     def _matches(self, markup, matchAgainst):    
00788         #print "Matching %s against %s" % (markup, matchAgainst)
00789         result = False
00790         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
00791             result = markup != None
00792         elif callable(matchAgainst):
00793             result = matchAgainst(markup)
00794         else:
00795             #Custom match methods take the tag as an argument, but all
00796             #other ways of matching match the tag name as a string.
00797             if isinstance(markup, Tag):
00798                 markup = markup.name
00799             if markup and not isString(markup):
00800                 markup = unicode(markup)
00801             #Now we know that chunk is either a string, or None.
00802             if hasattr(matchAgainst, 'match'):
00803                 # It's a regexp object.
00804                 result = markup and matchAgainst.search(markup)
00805             elif isList(matchAgainst):
00806                 result = markup in matchAgainst
00807             elif hasattr(matchAgainst, 'items'):
00808                 result = markup.has_key(matchAgainst)
00809             elif matchAgainst and isString(markup):
00810                 if isinstance(markup, unicode):
00811                     matchAgainst = unicode(matchAgainst)
00812                 else:
00813                     matchAgainst = str(matchAgainst)
00814 
00815             if not result:
00816                 result = matchAgainst == markup
00817         return result
00818 
00819 class ResultSet(list):
00820     """A ResultSet is just a list that keeps track of the SoupStrainer
00821     that created it."""
00822     def __init__(self, source):
00823         list.__init__([])
00824         self.source = source
00825 
00826 # Now, some helper functions.
00827 
00828 def isList(l):
00829     """Convenience method that works with all 2.x versions of Python
00830     to determine whether or not something is listlike."""
00831     return hasattr(l, '__iter__') \
00832            or (type(l) in (types.ListType, types.TupleType))
00833 
00834 def isString(s):
00835     """Convenience method that works with all 2.x versions of Python
00836     to determine whether or not something is stringlike."""
00837     try:
00838         return isinstance(s, unicode) or isintance(s, basestring) 
00839     except NameError:
00840         return isinstance(s, str)
00841 
00842 def buildTagMap(default, *args):
00843     """Turns a list of maps, lists, or scalars into a single map.
00844     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
00845     NESTING_RESET_TAGS maps out of lists and partial maps."""
00846     built = {}
00847     for portion in args:
00848         if hasattr(portion, 'items'):
00849             #It's a map. Merge it.
00850             for k,v in portion.items():
00851                 built[k] = v
00852         elif isList(portion):
00853             #It's a list. Map each item to the default.
00854             for k in portion:
00855                 built[k] = default
00856         else:
00857             #It's a scalar. Map it to the default.
00858             built[portion] = default
00859     return built
00860 
00861 # Now, the parser classes.
00862 
00863 class BeautifulStoneSoup(Tag, SGMLParser):
00864 
00865     """This class contains the basic parser and search code. It defines
00866     a parser that knows nothing about tag behavior except for the
00867     following:
00868    
00869       You can't close a tag without closing all the tags it encloses.
00870       That is, "<foo><bar></foo>" actually means
00871       "<foo><bar></bar></foo>".
00872 
00873     [Another possible explanation is "<foo><bar /></foo>", but since
00874     this class defines no SELF_CLOSING_TAGS, it will never use that
00875     explanation.]
00876 
00877     This class is useful for parsing XML or made-up markup languages,
00878     or when BeautifulSoup makes an assumption counter to what you were
00879     expecting."""
00880 
00881     XML_ENTITY_LIST = {}
00882     for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():
00883         XML_ENTITY_LIST[i] = True 
00884 
00885     SELF_CLOSING_TAGS = {}
00886     NESTABLE_TAGS = {}
00887     RESET_NESTING_TAGS = {}
00888     QUOTE_TAGS = {}
00889 
00890     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
00891                        lambda x: x.group(1) + ' />'),
00892                       (re.compile('<!\s+([^<>]*)>'),
00893                        lambda x: '<!' + x.group(1) + '>')
00894                       ]
00895 
00896     ROOT_TAG_NAME = u'[document]'
00897 
00898     HTML_ENTITIES = "html"
00899     XML_ENTITIES = "xml"
00900 
00901     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
00902                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
00903                  convertEntities=None, selfClosingTags=None):
00904         """The Soup object is initialized as the 'root tag', and the
00905         provided markup (which can be a string or a file-like object)
00906         is fed into the underlying parser. 
00907 
00908         sgmllib will process most bad HTML, and the BeautifulSoup
00909         class has some tricks for dealing with some HTML that kills
00910         sgmllib, but Beautiful Soup can nonetheless choke or lose data
00911         if your data uses self-closing tags or declarations
00912         incorrectly.
00913 
00914         By default, Beautiful Soup uses regexes to sanitize input,
00915         avoiding the vast majority of these problems. If the problems
00916         don't apply to you, pass in False for markupMassage, and
00917         you'll get better performance.
00918 
00919         The default parser massage techniques fix the two most common
00920         instances of invalid HTML that choke sgmllib:
00921 
00922          <br/> (No space between name of closing tag and tag close)
00923          <! --Comment--> (Extraneous whitespace in declaration)
00924 
00925         You can pass in a custom list of (RE object, replace method)
00926         tuples to get Beautiful Soup to scrub your input the way you
00927         want."""
00928 
00929         self.parseOnlyThese = parseOnlyThese
00930         self.fromEncoding = fromEncoding
00931         self.smartQuotesTo = smartQuotesTo
00932         self.convertEntities = convertEntities
00933         if self.convertEntities:
00934             # It doesn't make sense to convert encoded characters to
00935             # entities even while you're converting entities to Unicode.
00936             # Just convert it all to Unicode.
00937             self.smartQuotesTo = None
00938         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
00939         SGMLParser.__init__(self)
00940             
00941         if hasattr(markup, 'read'):        # It's a file-type object.
00942             markup = markup.read()
00943         self.markup = markup
00944         self.markupMassage = markupMassage
00945         try:
00946             self._feed()
00947         except StopParsing:
00948             pass
00949         self.markup = None                 # The markup can now be GCed
00950         
00951     def _feed(self, inDocumentEncoding=None):
00952         # Convert the document to Unicode.
00953         markup = self.markup
00954         if isinstance(markup, unicode):
00955             if not hasattr(self, 'originalEncoding'):
00956                 self.originalEncoding = None
00957         else:
00958             dammit = UnicodeDammit\
00959                      (markup, [self.fromEncoding, inDocumentEncoding],
00960                       smartQuotesTo=self.smartQuotesTo)
00961             markup = dammit.unicode
00962             self.originalEncoding = dammit.originalEncoding
00963         if markup:
00964             if self.markupMassage:
00965                 if not isList(self.markupMassage):
00966                     self.markupMassage = self.MARKUP_MASSAGE            
00967                 for fix, m in self.markupMassage:
00968                     markup = fix.sub(m, markup)
00969         self.reset()
00970 
00971         SGMLParser.feed(self, markup)
00972         # Close out any unfinished strings and close all the open tags.
00973         self.endData()
00974         while self.currentTag.name != self.ROOT_TAG_NAME:
00975             self.popTag()
00976 
00977     def __getattr__(self, methodName):
00978         """This method routes method call requests to either the SGMLParser
00979         superclass or the Tag superclass, depending on the method name."""
00980         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
00981 
00982         if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
00983                or methodName.find('do_') == 0:
00984             return SGMLParser.__getattr__(self, methodName)
00985         elif methodName.find('__') != 0:
00986             return Tag.__getattr__(self, methodName)
00987         else:
00988             raise AttributeError
00989 
00990     def isSelfClosingTag(self, name):
00991         """Returns true iff the given string is the name of a
00992         self-closing tag according to this parser."""
00993         return self.SELF_CLOSING_TAGS.has_key(name) \
00994                or self.instanceSelfClosingTags.has_key(name)
00995             
00996     def reset(self):
00997         Tag.__init__(self, self, self.ROOT_TAG_NAME)
00998         self.hidden = 1
00999         SGMLParser.reset(self)
01000         self.currentData = []
01001         self.currentTag = None
01002         self.tagStack = []
01003         self.quoteStack = []
01004         self.pushTag(self)
01005     
01006     def popTag(self):
01007         tag = self.tagStack.pop()
01008         # Tags with just one string-owning child get the child as a
01009         # 'string' property, so that soup.tag.string is shorthand for
01010         # soup.tag.contents[0]
01011         if len(self.currentTag.contents) == 1 and \
01012            isinstance(self.currentTag.contents[0], NavigableString):
01013             self.currentTag.string = self.currentTag.contents[0]
01014 
01015         #print "Pop", tag.name
01016         if self.tagStack:
01017             self.currentTag = self.tagStack[-1]
01018         return self.currentTag
01019 
01020     def pushTag(self, tag):
01021         #print "Push", tag.name
01022         if self.currentTag:
01023             self.currentTag.append(tag)
01024         self.tagStack.append(tag)
01025         self.currentTag = self.tagStack[-1]
01026 
01027     def endData(self, containerClass=NavigableString):
01028         if self.currentData:
01029             currentData = ''.join(self.currentData)
01030             if not currentData.strip():
01031                 if '\n' in currentData:
01032                     currentData = '\n'
01033                 else:
01034                     currentData = ' '
01035             self.currentData = []
01036             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
01037                    (not self.parseOnlyThese.text or \
01038                     not self.parseOnlyThese.search(currentData)):
01039                 return
01040             o = containerClass(currentData)
01041             o.setup(self.currentTag, self.previous)
01042             if self.previous:
01043                 self.previous.next = o
01044             self.previous = o
01045             self.currentTag.contents.append(o)
01046 
01047 
01048     def _popToTag(self, name, inclusivePop=True):
01049         """Pops the tag stack up to and including the most recent
01050         instance of the given tag. If inclusivePop is false, pops the tag
01051         stack up to but *not* including the most recent instqance of
01052         the given tag."""
01053         #print "Popping to %s" % name
01054         if name == self.ROOT_TAG_NAME:
01055             return            
01056 
01057         numPops = 0
01058         mostRecentTag = None
01059         for i in range(len(self.tagStack)-1, 0, -1):
01060             if name == self.tagStack[i].name:
01061                 numPops = len(self.tagStack)-i
01062                 break
01063         if not inclusivePop:
01064             numPops = numPops - 1
01065 
01066         for i in range(0, numPops):
01067             mostRecentTag = self.popTag()
01068         return mostRecentTag    
01069 
01070     def _smartPop(self, name):
01071 
01072         """We need to pop up to the previous tag of this type, unless
01073         one of this tag's nesting reset triggers comes between this
01074         tag and the previous tag of this type, OR unless this tag is a
01075         generic nesting trigger and another generic nesting trigger
01076         comes between this tag and the previous tag of this type.
01077 
01078         Examples:
01079          <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
01080          <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
01081          <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
01082          <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
01083 
01084          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
01085          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
01086          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
01087         """
01088 
01089         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
01090         isNestable = nestingResetTriggers != None
01091         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
01092         popTo = None
01093         inclusive = True
01094         for i in range(len(self.tagStack)-1, 0, -1):
01095             p = self.tagStack[i]
01096             if (not p or p.name == name) and not isNestable:
01097                 #Non-nestable tags get popped to the top or to their
01098                 #last occurance.
01099                 popTo = name
01100                 break
01101             if (nestingResetTriggers != None
01102                 and p.name in nestingResetTriggers) \
01103                 or (nestingResetTriggers == None and isResetNesting
01104                     and self.RESET_NESTING_TAGS.has_key(p.name)):
01105                 
01106                 #If we encounter one of the nesting reset triggers
01107                 #peculiar to this tag, or we encounter another tag
01108                 #that causes nesting to reset, pop up to but not
01109                 #including that tag.
01110                 popTo = p.name
01111                 inclusive = False
01112                 break
01113             p = p.parent
01114         if popTo:
01115             self._popToTag(popTo, inclusive)
01116 
01117     def unknown_starttag(self, name, attrs, selfClosing=0):
01118         #print "Start tag %s: %s" % (name, attrs)
01119         if self.quoteStack:
01120             #This is not a real tag.
01121             #print "<%s> is not real!" % name
01122             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
01123             self.handle_data('<%s%s>' % (name, attrs))
01124             return        
01125         self.endData()
01126 
01127         if not self.isSelfClosingTag(name) and not selfClosing:
01128             self._smartPop(name)
01129 
01130         if self.parseOnlyThese and len(self.tagStack) <= 1 \
01131                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
01132             return
01133 
01134         tag = Tag(self, name, attrs, self.currentTag, self.previous)
01135         if self.previous:
01136             self.previous.next = tag
01137         self.previous = tag
01138         self.pushTag(tag)
01139         if selfClosing or self.isSelfClosingTag(name):
01140             self.popTag()                
01141         if name in self.QUOTE_TAGS:
01142             #print "Beginning quote (%s)" % name
01143             self.quoteStack.append(name)
01144             self.literal = 1
01145         return tag
01146 
01147     def unknown_endtag(self, name):
01148         #print "End tag %s" % name
01149         if self.quoteStack and self.quoteStack[-1] != name:
01150             #This is not a real end tag.
01151             #print "</%s> is not real!" % name
01152             self.handle_data('</%s>' % name)
01153             return
01154         self.endData()
01155         self._popToTag(name)
01156         if self.quoteStack and self.quoteStack[-1] == name:
01157             self.quoteStack.pop()
01158             self.literal = (len(self.quoteStack) > 0)
01159 
01160     def handle_data(self, data):
01161         self.currentData.append(data)
01162 
01163     def _toStringSubclass(self, text, subclass):
01164         """Adds a certain piece of text to the tree as a NavigableString
01165         subclass."""
01166         self.endData()
01167         self.handle_data(text)
01168         self.endData(subclass)
01169 
01170     def handle_pi(self, text):
01171         """Handle a processing instruction as a ProcessingInstruction
01172         object, possibly one with a %SOUP-ENCODING% slot into which an
01173         encoding will be plugged later."""
01174         if text[:3] == "xml":
01175             text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
01176         self._toStringSubclass(text, ProcessingInstruction)
01177 
01178     def handle_comment(self, text):
01179         "Handle comments as Comment objects."
01180         self._toStringSubclass(text, Comment)
01181 
01182     def handle_charref(self, ref):
01183         "Handle character references as data."
01184         if self.convertEntities in [self.HTML_ENTITIES,
01185                                     self.XML_ENTITIES]:
01186             data = unichr(int(ref))
01187         else:
01188             data = '&#%s;' % ref
01189         self.handle_data(data)
01190 
01191     def handle_entityref(self, ref):
01192         """Handle entity references as data, possibly converting known
01193         HTML entity references to the corresponding Unicode
01194         characters."""
01195         data = None
01196         if self.convertEntities == self.HTML_ENTITIES or \
01197                (self.convertEntities == self.XML_ENTITIES and \
01198                 self.XML_ENTITY_LIST.get(ref)):
01199             try:
01200                 data = unichr(name2codepoint[ref])
01201             except KeyError:
01202                 pass
01203         if not data:
01204             data = '&%s;' % ref
01205         self.handle_data(data)
01206         
01207     def handle_decl(self, data):
01208         "Handle DOCTYPEs and the like as Declaration objects."
01209         self._toStringSubclass(data, Declaration)
01210 
01211     def parse_declaration(self, i):
01212         """Treat a bogus SGML declaration as raw data. Treat a CDATA
01213         declaration as a CData object."""
01214         j = None
01215         if self.rawdata[i:i+9] == '<![CDATA[':
01216              k = self.rawdata.find(']]>', i)
01217              if k == -1:
01218                  k = len(self.rawdata)
01219              data = self.rawdata[i+9:k]
01220              j = k+3
01221              self._toStringSubclass(data, CData)
01222         else:
01223             try:
01224                 j = SGMLParser.parse_declaration(self, i)
01225             except SGMLParseError:
01226                 toHandle = self.rawdata[i:]
01227                 self.handle_data(toHandle)
01228                 j = i + len(toHandle)
01229         return j
01230 
01231 class BeautifulSoup(BeautifulStoneSoup):
01232 
01233     """This parser knows the following facts about HTML:
01234 
01235     * Some tags have no closing tag and should be interpreted as being
01236       closed as soon as they are encountered.
01237 
01238     * The text inside some tags (ie. 'script') may contain tags which
01239       are not really part of the document and which should be parsed
01240       as text, not tags. If you want to parse the text as tags, you can
01241       always fetch it and parse it explicitly.
01242 
01243     * Tag nesting rules:
01244 
01245       Most tags can't be nested at all. For instance, the occurance of
01246       a <p> tag should implicitly close the previous <p> tag.
01247 
01248        <p>Para1<p>Para2
01249         should be transformed into:
01250        <p>Para1</p><p>Para2
01251 
01252       Some tags can be nested arbitrarily. For instance, the occurance
01253       of a <blockquote> tag should _not_ implicitly close the previous
01254       <blockquote> tag.
01255 
01256        Alice said: <blockquote>Bob said: <blockquote>Blah
01257         should NOT be transformed into:
01258        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
01259 
01260       Some tags can be nested, but the nesting is reset by the
01261       interposition of other tags. For instance, a <tr> tag should
01262       implicitly close the previous <tr> tag within the same <table>,
01263       but not close a <tr> tag in another table.
01264 
01265        <table><tr>Blah<tr>Blah
01266         should be transformed into:
01267        <table><tr>Blah</tr><tr>Blah
01268         but,
01269        <tr>Blah<table><tr>Blah
01270         should NOT be transformed into
01271        <tr>Blah<table></tr><tr>Blah
01272 
01273     Differing assumptions about tag nesting rules are a major source
01274     of problems with the BeautifulSoup class. If BeautifulSoup is not
01275     treating as nestable a tag your page author treats as nestable,
01276     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
01277     BeautifulStoneSoup before writing your own subclass."""
01278 
01279     def __init__(self, *args, **kwargs):
01280         if not kwargs.has_key('smartQuotesTo'):
01281             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
01282         BeautifulStoneSoup.__init__(self, *args, **kwargs)
01283 
01284     SELF_CLOSING_TAGS = buildTagMap(None,
01285                                     ['br' , 'hr', 'input', 'img', 'meta',
01286                                     'spacer', 'link', 'frame', 'base'])
01287 
01288     QUOTE_TAGS = {'script': None}
01289     
01290     #According to the HTML standard, each of these inline tags can
01291     #contain another tag of the same type. Furthermore, it's common
01292     #to actually use these tags this way.
01293     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
01294                             'center']
01295 
01296     #According to the HTML standard, these block tags can contain
01297     #another tag of the same type. Furthermore, it's common
01298     #to actually use these tags this way.
01299     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
01300 
01301     #Lists can contain other lists, but there are restrictions.    
01302     NESTABLE_LIST_TAGS = { 'ol' : [],
01303                            'ul' : [],
01304                            'li' : ['ul', 'ol'],
01305                            'dl' : [],
01306                            'dd' : ['dl'],
01307                            'dt' : ['dl'] }
01308 
01309     #Tables can contain other tables, but there are restrictions.    
01310     NESTABLE_TABLE_TAGS = {'table' : [], 
01311                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
01312                            'td' : ['tr'],
01313                            'th' : ['tr'],
01314                            'thead' : ['table'],
01315                            'tbody' : ['table'],
01316                            'tfoot' : ['table'],
01317                            }
01318 
01319     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
01320 
01321     #If one of these tags is encountered, all tags up to the next tag of
01322     #this type are popped.
01323     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
01324                                      NON_NESTABLE_BLOCK_TAGS,
01325                                      NESTABLE_LIST_TAGS,
01326                                      NESTABLE_TABLE_TAGS)
01327 
01328     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
01329                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
01330 
01331     # Used to detect the charset in a META tag; see start_meta
01332     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
01333 
01334     def start_meta(self, attrs):
01335         """Beautiful Soup can detect a charset included in a META tag,
01336         try to convert the document to that charset, and re-parse the
01337         document from the beginning."""
01338         httpEquiv = None
01339         contentType = None
01340         contentTypeIndex = None
01341         tagNeedsEncodingSubstitution = False
01342 
01343         for i in range(0, len(attrs)):
01344             key, value = attrs[i]
01345             key = key.lower()
01346             if key == 'http-equiv':
01347                 httpEquiv = value
01348             elif key == 'content':
01349                 contentType = value
01350                 contentTypeIndex = i
01351 
01352         if httpEquiv and contentType: # It's an interesting meta tag.
01353             match = self.CHARSET_RE.search(contentType)
01354             if match:
01355                 if getattr(self, 'declaredHTMLEncoding') or \
01356                        (self.originalEncoding == self.fromEncoding):
01357                     # This is our second pass through the document, or
01358                     # else an encoding was specified explicitly and it
01359                     # worked. Rewrite the meta tag.
01360                     newAttr = self.CHARSET_RE.sub\
01361                               (lambda(match):match.group(1) +
01362                                "%SOUP-ENCODING%", value)
01363                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
01364                                                newAttr)
01365                     tagNeedsEncodingSubstitution = True
01366                 else:
01367                     # This is our first pass through the document.
01368                     # Go through it again with the new information.
01369                     newCharset = match.group(3)
01370                     if newCharset and newCharset != self.originalEncoding:
01371                         self.declaredHTMLEncoding = newCharset
01372                         self._feed(self.declaredHTMLEncoding)
01373                         raise StopParsing
01374         tag = self.unknown_starttag("meta", attrs)
01375         if tag and tagNeedsEncodingSubstitution:
01376             tag.containsSubstitutions = True
01377 
01378 class StopParsing(Exception):
01379     pass
01380    
01381 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
01382 
01383     """The BeautifulSoup class is oriented towards skipping over
01384     common HTML errors like unclosed tags. However, sometimes it makes
01385     errors of its own. For instance, consider this fragment:
01386 
01387      <b>Foo<b>Bar</b></b>
01388 
01389     This is perfectly valid (if bizarre) HTML. However, the
01390     BeautifulSoup class will implicitly close the first b tag when it
01391     encounters the second 'b'. It will think the author wrote
01392     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
01393     there's no real-world reason to bold something that's already
01394     bold. When it encounters '</b></b>' it will close two more 'b'
01395     tags, for a grand total of three tags closed instead of two. This
01396     can throw off the rest of your document structure. The same is
01397     true of a number of other tags, listed below.
01398 
01399     It's much more common for someone to forget to close a 'b' tag
01400     than to actually use nested 'b' tags, and the BeautifulSoup class
01401     handles the common case. This class handles the not-co-common
01402     case: where you can't believe someone wrote what they did, but
01403     it's valid HTML and BeautifulSoup screwed up by assuming it
01404     wouldn't be."""
01405 
01406     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
01407      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
01408       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
01409       'big']
01410 
01411     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
01412 
01413     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
01414                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
01415                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
01416 
01417 class MinimalSoup(BeautifulSoup):
01418     """The MinimalSoup class is for parsing HTML that contains
01419     pathologically bad markup. It makes no assumptions about tag
01420     nesting, but it does know which tags are self-closing, that
01421     <script> tags contain Javascript and should not be parsed, that
01422     META tags may contain encoding information, and so on.
01423 
01424     This also makes it better for subclassing than BeautifulStoneSoup
01425     or BeautifulSoup."""
01426     
01427     RESET_NESTING_TAGS = buildTagMap('noscript')
01428     NESTABLE_TAGS = {}
01429 
01430 class BeautifulSOAP(BeautifulStoneSoup):
01431     """This class will push a tag with only a single string child into
01432     the tag's parent as an attribute. The attribute's name is the tag
01433     name, and the value is the string child. An example should give
01434     the flavor of the change:
01435 
01436     <foo><bar>baz</bar></foo>
01437      =>
01438     <foo bar="baz"><bar>baz</bar></foo>
01439 
01440     You can then access fooTag['bar'] instead of fooTag.barTag.string.
01441 
01442     This is, of course, useful for scraping structures that tend to
01443     use subelements instead of attributes, such as SOAP messages. Note
01444     that it modifies its input, so don't print the modified version
01445     out.
01446 
01447     I'm not sure how many people really want to use this class; let me
01448     know if you do. Mainly I like the name."""
01449 
01450     def popTag(self):
01451         if len(self.tagStack) > 1:
01452             tag = self.tagStack[-1]
01453             parent = self.tagStack[-2]
01454             parent._getAttrMap()
01455             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
01456                 isinstance(tag.contents[0], NavigableString) and 
01457                 not parent.attrMap.has_key(tag.name)):
01458                 parent[tag.name] = tag.contents[0]
01459         BeautifulStoneSoup.popTag(self)
01460 
01461 #Enterprise class names! It has come to our attention that some people
01462 #think the names of the Beautiful Soup parser classes are too silly
01463 #and "unprofessional" for use in enterprise screen-scraping. We feel
01464 #your pain! For such-minded folk, the Beautiful Soup Consortium And
01465 #All-Night Kosher Bakery recommends renaming this file to
01466 #"RobustParser.py" (or, in cases of extreme enterprisness,
01467 #"RobustParserBeanInterface.class") and using the following
01468 #enterprise-friendly class aliases:
01469 class RobustXMLParser(BeautifulStoneSoup):
01470     pass
01471 class RobustHTMLParser(BeautifulSoup):
01472     pass
01473 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
01474     pass
01475 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
01476     pass
01477 class SimplifyingSOAPParser(BeautifulSOAP):
01478     pass
01479 
01480 ######################################################
01481 #
01482 # Bonus library: Unicode, Dammit
01483 #
01484 # This class forces XML data into a standard format (usually to UTF-8
01485 # or Unicode).  It is heavily based on code from Mark Pilgrim's
01486 # Universal Feed Parser. It does not rewrite the XML or HTML to
01487 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
01488 # (XML) and BeautifulSoup.start_meta (HTML).
01489 
01490 # Autodetects character encodings.
01491 # Download from http://chardet.feedparser.org/
01492 try:
01493     import chardet
01494 #    import chardet.constants
01495 #    chardet.constants._debug = 1
01496 except:
01497     chardet = None
01498 chardet = None
01499 
01500 # cjkcodecs and iconv_codec make Python know about more character encodings.
01501 # Both are available from http://cjkpython.i18n.org/
01502 # They're built in if you use Python 2.4.
01503 try:
01504     import cjkcodecs.aliases
01505 except:
01506     pass
01507 try:
01508     import iconv_codec
01509 except:
01510     pass
01511 
01512 class UnicodeDammit:
01513     """A class for detecting the encoding of a *ML document and
01514     converting it to a Unicode string. If the source encoding is
01515     windows-1252, can replace MS smart quotes with their HTML or XML
01516     equivalents."""
01517 
01518     # This dictionary maps commonly seen values for "charset" in HTML
01519     # meta tags to the corresponding Python codec names. It only covers
01520     # values that aren't in Python's aliases and can't be determined
01521     # by the heuristics in find_codec.
01522     CHARSET_ALIASES = { "macintosh" : "mac-roman",
01523                         "x-sjis" : "shift-jis" }
01524 
01525     def __init__(self, markup, overrideEncodings=[],
01526                  smartQuotesTo='xml'):
01527         self.markup, documentEncoding, sniffedEncoding = \
01528                      self._detectEncoding(markup)
01529         self.smartQuotesTo = smartQuotesTo
01530         self.triedEncodings = []
01531         if markup == '' or isinstance(markup, unicode):
01532             self.originalEncoding = None
01533             self.unicode = unicode(markup)            
01534             return
01535         
01536         u = None
01537         for proposedEncoding in overrideEncodings:
01538             u = self._convertFrom(proposedEncoding)
01539             if u: break
01540         if not u:
01541             for proposedEncoding in (documentEncoding, sniffedEncoding):
01542                 u = self._convertFrom(proposedEncoding)
01543                 if u: break
01544                 
01545         # If no luck and we have auto-detection library, try that:
01546         if not u and chardet and not isinstance(self.markup, unicode):
01547             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
01548 
01549         # As a last resort, try utf-8 and windows-1252:
01550         if not u:
01551             for proposed_encoding in ("utf-8", "windows-1252"):
01552                 u = self._convertFrom(proposed_encoding)
01553                 if u: break
01554         self.unicode = u
01555         if not u: self.originalEncoding = None
01556 
01557     def _subMSChar(self, orig):
01558         """Changes a MS smart quote character to an XML or HTML
01559         entity."""
01560         sub = self.MS_CHARS.get(orig)
01561         if type(sub) == types.TupleType:
01562             if self.smartQuotesTo == 'xml':
01563                 sub = '&#x%s;' % sub[1]
01564             else:
01565                 sub = '&%s;' % sub[0]
01566         return sub            
01567 
01568     def _convertFrom(self, proposed):        
01569         proposed = self.find_codec(proposed)
01570         if not proposed or proposed in self.triedEncodings:
01571             return None
01572         self.triedEncodings.append(proposed)
01573         markup = self.markup
01574 
01575         # Convert smart quotes to HTML if coming from an encoding
01576         # that might have them.
01577         if self.smartQuotesTo and proposed.lower() in("windows-1252",
01578                                                       "iso-8859-1",
01579                                                       "iso-8859-2"):
01580             markup = re.compile("([\x80-\x9f])").sub \
01581                      (lambda(x): self._subMSChar(x.group(1)),
01582                       markup)
01583 
01584         try:
01585             # print "Trying to convert document to %s" % proposed
01586             u = self._toUnicode(markup, proposed)
01587             self.markup = u       
01588             self.originalEncoding = proposed
01589         except Exception, e:
01590             # print "That didn't work!"
01591             # print e
01592             return None        
01593         #print "Correct encoding: %s" % proposed
01594         return self.markup
01595 
01596     def _toUnicode(self, data, encoding):
01597         '''Given a string and its encoding, decodes the string into Unicode.
01598         %encoding is a string recognized by encodings.aliases'''
01599 
01600         # strip Byte Order Mark (if present)
01601         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
01602                and (data[2:4] != '\x00\x00'):
01603             encoding = 'utf-16be'
01604             data = data[2:]
01605         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
01606                  and (data[2:4] != '\x00\x00'):
01607             encoding = 'utf-16le'
01608             data = data[2:]
01609         elif data[:3] == '\xef\xbb\xbf':
01610             encoding = 'utf-8'
01611             data = data[3:]
01612         elif data[:4] == '\x00\x00\xfe\xff':
01613             encoding = 'utf-32be'
01614             data = data[4:]
01615         elif data[:4] == '\xff\xfe\x00\x00':
01616             encoding = 'utf-32le'
01617             data = data[4:]
01618         newdata = unicode(data, encoding)
01619         return newdata
01620     
01621     def _detectEncoding(self, xml_data):
01622         """Given a document, tries to detect its XML encoding."""
01623         xml_encoding = sniffed_xml_encoding = None
01624         try:
01625             if xml_data[:4] == '\x4c\x6f\xa7\x94':
01626                 # EBCDIC
01627                 xml_data = self._ebcdic_to_ascii(xml_data)
01628             elif xml_data[:4] == '\x00\x3c\x00\x3f':
01629                 # UTF-16BE
01630                 sniffed_xml_encoding = 'utf-16be'
01631                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
01632             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
01633                      and (xml_data[2:4] != '\x00\x00'):
01634                 # UTF-16BE with BOM
01635                 sniffed_xml_encoding = 'utf-16be'
01636                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
01637             elif xml_data[:4] == '\x3c\x00\x3f\x00':
01638                 # UTF-16LE
01639                 sniffed_xml_encoding = 'utf-16le'
01640                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
01641             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
01642                      (xml_data[2:4] != '\x00\x00'):
01643                 # UTF-16LE with BOM
01644                 sniffed_xml_encoding = 'utf-16le'
01645                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
01646             elif xml_data[:4] == '\x00\x00\x00\x3c':
01647                 # UTF-32BE
01648                 sniffed_xml_encoding = 'utf-32be'
01649                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
01650             elif xml_data[:4] == '\x3c\x00\x00\x00':
01651                 # UTF-32LE
01652                 sniffed_xml_encoding = 'utf-32le'
01653                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
01654             elif xml_data[:4] == '\x00\x00\xfe\xff':
01655                 # UTF-32BE with BOM
01656                 sniffed_xml_encoding = 'utf-32be'
01657                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
01658             elif xml_data[:4] == '\xff\xfe\x00\x00':
01659                 # UTF-32LE with BOM
01660                 sniffed_xml_encoding = 'utf-32le'
01661                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
01662             elif xml_data[:3] == '\xef\xbb\xbf':
01663                 # UTF-8 with BOM
01664                 sniffed_xml_encoding = 'utf-8'
01665                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
01666             else:
01667                 sniffed_xml_encoding = 'ascii'
01668                 pass
01669             xml_encoding_match = re.compile \
01670                                  ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
01671                                  .match(xml_data)
01672         except:
01673             xml_encoding_match = None
01674         if xml_encoding_match:
01675             xml_encoding = xml_encoding_match.groups()[0].lower()
01676             if sniffed_xml_encoding and \
01677                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
01678                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
01679                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
01680                                  'utf16', 'u16')):
01681                 xml_encoding = sniffed_xml_encoding
01682         return xml_data, xml_encoding, sniffed_xml_encoding
01683 
01684 
01685     def find_codec(self, charset):
01686         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
01687                or (charset and self._codec(charset.replace("-", ""))) \
01688                or (charset and self._codec(charset.replace("-", "_"))) \
01689                or charset
01690 
01691     def _codec(self, charset):
01692         if not charset: return charset 
01693         codec = None
01694         try:
01695             codecs.lookup(charset)
01696             codec = charset
01697         except LookupError:
01698             pass
01699         return codec
01700 
01701     EBCDIC_TO_ASCII_MAP = None
01702     def _ebcdic_to_ascii(self, s):
01703         c = self.__class__
01704         if not c.EBCDIC_TO_ASCII_MAP:
01705             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
01706                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
01707                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
01708                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
01709                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
01710                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
01711                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
01712                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
01713                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
01714                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
01715                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
01716                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
01717                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
01718                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
01719                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
01720                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
01721                     250,251,252,253,254,255)
01722             import string
01723             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
01724             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
01725         return s.translate(c.EBCDIC_TO_ASCII_MAP)
01726 
01727     MS_CHARS = { '\x80' : ('euro', '20AC'),
01728                  '\x81' : ' ',
01729                  '\x82' : ('sbquo', '201A'),
01730                  '\x83' : ('fnof', '192'),
01731                  '\x84' : ('bdquo', '201E'),
01732                  '\x85' : ('hellip', '2026'),
01733                  '\x86' : ('dagger', '2020'),
01734                  '\x87' : ('Dagger', '2021'),
01735                  '\x88' : ('circ', '2C6'),
01736                  '\x89' : ('permil', '2030'),
01737                  '\x8A' : ('Scaron', '160'),
01738                  '\x8B' : ('lsaquo', '2039'),
01739                  '\x8C' : ('OElig', '152'),
01740                  '\x8D' : '?',
01741                  '\x8E' : ('#x17D', '17D'),
01742                  '\x8F' : '?',
01743                  '\x90' : '?',
01744                  '\x91' : ('lsquo', '2018'),
01745                  '\x92' : ('rsquo', '2019'),
01746                  '\x93' : ('ldquo', '201C'),
01747                  '\x94' : ('rdquo', '201D'),
01748                  '\x95' : ('bull', '2022'),
01749                  '\x96' : ('ndash', '2013'),
01750                  '\x97' : ('mdash', '2014'),
01751                  '\x98' : ('tilde', '2DC'),
01752                  '\x99' : ('trade', '2122'),
01753                  '\x9a' : ('scaron', '161'),
01754                  '\x9b' : ('rsaquo', '203A'),
01755                  '\x9c' : ('oelig', '153'),
01756                  '\x9d' : '?',
01757                  '\x9e' : ('#x17E', '17E'),
01758                  '\x9f' : ('Yuml', ''),}
01759 
01760 #######################################################################
01761 
01762 
01763 #By default, act as an HTML pretty-printer.
01764 if __name__ == '__main__':
01765     import sys
01766     soup = BeautifulSoup(sys.stdin.read())
01767     print soup.prettify()


websocket_gui
Author(s): Benoit Lescot and Stéphane Magnenat
autogenerated on Sat Dec 28 2013 17:46:48