00001 """Beautiful Soup
00002 Elixir and Tonic
00003 "The Screen-Scraper's Friend"
00004 http://www.crummy.com/software/BeautifulSoup/
00005
00006 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
00007 tree representation. It provides methods and Pythonic idioms that make
00008 it easy to navigate, search, and modify the tree.
00009
00010 A well-formed XML/HTML document yields a well-formed data
00011 structure. An ill-formed XML/HTML document yields a correspondingly
00012 ill-formed data structure. If your document is only locally
00013 well-formed, you can use this library to find and process the
00014 well-formed part of it. The BeautifulSoup class
00015
00016 Beautiful Soup works with Python 2.2 and up. It has no external
00017 dependencies, but you'll have more success at converting data to UTF-8
00018 if you also install these three packages:
00019
00020 * chardet, for auto-detecting character encodings
00021 http://chardet.feedparser.org/
00022 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
00023 by stock Python.
00024 http://cjkpython.i18n.org/
00025
00026 Beautiful Soup defines classes for two main parsing strategies:
00027
00028 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
00029 language that kind of looks like XML.
00030
00031 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
00032 or invalid. This class has web browser-like heuristics for
00033 obtaining a sensible parse tree in the face of common HTML errors.
00034
00035 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
00036 the encoding of an HTML or XML document, and converting it to
00037 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
00038
00039 For more than you ever wanted to know about Beautiful Soup, see the
00040 documentation:
00041 http://www.crummy.com/software/BeautifulSoup/documentation.html
00042
00043 """
00044 from __future__ import generators
00045
00046 __author__ = "Leonard Richardson (leonardr@segfault.org)"
00047 __version__ = "3.0.4"
00048 __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
00049 __license__ = "PSF"
00050
00051 from sgmllib import SGMLParser, SGMLParseError
00052 import codecs
00053 import types
00054 import re
00055 import sgmllib
00056 try:
00057 from htmlentitydefs import name2codepoint
00058 except ImportError:
00059 name2codepoint = {}
00060
00061
00062 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
00063
00064 DEFAULT_OUTPUT_ENCODING = "utf-8"
00065
00066
00067
00068 class PageElement:
00069 """Contains the navigational information for some part of the page
00070 (either a tag or a piece of text)"""
00071
00072 def setup(self, parent=None, previous=None):
00073 """Sets up the initial relations between this element and
00074 other elements."""
00075 self.parent = parent
00076 self.previous = previous
00077 self.next = None
00078 self.previousSibling = None
00079 self.nextSibling = None
00080 if self.parent and self.parent.contents:
00081 self.previousSibling = self.parent.contents[-1]
00082 self.previousSibling.nextSibling = self
00083
00084 def replaceWith(self, replaceWith):
00085 oldParent = self.parent
00086 myIndex = self.parent.contents.index(self)
00087 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
00088
00089 index = self.parent.contents.index(replaceWith)
00090 if index and index < myIndex:
00091
00092
00093
00094 myIndex = myIndex - 1
00095 self.extract()
00096 oldParent.insert(myIndex, replaceWith)
00097
00098 def extract(self):
00099 """Destructively rips this element out of the tree."""
00100 if self.parent:
00101 try:
00102 self.parent.contents.remove(self)
00103 except ValueError:
00104 pass
00105
00106
00107
00108
00109 lastChild = self._lastRecursiveChild()
00110 nextElement = lastChild.next
00111
00112 if self.previous:
00113 self.previous.next = nextElement
00114 if nextElement:
00115 nextElement.previous = self.previous
00116 self.previous = None
00117 lastChild.next = None
00118
00119 self.parent = None
00120 if self.previousSibling:
00121 self.previousSibling.nextSibling = self.nextSibling
00122 if self.nextSibling:
00123 self.nextSibling.previousSibling = self.previousSibling
00124 self.previousSibling = self.nextSibling = None
00125
00126 def _lastRecursiveChild(self):
00127 "Finds the last element beneath this object to be parsed."
00128 lastChild = self
00129 while hasattr(lastChild, 'contents') and lastChild.contents:
00130 lastChild = lastChild.contents[-1]
00131 return lastChild
00132
00133 def insert(self, position, newChild):
00134 if (isinstance(newChild, basestring)
00135 or isinstance(newChild, unicode)) \
00136 and not isinstance(newChild, NavigableString):
00137 newChild = NavigableString(newChild)
00138
00139 position = min(position, len(self.contents))
00140 if hasattr(newChild, 'parent') and newChild.parent != None:
00141
00142
00143 if newChild.parent == self:
00144 index = self.find(newChild)
00145 if index and index < position:
00146
00147
00148
00149
00150 position = position - 1
00151 newChild.extract()
00152
00153 newChild.parent = self
00154 previousChild = None
00155 if position == 0:
00156 newChild.previousSibling = None
00157 newChild.previous = self
00158 else:
00159 previousChild = self.contents[position-1]
00160 newChild.previousSibling = previousChild
00161 newChild.previousSibling.nextSibling = newChild
00162 newChild.previous = previousChild._lastRecursiveChild()
00163 if newChild.previous:
00164 newChild.previous.next = newChild
00165
00166 newChildsLastElement = newChild._lastRecursiveChild()
00167
00168 if position >= len(self.contents):
00169 newChild.nextSibling = None
00170
00171 parent = self
00172 parentsNextSibling = None
00173 while not parentsNextSibling:
00174 parentsNextSibling = parent.nextSibling
00175 parent = parent.parent
00176 if not parent:
00177 break
00178 if parentsNextSibling:
00179 newChildsLastElement.next = parentsNextSibling
00180 else:
00181 newChildsLastElement.next = None
00182 else:
00183 nextChild = self.contents[position]
00184 newChild.nextSibling = nextChild
00185 if newChild.nextSibling:
00186 newChild.nextSibling.previousSibling = newChild
00187 newChildsLastElement.next = nextChild
00188
00189 if newChildsLastElement.next:
00190 newChildsLastElement.next.previous = newChildsLastElement
00191 self.contents.insert(position, newChild)
00192
00193 def findNext(self, name=None, attrs={}, text=None, **kwargs):
00194 """Returns the first item that matches the given criteria and
00195 appears after this Tag in the document."""
00196 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
00197
00198 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
00199 **kwargs):
00200 """Returns all items that match the given criteria and appear
00201 before after Tag in the document."""
00202 return self._findAll(name, attrs, text, limit, self.nextGenerator)
00203
00204 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
00205 """Returns the closest sibling to this Tag that matches the
00206 given criteria and appears after this Tag in the document."""
00207 return self._findOne(self.findNextSiblings, name, attrs, text,
00208 **kwargs)
00209
00210 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
00211 **kwargs):
00212 """Returns the siblings of this Tag that match the given
00213 criteria and appear after this Tag in the document."""
00214 return self._findAll(name, attrs, text, limit,
00215 self.nextSiblingGenerator, **kwargs)
00216 fetchNextSiblings = findNextSiblings
00217
00218 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
00219 """Returns the first item that matches the given criteria and
00220 appears before this Tag in the document."""
00221 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
00222
00223 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
00224 **kwargs):
00225 """Returns all items that match the given criteria and appear
00226 before this Tag in the document."""
00227 return self._findAll(name, attrs, text, limit, self.previousGenerator,
00228 **kwargs)
00229 fetchPrevious = findAllPrevious
00230
00231 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
00232 """Returns the closest sibling to this Tag that matches the
00233 given criteria and appears before this Tag in the document."""
00234 return self._findOne(self.findPreviousSiblings, name, attrs, text,
00235 **kwargs)
00236
00237 def findPreviousSiblings(self, name=None, attrs={}, text=None,
00238 limit=None, **kwargs):
00239 """Returns the siblings of this Tag that match the given
00240 criteria and appear before this Tag in the document."""
00241 return self._findAll(name, attrs, text, limit,
00242 self.previousSiblingGenerator, **kwargs)
00243 fetchPreviousSiblings = findPreviousSiblings
00244
00245 def findParent(self, name=None, attrs={}, **kwargs):
00246 """Returns the closest parent of this Tag that matches the given
00247 criteria."""
00248
00249
00250 r = None
00251 l = self.findParents(name, attrs, 1)
00252 if l:
00253 r = l[0]
00254 return r
00255
00256 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
00257 """Returns the parents of this Tag that match the given
00258 criteria."""
00259
00260 return self._findAll(name, attrs, None, limit, self.parentGenerator,
00261 **kwargs)
00262 fetchParents = findParents
00263
00264
00265
00266 def _findOne(self, method, name, attrs, text, **kwargs):
00267 r = None
00268 l = method(name, attrs, text, 1, **kwargs)
00269 if l:
00270 r = l[0]
00271 return r
00272
00273 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
00274 "Iterates over a generator looking for things that match."
00275
00276 if isinstance(name, SoupStrainer):
00277 strainer = name
00278 else:
00279
00280 strainer = SoupStrainer(name, attrs, text, **kwargs)
00281 results = ResultSet(strainer)
00282 g = generator()
00283 while True:
00284 try:
00285 i = g.next()
00286 except StopIteration:
00287 break
00288 if i:
00289 found = strainer.search(i)
00290 if found:
00291 results.append(found)
00292 if limit and len(results) >= limit:
00293 break
00294 return results
00295
00296
00297
00298 def nextGenerator(self):
00299 i = self
00300 while i:
00301 i = i.next
00302 yield i
00303
00304 def nextSiblingGenerator(self):
00305 i = self
00306 while i:
00307 i = i.nextSibling
00308 yield i
00309
00310 def previousGenerator(self):
00311 i = self
00312 while i:
00313 i = i.previous
00314 yield i
00315
00316 def previousSiblingGenerator(self):
00317 i = self
00318 while i:
00319 i = i.previousSibling
00320 yield i
00321
00322 def parentGenerator(self):
00323 i = self
00324 while i:
00325 i = i.parent
00326 yield i
00327
00328
00329 def substituteEncoding(self, str, encoding=None):
00330 encoding = encoding or "utf-8"
00331 return str.replace("%SOUP-ENCODING%", encoding)
00332
00333 def toEncoding(self, s, encoding=None):
00334 """Encodes an object to a string in some encoding, or to Unicode.
00335 ."""
00336 if isinstance(s, unicode):
00337 if encoding:
00338 s = s.encode(encoding)
00339 elif isinstance(s, str):
00340 if encoding:
00341 s = s.encode(encoding)
00342 else:
00343 s = unicode(s)
00344 else:
00345 if encoding:
00346 s = self.toEncoding(str(s), encoding)
00347 else:
00348 s = unicode(s)
00349 return s
00350
00351 class NavigableString(unicode, PageElement):
00352
00353 def __getattr__(self, attr):
00354 """text.string gives you text. This is for backwards
00355 compatibility for Navigable*String, but for CData* it lets you
00356 get the string without the CData wrapper."""
00357 if attr == 'string':
00358 return self
00359 else:
00360 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
00361
00362 def __unicode__(self):
00363 return self.__str__(None)
00364
00365 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00366 if encoding:
00367 return self.encode(encoding)
00368 else:
00369 return self
00370
00371 class CData(NavigableString):
00372
00373 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00374 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
00375
00376 class ProcessingInstruction(NavigableString):
00377 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00378 output = self
00379 if "%SOUP-ENCODING%" in output:
00380 output = self.substituteEncoding(output, encoding)
00381 return "<?%s?>" % self.toEncoding(output, encoding)
00382
00383 class Comment(NavigableString):
00384 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00385 return "<!--%s-->" % NavigableString.__str__(self, encoding)
00386
00387 class Declaration(NavigableString):
00388 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00389 return "<!%s>" % NavigableString.__str__(self, encoding)
00390
00391 class Tag(PageElement):
00392
00393 """Represents a found HTML tag with its attributes and contents."""
00394
00395 XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
00396 '"' : "quote",
00397 "&" : "amp",
00398 "<" : "lt",
00399 ">" : "gt" }
00400
00401 def __init__(self, parser, name, attrs=None, parent=None,
00402 previous=None):
00403 "Basic constructor."
00404
00405
00406
00407 self.parserClass = parser.__class__
00408 self.isSelfClosing = parser.isSelfClosingTag(name)
00409 self.name = name
00410 if attrs == None:
00411 attrs = []
00412 self.attrs = attrs
00413 self.contents = []
00414 self.setup(parent, previous)
00415 self.hidden = False
00416 self.containsSubstitutions = False
00417
00418 def get(self, key, default=None):
00419 """Returns the value of the 'key' attribute for the tag, or
00420 the value given for 'default' if it doesn't have that
00421 attribute."""
00422 return self._getAttrMap().get(key, default)
00423
00424 def has_key(self, key):
00425 return self._getAttrMap().has_key(key)
00426
00427 def __getitem__(self, key):
00428 """tag[key] returns the value of the 'key' attribute for the tag,
00429 and throws an exception if it's not there."""
00430 return self._getAttrMap()[key]
00431
00432 def __iter__(self):
00433 "Iterating over a tag iterates over its contents."
00434 return iter(self.contents)
00435
00436 def __len__(self):
00437 "The length of a tag is the length of its list of contents."
00438 return len(self.contents)
00439
00440 def __contains__(self, x):
00441 return x in self.contents
00442
00443 def __nonzero__(self):
00444 "A tag is non-None even if it has no contents."
00445 return True
00446
00447 def __setitem__(self, key, value):
00448 """Setting tag[key] sets the value of the 'key' attribute for the
00449 tag."""
00450 self._getAttrMap()
00451 self.attrMap[key] = value
00452 found = False
00453 for i in range(0, len(self.attrs)):
00454 if self.attrs[i][0] == key:
00455 self.attrs[i] = (key, value)
00456 found = True
00457 if not found:
00458 self.attrs.append((key, value))
00459 self._getAttrMap()[key] = value
00460
00461 def __delitem__(self, key):
00462 "Deleting tag[key] deletes all 'key' attributes for the tag."
00463 for item in self.attrs:
00464 if item[0] == key:
00465 self.attrs.remove(item)
00466
00467
00468 self._getAttrMap()
00469 if self.attrMap.has_key(key):
00470 del self.attrMap[key]
00471
00472 def __call__(self, *args, **kwargs):
00473 """Calling a tag like a function is the same as calling its
00474 findAll() method. Eg. tag('a') returns a list of all the A tags
00475 found within this tag."""
00476 return apply(self.findAll, args, kwargs)
00477
00478 def __getattr__(self, tag):
00479
00480 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
00481 return self.find(tag[:-3])
00482 elif tag.find('__') != 0:
00483 return self.find(tag)
00484
00485 def __eq__(self, other):
00486 """Returns true iff this tag has the same name, the same attributes,
00487 and the same contents (recursively) as the given tag.
00488
00489 NOTE: right now this will return false if two tags have the
00490 same attributes in a different order. Should this be fixed?"""
00491 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
00492 return False
00493 for i in range(0, len(self.contents)):
00494 if self.contents[i] != other.contents[i]:
00495 return False
00496 return True
00497
00498 def __ne__(self, other):
00499 """Returns true iff this tag is not identical to the other tag,
00500 as defined in __eq__."""
00501 return not self == other
00502
00503 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00504 """Renders this tag as a string."""
00505 return self.__str__(encoding)
00506
00507 def __unicode__(self):
00508 return self.__str__(None)
00509
00510 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
00511 prettyPrint=False, indentLevel=0):
00512 """Returns a string or Unicode representation of this tag and
00513 its contents. To get Unicode, pass None for encoding.
00514
00515 NOTE: since Python's HTML parser consumes whitespace, this
00516 method is not certain to reproduce the whitespace present in
00517 the original string."""
00518
00519 encodedName = self.toEncoding(self.name, encoding)
00520
00521 attrs = []
00522 if self.attrs:
00523 for key, val in self.attrs:
00524 fmt = '%s="%s"'
00525 if isString(val):
00526 if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
00527 val = self.substituteEncoding(val, encoding)
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544 if '"' in val:
00545 fmt = "%s='%s'"
00546
00547
00548 if "'" in val:
00549 val = val.replace("'", "&squot;")
00550
00551
00552
00553
00554
00555 val = re.sub("([<>]|&(?![^\s]+;))",
00556 lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
00557 val)
00558
00559 attrs.append(fmt % (self.toEncoding(key, encoding),
00560 self.toEncoding(val, encoding)))
00561 close = ''
00562 closeTag = ''
00563 if self.isSelfClosing:
00564 close = ' /'
00565 else:
00566 closeTag = '</%s>' % encodedName
00567
00568 indentTag, indentContents = 0, 0
00569 if prettyPrint:
00570 indentTag = indentLevel
00571 space = (' ' * (indentTag-1))
00572 indentContents = indentTag + 1
00573 contents = self.renderContents(encoding, prettyPrint, indentContents)
00574 if self.hidden:
00575 s = contents
00576 else:
00577 s = []
00578 attributeString = ''
00579 if attrs:
00580 attributeString = ' ' + ' '.join(attrs)
00581 if prettyPrint:
00582 s.append(space)
00583 s.append('<%s%s%s>' % (encodedName, attributeString, close))
00584 if prettyPrint:
00585 s.append("\n")
00586 s.append(contents)
00587 if prettyPrint and contents and contents[-1] != "\n":
00588 s.append("\n")
00589 if prettyPrint and closeTag:
00590 s.append(space)
00591 s.append(closeTag)
00592 if prettyPrint and closeTag and self.nextSibling:
00593 s.append("\n")
00594 s = ''.join(s)
00595 return s
00596
00597 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
00598 return self.__str__(encoding, True)
00599
00600 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
00601 prettyPrint=False, indentLevel=0):
00602 """Renders the contents of this tag as a string in the given
00603 encoding. If encoding is None, returns a Unicode string.."""
00604 s=[]
00605 for c in self:
00606 text = None
00607 if isinstance(c, NavigableString):
00608 text = c.__str__(encoding)
00609 elif isinstance(c, Tag):
00610 s.append(c.__str__(encoding, prettyPrint, indentLevel))
00611 if text and prettyPrint:
00612 text = text.strip()
00613 if text:
00614 if prettyPrint:
00615 s.append(" " * (indentLevel-1))
00616 s.append(text)
00617 if prettyPrint:
00618 s.append("\n")
00619 return ''.join(s)
00620
00621
00622
00623 def find(self, name=None, attrs={}, recursive=True, text=None,
00624 **kwargs):
00625 """Return only the first child of this Tag matching the given
00626 criteria."""
00627 r = None
00628 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
00629 if l:
00630 r = l[0]
00631 return r
00632 findChild = find
00633
00634 def findAll(self, name=None, attrs={}, recursive=True, text=None,
00635 limit=None, **kwargs):
00636 """Extracts a list of Tag objects that match the given
00637 criteria. You can specify the name of the Tag and any
00638 attributes you want the Tag to have.
00639
00640 The value of a key-value pair in the 'attrs' map can be a
00641 string, a list of strings, a regular expression object, or a
00642 callable that takes a string and returns whether or not the
00643 string matches for some custom definition of 'matches'. The
00644 same is true of the tag name."""
00645 generator = self.recursiveChildGenerator
00646 if not recursive:
00647 generator = self.childGenerator
00648 return self._findAll(name, attrs, text, limit, generator, **kwargs)
00649 findChildren = findAll
00650
00651
00652 first = find
00653 fetch = findAll
00654
00655 def fetchText(self, text=None, recursive=True, limit=None):
00656 return self.findAll(text=text, recursive=recursive, limit=limit)
00657
00658 def firstText(self, text=None, recursive=True):
00659 return self.find(text=text, recursive=recursive)
00660
00661
00662
00663 def append(self, tag):
00664 """Appends the given tag to the contents of this tag."""
00665 self.contents.append(tag)
00666
00667
00668
00669 def _getAttrMap(self):
00670 """Initializes a map representation of this tag's attributes,
00671 if not already initialized."""
00672 if not getattr(self, 'attrMap'):
00673 self.attrMap = {}
00674 for (key, value) in self.attrs:
00675 self.attrMap[key] = value
00676 return self.attrMap
00677
00678
00679 def childGenerator(self):
00680 for i in range(0, len(self.contents)):
00681 yield self.contents[i]
00682 raise StopIteration
00683
00684 def recursiveChildGenerator(self):
00685 stack = [(self, 0)]
00686 while stack:
00687 tag, start = stack.pop()
00688 if isinstance(tag, Tag):
00689 for i in range(start, len(tag.contents)):
00690 a = tag.contents[i]
00691 yield a
00692 if isinstance(a, Tag) and tag.contents:
00693 if i < len(tag.contents) - 1:
00694 stack.append((tag, i+1))
00695 stack.append((a, 0))
00696 break
00697 raise StopIteration
00698
00699
00700 class SoupStrainer:
00701 """Encapsulates a number of ways of matching a markup element (tag or
00702 text)."""
00703
00704 def __init__(self, name=None, attrs={}, text=None, **kwargs):
00705 self.name = name
00706 if isString(attrs):
00707 kwargs['class'] = attrs
00708 attrs = None
00709 if kwargs:
00710 if attrs:
00711 attrs = attrs.copy()
00712 attrs.update(kwargs)
00713 else:
00714 attrs = kwargs
00715 self.attrs = attrs
00716 self.text = text
00717
00718 def __str__(self):
00719 if self.text:
00720 return self.text
00721 else:
00722 return "%s|%s" % (self.name, self.attrs)
00723
00724 def searchTag(self, markupName=None, markupAttrs={}):
00725 found = None
00726 markup = None
00727 if isinstance(markupName, Tag):
00728 markup = markupName
00729 markupAttrs = markup
00730 callFunctionWithTagData = callable(self.name) \
00731 and not isinstance(markupName, Tag)
00732
00733 if (not self.name) \
00734 or callFunctionWithTagData \
00735 or (markup and self._matches(markup, self.name)) \
00736 or (not markup and self._matches(markupName, self.name)):
00737 if callFunctionWithTagData:
00738 match = self.name(markupName, markupAttrs)
00739 else:
00740 match = True
00741 markupAttrMap = None
00742 for attr, matchAgainst in self.attrs.items():
00743 if not markupAttrMap:
00744 if hasattr(markupAttrs, 'get'):
00745 markupAttrMap = markupAttrs
00746 else:
00747 markupAttrMap = {}
00748 for k,v in markupAttrs:
00749 markupAttrMap[k] = v
00750 attrValue = markupAttrMap.get(attr)
00751 if not self._matches(attrValue, matchAgainst):
00752 match = False
00753 break
00754 if match:
00755 if markup:
00756 found = markup
00757 else:
00758 found = markupName
00759 return found
00760
00761 def search(self, markup):
00762
00763 found = None
00764
00765
00766 if isList(markup) and not isinstance(markup, Tag):
00767 for element in markup:
00768 if isinstance(element, NavigableString) \
00769 and self.search(element):
00770 found = element
00771 break
00772
00773
00774 elif isinstance(markup, Tag):
00775 if not self.text:
00776 found = self.searchTag(markup)
00777
00778 elif isinstance(markup, NavigableString) or \
00779 isString(markup):
00780 if self._matches(markup, self.text):
00781 found = markup
00782 else:
00783 raise Exception, "I don't know how to match against a %s" \
00784 % markup.__class__
00785 return found
00786
00787 def _matches(self, markup, matchAgainst):
00788
00789 result = False
00790 if matchAgainst == True and type(matchAgainst) == types.BooleanType:
00791 result = markup != None
00792 elif callable(matchAgainst):
00793 result = matchAgainst(markup)
00794 else:
00795
00796
00797 if isinstance(markup, Tag):
00798 markup = markup.name
00799 if markup and not isString(markup):
00800 markup = unicode(markup)
00801
00802 if hasattr(matchAgainst, 'match'):
00803
00804 result = markup and matchAgainst.search(markup)
00805 elif isList(matchAgainst):
00806 result = markup in matchAgainst
00807 elif hasattr(matchAgainst, 'items'):
00808 result = markup.has_key(matchAgainst)
00809 elif matchAgainst and isString(markup):
00810 if isinstance(markup, unicode):
00811 matchAgainst = unicode(matchAgainst)
00812 else:
00813 matchAgainst = str(matchAgainst)
00814
00815 if not result:
00816 result = matchAgainst == markup
00817 return result
00818
00819 class ResultSet(list):
00820 """A ResultSet is just a list that keeps track of the SoupStrainer
00821 that created it."""
00822 def __init__(self, source):
00823 list.__init__([])
00824 self.source = source
00825
00826
00827
00828 def isList(l):
00829 """Convenience method that works with all 2.x versions of Python
00830 to determine whether or not something is listlike."""
00831 return hasattr(l, '__iter__') \
00832 or (type(l) in (types.ListType, types.TupleType))
00833
00834 def isString(s):
00835 """Convenience method that works with all 2.x versions of Python
00836 to determine whether or not something is stringlike."""
00837 try:
00838 return isinstance(s, unicode) or isintance(s, basestring)
00839 except NameError:
00840 return isinstance(s, str)
00841
00842 def buildTagMap(default, *args):
00843 """Turns a list of maps, lists, or scalars into a single map.
00844 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
00845 NESTING_RESET_TAGS maps out of lists and partial maps."""
00846 built = {}
00847 for portion in args:
00848 if hasattr(portion, 'items'):
00849
00850 for k,v in portion.items():
00851 built[k] = v
00852 elif isList(portion):
00853
00854 for k in portion:
00855 built[k] = default
00856 else:
00857
00858 built[portion] = default
00859 return built
00860
00861
00862
00863 class BeautifulStoneSoup(Tag, SGMLParser):
00864
00865 """This class contains the basic parser and search code. It defines
00866 a parser that knows nothing about tag behavior except for the
00867 following:
00868
00869 You can't close a tag without closing all the tags it encloses.
00870 That is, "<foo><bar></foo>" actually means
00871 "<foo><bar></bar></foo>".
00872
00873 [Another possible explanation is "<foo><bar /></foo>", but since
00874 this class defines no SELF_CLOSING_TAGS, it will never use that
00875 explanation.]
00876
00877 This class is useful for parsing XML or made-up markup languages,
00878 or when BeautifulSoup makes an assumption counter to what you were
00879 expecting."""
00880
00881 XML_ENTITY_LIST = {}
00882 for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():
00883 XML_ENTITY_LIST[i] = True
00884
00885 SELF_CLOSING_TAGS = {}
00886 NESTABLE_TAGS = {}
00887 RESET_NESTING_TAGS = {}
00888 QUOTE_TAGS = {}
00889
00890 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
00891 lambda x: x.group(1) + ' />'),
00892 (re.compile('<!\s+([^<>]*)>'),
00893 lambda x: '<!' + x.group(1) + '>')
00894 ]
00895
00896 ROOT_TAG_NAME = u'[document]'
00897
00898 HTML_ENTITIES = "html"
00899 XML_ENTITIES = "xml"
00900
00901 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
00902 markupMassage=True, smartQuotesTo=XML_ENTITIES,
00903 convertEntities=None, selfClosingTags=None):
00904 """The Soup object is initialized as the 'root tag', and the
00905 provided markup (which can be a string or a file-like object)
00906 is fed into the underlying parser.
00907
00908 sgmllib will process most bad HTML, and the BeautifulSoup
00909 class has some tricks for dealing with some HTML that kills
00910 sgmllib, but Beautiful Soup can nonetheless choke or lose data
00911 if your data uses self-closing tags or declarations
00912 incorrectly.
00913
00914 By default, Beautiful Soup uses regexes to sanitize input,
00915 avoiding the vast majority of these problems. If the problems
00916 don't apply to you, pass in False for markupMassage, and
00917 you'll get better performance.
00918
00919 The default parser massage techniques fix the two most common
00920 instances of invalid HTML that choke sgmllib:
00921
00922 <br/> (No space between name of closing tag and tag close)
00923 <! --Comment--> (Extraneous whitespace in declaration)
00924
00925 You can pass in a custom list of (RE object, replace method)
00926 tuples to get Beautiful Soup to scrub your input the way you
00927 want."""
00928
00929 self.parseOnlyThese = parseOnlyThese
00930 self.fromEncoding = fromEncoding
00931 self.smartQuotesTo = smartQuotesTo
00932 self.convertEntities = convertEntities
00933 if self.convertEntities:
00934
00935
00936
00937 self.smartQuotesTo = None
00938 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
00939 SGMLParser.__init__(self)
00940
00941 if hasattr(markup, 'read'):
00942 markup = markup.read()
00943 self.markup = markup
00944 self.markupMassage = markupMassage
00945 try:
00946 self._feed()
00947 except StopParsing:
00948 pass
00949 self.markup = None
00950
00951 def _feed(self, inDocumentEncoding=None):
00952
00953 markup = self.markup
00954 if isinstance(markup, unicode):
00955 if not hasattr(self, 'originalEncoding'):
00956 self.originalEncoding = None
00957 else:
00958 dammit = UnicodeDammit\
00959 (markup, [self.fromEncoding, inDocumentEncoding],
00960 smartQuotesTo=self.smartQuotesTo)
00961 markup = dammit.unicode
00962 self.originalEncoding = dammit.originalEncoding
00963 if markup:
00964 if self.markupMassage:
00965 if not isList(self.markupMassage):
00966 self.markupMassage = self.MARKUP_MASSAGE
00967 for fix, m in self.markupMassage:
00968 markup = fix.sub(m, markup)
00969 self.reset()
00970
00971 SGMLParser.feed(self, markup)
00972
00973 self.endData()
00974 while self.currentTag.name != self.ROOT_TAG_NAME:
00975 self.popTag()
00976
00977 def __getattr__(self, methodName):
00978 """This method routes method call requests to either the SGMLParser
00979 superclass or the Tag superclass, depending on the method name."""
00980
00981
00982 if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
00983 or methodName.find('do_') == 0:
00984 return SGMLParser.__getattr__(self, methodName)
00985 elif methodName.find('__') != 0:
00986 return Tag.__getattr__(self, methodName)
00987 else:
00988 raise AttributeError
00989
00990 def isSelfClosingTag(self, name):
00991 """Returns true iff the given string is the name of a
00992 self-closing tag according to this parser."""
00993 return self.SELF_CLOSING_TAGS.has_key(name) \
00994 or self.instanceSelfClosingTags.has_key(name)
00995
00996 def reset(self):
00997 Tag.__init__(self, self, self.ROOT_TAG_NAME)
00998 self.hidden = 1
00999 SGMLParser.reset(self)
01000 self.currentData = []
01001 self.currentTag = None
01002 self.tagStack = []
01003 self.quoteStack = []
01004 self.pushTag(self)
01005
01006 def popTag(self):
01007 tag = self.tagStack.pop()
01008
01009
01010
01011 if len(self.currentTag.contents) == 1 and \
01012 isinstance(self.currentTag.contents[0], NavigableString):
01013 self.currentTag.string = self.currentTag.contents[0]
01014
01015
01016 if self.tagStack:
01017 self.currentTag = self.tagStack[-1]
01018 return self.currentTag
01019
01020 def pushTag(self, tag):
01021
01022 if self.currentTag:
01023 self.currentTag.append(tag)
01024 self.tagStack.append(tag)
01025 self.currentTag = self.tagStack[-1]
01026
01027 def endData(self, containerClass=NavigableString):
01028 if self.currentData:
01029 currentData = ''.join(self.currentData)
01030 if not currentData.strip():
01031 if '\n' in currentData:
01032 currentData = '\n'
01033 else:
01034 currentData = ' '
01035 self.currentData = []
01036 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
01037 (not self.parseOnlyThese.text or \
01038 not self.parseOnlyThese.search(currentData)):
01039 return
01040 o = containerClass(currentData)
01041 o.setup(self.currentTag, self.previous)
01042 if self.previous:
01043 self.previous.next = o
01044 self.previous = o
01045 self.currentTag.contents.append(o)
01046
01047
01048 def _popToTag(self, name, inclusivePop=True):
01049 """Pops the tag stack up to and including the most recent
01050 instance of the given tag. If inclusivePop is false, pops the tag
01051 stack up to but *not* including the most recent instqance of
01052 the given tag."""
01053
01054 if name == self.ROOT_TAG_NAME:
01055 return
01056
01057 numPops = 0
01058 mostRecentTag = None
01059 for i in range(len(self.tagStack)-1, 0, -1):
01060 if name == self.tagStack[i].name:
01061 numPops = len(self.tagStack)-i
01062 break
01063 if not inclusivePop:
01064 numPops = numPops - 1
01065
01066 for i in range(0, numPops):
01067 mostRecentTag = self.popTag()
01068 return mostRecentTag
01069
01070 def _smartPop(self, name):
01071
01072 """We need to pop up to the previous tag of this type, unless
01073 one of this tag's nesting reset triggers comes between this
01074 tag and the previous tag of this type, OR unless this tag is a
01075 generic nesting trigger and another generic nesting trigger
01076 comes between this tag and the previous tag of this type.
01077
01078 Examples:
01079 <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
01080 <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
01081 <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
01082 <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
01083
01084 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
01085 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
01086 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
01087 """
01088
01089 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
01090 isNestable = nestingResetTriggers != None
01091 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
01092 popTo = None
01093 inclusive = True
01094 for i in range(len(self.tagStack)-1, 0, -1):
01095 p = self.tagStack[i]
01096 if (not p or p.name == name) and not isNestable:
01097
01098
01099 popTo = name
01100 break
01101 if (nestingResetTriggers != None
01102 and p.name in nestingResetTriggers) \
01103 or (nestingResetTriggers == None and isResetNesting
01104 and self.RESET_NESTING_TAGS.has_key(p.name)):
01105
01106
01107
01108
01109
01110 popTo = p.name
01111 inclusive = False
01112 break
01113 p = p.parent
01114 if popTo:
01115 self._popToTag(popTo, inclusive)
01116
01117 def unknown_starttag(self, name, attrs, selfClosing=0):
01118
01119 if self.quoteStack:
01120
01121
01122 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
01123 self.handle_data('<%s%s>' % (name, attrs))
01124 return
01125 self.endData()
01126
01127 if not self.isSelfClosingTag(name) and not selfClosing:
01128 self._smartPop(name)
01129
01130 if self.parseOnlyThese and len(self.tagStack) <= 1 \
01131 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
01132 return
01133
01134 tag = Tag(self, name, attrs, self.currentTag, self.previous)
01135 if self.previous:
01136 self.previous.next = tag
01137 self.previous = tag
01138 self.pushTag(tag)
01139 if selfClosing or self.isSelfClosingTag(name):
01140 self.popTag()
01141 if name in self.QUOTE_TAGS:
01142
01143 self.quoteStack.append(name)
01144 self.literal = 1
01145 return tag
01146
01147 def unknown_endtag(self, name):
01148
01149 if self.quoteStack and self.quoteStack[-1] != name:
01150
01151
01152 self.handle_data('</%s>' % name)
01153 return
01154 self.endData()
01155 self._popToTag(name)
01156 if self.quoteStack and self.quoteStack[-1] == name:
01157 self.quoteStack.pop()
01158 self.literal = (len(self.quoteStack) > 0)
01159
01160 def handle_data(self, data):
01161 self.currentData.append(data)
01162
01163 def _toStringSubclass(self, text, subclass):
01164 """Adds a certain piece of text to the tree as a NavigableString
01165 subclass."""
01166 self.endData()
01167 self.handle_data(text)
01168 self.endData(subclass)
01169
01170 def handle_pi(self, text):
01171 """Handle a processing instruction as a ProcessingInstruction
01172 object, possibly one with a %SOUP-ENCODING% slot into which an
01173 encoding will be plugged later."""
01174 if text[:3] == "xml":
01175 text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
01176 self._toStringSubclass(text, ProcessingInstruction)
01177
01178 def handle_comment(self, text):
01179 "Handle comments as Comment objects."
01180 self._toStringSubclass(text, Comment)
01181
01182 def handle_charref(self, ref):
01183 "Handle character references as data."
01184 if self.convertEntities in [self.HTML_ENTITIES,
01185 self.XML_ENTITIES]:
01186 data = unichr(int(ref))
01187 else:
01188 data = '&#%s;' % ref
01189 self.handle_data(data)
01190
01191 def handle_entityref(self, ref):
01192 """Handle entity references as data, possibly converting known
01193 HTML entity references to the corresponding Unicode
01194 characters."""
01195 data = None
01196 if self.convertEntities == self.HTML_ENTITIES or \
01197 (self.convertEntities == self.XML_ENTITIES and \
01198 self.XML_ENTITY_LIST.get(ref)):
01199 try:
01200 data = unichr(name2codepoint[ref])
01201 except KeyError:
01202 pass
01203 if not data:
01204 data = '&%s;' % ref
01205 self.handle_data(data)
01206
01207 def handle_decl(self, data):
01208 "Handle DOCTYPEs and the like as Declaration objects."
01209 self._toStringSubclass(data, Declaration)
01210
01211 def parse_declaration(self, i):
01212 """Treat a bogus SGML declaration as raw data. Treat a CDATA
01213 declaration as a CData object."""
01214 j = None
01215 if self.rawdata[i:i+9] == '<![CDATA[':
01216 k = self.rawdata.find(']]>', i)
01217 if k == -1:
01218 k = len(self.rawdata)
01219 data = self.rawdata[i+9:k]
01220 j = k+3
01221 self._toStringSubclass(data, CData)
01222 else:
01223 try:
01224 j = SGMLParser.parse_declaration(self, i)
01225 except SGMLParseError:
01226 toHandle = self.rawdata[i:]
01227 self.handle_data(toHandle)
01228 j = i + len(toHandle)
01229 return j
01230
01231 class BeautifulSoup(BeautifulStoneSoup):
01232
01233 """This parser knows the following facts about HTML:
01234
01235 * Some tags have no closing tag and should be interpreted as being
01236 closed as soon as they are encountered.
01237
01238 * The text inside some tags (ie. 'script') may contain tags which
01239 are not really part of the document and which should be parsed
01240 as text, not tags. If you want to parse the text as tags, you can
01241 always fetch it and parse it explicitly.
01242
01243 * Tag nesting rules:
01244
01245 Most tags can't be nested at all. For instance, the occurance of
01246 a <p> tag should implicitly close the previous <p> tag.
01247
01248 <p>Para1<p>Para2
01249 should be transformed into:
01250 <p>Para1</p><p>Para2
01251
01252 Some tags can be nested arbitrarily. For instance, the occurance
01253 of a <blockquote> tag should _not_ implicitly close the previous
01254 <blockquote> tag.
01255
01256 Alice said: <blockquote>Bob said: <blockquote>Blah
01257 should NOT be transformed into:
01258 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
01259
01260 Some tags can be nested, but the nesting is reset by the
01261 interposition of other tags. For instance, a <tr> tag should
01262 implicitly close the previous <tr> tag within the same <table>,
01263 but not close a <tr> tag in another table.
01264
01265 <table><tr>Blah<tr>Blah
01266 should be transformed into:
01267 <table><tr>Blah</tr><tr>Blah
01268 but,
01269 <tr>Blah<table><tr>Blah
01270 should NOT be transformed into
01271 <tr>Blah<table></tr><tr>Blah
01272
01273 Differing assumptions about tag nesting rules are a major source
01274 of problems with the BeautifulSoup class. If BeautifulSoup is not
01275 treating as nestable a tag your page author treats as nestable,
01276 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
01277 BeautifulStoneSoup before writing your own subclass."""
01278
01279 def __init__(self, *args, **kwargs):
01280 if not kwargs.has_key('smartQuotesTo'):
01281 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
01282 BeautifulStoneSoup.__init__(self, *args, **kwargs)
01283
01284 SELF_CLOSING_TAGS = buildTagMap(None,
01285 ['br' , 'hr', 'input', 'img', 'meta',
01286 'spacer', 'link', 'frame', 'base'])
01287
01288 QUOTE_TAGS = {'script': None}
01289
01290
01291
01292
01293 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
01294 'center']
01295
01296
01297
01298
01299 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
01300
01301
01302 NESTABLE_LIST_TAGS = { 'ol' : [],
01303 'ul' : [],
01304 'li' : ['ul', 'ol'],
01305 'dl' : [],
01306 'dd' : ['dl'],
01307 'dt' : ['dl'] }
01308
01309
01310 NESTABLE_TABLE_TAGS = {'table' : [],
01311 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
01312 'td' : ['tr'],
01313 'th' : ['tr'],
01314 'thead' : ['table'],
01315 'tbody' : ['table'],
01316 'tfoot' : ['table'],
01317 }
01318
01319 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
01320
01321
01322
01323 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
01324 NON_NESTABLE_BLOCK_TAGS,
01325 NESTABLE_LIST_TAGS,
01326 NESTABLE_TABLE_TAGS)
01327
01328 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
01329 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
01330
01331
01332 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
01333
01334 def start_meta(self, attrs):
01335 """Beautiful Soup can detect a charset included in a META tag,
01336 try to convert the document to that charset, and re-parse the
01337 document from the beginning."""
01338 httpEquiv = None
01339 contentType = None
01340 contentTypeIndex = None
01341 tagNeedsEncodingSubstitution = False
01342
01343 for i in range(0, len(attrs)):
01344 key, value = attrs[i]
01345 key = key.lower()
01346 if key == 'http-equiv':
01347 httpEquiv = value
01348 elif key == 'content':
01349 contentType = value
01350 contentTypeIndex = i
01351
01352 if httpEquiv and contentType:
01353 match = self.CHARSET_RE.search(contentType)
01354 if match:
01355 if getattr(self, 'declaredHTMLEncoding') or \
01356 (self.originalEncoding == self.fromEncoding):
01357
01358
01359
01360 newAttr = self.CHARSET_RE.sub\
01361 (lambda(match):match.group(1) +
01362 "%SOUP-ENCODING%", value)
01363 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
01364 newAttr)
01365 tagNeedsEncodingSubstitution = True
01366 else:
01367
01368
01369 newCharset = match.group(3)
01370 if newCharset and newCharset != self.originalEncoding:
01371 self.declaredHTMLEncoding = newCharset
01372 self._feed(self.declaredHTMLEncoding)
01373 raise StopParsing
01374 tag = self.unknown_starttag("meta", attrs)
01375 if tag and tagNeedsEncodingSubstitution:
01376 tag.containsSubstitutions = True
01377
01378 class StopParsing(Exception):
01379 pass
01380
01381 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
01382
01383 """The BeautifulSoup class is oriented towards skipping over
01384 common HTML errors like unclosed tags. However, sometimes it makes
01385 errors of its own. For instance, consider this fragment:
01386
01387 <b>Foo<b>Bar</b></b>
01388
01389 This is perfectly valid (if bizarre) HTML. However, the
01390 BeautifulSoup class will implicitly close the first b tag when it
01391 encounters the second 'b'. It will think the author wrote
01392 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
01393 there's no real-world reason to bold something that's already
01394 bold. When it encounters '</b></b>' it will close two more 'b'
01395 tags, for a grand total of three tags closed instead of two. This
01396 can throw off the rest of your document structure. The same is
01397 true of a number of other tags, listed below.
01398
01399 It's much more common for someone to forget to close a 'b' tag
01400 than to actually use nested 'b' tags, and the BeautifulSoup class
01401 handles the common case. This class handles the not-co-common
01402 case: where you can't believe someone wrote what they did, but
01403 it's valid HTML and BeautifulSoup screwed up by assuming it
01404 wouldn't be."""
01405
01406 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
01407 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
01408 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
01409 'big']
01410
01411 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
01412
01413 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
01414 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
01415 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
01416
01417 class MinimalSoup(BeautifulSoup):
01418 """The MinimalSoup class is for parsing HTML that contains
01419 pathologically bad markup. It makes no assumptions about tag
01420 nesting, but it does know which tags are self-closing, that
01421 <script> tags contain Javascript and should not be parsed, that
01422 META tags may contain encoding information, and so on.
01423
01424 This also makes it better for subclassing than BeautifulStoneSoup
01425 or BeautifulSoup."""
01426
01427 RESET_NESTING_TAGS = buildTagMap('noscript')
01428 NESTABLE_TAGS = {}
01429
01430 class BeautifulSOAP(BeautifulStoneSoup):
01431 """This class will push a tag with only a single string child into
01432 the tag's parent as an attribute. The attribute's name is the tag
01433 name, and the value is the string child. An example should give
01434 the flavor of the change:
01435
01436 <foo><bar>baz</bar></foo>
01437 =>
01438 <foo bar="baz"><bar>baz</bar></foo>
01439
01440 You can then access fooTag['bar'] instead of fooTag.barTag.string.
01441
01442 This is, of course, useful for scraping structures that tend to
01443 use subelements instead of attributes, such as SOAP messages. Note
01444 that it modifies its input, so don't print the modified version
01445 out.
01446
01447 I'm not sure how many people really want to use this class; let me
01448 know if you do. Mainly I like the name."""
01449
01450 def popTag(self):
01451 if len(self.tagStack) > 1:
01452 tag = self.tagStack[-1]
01453 parent = self.tagStack[-2]
01454 parent._getAttrMap()
01455 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
01456 isinstance(tag.contents[0], NavigableString) and
01457 not parent.attrMap.has_key(tag.name)):
01458 parent[tag.name] = tag.contents[0]
01459 BeautifulStoneSoup.popTag(self)
01460
01461
01462
01463
01464
01465
01466
01467
01468
01469 class RobustXMLParser(BeautifulStoneSoup):
01470 pass
01471 class RobustHTMLParser(BeautifulSoup):
01472 pass
01473 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
01474 pass
01475 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
01476 pass
01477 class SimplifyingSOAPParser(BeautifulSOAP):
01478 pass
01479
01480
01481
01482
01483
01484
01485
01486
01487
01488
01489
01490
01491
01492 try:
01493 import chardet
01494
01495
01496 except:
01497 chardet = None
01498 chardet = None
01499
01500
01501
01502
01503 try:
01504 import cjkcodecs.aliases
01505 except:
01506 pass
01507 try:
01508 import iconv_codec
01509 except:
01510 pass
01511
01512 class UnicodeDammit:
01513 """A class for detecting the encoding of a *ML document and
01514 converting it to a Unicode string. If the source encoding is
01515 windows-1252, can replace MS smart quotes with their HTML or XML
01516 equivalents."""
01517
01518
01519
01520
01521
01522 CHARSET_ALIASES = { "macintosh" : "mac-roman",
01523 "x-sjis" : "shift-jis" }
01524
01525 def __init__(self, markup, overrideEncodings=[],
01526 smartQuotesTo='xml'):
01527 self.markup, documentEncoding, sniffedEncoding = \
01528 self._detectEncoding(markup)
01529 self.smartQuotesTo = smartQuotesTo
01530 self.triedEncodings = []
01531 if markup == '' or isinstance(markup, unicode):
01532 self.originalEncoding = None
01533 self.unicode = unicode(markup)
01534 return
01535
01536 u = None
01537 for proposedEncoding in overrideEncodings:
01538 u = self._convertFrom(proposedEncoding)
01539 if u: break
01540 if not u:
01541 for proposedEncoding in (documentEncoding, sniffedEncoding):
01542 u = self._convertFrom(proposedEncoding)
01543 if u: break
01544
01545
01546 if not u and chardet and not isinstance(self.markup, unicode):
01547 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
01548
01549
01550 if not u:
01551 for proposed_encoding in ("utf-8", "windows-1252"):
01552 u = self._convertFrom(proposed_encoding)
01553 if u: break
01554 self.unicode = u
01555 if not u: self.originalEncoding = None
01556
01557 def _subMSChar(self, orig):
01558 """Changes a MS smart quote character to an XML or HTML
01559 entity."""
01560 sub = self.MS_CHARS.get(orig)
01561 if type(sub) == types.TupleType:
01562 if self.smartQuotesTo == 'xml':
01563 sub = '&#x%s;' % sub[1]
01564 else:
01565 sub = '&%s;' % sub[0]
01566 return sub
01567
01568 def _convertFrom(self, proposed):
01569 proposed = self.find_codec(proposed)
01570 if not proposed or proposed in self.triedEncodings:
01571 return None
01572 self.triedEncodings.append(proposed)
01573 markup = self.markup
01574
01575
01576
01577 if self.smartQuotesTo and proposed.lower() in("windows-1252",
01578 "iso-8859-1",
01579 "iso-8859-2"):
01580 markup = re.compile("([\x80-\x9f])").sub \
01581 (lambda(x): self._subMSChar(x.group(1)),
01582 markup)
01583
01584 try:
01585
01586 u = self._toUnicode(markup, proposed)
01587 self.markup = u
01588 self.originalEncoding = proposed
01589 except Exception, e:
01590
01591
01592 return None
01593
01594 return self.markup
01595
01596 def _toUnicode(self, data, encoding):
01597 '''Given a string and its encoding, decodes the string into Unicode.
01598 %encoding is a string recognized by encodings.aliases'''
01599
01600
01601 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
01602 and (data[2:4] != '\x00\x00'):
01603 encoding = 'utf-16be'
01604 data = data[2:]
01605 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
01606 and (data[2:4] != '\x00\x00'):
01607 encoding = 'utf-16le'
01608 data = data[2:]
01609 elif data[:3] == '\xef\xbb\xbf':
01610 encoding = 'utf-8'
01611 data = data[3:]
01612 elif data[:4] == '\x00\x00\xfe\xff':
01613 encoding = 'utf-32be'
01614 data = data[4:]
01615 elif data[:4] == '\xff\xfe\x00\x00':
01616 encoding = 'utf-32le'
01617 data = data[4:]
01618 newdata = unicode(data, encoding)
01619 return newdata
01620
01621 def _detectEncoding(self, xml_data):
01622 """Given a document, tries to detect its XML encoding."""
01623 xml_encoding = sniffed_xml_encoding = None
01624 try:
01625 if xml_data[:4] == '\x4c\x6f\xa7\x94':
01626
01627 xml_data = self._ebcdic_to_ascii(xml_data)
01628 elif xml_data[:4] == '\x00\x3c\x00\x3f':
01629
01630 sniffed_xml_encoding = 'utf-16be'
01631 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
01632 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
01633 and (xml_data[2:4] != '\x00\x00'):
01634
01635 sniffed_xml_encoding = 'utf-16be'
01636 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
01637 elif xml_data[:4] == '\x3c\x00\x3f\x00':
01638
01639 sniffed_xml_encoding = 'utf-16le'
01640 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
01641 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
01642 (xml_data[2:4] != '\x00\x00'):
01643
01644 sniffed_xml_encoding = 'utf-16le'
01645 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
01646 elif xml_data[:4] == '\x00\x00\x00\x3c':
01647
01648 sniffed_xml_encoding = 'utf-32be'
01649 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
01650 elif xml_data[:4] == '\x3c\x00\x00\x00':
01651
01652 sniffed_xml_encoding = 'utf-32le'
01653 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
01654 elif xml_data[:4] == '\x00\x00\xfe\xff':
01655
01656 sniffed_xml_encoding = 'utf-32be'
01657 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
01658 elif xml_data[:4] == '\xff\xfe\x00\x00':
01659
01660 sniffed_xml_encoding = 'utf-32le'
01661 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
01662 elif xml_data[:3] == '\xef\xbb\xbf':
01663
01664 sniffed_xml_encoding = 'utf-8'
01665 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
01666 else:
01667 sniffed_xml_encoding = 'ascii'
01668 pass
01669 xml_encoding_match = re.compile \
01670 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
01671 .match(xml_data)
01672 except:
01673 xml_encoding_match = None
01674 if xml_encoding_match:
01675 xml_encoding = xml_encoding_match.groups()[0].lower()
01676 if sniffed_xml_encoding and \
01677 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
01678 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
01679 'utf-16', 'utf-32', 'utf_16', 'utf_32',
01680 'utf16', 'u16')):
01681 xml_encoding = sniffed_xml_encoding
01682 return xml_data, xml_encoding, sniffed_xml_encoding
01683
01684
01685 def find_codec(self, charset):
01686 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
01687 or (charset and self._codec(charset.replace("-", ""))) \
01688 or (charset and self._codec(charset.replace("-", "_"))) \
01689 or charset
01690
01691 def _codec(self, charset):
01692 if not charset: return charset
01693 codec = None
01694 try:
01695 codecs.lookup(charset)
01696 codec = charset
01697 except LookupError:
01698 pass
01699 return codec
01700
01701 EBCDIC_TO_ASCII_MAP = None
01702 def _ebcdic_to_ascii(self, s):
01703 c = self.__class__
01704 if not c.EBCDIC_TO_ASCII_MAP:
01705 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
01706 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
01707 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
01708 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
01709 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
01710 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
01711 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
01712 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
01713 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
01714 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
01715 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
01716 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
01717 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
01718 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
01719 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
01720 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
01721 250,251,252,253,254,255)
01722 import string
01723 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
01724 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
01725 return s.translate(c.EBCDIC_TO_ASCII_MAP)
01726
01727 MS_CHARS = { '\x80' : ('euro', '20AC'),
01728 '\x81' : ' ',
01729 '\x82' : ('sbquo', '201A'),
01730 '\x83' : ('fnof', '192'),
01731 '\x84' : ('bdquo', '201E'),
01732 '\x85' : ('hellip', '2026'),
01733 '\x86' : ('dagger', '2020'),
01734 '\x87' : ('Dagger', '2021'),
01735 '\x88' : ('circ', '2C6'),
01736 '\x89' : ('permil', '2030'),
01737 '\x8A' : ('Scaron', '160'),
01738 '\x8B' : ('lsaquo', '2039'),
01739 '\x8C' : ('OElig', '152'),
01740 '\x8D' : '?',
01741 '\x8E' : ('#x17D', '17D'),
01742 '\x8F' : '?',
01743 '\x90' : '?',
01744 '\x91' : ('lsquo', '2018'),
01745 '\x92' : ('rsquo', '2019'),
01746 '\x93' : ('ldquo', '201C'),
01747 '\x94' : ('rdquo', '201D'),
01748 '\x95' : ('bull', '2022'),
01749 '\x96' : ('ndash', '2013'),
01750 '\x97' : ('mdash', '2014'),
01751 '\x98' : ('tilde', '2DC'),
01752 '\x99' : ('trade', '2122'),
01753 '\x9a' : ('scaron', '161'),
01754 '\x9b' : ('rsaquo', '203A'),
01755 '\x9c' : ('oelig', '153'),
01756 '\x9d' : '?',
01757 '\x9e' : ('#x17E', '17E'),
01758 '\x9f' : ('Yuml', ''),}
01759
01760
01761
01762
01763
01764 if __name__ == '__main__':
01765 import sys
01766 soup = BeautifulSoup(sys.stdin.read())
01767 print soup.prettify()