rosdeb.BeautifulSoup

1 """Beautiful Soup 2 Elixir and Tonic 3 "The Screen-Scraper's Friend" 4 http://www.crummy.com/software/BeautifulSoup/ 5 6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7 tree representation. It provides methods and Pythonic idioms that make 8 it easy to navigate, search, and modify the tree. 9 10 A well-formed XML/HTML document yields a well-formed data 11 structure. An ill-formed XML/HTML document yields a correspondingly 12 ill-formed data structure. If your document is only locally 13 well-formed, you can use this library to find and process the 14 well-formed part of it. 15 16 Beautiful Soup works with Python 2.2 and up. It has no external 17 dependencies, but you'll have more success at converting data to UTF-8 18 if you also install these three packages: 19 20 * chardet, for auto-detecting character encodings 21 http://chardet.feedparser.org/ 22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported 23 by stock Python. 24 http://cjkpython.i18n.org/ 25 26 Beautiful Soup defines classes for two main parsing strategies: 27 28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29 language that kind of looks like XML. 30 31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32 or invalid. This class has web browser-like heuristics for 33 obtaining a sensible parse tree in the face of common HTML errors. 34 35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36 the encoding of an HTML or XML document, and converting it to 37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 38 39 For more than you ever wanted to know about Beautiful Soup, see the 40 documentation: 41 http://www.crummy.com/software/BeautifulSoup/documentation.html 42 43 Here, have some legalese: 44 45 Copyright (c) 2004-2009, Leonard Richardson 46 47 All rights reserved. 48 49 Redistribution and use in source and binary forms, with or without 50 modification, are permitted provided that the following conditions are 51 met: 52 53 * Redistributions of source code must retain the above copyright 54 notice, this list of conditions and the following disclaimer. 55 56 * Redistributions in binary form must reproduce the above 57 copyright notice, this list of conditions and the following 58 disclaimer in the documentation and/or other materials provided 59 with the distribution. 60 61 * Neither the name of the the Beautiful Soup Consortium and All 62 Night Kosher Bakery nor the names of its contributors may be 63 used to endorse or promote products derived from this software 64 without specific prior written permission. 65 66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 77 78 """ 79 from __future__ import generators 80 81 __author__ = "Leonard Richardson (leonardr@segfault.org)" 82 __version__ = "3.1.0.1" 83 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" 84 __license__ = "New-style BSD" 85 86 import codecs 87 import markupbase 88 import types 89 import re 90 from HTMLParser import HTMLParser, HTMLParseError 91 try: 92 from htmlentitydefs import name2codepoint 93 except ImportError: 94 name2codepoint = {} 95 try: 96 set 97 except NameError: 98 from sets import Set as set 99 100 #These hacks make Beautiful Soup able to parse XML with namespaces 101 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match 102 103 DEFAULT_OUTPUT_ENCODING = "utf-8" 104 105 # First, the classes that represent markup elements. 106

107 -def sob(unicode, encoding):

108 """Returns either the given Unicode string or its encoding.""" 109 if encoding is None: 110 return unicode 111 else: 112 return unicode.encode(encoding)

113

114 -class PageElement:

115 """Contains the navigational information for some part of the page 116 (either a tag or a piece of text)""" 117

118 - def setup(self, parent=None, previous=None):

119 """Sets up the initial relations between this element and 120 other elements.""" 121 self.parent = parent 122 self.previous = previous 123 self.next = None 124 self.previousSibling = None 125 self.nextSibling = None 126 if self.parent and self.parent.contents: 127 self.previousSibling = self.parent.contents[-1] 128 self.previousSibling.nextSibling = self

129

130 - def replaceWith(self, replaceWith):

131 oldParent = self.parent 132 myIndex = self.parent.contents.index(self) 133 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: 134 # We're replacing this element with one of its siblings. 135 index = self.parent.contents.index(replaceWith) 136 if index and index < myIndex: 137 # Furthermore, it comes before this element. That 138 # means that when we extract it, the index of this 139 # element will change. 140 myIndex = myIndex - 1 141 self.extract() 142 oldParent.insert(myIndex, replaceWith)

143

144 - def extract(self):

145 """Destructively rips this element out of the tree.""" 146 if self.parent: 147 try: 148 self.parent.contents.remove(self) 149 except ValueError: 150 pass 151 152 #Find the two elements that would be next to each other if 153 #this element (and any children) hadn't been parsed. Connect 154 #the two. 155 lastChild = self._lastRecursiveChild() 156 nextElement = lastChild.next 157 158 if self.previous: 159 self.previous.next = nextElement 160 if nextElement: 161 nextElement.previous = self.previous 162 self.previous = None 163 lastChild.next = None 164 165 self.parent = None 166 if self.previousSibling: 167 self.previousSibling.nextSibling = self.nextSibling 168 if self.nextSibling: 169 self.nextSibling.previousSibling = self.previousSibling 170 self.previousSibling = self.nextSibling = None 171 return self

172

173 - def _lastRecursiveChild(self):

174 "Finds the last element beneath this object to be parsed." 175 lastChild = self 176 while hasattr(lastChild, 'contents') and lastChild.contents: 177 lastChild = lastChild.contents[-1] 178 return lastChild

179

180 - def insert(self, position, newChild):

181 if (isinstance(newChild, basestring) 182 or isinstance(newChild, unicode)) \ 183 and not isinstance(newChild, NavigableString): 184 newChild = NavigableString(newChild) 185 186 position = min(position, len(self.contents)) 187 if hasattr(newChild, 'parent') and newChild.parent != None: 188 # We're 'inserting' an element that's already one 189 # of this object's children. 190 if newChild.parent == self: 191 index = self.find(newChild) 192 if index and index < position: 193 # Furthermore we're moving it further down the 194 # list of this object's children. That means that 195 # when we extract this element, our target index 196 # will jump down one. 197 position = position - 1 198 newChild.extract() 199 200 newChild.parent = self 201 previousChild = None 202 if position == 0: 203 newChild.previousSibling = None 204 newChild.previous = self 205 else: 206 previousChild = self.contents[position-1] 207 newChild.previousSibling = previousChild 208 newChild.previousSibling.nextSibling = newChild 209 newChild.previous = previousChild._lastRecursiveChild() 210 if newChild.previous: 211 newChild.previous.next = newChild 212 213 newChildsLastElement = newChild._lastRecursiveChild() 214 215 if position >= len(self.contents): 216 newChild.nextSibling = None 217 218 parent = self 219 parentsNextSibling = None 220 while not parentsNextSibling: 221 parentsNextSibling = parent.nextSibling 222 parent = parent.parent 223 if not parent: # This is the last element in the document. 224 break 225 if parentsNextSibling: 226 newChildsLastElement.next = parentsNextSibling 227 else: 228 newChildsLastElement.next = None 229 else: 230 nextChild = self.contents[position] 231 newChild.nextSibling = nextChild 232 if newChild.nextSibling: 233 newChild.nextSibling.previousSibling = newChild 234 newChildsLastElement.next = nextChild 235 236 if newChildsLastElement.next: 237 newChildsLastElement.next.previous = newChildsLastElement 238 self.contents.insert(position, newChild)

239

240 - def append(self, tag):

241 """Appends the given tag to the contents of this tag.""" 242 self.insert(len(self.contents), tag)

243

244 - def findNext(self, name=None, attrs={}, text=None, **kwargs):

245 """Returns the first item that matches the given criteria and 246 appears after this Tag in the document.""" 247 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)

248

249 - def findAllNext(self, name=None, attrs={}, text=None, limit=None, 250 **kwargs):

251 """Returns all items that match the given criteria and appear 252 after this Tag in the document.""" 253 return self._findAll(name, attrs, text, limit, self.nextGenerator, 254 **kwargs)

255

256 - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):

257 """Returns the closest sibling to this Tag that matches the 258 given criteria and appears after this Tag in the document.""" 259 return self._findOne(self.findNextSiblings, name, attrs, text, 260 **kwargs)

261

262 - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 263 **kwargs):

264 """Returns the siblings of this Tag that match the given 265 criteria and appear after this Tag in the document.""" 266 return self._findAll(name, attrs, text, limit, 267 self.nextSiblingGenerator, **kwargs)

268 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x 269

270 - def findPrevious(self, name=None, attrs={}, text=None, **kwargs):

271 """Returns the first item that matches the given criteria and 272 appears before this Tag in the document.""" 273 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)

274

275 - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 276 **kwargs):

277 """Returns all items that match the given criteria and appear 278 before this Tag in the document.""" 279 return self._findAll(name, attrs, text, limit, self.previousGenerator, 280 **kwargs)

281 fetchPrevious = findAllPrevious # Compatibility with pre-3.x 282

283 - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):

284 """Returns the closest sibling to this Tag that matches the 285 given criteria and appears before this Tag in the document.""" 286 return self._findOne(self.findPreviousSiblings, name, attrs, text, 287 **kwargs)

288

289 - def findPreviousSiblings(self, name=None, attrs={}, text=None, 290 limit=None, **kwargs):

291 """Returns the siblings of this Tag that match the given 292 criteria and appear before this Tag in the document.""" 293 return self._findAll(name, attrs, text, limit, 294 self.previousSiblingGenerator, **kwargs)

295 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x 296

297 - def findParent(self, name=None, attrs={}, **kwargs):

298 """Returns the closest parent of this Tag that matches the given 299 criteria.""" 300 # NOTE: We can't use _findOne because findParents takes a different 301 # set of arguments. 302 r = None 303 l = self.findParents(name, attrs, 1) 304 if l: 305 r = l[0] 306 return r

307

308 - def findParents(self, name=None, attrs={}, limit=None, **kwargs):

309 """Returns the parents of this Tag that match the given 310 criteria.""" 311 312 return self._findAll(name, attrs, None, limit, self.parentGenerator, 313 **kwargs)

314 fetchParents = findParents # Compatibility with pre-3.x 315 316 #These methods do the real heavy lifting. 317

318 - def _findOne(self, method, name, attrs, text, **kwargs):

319 r = None 320 l = method(name, attrs, text, 1, **kwargs) 321 if l: 322 r = l[0] 323 return r

324

325 - def _findAll(self, name, attrs, text, limit, generator, **kwargs):

326 "Iterates over a generator looking for things that match." 327 328 if isinstance(name, SoupStrainer): 329 strainer = name 330 else: 331 # Build a SoupStrainer 332 strainer = SoupStrainer(name, attrs, text, **kwargs) 333 results = ResultSet(strainer) 334 g = generator() 335 while True: 336 try: 337 i = g.next() 338 except StopIteration: 339 break 340 if i: 341 found = strainer.search(i) 342 if found: 343 results.append(found) 344 if limit and len(results) >= limit: 345 break 346 return results

347 348 #These Generators can be used to navigate starting from both 349 #NavigableStrings and Tags.

350 - def nextGenerator(self):

351 i = self 352 while i: 353 i = i.next 354 yield i

355

356 - def nextSiblingGenerator(self):

357 i = self 358 while i: 359 i = i.nextSibling 360 yield i

361

362 - def previousGenerator(self):

363 i = self 364 while i: 365 i = i.previous 366 yield i

367

368 - def previousSiblingGenerator(self):

369 i = self 370 while i: 371 i = i.previousSibling 372 yield i

373

374 - def parentGenerator(self):

375 i = self 376 while i: 377 i = i.parent 378 yield i

379 380 # Utility methods

381 - def substituteEncoding(self, str, encoding=None):

382 encoding = encoding or "utf-8" 383 return str.replace("%SOUP-ENCODING%", encoding)

384

385 - def toEncoding(self, s, encoding=None):

386 """Encodes an object to a string in some encoding, or to Unicode. 387 .""" 388 if isinstance(s, unicode): 389 if encoding: 390 s = s.encode(encoding) 391 elif isinstance(s, str): 392 if encoding: 393 s = s.encode(encoding) 394 else: 395 s = unicode(s) 396 else: 397 if encoding: 398 s = self.toEncoding(str(s), encoding) 399 else: 400 s = unicode(s) 401 return s

402

403 -class NavigableString(unicode, PageElement):

404

405 - def __new__(cls, value):

406 """Create a new NavigableString. 407 408 When unpickling a NavigableString, this method is called with 409 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 410 passed in to the superclass's __new__ or the superclass won't know 411 how to handle non-ASCII characters. 412 """ 413 if isinstance(value, unicode): 414 return unicode.__new__(cls, value) 415 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)

416

417 - def __getnewargs__(self):

418 return (unicode(self),)

419

420 - def __getattr__(self, attr):

421 """text.string gives you text. This is for backwards 422 compatibility for Navigable*String, but for CData* it lets you 423 get the string without the CData wrapper.""" 424 if attr == 'string': 425 return self 426 else: 427 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)

428

429 - def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):

430 return self.decode().encode(encoding)

431

432 - def decodeGivenEventualEncoding(self, eventualEncoding):

433 return self

434

435 -class CData(NavigableString):

436

437 - def decodeGivenEventualEncoding(self, eventualEncoding):

438 return u'<![CDATA[' + self + u']]>'

439

440 -class ProcessingInstruction(NavigableString):

441

442 - def decodeGivenEventualEncoding(self, eventualEncoding):

443 output = self 444 if u'%SOUP-ENCODING%' in output: 445 output = self.substituteEncoding(output, eventualEncoding) 446 return u'<?' + output + u'?>'

447

448 -class Comment(NavigableString):

449 - def decodeGivenEventualEncoding(self, eventualEncoding):

450 return u''

451

452 -class Declaration(NavigableString):

453 - def decodeGivenEventualEncoding(self, eventualEncoding):

454 return u'<!' + self + u'>'

455

456 -class Tag(PageElement):

457 458 """Represents a found HTML tag with its attributes and contents.""" 459

460 - def _invert(h):

461 "Cheap function to invert a hash." 462 i = {} 463 for k,v in h.items(): 464 i[v] = k 465 return i

466 467 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", 468 "quot" : '"', 469 "amp" : "&", 470 "lt" : "<", 471 "gt" : ">" } 472 473 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) 474

475 - def _convertEntities(self, match):

476 """Used in a call to re.sub to replace HTML, XML, and numeric 477 entities with the appropriate Unicode characters. If HTML 478 entities are being converted, any unrecognized entities are 479 escaped.""" 480 x = match.group(1) 481 if self.convertHTMLEntities and x in name2codepoint: 482 return unichr(name2codepoint[x]) 483 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: 484 if self.convertXMLEntities: 485 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] 486 else: 487 return u'&%s;' % x 488 elif len(x) > 0 and x[0] == '#': 489 # Handle numeric entities 490 if len(x) > 1 and x[1] == 'x': 491 return unichr(int(x[2:], 16)) 492 else: 493 return unichr(int(x[1:])) 494 495 elif self.escapeUnrecognizedEntities: 496 return u'&%s;' % x 497 else: 498 return u'&%s;' % x

499

500 - def __init__(self, parser, name, attrs=None, parent=None, 501 previous=None):

502 "Basic constructor." 503 504 # We don't actually store the parser object: that lets extracted 505 # chunks be garbage-collected 506 self.parserClass = parser.__class__ 507 self.isSelfClosing = parser.isSelfClosingTag(name) 508 self.name = name 509 if attrs == None: 510 attrs = [] 511 self.attrs = attrs 512 self.contents = [] 513 self.setup(parent, previous) 514 self.hidden = False 515 self.containsSubstitutions = False 516 self.convertHTMLEntities = parser.convertHTMLEntities 517 self.convertXMLEntities = parser.convertXMLEntities 518 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities 519 520 def convert(kval): 521 "Converts HTML, XML and numeric entities in the attribute value." 522 k, val = kval 523 if val is None: 524 return kval 525 return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", 526 self._convertEntities, val))

527 self.attrs = map(convert, self.attrs)

528

529 - def get(self, key, default=None):

530 """Returns the value of the 'key' attribute for the tag, or 531 the value given for 'default' if it doesn't have that 532 attribute.""" 533 return self._getAttrMap().get(key, default)

534

535 - def has_key(self, key):

536 return self._getAttrMap().has_key(key)

537

538 - def __getitem__(self, key):

539 """tag[key] returns the value of the 'key' attribute for the tag, 540 and throws an exception if it's not there.""" 541 return self._getAttrMap()[key]

542

543 - def __iter__(self):

544 "Iterating over a tag iterates over its contents." 545 return iter(self.contents)

546

547 - def __len__(self):

548 "The length of a tag is the length of its list of contents." 549 return len(self.contents)

550

551 - def __contains__(self, x):

552 return x in self.contents

553

554 - def __nonzero__(self):

555 "A tag is non-None even if it has no contents." 556 return True

557

558 - def __setitem__(self, key, value):

559 """Setting tag[key] sets the value of the 'key' attribute for the 560 tag.""" 561 self._getAttrMap() 562 self.attrMap[key] = value 563 found = False 564 for i in range(0, len(self.attrs)): 565 if self.attrs[i][0] == key: 566 self.attrs[i] = (key, value) 567 found = True 568 if not found: 569 self.attrs.append((key, value)) 570 self._getAttrMap()[key] = value

571

572 - def __delitem__(self, key):

573 "Deleting tag[key] deletes all 'key' attributes for the tag." 574 for item in self.attrs: 575 if item[0] == key: 576 self.attrs.remove(item) 577 #We don't break because bad HTML can define the same 578 #attribute multiple times. 579 self._getAttrMap() 580 if self.attrMap.has_key(key): 581 del self.attrMap[key]

582

583 - def __call__(self, *args, **kwargs):

584 """Calling a tag like a function is the same as calling its 585 findAll() method. Eg. tag('a') returns a list of all the A tags 586 found within this tag.""" 587 return apply(self.findAll, args, kwargs)

588

589 - def __getattr__(self, tag):

590 #print "Getattr %s.%s" % (self.__class__, tag) 591 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: 592 return self.find(tag[:-3]) 593 elif tag.find('__') != 0: 594 return self.find(tag) 595 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)

596

597 - def __eq__(self, other):

598 """Returns true iff this tag has the same name, the same attributes, 599 and the same contents (recursively) as the given tag. 600 601 NOTE: right now this will return false if two tags have the 602 same attributes in a different order. Should this be fixed?""" 603 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 604 return False 605 for i in range(0, len(self.contents)): 606 if self.contents[i] != other.contents[i]: 607 return False 608 return True

609

610 - def __ne__(self, other):

611 """Returns true iff this tag is not identical to the other tag, 612 as defined in __eq__.""" 613 return not self == other

614

615 - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):

616 """Renders this tag as a string.""" 617 return self.decode(eventualEncoding=encoding)

618 619 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 620 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 621 + ")") 622

623 - def _sub_entity(self, x):

624 """Used with a regular expression to substitute the 625 appropriate XML entity for an XML special character.""" 626 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"

627

628 - def __unicode__(self):

629 return self.decode()

630

631 - def __str__(self):

632 return self.encode()

633

634 - def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 635 prettyPrint=False, indentLevel=0):

636 return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)

637

638 - def decode(self, prettyPrint=False, indentLevel=0, 639 eventualEncoding=DEFAULT_OUTPUT_ENCODING):

640 """Returns a string or Unicode representation of this tag and 641 its contents. To get Unicode, pass None for encoding.""" 642 643 attrs = [] 644 if self.attrs: 645 for key, val in self.attrs: 646 fmt = '%s="%s"' 647 if isString(val): 648 if (self.containsSubstitutions 649 and eventualEncoding is not None 650 and '%SOUP-ENCODING%' in val): 651 val = self.substituteEncoding(val, eventualEncoding) 652 653 # The attribute value either: 654 # 655 # * Contains no embedded double quotes or single quotes. 656 # No problem: we enclose it in double quotes. 657 # * Contains embedded single quotes. No problem: 658 # double quotes work here too. 659 # * Contains embedded double quotes. No problem: 660 # we enclose it in single quotes. 661 # * Embeds both single _and_ double quotes. This 662 # can't happen naturally, but it can happen if 663 # you modify an attribute value after parsing 664 # the document. Now we have a bit of a 665 # problem. We solve it by enclosing the 666 # attribute in single quotes, and escaping any 667 # embedded single quotes to XML entities. 668 if '"' in val: 669 fmt = "%s='%s'" 670 if "'" in val: 671 # TODO: replace with apos when 672 # appropriate. 673 val = val.replace("'", "&squot;") 674 675 # Now we're okay w/r/t quotes. But the attribute 676 # value might also contain angle brackets, or 677 # ampersands that aren't part of entities. We need 678 # to escape those to XML entities too. 679 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) 680 if val is None: 681 # Handle boolean attributes. 682 decoded = key 683 else: 684 decoded = fmt % (key, val) 685 attrs.append(decoded) 686 close = '' 687 closeTag = '' 688 if self.isSelfClosing: 689 close = ' /' 690 else: 691 closeTag = '</%s>' % self.name 692 693 indentTag, indentContents = 0, 0 694 if prettyPrint: 695 indentTag = indentLevel 696 space = (' ' * (indentTag-1)) 697 indentContents = indentTag + 1 698 contents = self.decodeContents(prettyPrint, indentContents, 699 eventualEncoding) 700 if self.hidden: 701 s = contents 702 else: 703 s = [] 704 attributeString = '' 705 if attrs: 706 attributeString = ' ' + ' '.join(attrs) 707 if prettyPrint: 708 s.append(space) 709 s.append('<%s%s%s>' % (self.name, attributeString, close)) 710 if prettyPrint: 711 s.append("\n") 712 s.append(contents) 713 if prettyPrint and contents and contents[-1] != "\n": 714 s.append("\n") 715 if prettyPrint and closeTag: 716 s.append(space) 717 s.append(closeTag) 718 if prettyPrint and closeTag and self.nextSibling: 719 s.append("\n") 720 s = ''.join(s) 721 return s

722

723 - def decompose(self):

724 """Recursively destroys the contents of this tree.""" 725 contents = [i for i in self.contents] 726 for i in contents: 727 if isinstance(i, Tag): 728 i.decompose() 729 else: 730 i.extract() 731 self.extract()

732

733 - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):

734 return self.encode(encoding, True)

735

736 - def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 737 prettyPrint=False, indentLevel=0):

738 return self.decodeContents(prettyPrint, indentLevel).encode(encoding)

739

740 - def decodeContents(self, prettyPrint=False, indentLevel=0, 741 eventualEncoding=DEFAULT_OUTPUT_ENCODING):

742 """Renders the contents of this tag as a string in the given 743 encoding. If encoding is None, returns a Unicode string..""" 744 s=[] 745 for c in self: 746 text = None 747 if isinstance(c, NavigableString): 748 text = c.decodeGivenEventualEncoding(eventualEncoding) 749 elif isinstance(c, Tag): 750 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) 751 if text and prettyPrint: 752 text = text.strip() 753 if text: 754 if prettyPrint: 755 s.append(" " * (indentLevel-1)) 756 s.append(text) 757 if prettyPrint: 758 s.append("\n") 759 return ''.join(s)

760 761 #Soup methods 762

763 - def find(self, name=None, attrs={}, recursive=True, text=None, 764 **kwargs):

765 """Return only the first child of this Tag matching the given 766 criteria.""" 767 r = None 768 l = self.findAll(name, attrs, recursive, text, 1, **kwargs) 769 if l: 770 r = l[0] 771 return r

772 findChild = find 773

774 - def findAll(self, name=None, attrs={}, recursive=True, text=None, 775 limit=None, **kwargs):

776 """Extracts a list of Tag objects that match the given 777 criteria. You can specify the name of the Tag and any 778 attributes you want the Tag to have. 779 780 The value of a key-value pair in the 'attrs' map can be a 781 string, a list of strings, a regular expression object, or a 782 callable that takes a string and returns whether or not the 783 string matches for some custom definition of 'matches'. The 784 same is true of the tag name.""" 785 generator = self.recursiveChildGenerator 786 if not recursive: 787 generator = self.childGenerator 788 return self._findAll(name, attrs, text, limit, generator, **kwargs)

789 findChildren = findAll 790 791 # Pre-3.x compatibility methods. Will go away in 4.0. 792 first = find 793 fetch = findAll 794

795 - def fetchText(self, text=None, recursive=True, limit=None):

796 return self.findAll(text=text, recursive=recursive, limit=limit)

797

798 - def firstText(self, text=None, recursive=True):

799 return self.find(text=text, recursive=recursive)

800 801 # 3.x compatibility methods. Will go away in 4.0.

802 - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 803 prettyPrint=False, indentLevel=0):

804 if encoding is None: 805 return self.decodeContents(prettyPrint, indentLevel, encoding) 806 else: 807 return self.encodeContents(encoding, prettyPrint, indentLevel)

808 809 810 #Private methods 811

812 - def _getAttrMap(self):

813 """Initializes a map representation of this tag's attributes, 814 if not already initialized.""" 815 if not getattr(self, 'attrMap'): 816 self.attrMap = {} 817 for (key, value) in self.attrs: 818 self.attrMap[key] = value 819 return self.attrMap

820 821 #Generator methods

822 - def recursiveChildGenerator(self):

823 if not len(self.contents): 824 raise StopIteration 825 stopNode = self._lastRecursiveChild().next 826 current = self.contents[0] 827 while current is not stopNode: 828 yield current 829 current = current.next

830

831 - def childGenerator(self):

832 if not len(self.contents): 833 raise StopIteration 834 current = self.contents[0] 835 while current: 836 yield current 837 current = current.nextSibling 838 raise StopIteration

839 840 # Next, a couple classes to represent queries and their results.

841 -class SoupStrainer:

842 """Encapsulates a number of ways of matching a markup element (tag or 843 text).""" 844

845 - def __init__(self, name=None, attrs={}, text=None, **kwargs):

846 self.name = name 847 if isString(attrs): 848 kwargs['class'] = attrs 849 attrs = None 850 if kwargs: 851 if attrs: 852 attrs = attrs.copy() 853 attrs.update(kwargs) 854 else: 855 attrs = kwargs 856 self.attrs = attrs 857 self.text = text

858

859 - def __str__(self):

860 if self.text: 861 return self.text 862 else: 863 return "%s|%s" % (self.name, self.attrs)

864

865 - def searchTag(self, markupName=None, markupAttrs={}):

866 found = None 867 markup = None 868 if isinstance(markupName, Tag): 869 markup = markupName 870 markupAttrs = markup 871 callFunctionWithTagData = callable(self.name) \ 872 and not isinstance(markupName, Tag) 873 874 if (not self.name) \ 875 or callFunctionWithTagData \ 876 or (markup and self._matches(markup, self.name)) \ 877 or (not markup and self._matches(markupName, self.name)): 878 if callFunctionWithTagData: 879 match = self.name(markupName, markupAttrs) 880 else: 881 match = True 882 markupAttrMap = None 883 for attr, matchAgainst in self.attrs.items(): 884 if not markupAttrMap: 885 if hasattr(markupAttrs, 'get'): 886 markupAttrMap = markupAttrs 887 else: 888 markupAttrMap = {} 889 for k,v in markupAttrs: 890 markupAttrMap[k] = v 891 attrValue = markupAttrMap.get(attr) 892 if not self._matches(attrValue, matchAgainst): 893 match = False 894 break 895 if match: 896 if markup: 897 found = markup 898 else: 899 found = markupName 900 return found

901

902 - def search(self, markup):

903 #print 'looking for %s in %s' % (self, markup) 904 found = None 905 # If given a list of items, scan it for a text element that 906 # matches. 907 if isList(markup) and not isinstance(markup, Tag): 908 for element in markup: 909 if isinstance(element, NavigableString) \ 910 and self.search(element): 911 found = element 912 break 913 # If it's a Tag, make sure its name or attributes match. 914 # Don't bother with Tags if we're searching for text. 915 elif isinstance(markup, Tag): 916 if not self.text: 917 found = self.searchTag(markup) 918 # If it's text, make sure the text matches. 919 elif isinstance(markup, NavigableString) or \ 920 isString(markup): 921 if self._matches(markup, self.text): 922 found = markup 923 else: 924 raise Exception, "I don't know how to match against a %s" \ 925 % markup.__class__ 926 return found

927

928 - def _matches(self, markup, matchAgainst):

929 #print "Matching %s against %s" % (markup, matchAgainst) 930 result = False 931 if matchAgainst == True and type(matchAgainst) == types.BooleanType: 932 result = markup != None 933 elif callable(matchAgainst): 934 result = matchAgainst(markup) 935 else: 936 #Custom match methods take the tag as an argument, but all 937 #other ways of matching match the tag name as a string. 938 if isinstance(markup, Tag): 939 markup = markup.name 940 if markup is not None and not isString(markup): 941 markup = unicode(markup) 942 #Now we know that chunk is either a string, or None. 943 if hasattr(matchAgainst, 'match'): 944 # It's a regexp object. 945 result = markup and matchAgainst.search(markup) 946 elif (isList(matchAgainst) 947 and (markup is not None or not isString(matchAgainst))): 948 result = markup in matchAgainst 949 elif hasattr(matchAgainst, 'items'): 950 result = markup.has_key(matchAgainst) 951 elif matchAgainst and isString(markup): 952 if isinstance(markup, unicode): 953 matchAgainst = unicode(matchAgainst) 954 else: 955 matchAgainst = str(matchAgainst) 956 957 if not result: 958 result = matchAgainst == markup 959 return result

960

961 -class ResultSet(list):

962 """A ResultSet is just a list that keeps track of the SoupStrainer 963 that created it."""

964 - def __init__(self, source):

965 list.__init__([]) 966 self.source = source

967 968 # Now, some helper functions. 969

970 -def isList(l):

971 """Convenience method that works with all 2.x versions of Python 972 to determine whether or not something is listlike.""" 973 return ((hasattr(l, '__iter__') and not isString(l)) 974 or (type(l) in (types.ListType, types.TupleType)))

975

976 -def isString(s):

977 """Convenience method that works with all 2.x versions of Python 978 to determine whether or not something is stringlike.""" 979 try: 980 return isinstance(s, unicode) or isinstance(s, basestring) 981 except NameError: 982 return isinstance(s, str)

983

984 -def buildTagMap(default, *args):

985 """Turns a list of maps, lists, or scalars into a single map. 986 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 987 NESTING_RESET_TAGS maps out of lists and partial maps.""" 988 built = {} 989 for portion in args: 990 if hasattr(portion, 'items'): 991 #It's a map. Merge it. 992 for k,v in portion.items(): 993 built[k] = v 994 elif isList(portion) and not isString(portion): 995 #It's a list. Map each item to the default. 996 for k in portion: 997 built[k] = default 998 else: 999 #It's a scalar. Map it to the default. 1000 built[portion] = default 1001 return built

1002 1003 # Now, the parser classes. 1004

1005 -class HTMLParserBuilder(HTMLParser):

1006

1007 - def __init__(self, soup):

1008 HTMLParser.__init__(self) 1009 self.soup = soup

1010 1011 # We inherit feed() and reset(). 1012

1013 - def handle_starttag(self, name, attrs):

1014 if name == 'meta': 1015 self.soup.extractCharsetFromMeta(attrs) 1016 else: 1017 self.soup.unknown_starttag(name, attrs)

1018

1019 - def handle_endtag(self, name):

1020 self.soup.unknown_endtag(name)

1021

1022 - def handle_data(self, content):

1023 self.soup.handle_data(content)

1024

1025 - def _toStringSubclass(self, text, subclass):

1026 """Adds a certain piece of text to the tree as a NavigableString 1027 subclass.""" 1028 self.soup.endData() 1029 self.handle_data(text) 1030 self.soup.endData(subclass)

1031

1032 - def handle_pi(self, text):

1033 """Handle a processing instruction as a ProcessingInstruction 1034 object, possibly one with a %SOUP-ENCODING% slot into which an 1035 encoding will be plugged later.""" 1036 if text[:3] == "xml": 1037 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1038 self._toStringSubclass(text, ProcessingInstruction)

1039

1040 - def handle_comment(self, text):

1041 "Handle comments as Comment objects." 1042 self._toStringSubclass(text, Comment)

1043

1044 - def handle_charref(self, ref):

1045 "Handle character references as data." 1046 if self.soup.convertEntities: 1047 data = unichr(int(ref)) 1048 else: 1049 data = '&#%s;' % ref 1050 self.handle_data(data)

1051

1052 - def handle_entityref(self, ref):

1053 """Handle entity references as data, possibly converting known 1054 HTML and/or XML entity references to the corresponding Unicode 1055 characters.""" 1056 data = None 1057 if self.soup.convertHTMLEntities: 1058 try: 1059 data = unichr(name2codepoint[ref]) 1060 except KeyError: 1061 pass 1062 1063 if not data and self.soup.convertXMLEntities: 1064 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1065 1066 if not data and self.soup.convertHTMLEntities and \ 1067 not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1068 # TODO: We've got a problem here. We're told this is 1069 # an entity reference, but it's not an XML entity 1070 # reference or an HTML entity reference. Nonetheless, 1071 # the logical thing to do is to pass it through as an 1072 # unrecognized entity reference. 1073 # 1074 # Except: when the input is "&carol;" this function 1075 # will be called with input "carol". When the input is 1076 # "AT&T", this function will be called with input 1077 # "T". We have no way of knowing whether a semicolon 1078 # was present originally, so we don't know whether 1079 # this is an unknown entity or just a misplaced 1080 # ampersand. 1081 # 1082 # The more common case is a misplaced ampersand, so I 1083 # escape the ampersand and omit the trailing semicolon. 1084 data = "&%s" % ref 1085 if not data: 1086 # This case is different from the one above, because we 1087 # haven't already gone through a supposedly comprehensive 1088 # mapping of entities to Unicode characters. We might not 1089 # have gone through any mapping at all. So the chances are 1090 # very high that this is a real entity, and not a 1091 # misplaced ampersand. 1092 data = "&%s;" % ref 1093 self.handle_data(data)

1094

1095 - def handle_decl(self, data):

1096 "Handle DOCTYPEs and the like as Declaration objects." 1097 self._toStringSubclass(data, Declaration)

1098

1099 - def parse_declaration(self, i):

1100 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1101 declaration as a CData object.""" 1102 j = None 1103 if self.rawdata[i:i+9] == '<![CDATA[': 1104 k = self.rawdata.find(']]>', i) 1105 if k == -1: 1106 k = len(self.rawdata) 1107 data = self.rawdata[i+9:k] 1108 j = k+3 1109 self._toStringSubclass(data, CData) 1110 else: 1111 try: 1112 j = HTMLParser.parse_declaration(self, i) 1113 except HTMLParseError: 1114 toHandle = self.rawdata[i:] 1115 self.handle_data(toHandle) 1116 j = i + len(toHandle) 1117 return j

1118 1119

1120 -class BeautifulStoneSoup(Tag):

1121 1122 """This class contains the basic parser and search code. It defines 1123 a parser that knows nothing about tag behavior except for the 1124 following: 1125 1126 You can't close a tag without closing all the tags it encloses. 1127 That is, "<foo><bar></foo>" actually means 1128 "<foo><bar></bar></foo>". 1129 1130 [Another possible explanation is "<foo><bar /></foo>", but since 1131 this class defines no SELF_CLOSING_TAGS, it will never use that 1132 explanation.] 1133 1134 This class is useful for parsing XML or made-up markup languages, 1135 or when BeautifulSoup makes an assumption counter to what you were 1136 expecting.""" 1137 1138 SELF_CLOSING_TAGS = {} 1139 NESTABLE_TAGS = {} 1140 RESET_NESTING_TAGS = {} 1141 QUOTE_TAGS = {} 1142 PRESERVE_WHITESPACE_TAGS = [] 1143 1144 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), 1145 lambda x: x.group(1) + ' />'), 1146 (re.compile('<!\s+([^<>]*)>'), 1147 lambda x: '<!' + x.group(1) + '>') 1148 ] 1149 1150 ROOT_TAG_NAME = u'[document]' 1151 1152 HTML_ENTITIES = "html" 1153 XML_ENTITIES = "xml" 1154 XHTML_ENTITIES = "xhtml" 1155 # TODO: This only exists for backwards-compatibility 1156 ALL_ENTITIES = XHTML_ENTITIES 1157 1158 # Used when determining whether a text node is all whitespace and 1159 # can be replaced with a single space. A text node that contains 1160 # fancy Unicode spaces (usually non-breaking) should be left 1161 # alone. 1162 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } 1163

1164 - def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 1165 markupMassage=True, smartQuotesTo=XML_ENTITIES, 1166 convertEntities=None, selfClosingTags=None, isHTML=False, 1167 builder=HTMLParserBuilder):

1168 """The Soup object is initialized as the 'root tag', and the 1169 provided markup (which can be a string or a file-like object) 1170 is fed into the underlying parser. 1171 1172 HTMLParser will process most bad HTML, and the BeautifulSoup 1173 class has some tricks for dealing with some HTML that kills 1174 HTMLParser, but Beautiful Soup can nonetheless choke or lose data 1175 if your data uses self-closing tags or declarations 1176 incorrectly. 1177 1178 By default, Beautiful Soup uses regexes to sanitize input, 1179 avoiding the vast majority of these problems. If the problems 1180 don't apply to you, pass in False for markupMassage, and 1181 you'll get better performance. 1182 1183 The default parser massage techniques fix the two most common 1184 instances of invalid HTML that choke HTMLParser: 1185 1186 <br/> (No space between name of closing tag and tag close) 1187 <! --Comment--> (Extraneous whitespace in declaration) 1188 1189 You can pass in a custom list of (RE object, replace method) 1190 tuples to get Beautiful Soup to scrub your input the way you 1191 want.""" 1192 1193 self.parseOnlyThese = parseOnlyThese 1194 self.fromEncoding = fromEncoding 1195 self.smartQuotesTo = smartQuotesTo 1196 self.convertEntities = convertEntities 1197 # Set the rules for how we'll deal with the entities we 1198 # encounter 1199 if self.convertEntities: 1200 # It doesn't make sense to convert encoded characters to 1201 # entities even while you're converting entities to Unicode. 1202 # Just convert it all to Unicode. 1203 self.smartQuotesTo = None 1204 if convertEntities == self.HTML_ENTITIES: 1205 self.convertXMLEntities = False 1206 self.convertHTMLEntities = True 1207 self.escapeUnrecognizedEntities = True 1208 elif convertEntities == self.XHTML_ENTITIES: 1209 self.convertXMLEntities = True 1210 self.convertHTMLEntities = True 1211 self.escapeUnrecognizedEntities = False 1212 elif convertEntities == self.XML_ENTITIES: 1213 self.convertXMLEntities = True 1214 self.convertHTMLEntities = False 1215 self.escapeUnrecognizedEntities = False 1216 else: 1217 self.convertXMLEntities = False 1218 self.convertHTMLEntities = False 1219 self.escapeUnrecognizedEntities = False 1220 1221 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 1222 self.builder = builder(self) 1223 self.reset() 1224 1225 if hasattr(markup, 'read'): # It's a file-type object. 1226 markup = markup.read() 1227 self.markup = markup 1228 self.markupMassage = markupMassage 1229 try: 1230 self._feed(isHTML=isHTML) 1231 except StopParsing: 1232 pass 1233 self.markup = None # The markup can now be GCed. 1234 self.builder = None # So can the builder.

1235

1236 - def _feed(self, inDocumentEncoding=None, isHTML=False):

1237 # Convert the document to Unicode. 1238 markup = self.markup 1239 if isinstance(markup, unicode): 1240 if not hasattr(self, 'originalEncoding'): 1241 self.originalEncoding = None 1242 else: 1243 dammit = UnicodeDammit\ 1244 (markup, [self.fromEncoding, inDocumentEncoding], 1245 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) 1246 markup = dammit.unicode 1247 self.originalEncoding = dammit.originalEncoding 1248 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding 1249 if markup: 1250 if self.markupMassage: 1251 if not isList(self.markupMassage): 1252 self.markupMassage = self.MARKUP_MASSAGE 1253 for fix, m in self.markupMassage: 1254 markup = fix.sub(m, markup) 1255 # TODO: We get rid of markupMassage so that the 1256 # soup object can be deepcopied later on. Some 1257 # Python installations can't copy regexes. If anyone 1258 # was relying on the existence of markupMassage, this 1259 # might cause problems. 1260 del(self.markupMassage) 1261 self.builder.reset() 1262 1263 self.builder.feed(markup) 1264 # Close out any unfinished strings and close all the open tags. 1265 self.endData() 1266 while self.currentTag.name != self.ROOT_TAG_NAME: 1267 self.popTag()

1268

1269 - def isSelfClosingTag(self, name):

1270 """Returns true iff the given string is the name of a 1271 self-closing tag according to this parser.""" 1272 return self.SELF_CLOSING_TAGS.has_key(name) \ 1273 or self.instanceSelfClosingTags.has_key(name)

1274

1275 - def reset(self):

1276 Tag.__init__(self, self, self.ROOT_TAG_NAME) 1277 self.hidden = 1 1278 self.builder.reset() 1279 self.currentData = [] 1280 self.currentTag = None 1281 self.tagStack = [] 1282 self.quoteStack = [] 1283 self.pushTag(self)

1284

1285 - def popTag(self):

1286 tag = self.tagStack.pop() 1287 # Tags with just one string-owning child get the child as a 1288 # 'string' property, so that soup.tag.string is shorthand for 1289 # soup.tag.contents[0] 1290 if len(self.currentTag.contents) == 1 and \ 1291 isinstance(self.currentTag.contents[0], NavigableString): 1292 self.currentTag.string = self.currentTag.contents[0] 1293 1294 #print "Pop", tag.name 1295 if self.tagStack: 1296 self.currentTag = self.tagStack[-1] 1297 return self.currentTag

1298

1299 - def pushTag(self, tag):

1300 #print "Push", tag.name 1301 if self.currentTag: 1302 self.currentTag.contents.append(tag) 1303 self.tagStack.append(tag) 1304 self.currentTag = self.tagStack[-1]

1305

1306 - def endData(self, containerClass=NavigableString):

1307 if self.currentData: 1308 currentData = u''.join(self.currentData) 1309 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and 1310 not set([tag.name for tag in self.tagStack]).intersection( 1311 self.PRESERVE_WHITESPACE_TAGS)): 1312 if '\n' in currentData: 1313 currentData = '\n' 1314 else: 1315 currentData = ' ' 1316 self.currentData = [] 1317 if self.parseOnlyThese and len(self.tagStack) <= 1 and \ 1318 (not self.parseOnlyThese.text or \ 1319 not self.parseOnlyThese.search(currentData)): 1320 return 1321 o = containerClass(currentData) 1322 o.setup(self.currentTag, self.previous) 1323 if self.previous: 1324 self.previous.next = o 1325 self.previous = o 1326 self.currentTag.contents.append(o)

1327 1328

1329 - def _popToTag(self, name, inclusivePop=True):

1330 """Pops the tag stack up to and including the most recent 1331 instance of the given tag. If inclusivePop is false, pops the tag 1332 stack up to but *not* including the most recent instqance of 1333 the given tag.""" 1334 #print "Popping to %s" % name 1335 if name == self.ROOT_TAG_NAME: 1336 return 1337 1338 numPops = 0 1339 mostRecentTag = None 1340 for i in range(len(self.tagStack)-1, 0, -1): 1341 if name == self.tagStack[i].name: 1342 numPops = len(self.tagStack)-i 1343 break 1344 if not inclusivePop: 1345 numPops = numPops - 1 1346 1347 for i in range(0, numPops): 1348 mostRecentTag = self.popTag() 1349 return mostRecentTag

1350

1351 - def _smartPop(self, name):

1352 1353 """We need to pop up to the previous tag of this type, unless 1354 one of this tag's nesting reset triggers comes between this 1355 tag and the previous tag of this type, OR unless this tag is a 1356 generic nesting trigger and another generic nesting trigger 1357 comes between this tag and the previous tag of this type. 1358 1359 Examples: 1360 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. 1361 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. 1362 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. 1363 1364 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. 1365 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' 1366 <td><tr><td> *<td>* should pop to 'tr', not the first 'td' 1367 """ 1368 1369 nestingResetTriggers = self.NESTABLE_TAGS.get(name) 1370 isNestable = nestingResetTriggers != None 1371 isResetNesting = self.RESET_NESTING_TAGS.has_key(name) 1372 popTo = None 1373 inclusive = True 1374 for i in range(len(self.tagStack)-1, 0, -1): 1375 p = self.tagStack[i] 1376 if (not p or p.name == name) and not isNestable: 1377 #Non-nestable tags get popped to the top or to their 1378 #last occurance. 1379 popTo = name 1380 break 1381 if (nestingResetTriggers != None 1382 and p.name in nestingResetTriggers) \ 1383 or (nestingResetTriggers == None and isResetNesting 1384 and self.RESET_NESTING_TAGS.has_key(p.name)): 1385 1386 #If we encounter one of the nesting reset triggers 1387 #peculiar to this tag, or we encounter another tag 1388 #that causes nesting to reset, pop up to but not 1389 #including that tag. 1390 popTo = p.name 1391 inclusive = False 1392 break 1393 p = p.parent 1394 if popTo: 1395 self._popToTag(popTo, inclusive)

1396

1397 - def unknown_starttag(self, name, attrs, selfClosing=0):

1398 #print "Start tag %s: %s" % (name, attrs) 1399 if self.quoteStack: 1400 #This is not a real tag. 1401 #print "<%s> is not real!" % name 1402 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) 1403 self.handle_data('<%s%s>' % (name, attrs)) 1404 return 1405 self.endData() 1406 1407 if not self.isSelfClosingTag(name) and not selfClosing: 1408 self._smartPop(name) 1409 1410 if self.parseOnlyThese and len(self.tagStack) <= 1 \ 1411 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): 1412 return 1413 1414 tag = Tag(self, name, attrs, self.currentTag, self.previous) 1415 if self.previous: 1416 self.previous.next = tag 1417 self.previous = tag 1418 self.pushTag(tag) 1419 if selfClosing or self.isSelfClosingTag(name): 1420 self.popTag() 1421 if name in self.QUOTE_TAGS: 1422 #print "Beginning quote (%s)" % name 1423 self.quoteStack.append(name) 1424 self.literal = 1 1425 return tag

1426

1427 - def unknown_endtag(self, name):

1428 #print "End tag %s" % name 1429 if self.quoteStack and self.quoteStack[-1] != name: 1430 #This is not a real end tag. 1431 #print "</%s> is not real!" % name 1432 self.handle_data('</%s>' % name) 1433 return 1434 self.endData() 1435 self._popToTag(name) 1436 if self.quoteStack and self.quoteStack[-1] == name: 1437 self.quoteStack.pop() 1438 self.literal = (len(self.quoteStack) > 0)

1439

1440 - def handle_data(self, data):

1441 self.currentData.append(data)

1442

1443 - def extractCharsetFromMeta(self, attrs):

1444 self.unknown_starttag('meta', attrs)

1445 1446

1447 -class BeautifulSoup(BeautifulStoneSoup):

1448 1449 """This parser knows the following facts about HTML: 1450 1451 * Some tags have no closing tag and should be interpreted as being 1452 closed as soon as they are encountered. 1453 1454 * The text inside some tags (ie. 'script') may contain tags which 1455 are not really part of the document and which should be parsed 1456 as text, not tags. If you want to parse the text as tags, you can 1457 always fetch it and parse it explicitly. 1458 1459 * Tag nesting rules: 1460 1461 Most tags can't be nested at all. For instance, the occurance of 1462 a <p> tag should implicitly close the previous <p> tag. 1463 1464 <p>Para1<p>Para2 1465 should be transformed into: 1466 <p>Para1</p><p>Para2 1467 1468 Some tags can be nested arbitrarily. For instance, the occurance 1469 of a <blockquote> tag should _not_ implicitly close the previous 1470 <blockquote> tag. 1471 1472 Alice said: <blockquote>Bob said: <blockquote>Blah 1473 should NOT be transformed into: 1474 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah 1475 1476 Some tags can be nested, but the nesting is reset by the 1477 interposition of other tags. For instance, a <tr> tag should 1478 implicitly close the previous <tr> tag within the same <table>, 1479 but not close a <tr> tag in another table. 1480 1481 <table><tr>Blah<tr>Blah 1482 should be transformed into: 1483 <table><tr>Blah</tr><tr>Blah 1484 but, 1485 <tr>Blah<table><tr>Blah 1486 should NOT be transformed into 1487 <tr>Blah<table></tr><tr>Blah 1488 1489 Differing assumptions about tag nesting rules are a major source 1490 of problems with the BeautifulSoup class. If BeautifulSoup is not 1491 treating as nestable a tag your page author treats as nestable, 1492 try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1493 BeautifulStoneSoup before writing your own subclass.""" 1494

1495 - def __init__(self, *args, **kwargs):

1496 if not kwargs.has_key('smartQuotesTo'): 1497 kwargs['smartQuotesTo'] = self.HTML_ENTITIES 1498 kwargs['isHTML'] = True 1499 BeautifulStoneSoup.__init__(self, *args, **kwargs)

1500 1501 SELF_CLOSING_TAGS = buildTagMap(None, 1502 ['br' , 'hr', 'input', 'img', 'meta', 1503 'spacer', 'link', 'frame', 'base']) 1504 1505 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) 1506 1507 QUOTE_TAGS = {'script' : None, 'textarea' : None} 1508 1509 #According to the HTML standard, each of these inline tags can 1510 #contain another tag of the same type. Furthermore, it's common 1511 #to actually use these tags this way. 1512 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1513 'center'] 1514 1515 #According to the HTML standard, these block tags can contain 1516 #another tag of the same type. Furthermore, it's common 1517 #to actually use these tags this way. 1518 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] 1519 1520 #Lists can contain other lists, but there are restrictions. 1521 NESTABLE_LIST_TAGS = { 'ol' : [], 1522 'ul' : [], 1523 'li' : ['ul', 'ol'], 1524 'dl' : [], 1525 'dd' : ['dl'], 1526 'dt' : ['dl'] } 1527 1528 #Tables can contain other tables, but there are restrictions. 1529 NESTABLE_TABLE_TAGS = {'table' : [], 1530 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 1531 'td' : ['tr'], 1532 'th' : ['tr'], 1533 'thead' : ['table'], 1534 'tbody' : ['table'], 1535 'tfoot' : ['table'], 1536 } 1537 1538 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] 1539 1540 #If one of these tags is encountered, all tags up to the next tag of 1541 #this type are popped. 1542 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', 1543 NON_NESTABLE_BLOCK_TAGS, 1544 NESTABLE_LIST_TAGS, 1545 NESTABLE_TABLE_TAGS) 1546 1547 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, 1548 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) 1549 1550 # Used to detect the charset in a META tag; see start_meta 1551 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 1552

1553 - def extractCharsetFromMeta(self, attrs):

1554 """Beautiful Soup can detect a charset included in a META tag, 1555 try to convert the document to that charset, and re-parse the 1556 document from the beginning.""" 1557 httpEquiv = None 1558 contentType = None 1559 contentTypeIndex = None 1560 tagNeedsEncodingSubstitution = False 1561 1562 for i in range(0, len(attrs)): 1563 key, value = attrs[i] 1564 key = key.lower() 1565 if key == 'http-equiv': 1566 httpEquiv = value 1567 elif key == 'content': 1568 contentType = value 1569 contentTypeIndex = i 1570 1571 if httpEquiv and contentType: # It's an interesting meta tag. 1572 match = self.CHARSET_RE.search(contentType) 1573 if match: 1574 if (self.declaredHTMLEncoding is not None or 1575 self.originalEncoding == self.fromEncoding): 1576 # An HTML encoding was sniffed while converting 1577 # the document to Unicode, or an HTML encoding was 1578 # sniffed during a previous pass through the 1579 # document, or an encoding was specified 1580 # explicitly and it worked. Rewrite the meta tag. 1581 def rewrite(match): 1582 return match.group(1) + "%SOUP-ENCODING%"

1583 newAttr = self.CHARSET_RE.sub(rewrite, contentType) 1584 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], 1585 newAttr) 1586 tagNeedsEncodingSubstitution = True 1587 else: 1588 # This is our first pass through the document. 1589 # Go through it again with the encoding information. 1590 newCharset = match.group(3) 1591 if newCharset and newCharset != self.originalEncoding: 1592 self.declaredHTMLEncoding = newCharset 1593 self._feed(self.declaredHTMLEncoding) 1594 raise StopParsing 1595 pass 1596 tag = self.unknown_starttag("meta", attrs) 1597 if tag and tagNeedsEncodingSubstitution: 1598 tag.containsSubstitutions = True

1599 1600

1601 -class StopParsing(Exception):

1602 pass

1603

1604 -class ICantBelieveItsBeautifulSoup(BeautifulSoup):

1605 1606 """The BeautifulSoup class is oriented towards skipping over 1607 common HTML errors like unclosed tags. However, sometimes it makes 1608 errors of its own. For instance, consider this fragment: 1609 1610 <b>Foo<b>Bar</b></b> 1611 1612 This is perfectly valid (if bizarre) HTML. However, the 1613 BeautifulSoup class will implicitly close the first b tag when it 1614 encounters the second 'b'. It will think the author wrote 1615 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because 1616 there's no real-world reason to bold something that's already 1617 bold. When it encounters '</b></b>' it will close two more 'b' 1618 tags, for a grand total of three tags closed instead of two. This 1619 can throw off the rest of your document structure. The same is 1620 true of a number of other tags, listed below. 1621 1622 It's much more common for someone to forget to close a 'b' tag 1623 than to actually use nested 'b' tags, and the BeautifulSoup class 1624 handles the common case. This class handles the not-co-common 1625 case: where you can't believe someone wrote what they did, but 1626 it's valid HTML and BeautifulSoup screwed up by assuming it 1627 wouldn't be.""" 1628 1629 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1630 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1631 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1632 'big'] 1633 1634 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] 1635 1636 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, 1637 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, 1638 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)

1639

1640 -class MinimalSoup(BeautifulSoup):

1641 """The MinimalSoup class is for parsing HTML that contains 1642 pathologically bad markup. It makes no assumptions about tag 1643 nesting, but it does know which tags are self-closing, that 1644 <script> tags contain Javascript and should not be parsed, that 1645 META tags may contain encoding information, and so on. 1646 1647 This also makes it better for subclassing than BeautifulStoneSoup 1648 or BeautifulSoup.""" 1649 1650 RESET_NESTING_TAGS = buildTagMap('noscript') 1651 NESTABLE_TAGS = {}

1652

1653 -class BeautifulSOAP(BeautifulStoneSoup):

1654 """This class will push a tag with only a single string child into 1655 the tag's parent as an attribute. The attribute's name is the tag 1656 name, and the value is the string child. An example should give 1657 the flavor of the change: 1658 1659 <foo><bar>baz</bar></foo> 1660 => 1661 <foo bar="baz"><bar>baz</bar></foo> 1662 1663 You can then access fooTag['bar'] instead of fooTag.barTag.string. 1664 1665 This is, of course, useful for scraping structures that tend to 1666 use subelements instead of attributes, such as SOAP messages. Note 1667 that it modifies its input, so don't print the modified version 1668 out. 1669 1670 I'm not sure how many people really want to use this class; let me 1671 know if you do. Mainly I like the name.""" 1672

1673 - def popTag(self):

1674 if len(self.tagStack) > 1: 1675 tag = self.tagStack[-1] 1676 parent = self.tagStack[-2] 1677 parent._getAttrMap() 1678 if (isinstance(tag, Tag) and len(tag.contents) == 1 and 1679 isinstance(tag.contents[0], NavigableString) and 1680 not parent.attrMap.has_key(tag.name)): 1681 parent[tag.name] = tag.contents[0] 1682 BeautifulStoneSoup.popTag(self)

1683 1684 #Enterprise class names! It has come to our attention that some people 1685 #think the names of the Beautiful Soup parser classes are too silly 1686 #and "unprofessional" for use in enterprise screen-scraping. We feel 1687 #your pain! For such-minded folk, the Beautiful Soup Consortium And 1688 #All-Night Kosher Bakery recommends renaming this file to 1689 #"RobustParser.py" (or, in cases of extreme enterprisiness, 1690 #"RobustParserBeanInterface.class") and using the following 1691 #enterprise-friendly class aliases:

1692 -class RobustXMLParser(BeautifulStoneSoup):

1693 pass

1694 -class RobustHTMLParser(BeautifulSoup):

1695 pass

1696 -class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):

1697 pass

1698 -class RobustInsanelyWackAssHTMLParser(MinimalSoup):

1699 pass

1700 -class SimplifyingSOAPParser(BeautifulSOAP):

1701 pass

1702 1703 ###################################################### 1704 # 1705 # Bonus library: Unicode, Dammit 1706 # 1707 # This class forces XML data into a standard format (usually to UTF-8 1708 # or Unicode). It is heavily based on code from Mark Pilgrim's 1709 # Universal Feed Parser. It does not rewrite the XML or HTML to 1710 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi 1711 # (XML) and BeautifulSoup.start_meta (HTML). 1712 1713 # Autodetects character encodings. 1714 # Download from http://chardet.feedparser.org/ 1715 try: 1716 import chardet 1717 # import chardet.constants 1718 # chardet.constants._debug = 1 1719 except ImportError: 1720 chardet = None 1721 1722 # cjkcodecs and iconv_codec make Python know about more character encodings. 1723 # Both are available from http://cjkpython.i18n.org/ 1724 # They're built in if you use Python 2.4. 1725 try: 1726 import cjkcodecs.aliases 1727 except ImportError: 1728 pass 1729 try: 1730 import iconv_codec 1731 except ImportError: 1732 pass 1733

1734 -class UnicodeDammit:

1735 """A class for detecting the encoding of a *ML document and 1736 converting it to a Unicode string. If the source encoding is 1737 windows-1252, can replace MS smart quotes with their HTML or XML 1738 equivalents.""" 1739 1740 # This dictionary maps commonly seen values for "charset" in HTML 1741 # meta tags to the corresponding Python codec names. It only covers 1742 # values that aren't in Python's aliases and can't be determined 1743 # by the heuristics in find_codec. 1744 CHARSET_ALIASES = { "macintosh" : "mac-roman", 1745 "x-sjis" : "shift-jis" } 1746

1747 - def __init__(self, markup, overrideEncodings=[], 1748 smartQuotesTo='xml', isHTML=False):

1749 self.declaredHTMLEncoding = None 1750 self.markup, documentEncoding, sniffedEncoding = \ 1751 self._detectEncoding(markup, isHTML) 1752 self.smartQuotesTo = smartQuotesTo 1753 self.triedEncodings = [] 1754 if markup == '' or isinstance(markup, unicode): 1755 self.originalEncoding = None 1756 self.unicode = unicode(markup) 1757 return 1758 1759 u = None 1760 for proposedEncoding in overrideEncodings: 1761 u = self._convertFrom(proposedEncoding) 1762 if u: break 1763 if not u: 1764 for proposedEncoding in (documentEncoding, sniffedEncoding): 1765 u = self._convertFrom(proposedEncoding) 1766 if u: break 1767 1768 # If no luck and we have auto-detection library, try that: 1769 if not u and chardet and not isinstance(self.markup, unicode): 1770 u = self._convertFrom(chardet.detect(self.markup)['encoding']) 1771 1772 # As a last resort, try utf-8 and windows-1252: 1773 if not u: 1774 for proposed_encoding in ("utf-8", "windows-1252"): 1775 u = self._convertFrom(proposed_encoding) 1776 if u: break 1777 1778 self.unicode = u 1779 if not u: self.originalEncoding = None

1780

1781 - def _subMSChar(self, match):

1782 """Changes a MS smart quote character to an XML or HTML 1783 entity.""" 1784 orig = match.group(1) 1785 sub = self.MS_CHARS.get(orig) 1786 if type(sub) == types.TupleType: 1787 if self.smartQuotesTo == 'xml': 1788 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 1789 else: 1790 sub = '&'.encode() + sub[0].encode() + ';'.encode() 1791 else: 1792 sub = sub.encode() 1793 return sub

1794

1795 - def _convertFrom(self, proposed):

1796 proposed = self.find_codec(proposed) 1797 if not proposed or proposed in self.triedEncodings: 1798 return None 1799 self.triedEncodings.append(proposed) 1800 markup = self.markup 1801 1802 # Convert smart quotes to HTML if coming from an encoding 1803 # that might have them. 1804 if self.smartQuotesTo and proposed.lower() in("windows-1252", 1805 "iso-8859-1", 1806 "iso-8859-2"): 1807 smart_quotes_re = "([\x80-\x9f])" 1808 smart_quotes_compiled = re.compile(smart_quotes_re) 1809 markup = smart_quotes_compiled.sub(self._subMSChar, markup) 1810 1811 try: 1812 # print "Trying to convert document to %s" % proposed 1813 u = self._toUnicode(markup, proposed) 1814 self.markup = u 1815 self.originalEncoding = proposed 1816 except Exception, e: 1817 # print "That didn't work!" 1818 # print e 1819 return None 1820 #print "Correct encoding: %s" % proposed 1821 return self.markup

1822

1823 - def _toUnicode(self, data, encoding):

1824 '''Given a string and its encoding, decodes the string into Unicode. 1825 %encoding is a string recognized by encodings.aliases''' 1826 1827 # strip Byte Order Mark (if present) 1828 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 1829 and (data[2:4] != '\x00\x00'): 1830 encoding = 'utf-16be' 1831 data = data[2:] 1832 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 1833 and (data[2:4] != '\x00\x00'): 1834 encoding = 'utf-16le' 1835 data = data[2:] 1836 elif data[:3] == '\xef\xbb\xbf': 1837 encoding = 'utf-8' 1838 data = data[3:] 1839 elif data[:4] == '\x00\x00\xfe\xff': 1840 encoding = 'utf-32be' 1841 data = data[4:] 1842 elif data[:4] == '\xff\xfe\x00\x00': 1843 encoding = 'utf-32le' 1844 data = data[4:] 1845 newdata = unicode(data, encoding) 1846 return newdata

1847

1848 - def _detectEncoding(self, xml_data, isHTML=False):

1849 """Given a document, tries to detect its XML encoding.""" 1850 xml_encoding = sniffed_xml_encoding = None 1851 try: 1852 if xml_data[:4] == '\x4c\x6f\xa7\x94': 1853 # EBCDIC 1854 xml_data = self._ebcdic_to_ascii(xml_data) 1855 elif xml_data[:4] == '\x00\x3c\x00\x3f': 1856 # UTF-16BE 1857 sniffed_xml_encoding = 'utf-16be' 1858 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 1859 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ 1860 and (xml_data[2:4] != '\x00\x00'): 1861 # UTF-16BE with BOM 1862 sniffed_xml_encoding = 'utf-16be' 1863 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 1864 elif xml_data[:4] == '\x3c\x00\x3f\x00': 1865 # UTF-16LE 1866 sniffed_xml_encoding = 'utf-16le' 1867 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 1868 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ 1869 (xml_data[2:4] != '\x00\x00'): 1870 # UTF-16LE with BOM 1871 sniffed_xml_encoding = 'utf-16le' 1872 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 1873 elif xml_data[:4] == '\x00\x00\x00\x3c': 1874 # UTF-32BE 1875 sniffed_xml_encoding = 'utf-32be' 1876 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 1877 elif xml_data[:4] == '\x3c\x00\x00\x00': 1878 # UTF-32LE 1879 sniffed_xml_encoding = 'utf-32le' 1880 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 1881 elif xml_data[:4] == '\x00\x00\xfe\xff': 1882 # UTF-32BE with BOM 1883 sniffed_xml_encoding = 'utf-32be' 1884 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 1885 elif xml_data[:4] == '\xff\xfe\x00\x00': 1886 # UTF-32LE with BOM 1887 sniffed_xml_encoding = 'utf-32le' 1888 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 1889 elif xml_data[:3] == '\xef\xbb\xbf': 1890 # UTF-8 with BOM 1891 sniffed_xml_encoding = 'utf-8' 1892 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 1893 else: 1894 sniffed_xml_encoding = 'ascii' 1895 pass 1896 except: 1897 xml_encoding_match = None 1898 xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() 1899 xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) 1900 if not xml_encoding_match and isHTML: 1901 meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() 1902 regexp = re.compile(meta_re, re.I) 1903 xml_encoding_match = regexp.search(xml_data) 1904 if xml_encoding_match is not None: 1905 xml_encoding = xml_encoding_match.groups()[0].decode( 1906 'ascii').lower() 1907 if isHTML: 1908 self.declaredHTMLEncoding = xml_encoding 1909 if sniffed_xml_encoding and \ 1910 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 1911 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 1912 'utf-16', 'utf-32', 'utf_16', 'utf_32', 1913 'utf16', 'u16')): 1914 xml_encoding = sniffed_xml_encoding 1915 return xml_data, xml_encoding, sniffed_xml_encoding

1916 1917

1918 - def find_codec(self, charset):

1919 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ 1920 or (charset and self._codec(charset.replace("-", ""))) \ 1921 or (charset and self._codec(charset.replace("-", "_"))) \ 1922 or charset

1923

1924 - def _codec(self, charset):

1925 if not charset: return charset 1926 codec = None 1927 try: 1928 codecs.lookup(charset) 1929 codec = charset 1930 except (LookupError, ValueError): 1931 pass 1932 return codec

1933 1934 EBCDIC_TO_ASCII_MAP = None

1935 - def _ebcdic_to_ascii(self, s):

1936 c = self.__class__ 1937 if not c.EBCDIC_TO_ASCII_MAP: 1938 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 1939 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 1940 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 1941 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 1942 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 1943 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 1944 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 1945 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 1946 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 1947 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 1948 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 1949 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 1950 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 1951 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 1952 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 1953 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 1954 250,251,252,253,254,255) 1955 import string 1956 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ 1957 ''.join(map(chr, range(256))), ''.join(map(chr, emap))) 1958 return s.translate(c.EBCDIC_TO_ASCII_MAP)

1959 1960 MS_CHARS = { '\x80' : ('euro', '20AC'), 1961 '\x81' : ' ', 1962 '\x82' : ('sbquo', '201A'), 1963 '\x83' : ('fnof', '192'), 1964 '\x84' : ('bdquo', '201E'), 1965 '\x85' : ('hellip', '2026'), 1966 '\x86' : ('dagger', '2020'), 1967 '\x87' : ('Dagger', '2021'), 1968 '\x88' : ('circ', '2C6'), 1969 '\x89' : ('permil', '2030'), 1970 '\x8A' : ('Scaron', '160'), 1971 '\x8B' : ('lsaquo', '2039'), 1972 '\x8C' : ('OElig', '152'), 1973 '\x8D' : '?', 1974 '\x8E' : ('#x17D', '17D'), 1975 '\x8F' : '?', 1976 '\x90' : '?', 1977 '\x91' : ('lsquo', '2018'), 1978 '\x92' : ('rsquo', '2019'), 1979 '\x93' : ('ldquo', '201C'), 1980 '\x94' : ('rdquo', '201D'), 1981 '\x95' : ('bull', '2022'), 1982 '\x96' : ('ndash', '2013'), 1983 '\x97' : ('mdash', '2014'), 1984 '\x98' : ('tilde', '2DC'), 1985 '\x99' : ('trade', '2122'), 1986 '\x9a' : ('scaron', '161'), 1987 '\x9b' : ('rsaquo', '203A'), 1988 '\x9c' : ('oelig', '153'), 1989 '\x9d' : '?', 1990 '\x9e' : ('#x17E', '17E'), 1991 '\x9f' : ('Yuml', ''),}

1992 1993 ####################################################################### 1994 1995 1996 #By default, act as an HTML pretty-printer. 1997 if __name__ == '__main__': 1998 import sys 1999 soup = BeautifulSoup(sys.stdin) 2000 print soup.prettify() 2001

Source Code for Module rosdeb.BeautifulSoup