1 """Beautiful Soup
2 Elixir and Tonic
3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
5
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
9
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
15
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
19
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 by stock Python.
24 http://cjkpython.i18n.org/
25
26 Beautiful Soup defines classes for two main parsing strategies:
27
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
34
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39 For more than you ever wanted to know about Beautiful Soup, see the
40 documentation:
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43 Here, have some legalese:
44
45 Copyright (c) 2004-2009, Leonard Richardson
46
47 All rights reserved.
48
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
51 met:
52
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
55
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
60
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
65
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
78 """
79 from __future__ import generators
80
81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
82 __version__ = "3.1.0.1"
83 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
84 __license__ = "New-style BSD"
85
86 import codecs
87 import markupbase
88 import types
89 import re
90 from HTMLParser import HTMLParser, HTMLParseError
91 try:
92 from htmlentitydefs import name2codepoint
93 except ImportError:
94 name2codepoint = {}
95 try:
96 set
97 except NameError:
98 from sets import Set as set
99
100
101 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
102
103 DEFAULT_OUTPUT_ENCODING = "utf-8"
104
105
106
107 -def sob(unicode, encoding):
108 """Returns either the given Unicode string or its encoding."""
109 if encoding is None:
110 return unicode
111 else:
112 return unicode.encode(encoding)
113
115 """Contains the navigational information for some part of the page
116 (either a tag or a piece of text)"""
117
118 - def setup(self, parent=None, previous=None):
119 """Sets up the initial relations between this element and
120 other elements."""
121 self.parent = parent
122 self.previous = previous
123 self.next = None
124 self.previousSibling = None
125 self.nextSibling = None
126 if self.parent and self.parent.contents:
127 self.previousSibling = self.parent.contents[-1]
128 self.previousSibling.nextSibling = self
129
130 - def replaceWith(self, replaceWith):
131 oldParent = self.parent
132 myIndex = self.parent.contents.index(self)
133 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
134
135 index = self.parent.contents.index(replaceWith)
136 if index and index < myIndex:
137
138
139
140 myIndex = myIndex - 1
141 self.extract()
142 oldParent.insert(myIndex, replaceWith)
143
145 """Destructively rips this element out of the tree."""
146 if self.parent:
147 try:
148 self.parent.contents.remove(self)
149 except ValueError:
150 pass
151
152
153
154
155 lastChild = self._lastRecursiveChild()
156 nextElement = lastChild.next
157
158 if self.previous:
159 self.previous.next = nextElement
160 if nextElement:
161 nextElement.previous = self.previous
162 self.previous = None
163 lastChild.next = None
164
165 self.parent = None
166 if self.previousSibling:
167 self.previousSibling.nextSibling = self.nextSibling
168 if self.nextSibling:
169 self.nextSibling.previousSibling = self.previousSibling
170 self.previousSibling = self.nextSibling = None
171 return self
172
174 "Finds the last element beneath this object to be parsed."
175 lastChild = self
176 while hasattr(lastChild, 'contents') and lastChild.contents:
177 lastChild = lastChild.contents[-1]
178 return lastChild
179
180 - def insert(self, position, newChild):
181 if (isinstance(newChild, basestring)
182 or isinstance(newChild, unicode)) \
183 and not isinstance(newChild, NavigableString):
184 newChild = NavigableString(newChild)
185
186 position = min(position, len(self.contents))
187 if hasattr(newChild, 'parent') and newChild.parent != None:
188
189
190 if newChild.parent == self:
191 index = self.find(newChild)
192 if index and index < position:
193
194
195
196
197 position = position - 1
198 newChild.extract()
199
200 newChild.parent = self
201 previousChild = None
202 if position == 0:
203 newChild.previousSibling = None
204 newChild.previous = self
205 else:
206 previousChild = self.contents[position-1]
207 newChild.previousSibling = previousChild
208 newChild.previousSibling.nextSibling = newChild
209 newChild.previous = previousChild._lastRecursiveChild()
210 if newChild.previous:
211 newChild.previous.next = newChild
212
213 newChildsLastElement = newChild._lastRecursiveChild()
214
215 if position >= len(self.contents):
216 newChild.nextSibling = None
217
218 parent = self
219 parentsNextSibling = None
220 while not parentsNextSibling:
221 parentsNextSibling = parent.nextSibling
222 parent = parent.parent
223 if not parent:
224 break
225 if parentsNextSibling:
226 newChildsLastElement.next = parentsNextSibling
227 else:
228 newChildsLastElement.next = None
229 else:
230 nextChild = self.contents[position]
231 newChild.nextSibling = nextChild
232 if newChild.nextSibling:
233 newChild.nextSibling.previousSibling = newChild
234 newChildsLastElement.next = nextChild
235
236 if newChildsLastElement.next:
237 newChildsLastElement.next.previous = newChildsLastElement
238 self.contents.insert(position, newChild)
239
240 - def append(self, tag):
241 """Appends the given tag to the contents of this tag."""
242 self.insert(len(self.contents), tag)
243
244 - def findNext(self, name=None, attrs={}, text=None, **kwargs):
245 """Returns the first item that matches the given criteria and
246 appears after this Tag in the document."""
247 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
248
249 - def findAllNext(self, name=None, attrs={}, text=None, limit=None,
250 **kwargs):
251 """Returns all items that match the given criteria and appear
252 after this Tag in the document."""
253 return self._findAll(name, attrs, text, limit, self.nextGenerator,
254 **kwargs)
255
256 - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
257 """Returns the closest sibling to this Tag that matches the
258 given criteria and appears after this Tag in the document."""
259 return self._findOne(self.findNextSiblings, name, attrs, text,
260 **kwargs)
261
262 - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
263 **kwargs):
264 """Returns the siblings of this Tag that match the given
265 criteria and appear after this Tag in the document."""
266 return self._findAll(name, attrs, text, limit,
267 self.nextSiblingGenerator, **kwargs)
268 fetchNextSiblings = findNextSiblings
269
270 - def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
271 """Returns the first item that matches the given criteria and
272 appears before this Tag in the document."""
273 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
274
275 - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
276 **kwargs):
277 """Returns all items that match the given criteria and appear
278 before this Tag in the document."""
279 return self._findAll(name, attrs, text, limit, self.previousGenerator,
280 **kwargs)
281 fetchPrevious = findAllPrevious
282
283 - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
284 """Returns the closest sibling to this Tag that matches the
285 given criteria and appears before this Tag in the document."""
286 return self._findOne(self.findPreviousSiblings, name, attrs, text,
287 **kwargs)
288
289 - def findPreviousSiblings(self, name=None, attrs={}, text=None,
290 limit=None, **kwargs):
291 """Returns the siblings of this Tag that match the given
292 criteria and appear before this Tag in the document."""
293 return self._findAll(name, attrs, text, limit,
294 self.previousSiblingGenerator, **kwargs)
295 fetchPreviousSiblings = findPreviousSiblings
296
297 - def findParent(self, name=None, attrs={}, **kwargs):
298 """Returns the closest parent of this Tag that matches the given
299 criteria."""
300
301
302 r = None
303 l = self.findParents(name, attrs, 1)
304 if l:
305 r = l[0]
306 return r
307
308 - def findParents(self, name=None, attrs={}, limit=None, **kwargs):
309 """Returns the parents of this Tag that match the given
310 criteria."""
311
312 return self._findAll(name, attrs, None, limit, self.parentGenerator,
313 **kwargs)
314 fetchParents = findParents
315
316
317
318 - def _findOne(self, method, name, attrs, text, **kwargs):
319 r = None
320 l = method(name, attrs, text, 1, **kwargs)
321 if l:
322 r = l[0]
323 return r
324
325 - def _findAll(self, name, attrs, text, limit, generator, **kwargs):
326 "Iterates over a generator looking for things that match."
327
328 if isinstance(name, SoupStrainer):
329 strainer = name
330 else:
331
332 strainer = SoupStrainer(name, attrs, text, **kwargs)
333 results = ResultSet(strainer)
334 g = generator()
335 while True:
336 try:
337 i = g.next()
338 except StopIteration:
339 break
340 if i:
341 found = strainer.search(i)
342 if found:
343 results.append(found)
344 if limit and len(results) >= limit:
345 break
346 return results
347
348
349
350 - def nextGenerator(self):
351 i = self
352 while i:
353 i = i.next
354 yield i
355
357 i = self
358 while i:
359 i = i.nextSibling
360 yield i
361
363 i = self
364 while i:
365 i = i.previous
366 yield i
367
369 i = self
370 while i:
371 i = i.previousSibling
372 yield i
373
374 - def parentGenerator(self):
375 i = self
376 while i:
377 i = i.parent
378 yield i
379
380
381 - def substituteEncoding(self, str, encoding=None):
382 encoding = encoding or "utf-8"
383 return str.replace("%SOUP-ENCODING%", encoding)
384
385 - def toEncoding(self, s, encoding=None):
386 """Encodes an object to a string in some encoding, or to Unicode.
387 ."""
388 if isinstance(s, unicode):
389 if encoding:
390 s = s.encode(encoding)
391 elif isinstance(s, str):
392 if encoding:
393 s = s.encode(encoding)
394 else:
395 s = unicode(s)
396 else:
397 if encoding:
398 s = self.toEncoding(str(s), encoding)
399 else:
400 s = unicode(s)
401 return s
402
404
406 """Create a new NavigableString.
407
408 When unpickling a NavigableString, this method is called with
409 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
410 passed in to the superclass's __new__ or the superclass won't know
411 how to handle non-ASCII characters.
412 """
413 if isinstance(value, unicode):
414 return unicode.__new__(cls, value)
415 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
416
418 return (unicode(self),)
419
421 """text.string gives you text. This is for backwards
422 compatibility for Navigable*String, but for CData* it lets you
423 get the string without the CData wrapper."""
424 if attr == 'string':
425 return self
426 else:
427 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
428
431
434
435 -class CData(NavigableString):
436
438 return u'<![CDATA[' + self + u']]>'
439
441
443 output = self
444 if u'%SOUP-ENCODING%' in output:
445 output = self.substituteEncoding(output, eventualEncoding)
446 return u'<?' + output + u'?>'
447
451
454 return u'<!' + self + u'>'
455
456 -class Tag(PageElement):
457
458 """Represents a found HTML tag with its attributes and contents."""
459
461 "Cheap function to invert a hash."
462 i = {}
463 for k,v in h.items():
464 i[v] = k
465 return i
466
467 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
468 "quot" : '"',
469 "amp" : "&",
470 "lt" : "<",
471 "gt" : ">" }
472
473 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
474
476 """Used in a call to re.sub to replace HTML, XML, and numeric
477 entities with the appropriate Unicode characters. If HTML
478 entities are being converted, any unrecognized entities are
479 escaped."""
480 x = match.group(1)
481 if self.convertHTMLEntities and x in name2codepoint:
482 return unichr(name2codepoint[x])
483 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
484 if self.convertXMLEntities:
485 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
486 else:
487 return u'&%s;' % x
488 elif len(x) > 0 and x[0] == '#':
489
490 if len(x) > 1 and x[1] == 'x':
491 return unichr(int(x[2:], 16))
492 else:
493 return unichr(int(x[1:]))
494
495 elif self.escapeUnrecognizedEntities:
496 return u'&%s;' % x
497 else:
498 return u'&%s;' % x
499
500 - def __init__(self, parser, name, attrs=None, parent=None,
501 previous=None):
502 "Basic constructor."
503
504
505
506 self.parserClass = parser.__class__
507 self.isSelfClosing = parser.isSelfClosingTag(name)
508 self.name = name
509 if attrs == None:
510 attrs = []
511 self.attrs = attrs
512 self.contents = []
513 self.setup(parent, previous)
514 self.hidden = False
515 self.containsSubstitutions = False
516 self.convertHTMLEntities = parser.convertHTMLEntities
517 self.convertXMLEntities = parser.convertXMLEntities
518 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
519
520 def convert(kval):
521 "Converts HTML, XML and numeric entities in the attribute value."
522 k, val = kval
523 if val is None:
524 return kval
525 return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
526 self._convertEntities, val))
527 self.attrs = map(convert, self.attrs)
528
529 - def get(self, key, default=None):
530 """Returns the value of the 'key' attribute for the tag, or
531 the value given for 'default' if it doesn't have that
532 attribute."""
533 return self._getAttrMap().get(key, default)
534
536 return self._getAttrMap().has_key(key)
537
539 """tag[key] returns the value of the 'key' attribute for the tag,
540 and throws an exception if it's not there."""
541 return self._getAttrMap()[key]
542
544 "Iterating over a tag iterates over its contents."
545 return iter(self.contents)
546
548 "The length of a tag is the length of its list of contents."
549 return len(self.contents)
550
552 return x in self.contents
553
555 "A tag is non-None even if it has no contents."
556 return True
557
559 """Setting tag[key] sets the value of the 'key' attribute for the
560 tag."""
561 self._getAttrMap()
562 self.attrMap[key] = value
563 found = False
564 for i in range(0, len(self.attrs)):
565 if self.attrs[i][0] == key:
566 self.attrs[i] = (key, value)
567 found = True
568 if not found:
569 self.attrs.append((key, value))
570 self._getAttrMap()[key] = value
571
573 "Deleting tag[key] deletes all 'key' attributes for the tag."
574 for item in self.attrs:
575 if item[0] == key:
576 self.attrs.remove(item)
577
578
579 self._getAttrMap()
580 if self.attrMap.has_key(key):
581 del self.attrMap[key]
582
584 """Calling a tag like a function is the same as calling its
585 findAll() method. Eg. tag('a') returns a list of all the A tags
586 found within this tag."""
587 return apply(self.findAll, args, kwargs)
588
590
591 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
592 return self.find(tag[:-3])
593 elif tag.find('__') != 0:
594 return self.find(tag)
595 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
596
598 """Returns true iff this tag has the same name, the same attributes,
599 and the same contents (recursively) as the given tag.
600
601 NOTE: right now this will return false if two tags have the
602 same attributes in a different order. Should this be fixed?"""
603 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
604 return False
605 for i in range(0, len(self.contents)):
606 if self.contents[i] != other.contents[i]:
607 return False
608 return True
609
611 """Returns true iff this tag is not identical to the other tag,
612 as defined in __eq__."""
613 return not self == other
614
616 """Renders this tag as a string."""
617 return self.decode(eventualEncoding=encoding)
618
619 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
620 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
621 + ")")
622
624 """Used with a regular expression to substitute the
625 appropriate XML entity for an XML special character."""
626 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
627
630
633
636 return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
637
640 """Returns a string or Unicode representation of this tag and
641 its contents. To get Unicode, pass None for encoding."""
642
643 attrs = []
644 if self.attrs:
645 for key, val in self.attrs:
646 fmt = '%s="%s"'
647 if isString(val):
648 if (self.containsSubstitutions
649 and eventualEncoding is not None
650 and '%SOUP-ENCODING%' in val):
651 val = self.substituteEncoding(val, eventualEncoding)
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668 if '"' in val:
669 fmt = "%s='%s'"
670 if "'" in val:
671
672
673 val = val.replace("'", "&squot;")
674
675
676
677
678
679 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
680 if val is None:
681
682 decoded = key
683 else:
684 decoded = fmt % (key, val)
685 attrs.append(decoded)
686 close = ''
687 closeTag = ''
688 if self.isSelfClosing:
689 close = ' /'
690 else:
691 closeTag = '</%s>' % self.name
692
693 indentTag, indentContents = 0, 0
694 if prettyPrint:
695 indentTag = indentLevel
696 space = (' ' * (indentTag-1))
697 indentContents = indentTag + 1
698 contents = self.decodeContents(prettyPrint, indentContents,
699 eventualEncoding)
700 if self.hidden:
701 s = contents
702 else:
703 s = []
704 attributeString = ''
705 if attrs:
706 attributeString = ' ' + ' '.join(attrs)
707 if prettyPrint:
708 s.append(space)
709 s.append('<%s%s%s>' % (self.name, attributeString, close))
710 if prettyPrint:
711 s.append("\n")
712 s.append(contents)
713 if prettyPrint and contents and contents[-1] != "\n":
714 s.append("\n")
715 if prettyPrint and closeTag:
716 s.append(space)
717 s.append(closeTag)
718 if prettyPrint and closeTag and self.nextSibling:
719 s.append("\n")
720 s = ''.join(s)
721 return s
722
724 """Recursively destroys the contents of this tree."""
725 contents = [i for i in self.contents]
726 for i in contents:
727 if isinstance(i, Tag):
728 i.decompose()
729 else:
730 i.extract()
731 self.extract()
732
734 return self.encode(encoding, True)
735
736 - def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
737 prettyPrint=False, indentLevel=0):
738 return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
739
740 - def decodeContents(self, prettyPrint=False, indentLevel=0,
741 eventualEncoding=DEFAULT_OUTPUT_ENCODING):
742 """Renders the contents of this tag as a string in the given
743 encoding. If encoding is None, returns a Unicode string.."""
744 s=[]
745 for c in self:
746 text = None
747 if isinstance(c, NavigableString):
748 text = c.decodeGivenEventualEncoding(eventualEncoding)
749 elif isinstance(c, Tag):
750 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
751 if text and prettyPrint:
752 text = text.strip()
753 if text:
754 if prettyPrint:
755 s.append(" " * (indentLevel-1))
756 s.append(text)
757 if prettyPrint:
758 s.append("\n")
759 return ''.join(s)
760
761
762
763 - def find(self, name=None, attrs={}, recursive=True, text=None,
764 **kwargs):
765 """Return only the first child of this Tag matching the given
766 criteria."""
767 r = None
768 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
769 if l:
770 r = l[0]
771 return r
772 findChild = find
773
774 - def findAll(self, name=None, attrs={}, recursive=True, text=None,
775 limit=None, **kwargs):
776 """Extracts a list of Tag objects that match the given
777 criteria. You can specify the name of the Tag and any
778 attributes you want the Tag to have.
779
780 The value of a key-value pair in the 'attrs' map can be a
781 string, a list of strings, a regular expression object, or a
782 callable that takes a string and returns whether or not the
783 string matches for some custom definition of 'matches'. The
784 same is true of the tag name."""
785 generator = self.recursiveChildGenerator
786 if not recursive:
787 generator = self.childGenerator
788 return self._findAll(name, attrs, text, limit, generator, **kwargs)
789 findChildren = findAll
790
791
792 first = find
793 fetch = findAll
794
795 - def fetchText(self, text=None, recursive=True, limit=None):
796 return self.findAll(text=text, recursive=recursive, limit=limit)
797
798 - def firstText(self, text=None, recursive=True):
799 return self.find(text=text, recursive=recursive)
800
801
802 - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
803 prettyPrint=False, indentLevel=0):
804 if encoding is None:
805 return self.decodeContents(prettyPrint, indentLevel, encoding)
806 else:
807 return self.encodeContents(encoding, prettyPrint, indentLevel)
808
809
810
811
813 """Initializes a map representation of this tag's attributes,
814 if not already initialized."""
815 if not getattr(self, 'attrMap'):
816 self.attrMap = {}
817 for (key, value) in self.attrs:
818 self.attrMap[key] = value
819 return self.attrMap
820
821
823 if not len(self.contents):
824 raise StopIteration
825 stopNode = self._lastRecursiveChild().next
826 current = self.contents[0]
827 while current is not stopNode:
828 yield current
829 current = current.next
830
832 if not len(self.contents):
833 raise StopIteration
834 current = self.contents[0]
835 while current:
836 yield current
837 current = current.nextSibling
838 raise StopIteration
839
840
842 """Encapsulates a number of ways of matching a markup element (tag or
843 text)."""
844
845 - def __init__(self, name=None, attrs={}, text=None, **kwargs):
846 self.name = name
847 if isString(attrs):
848 kwargs['class'] = attrs
849 attrs = None
850 if kwargs:
851 if attrs:
852 attrs = attrs.copy()
853 attrs.update(kwargs)
854 else:
855 attrs = kwargs
856 self.attrs = attrs
857 self.text = text
858
860 if self.text:
861 return self.text
862 else:
863 return "%s|%s" % (self.name, self.attrs)
864
865 - def searchTag(self, markupName=None, markupAttrs={}):
866 found = None
867 markup = None
868 if isinstance(markupName, Tag):
869 markup = markupName
870 markupAttrs = markup
871 callFunctionWithTagData = callable(self.name) \
872 and not isinstance(markupName, Tag)
873
874 if (not self.name) \
875 or callFunctionWithTagData \
876 or (markup and self._matches(markup, self.name)) \
877 or (not markup and self._matches(markupName, self.name)):
878 if callFunctionWithTagData:
879 match = self.name(markupName, markupAttrs)
880 else:
881 match = True
882 markupAttrMap = None
883 for attr, matchAgainst in self.attrs.items():
884 if not markupAttrMap:
885 if hasattr(markupAttrs, 'get'):
886 markupAttrMap = markupAttrs
887 else:
888 markupAttrMap = {}
889 for k,v in markupAttrs:
890 markupAttrMap[k] = v
891 attrValue = markupAttrMap.get(attr)
892 if not self._matches(attrValue, matchAgainst):
893 match = False
894 break
895 if match:
896 if markup:
897 found = markup
898 else:
899 found = markupName
900 return found
901
903
904 found = None
905
906
907 if isList(markup) and not isinstance(markup, Tag):
908 for element in markup:
909 if isinstance(element, NavigableString) \
910 and self.search(element):
911 found = element
912 break
913
914
915 elif isinstance(markup, Tag):
916 if not self.text:
917 found = self.searchTag(markup)
918
919 elif isinstance(markup, NavigableString) or \
920 isString(markup):
921 if self._matches(markup, self.text):
922 found = markup
923 else:
924 raise Exception, "I don't know how to match against a %s" \
925 % markup.__class__
926 return found
927
928 - def _matches(self, markup, matchAgainst):
929
930 result = False
931 if matchAgainst == True and type(matchAgainst) == types.BooleanType:
932 result = markup != None
933 elif callable(matchAgainst):
934 result = matchAgainst(markup)
935 else:
936
937
938 if isinstance(markup, Tag):
939 markup = markup.name
940 if markup is not None and not isString(markup):
941 markup = unicode(markup)
942
943 if hasattr(matchAgainst, 'match'):
944
945 result = markup and matchAgainst.search(markup)
946 elif (isList(matchAgainst)
947 and (markup is not None or not isString(matchAgainst))):
948 result = markup in matchAgainst
949 elif hasattr(matchAgainst, 'items'):
950 result = markup.has_key(matchAgainst)
951 elif matchAgainst and isString(markup):
952 if isinstance(markup, unicode):
953 matchAgainst = unicode(matchAgainst)
954 else:
955 matchAgainst = str(matchAgainst)
956
957 if not result:
958 result = matchAgainst == markup
959 return result
960
962 """A ResultSet is just a list that keeps track of the SoupStrainer
963 that created it."""
965 list.__init__([])
966 self.source = source
967
968
969
971 """Convenience method that works with all 2.x versions of Python
972 to determine whether or not something is listlike."""
973 return ((hasattr(l, '__iter__') and not isString(l))
974 or (type(l) in (types.ListType, types.TupleType)))
975
977 """Convenience method that works with all 2.x versions of Python
978 to determine whether or not something is stringlike."""
979 try:
980 return isinstance(s, unicode) or isinstance(s, basestring)
981 except NameError:
982 return isinstance(s, str)
983
985 """Turns a list of maps, lists, or scalars into a single map.
986 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
987 NESTING_RESET_TAGS maps out of lists and partial maps."""
988 built = {}
989 for portion in args:
990 if hasattr(portion, 'items'):
991
992 for k,v in portion.items():
993 built[k] = v
994 elif isList(portion) and not isString(portion):
995
996 for k in portion:
997 built[k] = default
998 else:
999
1000 built[portion] = default
1001 return built
1002
1003
1004
1006
1008 HTMLParser.__init__(self)
1009 self.soup = soup
1010
1011
1012
1018
1021
1024
1026 """Adds a certain piece of text to the tree as a NavigableString
1027 subclass."""
1028 self.soup.endData()
1029 self.handle_data(text)
1030 self.soup.endData(subclass)
1031
1033 """Handle a processing instruction as a ProcessingInstruction
1034 object, possibly one with a %SOUP-ENCODING% slot into which an
1035 encoding will be plugged later."""
1036 if text[:3] == "xml":
1037 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1038 self._toStringSubclass(text, ProcessingInstruction)
1039
1043
1045 "Handle character references as data."
1046 if self.soup.convertEntities:
1047 data = unichr(int(ref))
1048 else:
1049 data = '&#%s;' % ref
1050 self.handle_data(data)
1051
1053 """Handle entity references as data, possibly converting known
1054 HTML and/or XML entity references to the corresponding Unicode
1055 characters."""
1056 data = None
1057 if self.soup.convertHTMLEntities:
1058 try:
1059 data = unichr(name2codepoint[ref])
1060 except KeyError:
1061 pass
1062
1063 if not data and self.soup.convertXMLEntities:
1064 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1065
1066 if not data and self.soup.convertHTMLEntities and \
1067 not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084 data = "&%s" % ref
1085 if not data:
1086
1087
1088
1089
1090
1091
1092 data = "&%s;" % ref
1093 self.handle_data(data)
1094
1096 "Handle DOCTYPEs and the like as Declaration objects."
1097 self._toStringSubclass(data, Declaration)
1098
1100 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1101 declaration as a CData object."""
1102 j = None
1103 if self.rawdata[i:i+9] == '<![CDATA[':
1104 k = self.rawdata.find(']]>', i)
1105 if k == -1:
1106 k = len(self.rawdata)
1107 data = self.rawdata[i+9:k]
1108 j = k+3
1109 self._toStringSubclass(data, CData)
1110 else:
1111 try:
1112 j = HTMLParser.parse_declaration(self, i)
1113 except HTMLParseError:
1114 toHandle = self.rawdata[i:]
1115 self.handle_data(toHandle)
1116 j = i + len(toHandle)
1117 return j
1118
1119
1121
1122 """This class contains the basic parser and search code. It defines
1123 a parser that knows nothing about tag behavior except for the
1124 following:
1125
1126 You can't close a tag without closing all the tags it encloses.
1127 That is, "<foo><bar></foo>" actually means
1128 "<foo><bar></bar></foo>".
1129
1130 [Another possible explanation is "<foo><bar /></foo>", but since
1131 this class defines no SELF_CLOSING_TAGS, it will never use that
1132 explanation.]
1133
1134 This class is useful for parsing XML or made-up markup languages,
1135 or when BeautifulSoup makes an assumption counter to what you were
1136 expecting."""
1137
1138 SELF_CLOSING_TAGS = {}
1139 NESTABLE_TAGS = {}
1140 RESET_NESTING_TAGS = {}
1141 QUOTE_TAGS = {}
1142 PRESERVE_WHITESPACE_TAGS = []
1143
1144 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1145 lambda x: x.group(1) + ' />'),
1146 (re.compile('<!\s+([^<>]*)>'),
1147 lambda x: '<!' + x.group(1) + '>')
1148 ]
1149
1150 ROOT_TAG_NAME = u'[document]'
1151
1152 HTML_ENTITIES = "html"
1153 XML_ENTITIES = "xml"
1154 XHTML_ENTITIES = "xhtml"
1155
1156 ALL_ENTITIES = XHTML_ENTITIES
1157
1158
1159
1160
1161
1162 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1163
1164 - def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1165 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1166 convertEntities=None, selfClosingTags=None, isHTML=False,
1167 builder=HTMLParserBuilder):
1168 """The Soup object is initialized as the 'root tag', and the
1169 provided markup (which can be a string or a file-like object)
1170 is fed into the underlying parser.
1171
1172 HTMLParser will process most bad HTML, and the BeautifulSoup
1173 class has some tricks for dealing with some HTML that kills
1174 HTMLParser, but Beautiful Soup can nonetheless choke or lose data
1175 if your data uses self-closing tags or declarations
1176 incorrectly.
1177
1178 By default, Beautiful Soup uses regexes to sanitize input,
1179 avoiding the vast majority of these problems. If the problems
1180 don't apply to you, pass in False for markupMassage, and
1181 you'll get better performance.
1182
1183 The default parser massage techniques fix the two most common
1184 instances of invalid HTML that choke HTMLParser:
1185
1186 <br/> (No space between name of closing tag and tag close)
1187 <! --Comment--> (Extraneous whitespace in declaration)
1188
1189 You can pass in a custom list of (RE object, replace method)
1190 tuples to get Beautiful Soup to scrub your input the way you
1191 want."""
1192
1193 self.parseOnlyThese = parseOnlyThese
1194 self.fromEncoding = fromEncoding
1195 self.smartQuotesTo = smartQuotesTo
1196 self.convertEntities = convertEntities
1197
1198
1199 if self.convertEntities:
1200
1201
1202
1203 self.smartQuotesTo = None
1204 if convertEntities == self.HTML_ENTITIES:
1205 self.convertXMLEntities = False
1206 self.convertHTMLEntities = True
1207 self.escapeUnrecognizedEntities = True
1208 elif convertEntities == self.XHTML_ENTITIES:
1209 self.convertXMLEntities = True
1210 self.convertHTMLEntities = True
1211 self.escapeUnrecognizedEntities = False
1212 elif convertEntities == self.XML_ENTITIES:
1213 self.convertXMLEntities = True
1214 self.convertHTMLEntities = False
1215 self.escapeUnrecognizedEntities = False
1216 else:
1217 self.convertXMLEntities = False
1218 self.convertHTMLEntities = False
1219 self.escapeUnrecognizedEntities = False
1220
1221 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1222 self.builder = builder(self)
1223 self.reset()
1224
1225 if hasattr(markup, 'read'):
1226 markup = markup.read()
1227 self.markup = markup
1228 self.markupMassage = markupMassage
1229 try:
1230 self._feed(isHTML=isHTML)
1231 except StopParsing:
1232 pass
1233 self.markup = None
1234 self.builder = None
1235
1236 - def _feed(self, inDocumentEncoding=None, isHTML=False):
1237
1238 markup = self.markup
1239 if isinstance(markup, unicode):
1240 if not hasattr(self, 'originalEncoding'):
1241 self.originalEncoding = None
1242 else:
1243 dammit = UnicodeDammit\
1244 (markup, [self.fromEncoding, inDocumentEncoding],
1245 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1246 markup = dammit.unicode
1247 self.originalEncoding = dammit.originalEncoding
1248 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1249 if markup:
1250 if self.markupMassage:
1251 if not isList(self.markupMassage):
1252 self.markupMassage = self.MARKUP_MASSAGE
1253 for fix, m in self.markupMassage:
1254 markup = fix.sub(m, markup)
1255
1256
1257
1258
1259
1260 del(self.markupMassage)
1261 self.builder.reset()
1262
1263 self.builder.feed(markup)
1264
1265 self.endData()
1266 while self.currentTag.name != self.ROOT_TAG_NAME:
1267 self.popTag()
1268
1270 """Returns true iff the given string is the name of a
1271 self-closing tag according to this parser."""
1272 return self.SELF_CLOSING_TAGS.has_key(name) \
1273 or self.instanceSelfClosingTags.has_key(name)
1274
1276 Tag.__init__(self, self, self.ROOT_TAG_NAME)
1277 self.hidden = 1
1278 self.builder.reset()
1279 self.currentData = []
1280 self.currentTag = None
1281 self.tagStack = []
1282 self.quoteStack = []
1283 self.pushTag(self)
1284
1286 tag = self.tagStack.pop()
1287
1288
1289
1290 if len(self.currentTag.contents) == 1 and \
1291 isinstance(self.currentTag.contents[0], NavigableString):
1292 self.currentTag.string = self.currentTag.contents[0]
1293
1294
1295 if self.tagStack:
1296 self.currentTag = self.tagStack[-1]
1297 return self.currentTag
1298
1300
1301 if self.currentTag:
1302 self.currentTag.contents.append(tag)
1303 self.tagStack.append(tag)
1304 self.currentTag = self.tagStack[-1]
1305
1307 if self.currentData:
1308 currentData = u''.join(self.currentData)
1309 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1310 not set([tag.name for tag in self.tagStack]).intersection(
1311 self.PRESERVE_WHITESPACE_TAGS)):
1312 if '\n' in currentData:
1313 currentData = '\n'
1314 else:
1315 currentData = ' '
1316 self.currentData = []
1317 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1318 (not self.parseOnlyThese.text or \
1319 not self.parseOnlyThese.search(currentData)):
1320 return
1321 o = containerClass(currentData)
1322 o.setup(self.currentTag, self.previous)
1323 if self.previous:
1324 self.previous.next = o
1325 self.previous = o
1326 self.currentTag.contents.append(o)
1327
1328
1329 - def _popToTag(self, name, inclusivePop=True):
1330 """Pops the tag stack up to and including the most recent
1331 instance of the given tag. If inclusivePop is false, pops the tag
1332 stack up to but *not* including the most recent instqance of
1333 the given tag."""
1334
1335 if name == self.ROOT_TAG_NAME:
1336 return
1337
1338 numPops = 0
1339 mostRecentTag = None
1340 for i in range(len(self.tagStack)-1, 0, -1):
1341 if name == self.tagStack[i].name:
1342 numPops = len(self.tagStack)-i
1343 break
1344 if not inclusivePop:
1345 numPops = numPops - 1
1346
1347 for i in range(0, numPops):
1348 mostRecentTag = self.popTag()
1349 return mostRecentTag
1350
1352
1353 """We need to pop up to the previous tag of this type, unless
1354 one of this tag's nesting reset triggers comes between this
1355 tag and the previous tag of this type, OR unless this tag is a
1356 generic nesting trigger and another generic nesting trigger
1357 comes between this tag and the previous tag of this type.
1358
1359 Examples:
1360 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1361 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1362 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1363
1364 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1365 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1366 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1367 """
1368
1369 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1370 isNestable = nestingResetTriggers != None
1371 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1372 popTo = None
1373 inclusive = True
1374 for i in range(len(self.tagStack)-1, 0, -1):
1375 p = self.tagStack[i]
1376 if (not p or p.name == name) and not isNestable:
1377
1378
1379 popTo = name
1380 break
1381 if (nestingResetTriggers != None
1382 and p.name in nestingResetTriggers) \
1383 or (nestingResetTriggers == None and isResetNesting
1384 and self.RESET_NESTING_TAGS.has_key(p.name)):
1385
1386
1387
1388
1389
1390 popTo = p.name
1391 inclusive = False
1392 break
1393 p = p.parent
1394 if popTo:
1395 self._popToTag(popTo, inclusive)
1396
1398
1399 if self.quoteStack:
1400
1401
1402 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1403 self.handle_data('<%s%s>' % (name, attrs))
1404 return
1405 self.endData()
1406
1407 if not self.isSelfClosingTag(name) and not selfClosing:
1408 self._smartPop(name)
1409
1410 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1411 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1412 return
1413
1414 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1415 if self.previous:
1416 self.previous.next = tag
1417 self.previous = tag
1418 self.pushTag(tag)
1419 if selfClosing or self.isSelfClosingTag(name):
1420 self.popTag()
1421 if name in self.QUOTE_TAGS:
1422
1423 self.quoteStack.append(name)
1424 self.literal = 1
1425 return tag
1426
1428
1429 if self.quoteStack and self.quoteStack[-1] != name:
1430
1431
1432 self.handle_data('</%s>' % name)
1433 return
1434 self.endData()
1435 self._popToTag(name)
1436 if self.quoteStack and self.quoteStack[-1] == name:
1437 self.quoteStack.pop()
1438 self.literal = (len(self.quoteStack) > 0)
1439
1441 self.currentData.append(data)
1442
1445
1446
1448
1449 """This parser knows the following facts about HTML:
1450
1451 * Some tags have no closing tag and should be interpreted as being
1452 closed as soon as they are encountered.
1453
1454 * The text inside some tags (ie. 'script') may contain tags which
1455 are not really part of the document and which should be parsed
1456 as text, not tags. If you want to parse the text as tags, you can
1457 always fetch it and parse it explicitly.
1458
1459 * Tag nesting rules:
1460
1461 Most tags can't be nested at all. For instance, the occurance of
1462 a <p> tag should implicitly close the previous <p> tag.
1463
1464 <p>Para1<p>Para2
1465 should be transformed into:
1466 <p>Para1</p><p>Para2
1467
1468 Some tags can be nested arbitrarily. For instance, the occurance
1469 of a <blockquote> tag should _not_ implicitly close the previous
1470 <blockquote> tag.
1471
1472 Alice said: <blockquote>Bob said: <blockquote>Blah
1473 should NOT be transformed into:
1474 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1475
1476 Some tags can be nested, but the nesting is reset by the
1477 interposition of other tags. For instance, a <tr> tag should
1478 implicitly close the previous <tr> tag within the same <table>,
1479 but not close a <tr> tag in another table.
1480
1481 <table><tr>Blah<tr>Blah
1482 should be transformed into:
1483 <table><tr>Blah</tr><tr>Blah
1484 but,
1485 <tr>Blah<table><tr>Blah
1486 should NOT be transformed into
1487 <tr>Blah<table></tr><tr>Blah
1488
1489 Differing assumptions about tag nesting rules are a major source
1490 of problems with the BeautifulSoup class. If BeautifulSoup is not
1491 treating as nestable a tag your page author treats as nestable,
1492 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1493 BeautifulStoneSoup before writing your own subclass."""
1494
1500
1501 SELF_CLOSING_TAGS = buildTagMap(None,
1502 ['br' , 'hr', 'input', 'img', 'meta',
1503 'spacer', 'link', 'frame', 'base'])
1504
1505 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1506
1507 QUOTE_TAGS = {'script' : None, 'textarea' : None}
1508
1509
1510
1511
1512 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1513 'center']
1514
1515
1516
1517
1518 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1519
1520
1521 NESTABLE_LIST_TAGS = { 'ol' : [],
1522 'ul' : [],
1523 'li' : ['ul', 'ol'],
1524 'dl' : [],
1525 'dd' : ['dl'],
1526 'dt' : ['dl'] }
1527
1528
1529 NESTABLE_TABLE_TAGS = {'table' : [],
1530 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1531 'td' : ['tr'],
1532 'th' : ['tr'],
1533 'thead' : ['table'],
1534 'tbody' : ['table'],
1535 'tfoot' : ['table'],
1536 }
1537
1538 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1539
1540
1541
1542 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1543 NON_NESTABLE_BLOCK_TAGS,
1544 NESTABLE_LIST_TAGS,
1545 NESTABLE_TABLE_TAGS)
1546
1547 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1548 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1549
1550
1551 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1552
1583 newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1584 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1585 newAttr)
1586 tagNeedsEncodingSubstitution = True
1587 else:
1588
1589
1590 newCharset = match.group(3)
1591 if newCharset and newCharset != self.originalEncoding:
1592 self.declaredHTMLEncoding = newCharset
1593 self._feed(self.declaredHTMLEncoding)
1594 raise StopParsing
1595 pass
1596 tag = self.unknown_starttag("meta", attrs)
1597 if tag and tagNeedsEncodingSubstitution:
1598 tag.containsSubstitutions = True
1599
1600
1603
1605
1606 """The BeautifulSoup class is oriented towards skipping over
1607 common HTML errors like unclosed tags. However, sometimes it makes
1608 errors of its own. For instance, consider this fragment:
1609
1610 <b>Foo<b>Bar</b></b>
1611
1612 This is perfectly valid (if bizarre) HTML. However, the
1613 BeautifulSoup class will implicitly close the first b tag when it
1614 encounters the second 'b'. It will think the author wrote
1615 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1616 there's no real-world reason to bold something that's already
1617 bold. When it encounters '</b></b>' it will close two more 'b'
1618 tags, for a grand total of three tags closed instead of two. This
1619 can throw off the rest of your document structure. The same is
1620 true of a number of other tags, listed below.
1621
1622 It's much more common for someone to forget to close a 'b' tag
1623 than to actually use nested 'b' tags, and the BeautifulSoup class
1624 handles the common case. This class handles the not-co-common
1625 case: where you can't believe someone wrote what they did, but
1626 it's valid HTML and BeautifulSoup screwed up by assuming it
1627 wouldn't be."""
1628
1629 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1630 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1631 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1632 'big']
1633
1634 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1635
1636 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1637 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1638 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1639
1641 """The MinimalSoup class is for parsing HTML that contains
1642 pathologically bad markup. It makes no assumptions about tag
1643 nesting, but it does know which tags are self-closing, that
1644 <script> tags contain Javascript and should not be parsed, that
1645 META tags may contain encoding information, and so on.
1646
1647 This also makes it better for subclassing than BeautifulStoneSoup
1648 or BeautifulSoup."""
1649
1650 RESET_NESTING_TAGS = buildTagMap('noscript')
1651 NESTABLE_TAGS = {}
1652
1654 """This class will push a tag with only a single string child into
1655 the tag's parent as an attribute. The attribute's name is the tag
1656 name, and the value is the string child. An example should give
1657 the flavor of the change:
1658
1659 <foo><bar>baz</bar></foo>
1660 =>
1661 <foo bar="baz"><bar>baz</bar></foo>
1662
1663 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1664
1665 This is, of course, useful for scraping structures that tend to
1666 use subelements instead of attributes, such as SOAP messages. Note
1667 that it modifies its input, so don't print the modified version
1668 out.
1669
1670 I'm not sure how many people really want to use this class; let me
1671 know if you do. Mainly I like the name."""
1672
1674 if len(self.tagStack) > 1:
1675 tag = self.tagStack[-1]
1676 parent = self.tagStack[-2]
1677 parent._getAttrMap()
1678 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1679 isinstance(tag.contents[0], NavigableString) and
1680 not parent.attrMap.has_key(tag.name)):
1681 parent[tag.name] = tag.contents[0]
1682 BeautifulStoneSoup.popTag(self)
1683
1684
1685
1686
1687
1688
1689
1690
1691
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715 try:
1716 import chardet
1717
1718
1719 except ImportError:
1720 chardet = None
1721
1722
1723
1724
1725 try:
1726 import cjkcodecs.aliases
1727 except ImportError:
1728 pass
1729 try:
1730 import iconv_codec
1731 except ImportError:
1732 pass
1733
1735 """A class for detecting the encoding of a *ML document and
1736 converting it to a Unicode string. If the source encoding is
1737 windows-1252, can replace MS smart quotes with their HTML or XML
1738 equivalents."""
1739
1740
1741
1742
1743
1744 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1745 "x-sjis" : "shift-jis" }
1746
1747 - def __init__(self, markup, overrideEncodings=[],
1748 smartQuotesTo='xml', isHTML=False):
1749 self.declaredHTMLEncoding = None
1750 self.markup, documentEncoding, sniffedEncoding = \
1751 self._detectEncoding(markup, isHTML)
1752 self.smartQuotesTo = smartQuotesTo
1753 self.triedEncodings = []
1754 if markup == '' or isinstance(markup, unicode):
1755 self.originalEncoding = None
1756 self.unicode = unicode(markup)
1757 return
1758
1759 u = None
1760 for proposedEncoding in overrideEncodings:
1761 u = self._convertFrom(proposedEncoding)
1762 if u: break
1763 if not u:
1764 for proposedEncoding in (documentEncoding, sniffedEncoding):
1765 u = self._convertFrom(proposedEncoding)
1766 if u: break
1767
1768
1769 if not u and chardet and not isinstance(self.markup, unicode):
1770 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1771
1772
1773 if not u:
1774 for proposed_encoding in ("utf-8", "windows-1252"):
1775 u = self._convertFrom(proposed_encoding)
1776 if u: break
1777
1778 self.unicode = u
1779 if not u: self.originalEncoding = None
1780
1782 """Changes a MS smart quote character to an XML or HTML
1783 entity."""
1784 orig = match.group(1)
1785 sub = self.MS_CHARS.get(orig)
1786 if type(sub) == types.TupleType:
1787 if self.smartQuotesTo == 'xml':
1788 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
1789 else:
1790 sub = '&'.encode() + sub[0].encode() + ';'.encode()
1791 else:
1792 sub = sub.encode()
1793 return sub
1794
1796 proposed = self.find_codec(proposed)
1797 if not proposed or proposed in self.triedEncodings:
1798 return None
1799 self.triedEncodings.append(proposed)
1800 markup = self.markup
1801
1802
1803
1804 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1805 "iso-8859-1",
1806 "iso-8859-2"):
1807 smart_quotes_re = "([\x80-\x9f])"
1808 smart_quotes_compiled = re.compile(smart_quotes_re)
1809 markup = smart_quotes_compiled.sub(self._subMSChar, markup)
1810
1811 try:
1812
1813 u = self._toUnicode(markup, proposed)
1814 self.markup = u
1815 self.originalEncoding = proposed
1816 except Exception, e:
1817
1818
1819 return None
1820
1821 return self.markup
1822
1824 '''Given a string and its encoding, decodes the string into Unicode.
1825 %encoding is a string recognized by encodings.aliases'''
1826
1827
1828 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1829 and (data[2:4] != '\x00\x00'):
1830 encoding = 'utf-16be'
1831 data = data[2:]
1832 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1833 and (data[2:4] != '\x00\x00'):
1834 encoding = 'utf-16le'
1835 data = data[2:]
1836 elif data[:3] == '\xef\xbb\xbf':
1837 encoding = 'utf-8'
1838 data = data[3:]
1839 elif data[:4] == '\x00\x00\xfe\xff':
1840 encoding = 'utf-32be'
1841 data = data[4:]
1842 elif data[:4] == '\xff\xfe\x00\x00':
1843 encoding = 'utf-32le'
1844 data = data[4:]
1845 newdata = unicode(data, encoding)
1846 return newdata
1847
1849 """Given a document, tries to detect its XML encoding."""
1850 xml_encoding = sniffed_xml_encoding = None
1851 try:
1852 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1853
1854 xml_data = self._ebcdic_to_ascii(xml_data)
1855 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1856
1857 sniffed_xml_encoding = 'utf-16be'
1858 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1859 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1860 and (xml_data[2:4] != '\x00\x00'):
1861
1862 sniffed_xml_encoding = 'utf-16be'
1863 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1864 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1865
1866 sniffed_xml_encoding = 'utf-16le'
1867 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1868 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1869 (xml_data[2:4] != '\x00\x00'):
1870
1871 sniffed_xml_encoding = 'utf-16le'
1872 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1873 elif xml_data[:4] == '\x00\x00\x00\x3c':
1874
1875 sniffed_xml_encoding = 'utf-32be'
1876 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1877 elif xml_data[:4] == '\x3c\x00\x00\x00':
1878
1879 sniffed_xml_encoding = 'utf-32le'
1880 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1881 elif xml_data[:4] == '\x00\x00\xfe\xff':
1882
1883 sniffed_xml_encoding = 'utf-32be'
1884 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1885 elif xml_data[:4] == '\xff\xfe\x00\x00':
1886
1887 sniffed_xml_encoding = 'utf-32le'
1888 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1889 elif xml_data[:3] == '\xef\xbb\xbf':
1890
1891 sniffed_xml_encoding = 'utf-8'
1892 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1893 else:
1894 sniffed_xml_encoding = 'ascii'
1895 pass
1896 except:
1897 xml_encoding_match = None
1898 xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
1899 xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
1900 if not xml_encoding_match and isHTML:
1901 meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
1902 regexp = re.compile(meta_re, re.I)
1903 xml_encoding_match = regexp.search(xml_data)
1904 if xml_encoding_match is not None:
1905 xml_encoding = xml_encoding_match.groups()[0].decode(
1906 'ascii').lower()
1907 if isHTML:
1908 self.declaredHTMLEncoding = xml_encoding
1909 if sniffed_xml_encoding and \
1910 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1911 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1912 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1913 'utf16', 'u16')):
1914 xml_encoding = sniffed_xml_encoding
1915 return xml_data, xml_encoding, sniffed_xml_encoding
1916
1917
1919 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1920 or (charset and self._codec(charset.replace("-", ""))) \
1921 or (charset and self._codec(charset.replace("-", "_"))) \
1922 or charset
1923
1925 if not charset: return charset
1926 codec = None
1927 try:
1928 codecs.lookup(charset)
1929 codec = charset
1930 except (LookupError, ValueError):
1931 pass
1932 return codec
1933
1934 EBCDIC_TO_ASCII_MAP = None
1936 c = self.__class__
1937 if not c.EBCDIC_TO_ASCII_MAP:
1938 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1939 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1940 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1941 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1942 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1943 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1944 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1945 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1946 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1947 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1948 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1949 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1950 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1951 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1952 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1953 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1954 250,251,252,253,254,255)
1955 import string
1956 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1957 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1958 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1959
1960 MS_CHARS = { '\x80' : ('euro', '20AC'),
1961 '\x81' : ' ',
1962 '\x82' : ('sbquo', '201A'),
1963 '\x83' : ('fnof', '192'),
1964 '\x84' : ('bdquo', '201E'),
1965 '\x85' : ('hellip', '2026'),
1966 '\x86' : ('dagger', '2020'),
1967 '\x87' : ('Dagger', '2021'),
1968 '\x88' : ('circ', '2C6'),
1969 '\x89' : ('permil', '2030'),
1970 '\x8A' : ('Scaron', '160'),
1971 '\x8B' : ('lsaquo', '2039'),
1972 '\x8C' : ('OElig', '152'),
1973 '\x8D' : '?',
1974 '\x8E' : ('#x17D', '17D'),
1975 '\x8F' : '?',
1976 '\x90' : '?',
1977 '\x91' : ('lsquo', '2018'),
1978 '\x92' : ('rsquo', '2019'),
1979 '\x93' : ('ldquo', '201C'),
1980 '\x94' : ('rdquo', '201D'),
1981 '\x95' : ('bull', '2022'),
1982 '\x96' : ('ndash', '2013'),
1983 '\x97' : ('mdash', '2014'),
1984 '\x98' : ('tilde', '2DC'),
1985 '\x99' : ('trade', '2122'),
1986 '\x9a' : ('scaron', '161'),
1987 '\x9b' : ('rsaquo', '203A'),
1988 '\x9c' : ('oelig', '153'),
1989 '\x9d' : '?',
1990 '\x9e' : ('#x17E', '17E'),
1991 '\x9f' : ('Yuml', ''),}
1992
1993
1994
1995
1996
1997 if __name__ == '__main__':
1998 import sys
1999 soup = BeautifulSoup(sys.stdin)
2000 print soup.prettify()
2001