00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 import sys
00021 from myparser import MyParser
00022 from string import Template
00023 
00024 
00025 import wikidot.debug
00026 from wikidot.orderedset import OrderedSet
00027 
00028 header = \
00029 """
00030 <!DOCTYPE html
00031      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
00032      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
00033 
00034 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
00035 
00036 <head>
00037 <link rel='stylesheet' type='text/css' href='aseba.css' />
00038 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
00039 <title>${title}</title>
00040 </head >
00041 
00042 <body>
00043 <h1 class="title">${title}</h1>
00044 ${toc}
00045 """
00046 
00047 footer = \
00048 """
00049 </body>
00050 </html>
00051 """
00052 
00053 class WikidotParser(MyParser):
00054     """WikidotParser is used to clean a page from www.wikidot.com,
00055     keeping only the interesting content."""
00056     def __init__(self):
00057         """Intialize internal variables"""
00058         MyParser.__init__(self)
00059         self.div_level = 0
00060         self.div_bookmark = [-1]    
00061         self.state = ["none"]       
00062         self.current_state = "none" 
00063         
00064         
00065         self.div_state_map = \
00066             [
00067             ('id', 'page-title', 'title'),
00068             ('id', 'breadcrumbs', 'breadcrumbs'),
00069             ('id', 'page-content', 'body'),
00070             ('id', 'toc-action-bar', 'useless'),
00071             ('id', 'toc', 'toc'),
00072             ('style','position:absolute', 'useless')]
00073         self.page_title = ""
00074         self.toc = ""
00075         self.links = OrderedSet()
00076         self.breadcrumbs = list()
00077 
00078     
00079     def get_doc(self):
00080         """Retrieve the parsed and cleaned document"""
00081         
00082         if self.toc != "":
00083             self.toc = """<table id="toc-table" summary="TOC"><tr><td>""" + self.toc
00084             self.toc += "</td></tr></table>"
00085         
00086         header_template = Template(header)
00087         self.out_doc = header_template.substitute(title=self.page_title, toc=self.toc) + self.out_doc
00088         
00089         self.out_doc += footer
00090         return self.out_doc
00091 
00092     def get_links(self):
00093         """Retrieve the links embedded in the page (including images)"""
00094         return self.links
00095 
00096     def get_title(self):
00097         return self.page_title
00098 
00099     def get_breadcrumbs(self):
00100         return self.breadcrumbs
00101 
00102     
00103     def handle_starttag(self, tag, attrs):
00104         """Overridden - Called when a start tag is parsed
00105 
00106         The heart of this function is the state machine.
00107         When a <div> tag is detected, the attributes are compared with
00108         a map of the form (name,value) -> state. If a match occurs,
00109         the state is pushed on top of the stack.
00110 
00111         Depending on the current state, the start tag is queued for output,
00112         or not."""
00113         
00114         if wikidot.debug.ENABLE_DEBUG == True:
00115             print >> sys.stderr, "<{}> {}".format(tag, attrs)
00116 
00117         
00118         state_changed = self.__update_state_machine_start__(tag, attrs)
00119 
00120         if (state_changed == True) and (self.current_state == "body"):
00121             
00122             return
00123         if self.current_state == "body":
00124             
00125             self.__handle_body_tag__(tag, attrs)
00126             
00127             MyParser.handle_starttag(self, tag, attrs)
00128         elif self.current_state == "toc":
00129             
00130             self.toc += MyParser.format_start_tag(self, tag, attrs)
00131         elif (self.current_state == "breadcrumbs") and (tag == 'a'):
00132             
00133             for attr in attrs:
00134                 if (attr[0] == 'href'):
00135                     self.breadcrumbs.append(attr[1])
00136                     break
00137 
00138     def handle_endtag(self, tag):
00139         """Overridden - Called when an end tag is parsed
00140 
00141         The state machine is updated when a </div> tag is encountered.
00142         Depending on the current state, the end tag is queued for output,
00143         or not."""
00144         if self.current_state == "toc":
00145             
00146             self.toc += MyParser.format_end_tag(self, tag)
00147 
00148         
00149         state_changed = self.__update_state_machine_end__(tag)
00150         if state_changed == True:
00151             return
00152 
00153         if self.current_state == "body":
00154             
00155             MyParser.handle_endtag(self, tag)
00156 
00157     def handle_data(self, data):
00158         """Overridden - Called when some data is parsed
00159 
00160         Depending on the current state, the data is queued for output,
00161         or not."""
00162         if self.current_state == "title":
00163             
00164             self.page_title += data.strip()
00165         elif self.current_state == "body":
00166             
00167             MyParser.handle_data(self, data)
00168         elif self.current_state == "toc":
00169             
00170             self.toc += data
00171 
00172     def handle_charref(self, name):
00173         """Overridden - Called when a charref (&#xyz) is parsed
00174 
00175         Depending on the current state, the charref is queued for output,
00176         or not."""
00177         if self.current_state == "title":
00178             
00179             self.page_title += ("&#" + name + ";")
00180         elif self.current_state == "body":
00181             
00182             MyParser.handle_charref(self, name)
00183         elif self.current_state == "toc":
00184             
00185             self.toc += ("&#" + name + ";")
00186 
00187     def handle_entityref(self, name):
00188         """Overridden - Called when an entityref (&xyz) tag is parsed
00189 
00190         Depending on the current state, the entityref is queued for output,
00191         or not."""
00192         if self.current_state == "title":
00193             
00194             self.page_title += ("&" + name + ";")
00195         elif self.current_state == "body":
00196             
00197             MyParser.handle_entityref(self, name)
00198         elif self.current_state == "toc":
00199             
00200             self.toc += ("&" + name + ";")
00201 
00202     def handle_decl(self, decl):
00203         """Overridden - Called when a SGML declaration (<!) is parsed
00204 
00205         Depending on the current state, the declaration is queued for output,
00206         or not."""
00207         if self.current_state == "body":
00208             
00209             MyParser.handle_decl(self, decl)
00210 
00211     
00212     def __update_state_machine_start__(self, tag, attrs):
00213         """Update the state machine."""
00214         state_changed = False
00215 
00216         if tag == 'div':
00217             if wikidot.debug.ENABLE_DEBUG == True:
00218                 print >> sys.stderr, self.state, self.div_bookmark
00219             
00220             for attr in attrs:
00221                 for div_attr in self.div_state_map:
00222                     if (div_attr[0] == attr[0]) and (div_attr[1] in attr[1]):
00223                         
00224                         self.state.append(div_attr[2])
00225                         self.div_bookmark.append(self.div_level)
00226                         state_changed = True
00227                         break
00228             
00229             self.div_level += 1
00230 
00231         
00232         self.current_state = self.__get_current_state__()
00233         return state_changed
00234 
00235     def __update_state_machine_end__(self, tag):
00236         state_changed = False
00237 
00238         if tag == 'div':
00239             if wikidot.debug.ENABLE_DEBUG == True:
00240                 print >> sys.stderr, self.state, self.div_bookmark
00241             self.div_level -= 1
00242             if self.div_level == self.div_bookmark[-1]:
00243                 
00244                 self.state.pop()
00245                 self.div_bookmark.pop()
00246                 state_changed = True
00247 
00248         
00249         self.current_state = self.__get_current_state__()
00250         return state_changed
00251 
00252     def __get_current_state__(self):
00253         return self.state[-1]
00254 
00255     def __handle_body_tag__(self, tag, attrs):
00256         
00257         if tag == 'a':
00258             for index, attr in enumerate(attrs):
00259                 if attr[0] == 'href':
00260                     
00261                     self.links.add(attr[1])
00262                     break
00263         
00264         elif tag == 'img':
00265             for index, attr in enumerate(attrs):
00266                 if attr[0] == 'src':
00267                     
00268                     self.links.add(attr[1])
00269                 elif attr[0] == 'width':
00270                     
00271                     
00272                     pos = attr[1].find('px')
00273                     if pos >= 0:
00274                         attrs[index] = (attr[0], attr[1][0:pos])
00275