$search
00001 # Aseba - an event-based framework for distributed robot control 00002 # Copyright (C) 2007--2011: 00003 # Stephane Magnenat <stephane at magnenat dot net> 00004 # (http://stephane.magnenat.net) 00005 # and other contributors, see authors.txt for details 00006 # 00007 # This program is free software: you can redistribute it and/or modify 00008 # it under the terms of the GNU Lesser General Public License as published 00009 # by the Free Software Foundation, version 3 of the License. 00010 # 00011 # This program is distributed in the hope that it will be useful, 00012 # but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 # GNU Lesser General Public License for more details. 00015 # 00016 # You should have received a copy of the GNU Lesser General Public License 00017 # along with this program. If not, see <http://www.gnu.org/licenses/>. 00018 00019 # Python lib 00020 import sys 00021 from myparser import MyParser 00022 from string import Template 00023 00024 # Local module 00025 import wikidot.debug 00026 from wikidot.orderedset import OrderedSet 00027 00028 header = \ 00029 """ 00030 <!DOCTYPE html 00031 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 00032 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 00033 00034 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> 00035 00036 <head> 00037 <link rel='stylesheet' type='text/css' href='aseba.css' /> 00038 <meta http-equiv="content-type" content="text/html; charset=utf-8" /> 00039 <title>${title}</title> 00040 </head > 00041 00042 <body> 00043 <h1 class="title">${title}</h1> 00044 ${toc} 00045 """ 00046 00047 footer = \ 00048 """ 00049 </body> 00050 </html> 00051 """ 00052 00053 class WikidotParser(MyParser): 00054 """WikidotParser is used to clean a page from www.wikidot.com, 00055 keeping only the interesting content.""" 00056 def __init__(self): 00057 """Intialize internal variables""" 00058 MyParser.__init__(self) 00059 self.div_level = 0 00060 self.div_bookmark = [-1] # List managed as a stack 00061 self.state = ["none"] # List managed as a stack 00062 self.current_state = "none" # Point to the top of the stack 00063 # map for div tag attribute -> state 00064 # (attribute name, attribute property, state) 00065 self.div_state_map = \ 00066 [ 00067 ('id', 'page-title', 'title'), 00068 ('id', 'breadcrumbs', 'breadcrumbs'), 00069 ('id', 'page-content', 'body'), 00070 ('id', 'toc-action-bar', 'useless'), 00071 ('id', 'toc', 'toc'), 00072 ('style','position:absolute', 'useless')] 00073 self.page_title = "" 00074 self.toc = "" 00075 self.links = OrderedSet() 00076 self.breadcrumbs = list() 00077 00078 # Public interface 00079 def get_doc(self): 00080 """Retrieve the parsed and cleaned document""" 00081 # format the TOC 00082 if self.toc != "": 00083 self.toc = """<table id="toc-table" summary="TOC"><tr><td>""" + self.toc 00084 self.toc += "</td></tr></table>" 00085 # Add header 00086 header_template = Template(header) 00087 self.out_doc = header_template.substitute(title=self.page_title, toc=self.toc) + self.out_doc 00088 # Add footer 00089 self.out_doc += footer 00090 return self.out_doc 00091 00092 def get_links(self): 00093 """Retrieve the links embedded in the page (including images)""" 00094 return self.links 00095 00096 def get_title(self): 00097 return self.page_title 00098 00099 def get_breadcrumbs(self): 00100 return self.breadcrumbs 00101 00102 # Inherited functions 00103 def handle_starttag(self, tag, attrs): 00104 """Overridden - Called when a start tag is parsed 00105 00106 The heart of this function is the state machine. 00107 When a <div> tag is detected, the attributes are compared with 00108 a map of the form (name,value) -> state. If a match occurs, 00109 the state is pushed on top of the stack. 00110 00111 Depending on the current state, the start tag is queued for output, 00112 or not.""" 00113 # Debug 00114 if wikidot.debug.ENABLE_DEBUG == True: 00115 print >> sys.stderr, "<{}> {}".format(tag, attrs) 00116 00117 # Update the state machine 00118 state_changed = self.__update_state_machine_start__(tag, attrs) 00119 00120 if (state_changed == True) and (self.current_state == "body"): 00121 # We have just entered the body, don't output this <div> tag 00122 return 00123 if self.current_state == "body": 00124 # Handle special tags 00125 self.__handle_body_tag__(tag, attrs) 00126 # Add the tag to output 00127 MyParser.handle_starttag(self, tag, attrs) 00128 elif self.current_state == "toc": 00129 # Handle the content of the TOC 00130 self.toc += MyParser.format_start_tag(self, tag, attrs) 00131 elif (self.current_state == "breadcrumbs") and (tag == 'a'): 00132 # Register the breadcrumbs 00133 for attr in attrs: 00134 if (attr[0] == 'href'): 00135 self.breadcrumbs.append(attr[1]) 00136 break 00137 00138 def handle_endtag(self, tag): 00139 """Overridden - Called when an end tag is parsed 00140 00141 The state machine is updated when a </div> tag is encountered. 00142 Depending on the current state, the end tag is queued for output, 00143 or not.""" 00144 if self.current_state == "toc": 00145 # Add the tag to the TOC 00146 self.toc += MyParser.format_end_tag(self, tag) 00147 00148 # Update the state machine 00149 state_changed = self.__update_state_machine_end__(tag) 00150 if state_changed == True: 00151 return 00152 00153 if self.current_state == "body": 00154 # Add the tag to output 00155 MyParser.handle_endtag(self, tag) 00156 00157 def handle_data(self, data): 00158 """Overridden - Called when some data is parsed 00159 00160 Depending on the current state, the data is queued for output, 00161 or not.""" 00162 if self.current_state == "title": 00163 # Register the title 00164 self.page_title += data.strip() 00165 elif self.current_state == "body": 00166 # Add data to the output 00167 MyParser.handle_data(self, data) 00168 elif self.current_state == "toc": 00169 # Add data to the TOC 00170 self.toc += data 00171 00172 def handle_charref(self, name): 00173 """Overridden - Called when a charref (&#xyz) is parsed 00174 00175 Depending on the current state, the charref is queued for output, 00176 or not.""" 00177 if self.current_state == "title": 00178 # Add charref to the title 00179 self.page_title += ("&#" + name + ";") 00180 elif self.current_state == "body": 00181 # Add charref to the output 00182 MyParser.handle_charref(self, name) 00183 elif self.current_state == "toc": 00184 # Add charref to the TOC 00185 self.toc += ("&#" + name + ";") 00186 00187 def handle_entityref(self, name): 00188 """Overridden - Called when an entityref (&xyz) tag is parsed 00189 00190 Depending on the current state, the entityref is queued for output, 00191 or not.""" 00192 if self.current_state == "title": 00193 # Add the entityref to the title 00194 self.page_title += ("&" + name + ";") 00195 elif self.current_state == "body": 00196 # Add the entityref to the output 00197 MyParser.handle_entityref(self, name) 00198 elif self.current_state == "toc": 00199 # Add the entityref to the TOC 00200 self.toc += ("&" + name + ";") 00201 00202 def handle_decl(self, decl): 00203 """Overridden - Called when a SGML declaration (<!) is parsed 00204 00205 Depending on the current state, the declaration is queued for output, 00206 or not.""" 00207 if self.current_state == "body": 00208 # Add the SGML declaration to the output 00209 MyParser.handle_decl(self, decl) 00210 00211 # Private functions 00212 def __update_state_machine_start__(self, tag, attrs): 00213 """Update the state machine.""" 00214 state_changed = False 00215 00216 if tag == 'div': 00217 if wikidot.debug.ENABLE_DEBUG == True: 00218 print >> sys.stderr, self.state, self.div_bookmark 00219 # Look for the id = xyz attribute 00220 for attr in attrs: 00221 for div_attr in self.div_state_map: 00222 if (div_attr[0] == attr[0]) and (div_attr[1] in attr[1]): 00223 # Match ! 00224 self.state.append(div_attr[2]) 00225 self.div_bookmark.append(self.div_level) 00226 state_changed = True 00227 break 00228 # Increment div level 00229 self.div_level += 1 00230 00231 # Update the current state 00232 self.current_state = self.__get_current_state__() 00233 return state_changed 00234 00235 def __update_state_machine_end__(self, tag): 00236 state_changed = False 00237 00238 if tag == 'div': 00239 if wikidot.debug.ENABLE_DEBUG == True: 00240 print >> sys.stderr, self.state, self.div_bookmark 00241 self.div_level -= 1 00242 if self.div_level == self.div_bookmark[-1]: 00243 # Matching closing </div> tag -> pop the state 00244 self.state.pop() 00245 self.div_bookmark.pop() 00246 state_changed = True 00247 00248 # Update the current state 00249 self.current_state = self.__get_current_state__() 00250 return state_changed 00251 00252 def __get_current_state__(self): 00253 return self.state[-1] 00254 00255 def __handle_body_tag__(self, tag, attrs): 00256 # Special case 1: links 00257 if tag == 'a': 00258 for index, attr in enumerate(attrs): 00259 if attr[0] == 'href': 00260 # Register the link 00261 self.links.add(attr[1]) 00262 break 00263 # Special case 2: images 00264 elif tag == 'img': 00265 for index, attr in enumerate(attrs): 00266 if attr[0] == 'src': 00267 # Register the link 00268 self.links.add(attr[1]) 00269 elif attr[0] == 'width': 00270 # Fix the width=xx attribute 00271 # Wikidot gives width="600px", instead of width=600 00272 pos = attr[1].find('px') 00273 if pos >= 0: 00274 attrs[index] = (attr[0], attr[1][0:pos]) 00275