parser.py
Go to the documentation of this file.
00001 #   Aseba - an event-based framework for distributed robot control
00002 #   Copyright (C) 2007--2011:
00003 #           Stephane Magnenat <stephane at magnenat dot net>
00004 #           (http://stephane.magnenat.net)
00005 #           and other contributors, see authors.txt for details
00006 #
00007 #   This program is free software: you can redistribute it and/or modify
00008 #   it under the terms of the GNU Lesser General Public License as published
00009 #   by the Free Software Foundation, version 3 of the License.
00010 #
00011 #   This program is distributed in the hope that it will be useful,
00012 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 #   GNU Lesser General Public License for more details.
00015 #
00016 #   You should have received a copy of the GNU Lesser General Public License
00017 #   along with this program. If not, see <http://www.gnu.org/licenses/>.
00018 
00019 # Python lib
00020 import sys
00021 from myparser import MyParser
00022 from string import Template
00023 
00024 # Local module
00025 import wikidot.debug
00026 from wikidot.orderedset import OrderedSet
00027 
00028 header = \
00029 """
00030 <!DOCTYPE html
00031      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
00032      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
00033 
00034 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
00035 
00036 <head>
00037 <link rel='stylesheet' type='text/css' href='aseba.css' />
00038 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
00039 <title>${title}</title>
00040 </head >
00041 
00042 <body>
00043 <h1 class="title">${title}</h1>
00044 ${toc}
00045 """
00046 
00047 footer = \
00048 """
00049 </body>
00050 </html>
00051 """
00052 
00053 class WikidotParser(MyParser):
00054     """WikidotParser is used to clean a page from www.wikidot.com,
00055     keeping only the interesting content."""
00056     def __init__(self):
00057         """Intialize internal variables"""
00058         MyParser.__init__(self)
00059         self.div_level = 0
00060         self.div_bookmark = [-1]    # List managed as a stack
00061         self.state = ["none"]       # List managed as a stack
00062         self.current_state = "none" # Point to the top of the stack
00063         # map for div tag attribute -> state
00064         # (attribute name, attribute property, state)
00065         self.div_state_map = \
00066             [
00067             ('id', 'page-title', 'title'),
00068             ('id', 'breadcrumbs', 'breadcrumbs'),
00069             ('id', 'page-content', 'body'),
00070             ('id', 'toc-action-bar', 'useless'),
00071             ('id', 'toc', 'toc'),
00072             ('style','position:absolute', 'useless')]
00073         self.page_title = ""
00074         self.toc = ""
00075         self.links = OrderedSet()
00076         self.breadcrumbs = list()
00077 
00078     # Public interface
00079     def get_doc(self):
00080         """Retrieve the parsed and cleaned document"""
00081         # format the TOC
00082         if self.toc != "":
00083             self.toc = """<table id="toc-table" summary="TOC"><tr><td>""" + self.toc
00084             self.toc += "</td></tr></table>"
00085         # Add header
00086         header_template = Template(header)
00087         self.out_doc = header_template.substitute(title=self.page_title, toc=self.toc) + self.out_doc
00088         # Add footer
00089         self.out_doc += footer
00090         return self.out_doc
00091 
00092     def get_links(self):
00093         """Retrieve the links embedded in the page (including images)"""
00094         return self.links
00095 
00096     def get_title(self):
00097         return self.page_title
00098 
00099     def get_breadcrumbs(self):
00100         return self.breadcrumbs
00101 
00102     # Inherited functions
00103     def handle_starttag(self, tag, attrs):
00104         """Overridden - Called when a start tag is parsed
00105 
00106         The heart of this function is the state machine.
00107         When a <div> tag is detected, the attributes are compared with
00108         a map of the form (name,value) -> state. If a match occurs,
00109         the state is pushed on top of the stack.
00110 
00111         Depending on the current state, the start tag is queued for output,
00112         or not."""
00113         # Debug
00114         if wikidot.debug.ENABLE_DEBUG == True:
00115             print >> sys.stderr, "<{}> {}".format(tag, attrs)
00116 
00117         # Update the state machine
00118         state_changed = self.__update_state_machine_start__(tag, attrs)
00119 
00120         if (state_changed == True) and (self.current_state == "body"):
00121             # We have just entered the body, don't output this <div> tag
00122             return
00123         if self.current_state == "body":
00124             # Handle special tags
00125             self.__handle_body_tag__(tag, attrs)
00126             # Add the tag to output
00127             MyParser.handle_starttag(self, tag, attrs)
00128         elif self.current_state == "toc":
00129             # Handle the content of the TOC
00130             self.toc += MyParser.format_start_tag(self, tag, attrs)
00131         elif (self.current_state == "breadcrumbs") and (tag == 'a'):
00132             # Register the breadcrumbs
00133             for attr in attrs:
00134                 if (attr[0] == 'href'):
00135                     self.breadcrumbs.append(attr[1])
00136                     break
00137 
00138     def handle_endtag(self, tag):
00139         """Overridden - Called when an end tag is parsed
00140 
00141         The state machine is updated when a </div> tag is encountered.
00142         Depending on the current state, the end tag is queued for output,
00143         or not."""
00144         if self.current_state == "toc":
00145             # Add the tag to the TOC
00146             self.toc += MyParser.format_end_tag(self, tag)
00147 
00148         # Update the state machine
00149         state_changed = self.__update_state_machine_end__(tag)
00150         if state_changed == True:
00151             return
00152 
00153         if self.current_state == "body":
00154             # Add the tag to output
00155             MyParser.handle_endtag(self, tag)
00156 
00157     def handle_data(self, data):
00158         """Overridden - Called when some data is parsed
00159 
00160         Depending on the current state, the data is queued for output,
00161         or not."""
00162         if self.current_state == "title":
00163             # Register the title
00164             self.page_title += data.strip()
00165         elif self.current_state == "body":
00166             # Add data to the output
00167             MyParser.handle_data(self, data)
00168         elif self.current_state == "toc":
00169             # Add data to the TOC
00170             self.toc += data
00171 
00172     def handle_charref(self, name):
00173         """Overridden - Called when a charref (&#xyz) is parsed
00174 
00175         Depending on the current state, the charref is queued for output,
00176         or not."""
00177         if self.current_state == "title":
00178             # Add charref to the title
00179             self.page_title += ("&#" + name + ";")
00180         elif self.current_state == "body":
00181             # Add charref to the output
00182             MyParser.handle_charref(self, name)
00183         elif self.current_state == "toc":
00184             # Add charref to the TOC
00185             self.toc += ("&#" + name + ";")
00186 
00187     def handle_entityref(self, name):
00188         """Overridden - Called when an entityref (&xyz) tag is parsed
00189 
00190         Depending on the current state, the entityref is queued for output,
00191         or not."""
00192         if self.current_state == "title":
00193             # Add the entityref to the title
00194             self.page_title += ("&" + name + ";")
00195         elif self.current_state == "body":
00196             # Add the entityref to the output
00197             MyParser.handle_entityref(self, name)
00198         elif self.current_state == "toc":
00199             # Add the entityref to the TOC
00200             self.toc += ("&" + name + ";")
00201 
00202     def handle_decl(self, decl):
00203         """Overridden - Called when a SGML declaration (<!) is parsed
00204 
00205         Depending on the current state, the declaration is queued for output,
00206         or not."""
00207         if self.current_state == "body":
00208             # Add the SGML declaration to the output
00209             MyParser.handle_decl(self, decl)
00210 
00211     # Private functions
00212     def __update_state_machine_start__(self, tag, attrs):
00213         """Update the state machine."""
00214         state_changed = False
00215 
00216         if tag == 'div':
00217             if wikidot.debug.ENABLE_DEBUG == True:
00218                 print >> sys.stderr, self.state, self.div_bookmark
00219             # Look for the id = xyz attribute
00220             for attr in attrs:
00221                 for div_attr in self.div_state_map:
00222                     if (div_attr[0] == attr[0]) and (div_attr[1] in attr[1]):
00223                         # Match !
00224                         self.state.append(div_attr[2])
00225                         self.div_bookmark.append(self.div_level)
00226                         state_changed = True
00227                         break
00228             # Increment div level
00229             self.div_level += 1
00230 
00231         # Update the current state
00232         self.current_state = self.__get_current_state__()
00233         return state_changed
00234 
00235     def __update_state_machine_end__(self, tag):
00236         state_changed = False
00237 
00238         if tag == 'div':
00239             if wikidot.debug.ENABLE_DEBUG == True:
00240                 print >> sys.stderr, self.state, self.div_bookmark
00241             self.div_level -= 1
00242             if self.div_level == self.div_bookmark[-1]:
00243                 # Matching closing </div> tag -> pop the state
00244                 self.state.pop()
00245                 self.div_bookmark.pop()
00246                 state_changed = True
00247 
00248         # Update the current state
00249         self.current_state = self.__get_current_state__()
00250         return state_changed
00251 
00252     def __get_current_state__(self):
00253         return self.state[-1]
00254 
00255     def __handle_body_tag__(self, tag, attrs):
00256         # Special case 1: links
00257         if tag == 'a':
00258             for index, attr in enumerate(attrs):
00259                 if attr[0] == 'href':
00260                     # Register the link
00261                     self.links.add(attr[1])
00262                     break
00263         # Special case 2: images
00264         elif tag == 'img':
00265             for index, attr in enumerate(attrs):
00266                 if attr[0] == 'src':
00267                     # Register the link
00268                     self.links.add(attr[1])
00269                 elif attr[0] == 'width':
00270                     # Fix the width=xx attribute
00271                     # Wikidot gives width="600px", instead of width=600
00272                     pos = attr[1].find('px')
00273                     if pos >= 0:
00274                         attrs[index] = (attr[0], attr[1][0:pos])
00275 


aseba
Author(s): Stéphane Magnenat
autogenerated on Sun Oct 5 2014 23:46:38