00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 import sys
00021 from myparser import MyParser
00022 from string import Template
00023
00024
00025 import wikidot.debug
00026 from wikidot.orderedset import OrderedSet
00027
00028 header = \
00029 """
00030 <!DOCTYPE html
00031 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
00032 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
00033
00034 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
00035
00036 <head>
00037 <link rel='stylesheet' type='text/css' href='aseba.css' />
00038 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
00039 <title>${title}</title>
00040 </head >
00041
00042 <body>
00043 <h1 class="title">${title}</h1>
00044 ${toc}
00045 """
00046
00047 footer = \
00048 """
00049 </body>
00050 </html>
00051 """
00052
00053 class WikidotParser(MyParser):
00054 """WikidotParser is used to clean a page from www.wikidot.com,
00055 keeping only the interesting content."""
00056 def __init__(self):
00057 """Intialize internal variables"""
00058 MyParser.__init__(self)
00059 self.div_level = 0
00060 self.div_bookmark = [-1]
00061 self.state = ["none"]
00062 self.current_state = "none"
00063
00064
00065 self.div_state_map = \
00066 [
00067 ('id', 'page-title', 'title'),
00068 ('id', 'breadcrumbs', 'breadcrumbs'),
00069 ('id', 'page-content', 'body'),
00070 ('id', 'toc-action-bar', 'useless'),
00071 ('id', 'toc', 'toc'),
00072 ('style','position:absolute', 'useless')]
00073 self.page_title = ""
00074 self.toc = ""
00075 self.links = OrderedSet()
00076 self.breadcrumbs = list()
00077
00078
00079 def get_doc(self):
00080 """Retrieve the parsed and cleaned document"""
00081
00082 if self.toc != "":
00083 self.toc = """<table id="toc-table" summary="TOC"><tr><td>""" + self.toc
00084 self.toc += "</td></tr></table>"
00085
00086 header_template = Template(header)
00087 self.out_doc = header_template.substitute(title=self.page_title, toc=self.toc) + self.out_doc
00088
00089 self.out_doc += footer
00090 return self.out_doc
00091
00092 def get_links(self):
00093 """Retrieve the links embedded in the page (including images)"""
00094 return self.links
00095
00096 def get_title(self):
00097 return self.page_title
00098
00099 def get_breadcrumbs(self):
00100 return self.breadcrumbs
00101
00102
00103 def handle_starttag(self, tag, attrs):
00104 """Overridden - Called when a start tag is parsed
00105
00106 The heart of this function is the state machine.
00107 When a <div> tag is detected, the attributes are compared with
00108 a map of the form (name,value) -> state. If a match occurs,
00109 the state is pushed on top of the stack.
00110
00111 Depending on the current state, the start tag is queued for output,
00112 or not."""
00113
00114 if wikidot.debug.ENABLE_DEBUG == True:
00115 print >> sys.stderr, "<{}> {}".format(tag, attrs)
00116
00117
00118 state_changed = self.__update_state_machine_start__(tag, attrs)
00119
00120 if (state_changed == True) and (self.current_state == "body"):
00121
00122 return
00123 if self.current_state == "body":
00124
00125 self.__handle_body_tag__(tag, attrs)
00126
00127 MyParser.handle_starttag(self, tag, attrs)
00128 elif self.current_state == "toc":
00129
00130 self.toc += MyParser.format_start_tag(self, tag, attrs)
00131 elif (self.current_state == "breadcrumbs") and (tag == 'a'):
00132
00133 for attr in attrs:
00134 if (attr[0] == 'href'):
00135 self.breadcrumbs.append(attr[1])
00136 break
00137
00138 def handle_endtag(self, tag):
00139 """Overridden - Called when an end tag is parsed
00140
00141 The state machine is updated when a </div> tag is encountered.
00142 Depending on the current state, the end tag is queued for output,
00143 or not."""
00144 if self.current_state == "toc":
00145
00146 self.toc += MyParser.format_end_tag(self, tag)
00147
00148
00149 state_changed = self.__update_state_machine_end__(tag)
00150 if state_changed == True:
00151 return
00152
00153 if self.current_state == "body":
00154
00155 MyParser.handle_endtag(self, tag)
00156
00157 def handle_data(self, data):
00158 """Overridden - Called when some data is parsed
00159
00160 Depending on the current state, the data is queued for output,
00161 or not."""
00162 if self.current_state == "title":
00163
00164 self.page_title += data.strip()
00165 elif self.current_state == "body":
00166
00167 MyParser.handle_data(self, data)
00168 elif self.current_state == "toc":
00169
00170 self.toc += data
00171
00172 def handle_charref(self, name):
00173 """Overridden - Called when a charref (&#xyz) is parsed
00174
00175 Depending on the current state, the charref is queued for output,
00176 or not."""
00177 if self.current_state == "title":
00178
00179 self.page_title += ("&#" + name + ";")
00180 elif self.current_state == "body":
00181
00182 MyParser.handle_charref(self, name)
00183 elif self.current_state == "toc":
00184
00185 self.toc += ("&#" + name + ";")
00186
00187 def handle_entityref(self, name):
00188 """Overridden - Called when an entityref (&xyz) tag is parsed
00189
00190 Depending on the current state, the entityref is queued for output,
00191 or not."""
00192 if self.current_state == "title":
00193
00194 self.page_title += ("&" + name + ";")
00195 elif self.current_state == "body":
00196
00197 MyParser.handle_entityref(self, name)
00198 elif self.current_state == "toc":
00199
00200 self.toc += ("&" + name + ";")
00201
00202 def handle_decl(self, decl):
00203 """Overridden - Called when a SGML declaration (<!) is parsed
00204
00205 Depending on the current state, the declaration is queued for output,
00206 or not."""
00207 if self.current_state == "body":
00208
00209 MyParser.handle_decl(self, decl)
00210
00211
00212 def __update_state_machine_start__(self, tag, attrs):
00213 """Update the state machine."""
00214 state_changed = False
00215
00216 if tag == 'div':
00217 if wikidot.debug.ENABLE_DEBUG == True:
00218 print >> sys.stderr, self.state, self.div_bookmark
00219
00220 for attr in attrs:
00221 for div_attr in self.div_state_map:
00222 if (div_attr[0] == attr[0]) and (div_attr[1] in attr[1]):
00223
00224 self.state.append(div_attr[2])
00225 self.div_bookmark.append(self.div_level)
00226 state_changed = True
00227 break
00228
00229 self.div_level += 1
00230
00231
00232 self.current_state = self.__get_current_state__()
00233 return state_changed
00234
00235 def __update_state_machine_end__(self, tag):
00236 state_changed = False
00237
00238 if tag == 'div':
00239 if wikidot.debug.ENABLE_DEBUG == True:
00240 print >> sys.stderr, self.state, self.div_bookmark
00241 self.div_level -= 1
00242 if self.div_level == self.div_bookmark[-1]:
00243
00244 self.state.pop()
00245 self.div_bookmark.pop()
00246 state_changed = True
00247
00248
00249 self.current_state = self.__get_current_state__()
00250 return state_changed
00251
00252 def __get_current_state__(self):
00253 return self.state[-1]
00254
00255 def __handle_body_tag__(self, tag, attrs):
00256
00257 if tag == 'a':
00258 for index, attr in enumerate(attrs):
00259 if attr[0] == 'href':
00260
00261 self.links.add(attr[1])
00262 break
00263
00264 elif tag == 'img':
00265 for index, attr in enumerate(attrs):
00266 if attr[0] == 'src':
00267
00268 self.links.add(attr[1])
00269 elif attr[0] == 'width':
00270
00271
00272 pos = attr[1].find('px')
00273 if pos >= 0:
00274 attrs[index] = (attr[0], attr[1][0:pos])
00275