Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 import os
00021 import os.path
00022 import sys
00023 import urllib2
00024 import mimetypes
00025 import subprocess
00026
00027
00028 import wikidot.debug
00029 import wikidot.structure
00030 from wikidot.urltoname import urltoname
00031 from wikidot.parser import WikidotParser
00032 import wikidot.tex2png
00033
00034 def __fix_breadcrumbs__(breadcrumbs, toplevel):
00035
00036
00037 if toplevel == '':
00038 return breadcrumbs
00039 output = list()
00040 output_enable = False
00041 for bc in breadcrumbs:
00042 if output_enable:
00043 output.append(bc)
00044 if toplevel in bc:
00045 output_enable = True
00046 return output
00047
00048 def fetchurl(page, offline_name, breadcrumbs = ''):
00049 """Given a wikidot URL, fetch it, convert it and store it locally.
00050
00051 Inputs:
00052 page: URL of the wikidot page
00053 offline_name Local file name for storage purpose
00054 breadcrumbs (Optional) Name of the root page. If given,
00055 checks that the breadcrumbs reference this page
00056 (child page). If not, page is discarded
00057 Output:
00058 Links to other pages / images."""
00059 try:
00060
00061 print >> sys.stderr, "Connecting to {}...".format(page)
00062 response = urllib2.urlopen(page)
00063 except urllib2.HTTPError, e:
00064 print >> sys.stderr, e.code
00065 except urllib2.URLError, e:
00066 print >> sys.stderr, e.reason
00067 else:
00068 retval = dict()
00069 retval['links'] = set()
00070 retval['breadcrumbs'] = list()
00071
00072 mime = mimetypes.guess_type(page)[0]
00073 if (mime == None) or ('html' in mime):
00074
00075
00076 print >> sys.stderr, "Parsing..."
00077 parser = WikidotParser()
00078 parser.feed(response.read())
00079
00080 page_breadcrumbs = parser.get_breadcrumbs()
00081 page_breadcrumbs = __fix_breadcrumbs__(page_breadcrumbs, breadcrumbs)
00082 retval['breadcrumbs'].extend(page_breadcrumbs)
00083 if (breadcrumbs == '') or (len(page_breadcrumbs) > 0):
00084
00085 wikidot.structure.insert(parser.get_title(), page, page_breadcrumbs if breadcrumbs != '' else set())
00086 data = parser.get_doc()
00087 retval['links'] = parser.get_links()
00088 else:
00089
00090 print >> sys.stderr, "*** Page is not part of the documentation. Page skipped."
00091 return retval
00092 elif ('image' in mime):
00093
00094 data = response.read()
00095 else:
00096
00097 if wikidot.debug.ENABLE_DEBUG == True:
00098 print >> sys.stderr, "*** This is not a supported type of file. File skipped."
00099 return retval
00100
00101 print >> sys.stderr, "Saving the result to {}...".format(offline_name)
00102 f = open(offline_name, 'w')
00103 f.write(data)
00104 f.close()
00105 if wikidot.debug.ENABLE_DEBUG == True:
00106 print >> sys.stderr, "***DEBUG: links: ", reval['links']
00107 return retval
00108
00109 def tidy(directory):
00110 html_files = [x for x in os.listdir(directory) if '.html' in x]
00111 print >> sys.stderr, "\nCleaning HTML files..."
00112 for x in html_files:
00113 filename = os.path.join(directory, x)
00114 print >> sys.stderr, "Processing ", filename
00115 retcode = subprocess.call(["tidy","-config", "wikidot/tidy.config", "-q", "-o", filename, filename])
00116
00117 def fix_latex(directory):
00118 """Given a directory, convert LaTeX code to PNG images for every HTML file.
00119
00120 Inputs:
00121 directory: Source directory
00122
00123 Output:
00124 No output."""
00125
00126 html_files = [x for x in os.listdir(directory) if '.html' in x]
00127 print >> sys.stderr, "\nConverting LaTeX equations embedded in HTML files..."
00128 for x in html_files:
00129 filename = os.path.join(directory, x)
00130 print >> sys.stderr, "Processing ", filename
00131 wikidot.tex2png.from_html(filename, os.path.splitext(filename)[0])
00132