$search
00001 # Aseba - an event-based framework for distributed robot control 00002 # Copyright (C) 2007--2011: 00003 # Stephane Magnenat <stephane at magnenat dot net> 00004 # (http://stephane.magnenat.net) 00005 # and other contributors, see authors.txt for details 00006 # 00007 # This program is free software: you can redistribute it and/or modify 00008 # it under the terms of the GNU Lesser General Public License as published 00009 # by the Free Software Foundation, version 3 of the License. 00010 # 00011 # This program is distributed in the hope that it will be useful, 00012 # but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 # GNU Lesser General Public License for more details. 00015 # 00016 # You should have received a copy of the GNU Lesser General Public License 00017 # along with this program. If not, see <http://www.gnu.org/licenses/>. 00018 00019 # System lib 00020 import os 00021 import os.path 00022 import sys 00023 import urllib2 00024 import mimetypes 00025 import subprocess 00026 00027 # Custom lib 00028 import wikidot.debug 00029 import wikidot.structure 00030 from wikidot.urltoname import urltoname 00031 from wikidot.parser import WikidotParser 00032 import wikidot.tex2png 00033 00034 def __fix_breadcrumbs__(breadcrumbs, toplevel): 00035 # check if toplevel is part of the breadcrumbs, 00036 # and rebase the basecrumb on this level 00037 if toplevel == '': 00038 return breadcrumbs 00039 output = list() 00040 output_enable = False 00041 for bc in breadcrumbs: 00042 if output_enable: 00043 output.append(bc) 00044 if toplevel in bc: 00045 output_enable = True 00046 return output 00047 00048 def fetchurl(page, offline_name, breadcrumbs = ''): 00049 """Given a wikidot URL, fetch it, convert it and store it locally. 00050 00051 Inputs: 00052 page: URL of the wikidot page 00053 offline_name Local file name for storage purpose 00054 breadcrumbs (Optional) Name of the root page. If given, 00055 checks that the breadcrumbs reference this page 00056 (child page). If not, page is discarded 00057 Output: 00058 Links to other pages / images.""" 00059 try: 00060 # Get the page 00061 print >> sys.stderr, "Connecting to {}...".format(page) 00062 response = urllib2.urlopen(page) 00063 except urllib2.HTTPError, e: 00064 print >> sys.stderr, e.code 00065 except urllib2.URLError, e: 00066 print >> sys.stderr, e.reason 00067 else: 00068 retval = dict() 00069 retval['links'] = set() 00070 retval['breadcrumbs'] = list() 00071 # Check MIME type 00072 mime = mimetypes.guess_type(page)[0] 00073 if (mime == None) or ('html' in mime): 00074 # HTML or unknown type 00075 # Convert the wikidot page and check the breakcrumbs 00076 print >> sys.stderr, "Parsing..." 00077 parser = WikidotParser() 00078 parser.feed(response.read()) 00079 # Test if the breadcrumbs links to the main page 00080 page_breadcrumbs = parser.get_breadcrumbs() 00081 page_breadcrumbs = __fix_breadcrumbs__(page_breadcrumbs, breadcrumbs) 00082 retval['breadcrumbs'].extend(page_breadcrumbs) 00083 if (breadcrumbs == '') or (len(page_breadcrumbs) > 0): 00084 # Ok, page valid 00085 wikidot.structure.insert(parser.get_title(), page, page_breadcrumbs if breadcrumbs != '' else set()) 00086 data = parser.get_doc() 00087 retval['links'] = parser.get_links() 00088 else: 00089 # Page is not linked to the main page 00090 print >> sys.stderr, "*** Page is not part of the documentation. Page skipped." 00091 return retval 00092 elif ('image' in mime): 00093 # Image 00094 data = response.read() 00095 else: 00096 # Type is not supported 00097 if wikidot.debug.ENABLE_DEBUG == True: 00098 print >> sys.stderr, "*** This is not a supported type of file. File skipped." 00099 return retval 00100 # Save 00101 print >> sys.stderr, "Saving the result to {}...".format(offline_name) 00102 f = open(offline_name, 'w') 00103 f.write(data) 00104 f.close() 00105 if wikidot.debug.ENABLE_DEBUG == True: 00106 print >> sys.stderr, "***DEBUG: links: ", reval['links'] 00107 return retval 00108 00109 def tidy(directory): 00110 html_files = [x for x in os.listdir(directory) if '.html' in x] 00111 print >> sys.stderr, "\nCleaning HTML files..." 00112 for x in html_files: 00113 filename = os.path.join(directory, x) 00114 print >> sys.stderr, "Processing ", filename 00115 retcode = subprocess.call(["tidy","-config", "wikidot/tidy.config", "-q", "-o", filename, filename]) 00116 00117 def fix_latex(directory): 00118 """Given a directory, convert LaTeX code to PNG images for every HTML file. 00119 00120 Inputs: 00121 directory: Source directory 00122 00123 Output: 00124 No output.""" 00125 00126 html_files = [x for x in os.listdir(directory) if '.html' in x] 00127 print >> sys.stderr, "\nConverting LaTeX equations embedded in HTML files..." 00128 for x in html_files: 00129 filename = os.path.join(directory, x) 00130 print >> sys.stderr, "Processing ", filename 00131 wikidot.tex2png.from_html(filename, os.path.splitext(filename)[0]) 00132