$search
00001 #! /usr/bin/env python 00002 00003 # Aseba - an event-based framework for distributed robot control 00004 # Copyright (C) 2007--2011: 00005 # Stephane Magnenat <stephane at magnenat dot net> 00006 # (http://stephane.magnenat.net) 00007 # and other contributors, see authors.txt for details 00008 # 00009 # This program is free software: you can redistribute it and/or modify 00010 # it under the terms of the GNU Lesser General Public License as published 00011 # by the Free Software Foundation, version 3 of the License. 00012 # 00013 # This program is distributed in the hope that it will be useful, 00014 # but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 # GNU Lesser General Public License for more details. 00017 # 00018 # You should have received a copy of the GNU Lesser General Public License 00019 # along with this program. If not, see <http://www.gnu.org/licenses/>. 00020 00021 # System lib 00022 import sys 00023 import os.path 00024 import urlparse 00025 00026 # Custom lib 00027 from wikidot.tools import fetchurl 00028 from wikidot.fixurl import fixurls 00029 from wikidot.tools import tidy 00030 from wikidot.tools import fix_latex 00031 from wikidot.urltoname import urltoname 00032 from wikidot.orderedset import OrderedSet 00033 00034 def fetchwikidot(starturl, outputdir): 00035 # Create the output directory, if needed 00036 try: 00037 os.stat(outputdir) 00038 except OSError, e: 00039 print >> sys.stderr, "Creating output directory; ", outputdir 00040 os.mkdir(outputdir) 00041 00042 # Fetch root page 00043 output = os.path.join(outputdir, urltoname(starturl)) 00044 retval = fetchurl(starturl, output) 00045 newlinks = retval['links'] 00046 breadcrumbs = retval['breadcrumbs'] 00047 # get the last element of the list 00048 if len(breadcrumbs) > 0: 00049 breadcrumbs = breadcrumbs[len(breadcrumbs)-1] 00050 else: 00051 breadcrumbs = '' 00052 00053 # Create a set with fetched links (avoid loops...) 00054 links = OrderedSet(starturl) 00055 00056 # Iterate on the links, and recursively download / convert 00057 fetchlinks = newlinks 00058 while len(fetchlinks) > 0: 00059 newlinks = OrderedSet() 00060 for url in fetchlinks: 00061 url = urlparse.urljoin(starturl, url) 00062 output = os.path.join(outputdir, urltoname(url)) 00063 print >> sys.stderr, "\nProcessing ", url 00064 # Link on the same server? 00065 if (urlparse.urlparse(url).netloc == urlparse.urlparse(starturl).netloc): 00066 retval = fetchurl(url, output, breadcrumbs) 00067 newlinks.update(retval['links']) 00068 else: 00069 print >> sys.stderr, "*** {} is not on the same server. Link skipped.".format(url) 00070 # Update sets of links 00071 links.update(fetchlinks) 00072 fetchlinks = newlinks - links 00073 00074 # Fix local urls for the files of the output directory 00075 fixurls(outputdir, starturl) 00076 00077 # Clean HTML code 00078 tidy(outputdir) 00079 00080 # Convert equations to PNG 00081 fix_latex(outputdir) 00082 00083 # We are done 00084 print >> sys.stderr, "\nDone!" 00085 00086 00087 # When executed from the command line 00088 if __name__ == "__main__": 00089 # Input arguments 00090 if len(sys.argv) == 3: 00091 starturl = sys.argv[1] 00092 outputdir = sys.argv[2] 00093 else: 00094 print >> sys.stderr, "Wrong number of arguments.\n" 00095 print >> sys.stderr, "Usage:" 00096 print >> sys.stderr, " {} root_page output_dir".format(sys.argv[0]) 00097 exit(1) 00098 00099 fetchwikidot(starturl, outputdir)