fetch.py
Go to the documentation of this file.
00001 #! /usr/bin/env python
00002 
00003 #   Aseba - an event-based framework for distributed robot control
00004 #   Copyright (C) 2007--2011:
00005 #           Stephane Magnenat <stephane at magnenat dot net>
00006 #           (http://stephane.magnenat.net)
00007 #           and other contributors, see authors.txt for details
00008 #
00009 #   This program is free software: you can redistribute it and/or modify
00010 #   it under the terms of the GNU Lesser General Public License as published
00011 #   by the Free Software Foundation, version 3 of the License.
00012 #
00013 #   This program is distributed in the hope that it will be useful,
00014 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 #   GNU Lesser General Public License for more details.
00017 #
00018 #   You should have received a copy of the GNU Lesser General Public License
00019 #   along with this program. If not, see <http://www.gnu.org/licenses/>.
00020 
00021 # System lib
00022 import sys
00023 import os.path
00024 import urlparse
00025 
00026 # Custom lib
00027 from wikidot.tools import fetchurl
00028 from wikidot.fixurl import fixurls
00029 from wikidot.tools import tidy
00030 from wikidot.tools import fix_latex
00031 from wikidot.urltoname import urltoname
00032 from wikidot.orderedset import OrderedSet
00033 
00034 def fetchwikidot(starturl, outputdir):
00035     # Create the output directory, if needed
00036     try:
00037         os.stat(outputdir)
00038     except OSError, e:
00039         print >> sys.stderr, "Creating output directory; ", outputdir
00040         os.mkdir(outputdir)
00041 
00042     # Fetch root page
00043     output = os.path.join(outputdir, urltoname(starturl))
00044     retval = fetchurl(starturl, output)
00045     newlinks = retval['links']
00046     breadcrumbs = retval['breadcrumbs']
00047     # get the last element of the list
00048     if len(breadcrumbs) > 0:
00049         breadcrumbs = breadcrumbs[len(breadcrumbs)-1]
00050     else:
00051         breadcrumbs = ''
00052 
00053     # Create a set with fetched links (avoid loops...)
00054     links = OrderedSet(starturl)
00055 
00056     # Iterate on the links, and recursively download / convert
00057     fetchlinks = newlinks
00058     while len(fetchlinks) > 0:
00059         newlinks = OrderedSet()
00060         for url in fetchlinks:
00061             url = urlparse.urljoin(starturl, url)
00062             output = os.path.join(outputdir, urltoname(url))
00063             print >> sys.stderr, "\nProcessing ", url
00064             # Link on the same server?
00065             if (urlparse.urlparse(url).netloc == urlparse.urlparse(starturl).netloc):
00066                 retval = fetchurl(url, output, breadcrumbs)
00067                 newlinks.update(retval['links'])
00068             else:
00069                 print >> sys.stderr, "*** {} is not on the same server. Link skipped.".format(url)
00070         # Update sets of links
00071         links.update(fetchlinks)
00072         fetchlinks = newlinks - links
00073 
00074     # Fix local urls for the files of the output directory
00075     fixurls(outputdir, starturl)
00076 
00077     # Clean HTML code
00078     tidy(outputdir)
00079 
00080     # Convert equations to PNG
00081     fix_latex(outputdir)
00082 
00083     # We are done
00084     print >> sys.stderr, "\nDone!"
00085 
00086 
00087 # When executed from the command line
00088 if __name__ == "__main__":
00089     # Input arguments
00090     if len(sys.argv) == 3:
00091         starturl = sys.argv[1]
00092         outputdir = sys.argv[2]
00093     else:
00094         print >> sys.stderr, "Wrong number of arguments.\n"
00095         print >> sys.stderr, "Usage:"
00096         print >> sys.stderr, "  {} root_page output_dir".format(sys.argv[0])
00097         exit(1)
00098 
00099     fetchwikidot(starturl, outputdir)


aseba
Author(s): Stéphane Magnenat
autogenerated on Thu Jan 2 2014 11:17:16