fixurl.py
Go to the documentation of this file.
00001 #   Aseba - an event-based framework for distributed robot control
00002 #   Copyright (C) 2007--2011:
00003 #           Stephane Magnenat <stephane at magnenat dot net>
00004 #           (http://stephane.magnenat.net)
00005 #           and other contributors, see authors.txt for details
00006 #
00007 #   This program is free software: you can redistribute it and/or modify
00008 #   it under the terms of the GNU Lesser General Public License as published
00009 #   by the Free Software Foundation, version 3 of the License.
00010 #
00011 #   This program is distributed in the hope that it will be useful,
00012 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 #   GNU Lesser General Public License for more details.
00015 #
00016 #   You should have received a copy of the GNU Lesser General Public License
00017 #   along with this program. If not, see <http://www.gnu.org/licenses/>.
00018 
00019 # System lib
00020 import os
00021 import os.path
00022 import sys
00023 import urlparse
00024 
00025 # Custom lib
00026 from wikidot.myparser import MyParser
00027 from wikidot.urltoname import urltoname
00028 
00029 
00030 class FixURL(MyParser):
00031     """Fix HTML links (as well as images), so they point to
00032     the local files. If a local file is not available, the full
00033     link to the remote file is built.
00034 
00035     The list of available local files is given at initialization."""
00036 
00037     def __init__(self, links, host):
00038         """Initialization.
00039 
00040         links: set of available local files
00041         host: full path to remote host"""
00042         MyParser.__init__(self)
00043         self.local_links = links
00044         self.remote_host = host
00045         self.reset()
00046 
00047     def reset(self):
00048         MyParser.reset(self)
00049         self.local_set = set()      # Set of local links
00050         self.remote_set = set()     # Set of remote links
00051 
00052     # Public functions
00053     def get_local_links(self):
00054         return self.local_set
00055 
00056     def get_remote_links(self):
00057         return self.remote_set
00058 
00059     # Private functions
00060     def __is_link_local__(self, link):
00061         """Private - Tell if a link match a local file.
00062 
00063         Output:
00064             True if a local file match the link
00065             False otherwise"""
00066         if urltoname(link) in self.local_links:
00067             return True
00068         else:
00069             return False
00070 
00071     def __is_link_toc__(self, link):
00072         """Private - Tell if the link is part of the Table of Content.
00073 
00074         Output:
00075             True if the link is of the form #tocXYZ
00076             False otherwise"""
00077         if link.find('#toc') == 0:
00078             return True
00079         else:
00080             return False
00081 
00082     def __fix_link__(self, link):
00083         """Private - Take a link and convert it,
00084         either as a local link, either as a link pointing
00085         to the remote host."""
00086         if self.__is_link_toc__(link) == True:
00087             # don't touch it!
00088             return link
00089 
00090         if self.__is_link_local__(link) == True:
00091             # Convert link
00092             new_link = urltoname(link)
00093             self.local_set.add(new_link)
00094         else:
00095             # Remote link
00096             new_link = urlparse.urljoin(self.remote_host, link)
00097             self.remote_set.add(new_link)
00098         return new_link
00099 
00100 
00101     # Inherited functions
00102     def handle_starttag(self, tag, attrs):
00103         """Overidden - Parse links and convert them.
00104 
00105         <a> and <img> tags are looked for links."""
00106         # Special case 1: links
00107         if tag == 'a':
00108             for index, attr in enumerate(attrs):
00109                 if attr[0] == 'href':
00110                     attrs[index] = attr[0], self.__fix_link__(attr[1])
00111                     break
00112         # Special case 2: images
00113         elif tag == 'img':
00114             for index, attr in enumerate(attrs):
00115                 if attr[0] == 'src':
00116                     attrs[index] = attr[0], self.__fix_link__(attr[1])
00117                     break
00118 
00119         MyParser.handle_starttag(self, tag, attrs)
00120 
00121 
00122 
00123 def fixurls(directory, base_url):
00124     """Iterate over the files of a directory, and fix the links to point to
00125     local files."""
00126     # List all files, then HTML files to be fixed
00127     files = os.listdir(directory)
00128     html_files = [x for x in files if '.html' in x]
00129     # Create the 'fixer'
00130     fix = FixURL(files, base_url)
00131     print >> sys.stderr, "\nFixing URLs..."
00132     local_set = set()
00133     remote_set = set()
00134     for x in html_files:
00135         file_name = os.path.join(directory, x)
00136         print >> sys.stderr, "Processing ", file_name
00137         # Parse the file
00138         f = open(file_name, 'r')
00139         fix.feed(f.read())
00140         f.close()
00141         # Write result to the file
00142         f = open(file_name, 'w')
00143         f.write(fix.get_doc())
00144         f.close()
00145         # Reset parser
00146         local_set.update(fix.get_local_links())
00147         remote_set.update(fix.get_remote_links())
00148         fix.reset()
00149 
00150     print >> sys.stderr, "\nUpdated local URLs: "
00151     for x in sorted(local_set):
00152         print >> sys.stderr, "  ", x
00153     print >> sys.stderr, "\nRemote URLs: "
00154     for x in sorted(remote_set):
00155         print >> sys.stderr, "  ", x
00156 
00157 


aseba
Author(s): Stéphane Magnenat
autogenerated on Sun Oct 5 2014 23:46:38