Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 import os
00021 import os.path
00022 import sys
00023 import urlparse
00024
00025
00026 from wikidot.myparser import MyParser
00027 from wikidot.urltoname import urltoname
00028
00029
00030 class FixURL(MyParser):
00031 """Fix HTML links (as well as images), so they point to
00032 the local files. If a local file is not available, the full
00033 link to the remote file is built.
00034
00035 The list of available local files is given at initialization."""
00036
00037 def __init__(self, links, host):
00038 """Initialization.
00039
00040 links: set of available local files
00041 host: full path to remote host"""
00042 MyParser.__init__(self)
00043 self.local_links = links
00044 self.remote_host = host
00045 self.reset()
00046
00047 def reset(self):
00048 MyParser.reset(self)
00049 self.local_set = set()
00050 self.remote_set = set()
00051
00052
00053 def get_local_links(self):
00054 return self.local_set
00055
00056 def get_remote_links(self):
00057 return self.remote_set
00058
00059
00060 def __is_link_local__(self, link):
00061 """Private - Tell if a link match a local file.
00062
00063 Output:
00064 True if a local file match the link
00065 False otherwise"""
00066 if urltoname(link) in self.local_links:
00067 return True
00068 else:
00069 return False
00070
00071 def __is_link_toc__(self, link):
00072 """Private - Tell if the link is part of the Table of Content.
00073
00074 Output:
00075 True if the link is of the form #tocXYZ
00076 False otherwise"""
00077 if link.find('#toc') == 0:
00078 return True
00079 else:
00080 return False
00081
00082 def __fix_link__(self, link):
00083 """Private - Take a link and convert it,
00084 either as a local link, either as a link pointing
00085 to the remote host."""
00086 if self.__is_link_toc__(link) == True:
00087
00088 return link
00089
00090 if self.__is_link_local__(link) == True:
00091
00092 new_link = urltoname(link)
00093 self.local_set.add(new_link)
00094 else:
00095
00096 new_link = urlparse.urljoin(self.remote_host, link)
00097 self.remote_set.add(new_link)
00098 return new_link
00099
00100
00101
00102 def handle_starttag(self, tag, attrs):
00103 """Overidden - Parse links and convert them.
00104
00105 <a> and <img> tags are looked for links."""
00106
00107 if tag == 'a':
00108 for index, attr in enumerate(attrs):
00109 if attr[0] == 'href':
00110 attrs[index] = attr[0], self.__fix_link__(attr[1])
00111 break
00112
00113 elif tag == 'img':
00114 for index, attr in enumerate(attrs):
00115 if attr[0] == 'src':
00116 attrs[index] = attr[0], self.__fix_link__(attr[1])
00117 break
00118
00119 MyParser.handle_starttag(self, tag, attrs)
00120
00121
00122
00123 def fixurls(directory, base_url):
00124 """Iterate over the files of a directory, and fix the links to point to
00125 local files."""
00126
00127 files = os.listdir(directory)
00128 html_files = [x for x in files if '.html' in x]
00129
00130 fix = FixURL(files, base_url)
00131 print >> sys.stderr, "\nFixing URLs..."
00132 local_set = set()
00133 remote_set = set()
00134 for x in html_files:
00135 file_name = os.path.join(directory, x)
00136 print >> sys.stderr, "Processing ", file_name
00137
00138 f = open(file_name, 'r')
00139 fix.feed(f.read())
00140 f.close()
00141
00142 f = open(file_name, 'w')
00143 f.write(fix.get_doc())
00144 f.close()
00145
00146 local_set.update(fix.get_local_links())
00147 remote_set.update(fix.get_remote_links())
00148 fix.reset()
00149
00150 print >> sys.stderr, "\nUpdated local URLs: "
00151 for x in sorted(local_set):
00152 print >> sys.stderr, " ", x
00153 print >> sys.stderr, "\nRemote URLs: "
00154 for x in sorted(remote_set):
00155 print >> sys.stderr, " ", x
00156
00157