$search
00001 import urllib 00002 import xml.dom.minidom 00003 import re 00004 import os 00005 import logging 00006 import sys 00007 import time 00008 from xml.dom.minidom import Document 00009 from threading import Thread 00010 import threading 00011 00012 00013 00014 ################### Some helper functions and classes ########################## 00015 00016 00017 # Downloads the content of a web page 00018 def get_page_content(page): 00019 url = urllib.urlopen(page) 00020 raw_content = url.read() 00021 url.close() 00022 return raw_content.decode('iso-8859-1', 'ignore') 00023 00024 # Checks if a directory exists and if not creates it 00025 def ensure_directory(dir): 00026 if not os.path.exists(dir): 00027 os.makedirs(dir) 00028 00029 # Removes the extention of a given link. For example strip_extention("test.htm") == "test" 00030 def get_id_from_url(page): 00031 return page[page.rfind('/')+1 : page.rfind('.')] 00032 00033 00034 00035 ################### A class for the crawler threads ########################## 00036 00037 class ProcessorThread(Thread): 00038 def __init__ (self, crawler, item, page, data_directory, xml_node): 00039 Thread.__init__(self) 00040 self.crawler = crawler 00041 self.item = item 00042 self.page = page 00043 self.data_directory = data_directory 00044 self.xml_node = xml_node 00045 00046 def run(self): 00047 product = self.crawler.process_product_page(self.page) 00048 self.crawler.save_product(self.item, product['image_url'], self.data_directory) 00049 00050 product_element = self.crawler.doc.createElement('product') 00051 product_element.setAttribute('description', product['image_description']) 00052 product_element.setAttribute('perishability', product['perishability']) 00053 product_element.setAttribute('weight', product['info_weight']) 00054 product_element.setAttribute('brand', product['info_brand']) 00055 product_element.setAttribute('country_of_origin', product['info_countryoforigin']) 00056 product_element.setAttribute('sale_price', product['info_sale_price']) 00057 product_element.setAttribute('product_code', product['info_code']) 00058 product_element.setAttribute('link', self.page) 00059 product_element.setAttribute('id', self.item) 00060 product_element.setAttribute('location', self.data_directory) 00061 self.xml_node.appendChild(product_element) 00062 00063 00064 00065 00066 ################### The main crawler class ########################## 00067 00068 class GermanDeliCrawler: 00069 00070 base_url = "http://www.germandeli.com/" 00071 data_directory = "" 00072 00073 doc = None 00074 logger = None 00075 00076 def __init__(self, data_directory, max_threads = 999999, base_url = "http://www.germandeli.com/"): 00077 self.base_url = base_url 00078 self.max_threads = max_threads 00079 self.data_directory = data_directory 00080 self.doc = Document() 00081 root = self.doc.createElement('group') 00082 self.doc.appendChild(root) 00083 00084 self.logger = logging.getLogger('GermanDeliCrawer') 00085 file_handler = logging.FileHandler(os.path.join(self.data_directory, 'crawler.log')) 00086 console_handler = logging.StreamHandler(sys.stdout) 00087 00088 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') 00089 file_handler.setFormatter(formatter) 00090 console_handler.setFormatter(formatter) 00091 self.logger.addHandler(file_handler) 00092 self.logger.addHandler(console_handler) 00093 self.logger.setLevel(logging.INFO) 00094 00095 00096 00097 # Extracts the structure of the Germandeli.com site and stores it in a XML file 00098 def extract_germandeli_structure(self): 00099 self.log("Extracting site structure..."); 00100 url = urllib.urlopen('http://www.germandeli.com') 00101 content = url.read() 00102 url.close() 00103 00104 nav_start = '<ul id="nav">'.encode(); 00105 nav_end = '\n</div>\n<div id="bodyshell" class="clear">'.encode() 00106 00107 nav_start_id = content.find(nav_start) 00108 nav_end_id = content.find(nav_end) 00109 00110 nav_string = content[nav_start_id : nav_end_id].decode('iso-8859-1', 'ignore').replace("&", "&").replace("<BR>","") 00111 00112 structure = xml.dom.minidom.parseString(nav_string.encode("utf-8")) 00113 00114 self.trace_structure_node_to_xml(structure.firstChild, self.doc.firstChild) 00115 00116 self.save_document() 00117 00118 # Helper function for the extraction of the structure of the site 00119 def trace_structure_node_to_xml(self, node, xml_node): 00120 if node.tagName == 'a': 00121 if node.firstChild.nodeValue is not None: 00122 xml_node.setAttribute('name', node.firstChild.nodeValue.encode('iso-8859-1').decode('utf8', 'ignore')) 00123 xml_node.setAttribute('link', node.getAttribute('href')) 00124 return 00125 00126 if node.tagName == 'li': 00127 new_node = self.doc.createElement('page') 00128 xml_node.appendChild(new_node) 00129 xml_node.tagName = 'group' 00130 xml_node = new_node 00131 00132 for child in node.childNodes: 00133 self.trace_structure_node_to_xml(child, xml_node) 00134 00135 # Loads the structure of the Germandeli.com site from a XML file 00136 def load_structure_from_file(self): 00137 self.log("Loading site structure from file..."); 00138 self.doc = xml.dom.minidom.parse(os.path.join(self.data_directory,'structure.xml')) 00139 00140 # Saves the structure XML file 00141 def save_document(self): 00142 f = open(os.path.join(self.data_directory, 'structure.xml'), 'wb') 00143 f.write(self.doc.toprettyxml('\t', '\n', 'utf-8')) 00144 f.close() 00145 00146 00147 00148 00149 00150 00151 00152 # processes a node form the XML description of the site structure 00153 def process_node(self, node, data_directory): 00154 if node.nodeType != node.ELEMENT_NODE: 00155 return 00156 00157 # if the node is of type page we should get all products listed there 00158 if node.tagName == 'page': 00159 page_directory = os.path.join(data_directory, get_id_from_url(node.getAttribute('link'))) 00160 ensure_directory(page_directory) 00161 self.process_page(node.getAttribute('link'), node, page_directory) 00162 00163 # if the node is of type groups, we should create a new directory and process the children of the groups 00164 elif node.tagName == 'group': 00165 group_directory = os.path.join(data_directory, get_id_from_url(node.getAttribute('link'))) 00166 ensure_directory(group_directory) 00167 for child in node.childNodes: 00168 self.process_node(child, group_directory) 00169 00170 # if the node is unknown just process its children 00171 else: 00172 for child in node.childNodes: 00173 self.process_node(child, data_directory) 00174 00175 00176 00177 # processes a page with products 00178 def process_page(self, page, xml_node, data_directory): 00179 if xml_node.getAttribute('traced') == '1': 00180 self.log("Skipping category: " + page) 00181 return; 00182 00183 self.log("Processing category: " + page) 00184 00185 content = get_page_content(self.base_url + page) 00186 head_content = content[1:content.find('<body')] 00187 00188 items = re.findall('window\.item[0-9].+?pagingItem\(\"(.+?)\"', head_content) 00189 00190 item_number = 0; 00191 item_count = len(items) 00192 for item in items: 00193 item_number = item_number + 1 00194 00195 page = self.base_url + item + ".html" 00196 self.log("\tProcessing product (%d/%d): %s" % (item_number, item_count, page)) 00197 00198 # start a new thread, which should download the picture and description for a single product 00199 thread = ProcessorThread(self, item, page, data_directory, xml_node) 00200 thread.start() 00201 00202 if item_number % self.max_threads == 0: 00203 self.wait_for_threads() 00204 00205 00206 # Wait for all threads to finish 00207 self.wait_for_threads() 00208 00209 # mark the node as traced 00210 xml_node.setAttribute('traced', '1') 00211 self.save_document() 00212 00213 def wait_for_threads(self): 00214 sys.stdout.write(str(threading.active_count())) 00215 while threading.active_count() > 1: 00216 time.sleep(0.5) 00217 sys.stdout.write(', ' + str(threading.active_count())) 00218 sys.stdout.flush() 00219 00220 sys.stdout.write('\n'); 00221 00222 00223 # Processes a single product 00224 def process_product_page(self, product_page): 00225 content = get_page_content(product_page) 00226 00227 result = {}; 00228 00229 # image information 00230 image_div_start_string = '<div class="item-images">' 00231 00232 image_div_start = content.find(image_div_start_string) 00233 image_div_end = content.find('</div>', image_div_start) 00234 00235 image_content = content[image_div_start : image_div_end] 00236 00237 m = re.search('<img.*?src="(.*?)".*?alt="(.*?)"', image_content) 00238 00239 result['image_url'] = m.group(1); 00240 result['image_description'] = m.group(2) 00241 00242 00243 00244 # perishability 00245 m = re.search('<div.*?class="perishable.*?>.*?<img.*?alt="(.*?)".*?>', content); 00246 result['perishability'] = m.group(1) if m else '' 00247 00248 00249 00250 00251 # info table 00252 m = re.search('<table.*?id="product-info-table".*?>(.*?)</table>', content); 00253 info_content = m.group(1) if m else '' 00254 00255 # look for brand 00256 m = re.search('<tr.*?class="brand".*?>.*?<td>(.*?)</td>', info_content) 00257 result['info_brand'] = m.group(1) if m else '' 00258 00259 # look for countryoforigin 00260 m = re.search('<tr.*?class="countryoforigin".*?>.*?<td>(.*?)</td>', info_content) 00261 result['info_countryoforigin'] = m.group(1) if m else '' 00262 00263 # look for code 00264 m = re.search('<tr.*?class="code".*?>.*?<td>(.*?)</td>', info_content) 00265 result['info_code'] = m.group(1) if m else '' 00266 00267 # look for weight 00268 m = re.search('<tr.*?class="weight".*?>.*?<td>(.*?)</td>', info_content) 00269 result['info_weight'] = m.group(1) if m else '' 00270 00271 # look for sale-price 00272 m = re.search('<tr.*?class="sale-price".*?>.*?<td>(.*?)</td>', info_content) 00273 result['info_sale_price'] = m.group(1) if m else '' 00274 00275 return result 00276 00277 00278 # Saves the product information (currently just the picture) 00279 def save_product(self, product_id, picture_url, data_directory): 00280 urllib.urlretrieve(picture_url, os.path.join(data_directory, product_id + ".jpg")) 00281 00282 00283 00284 00285 00286 00287 def start(self): 00288 self.process_node(self.doc.firstChild, self.data_directory) 00289 00290 def log(self, message): 00291 self.logger.info(message) 00292 00293 00294 00295 00296 00297 00298 00299 00300 00301