crawler.py
Go to the documentation of this file.
00001 import urllib
00002 import xml.dom.minidom
00003 import re
00004 import os
00005 import logging
00006 import sys
00007 import time
00008 from xml.dom.minidom import Document
00009 from threading import Thread
00010 import threading
00011 
00012 
00013 
00014 ###################  Some helper functions and classes ##########################
00015 
00016 
00017 # Downloads the content of a web page
00018 def get_page_content(page):
00019     url = urllib.urlopen(page)
00020     raw_content = url.read()
00021     url.close()
00022     return raw_content.decode('iso-8859-1', 'ignore')
00023     
00024 # Checks if a directory exists and if not creates it
00025 def ensure_directory(dir):
00026     if not os.path.exists(dir):
00027         os.makedirs(dir)
00028         
00029 # Removes the extention of a given link. For example strip_extention("test.htm") == "test"
00030 def get_id_from_url(page):
00031     return page[page.rfind('/')+1 : page.rfind('.')]
00032 
00033 
00034 
00035 ###################  A class for the crawler threads  ##########################
00036 
00037 class ProcessorThread(Thread):
00038     def __init__ (self, crawler, item, page, data_directory, xml_node):
00039         Thread.__init__(self)
00040         self.crawler = crawler
00041         self.item = item
00042         self.page = page
00043         self.data_directory = data_directory
00044         self.xml_node = xml_node
00045         
00046     def run(self):
00047         product = self.crawler.process_product_page(self.page)
00048         self.crawler.save_product(self.item, product['image_url'], self.data_directory)
00049         
00050         product_element = self.crawler.doc.createElement('product')
00051         product_element.setAttribute('description', product['image_description'])
00052         product_element.setAttribute('perishability', product['perishability'])
00053         product_element.setAttribute('weight', product['info_weight'])
00054         product_element.setAttribute('brand', product['info_brand'])
00055         product_element.setAttribute('country_of_origin', product['info_countryoforigin'])
00056         product_element.setAttribute('sale_price', product['info_sale_price'])
00057         product_element.setAttribute('product_code', product['info_code'])
00058         product_element.setAttribute('link', self.page)
00059         product_element.setAttribute('id', self.item)
00060         product_element.setAttribute('location', self.data_directory)
00061         self.xml_node.appendChild(product_element)    
00062         
00063     
00064     
00065     
00066 ###################  The main crawler class  ##########################
00067       
00068 class GermanDeliCrawler:
00069 
00070     base_url = "http://www.germandeli.com/"
00071     data_directory = ""
00072     
00073     doc = None
00074     logger = None
00075 
00076     def __init__(self, data_directory, max_threads = 999999, base_url = "http://www.germandeli.com/"):
00077         self.base_url = base_url      
00078         self.max_threads = max_threads  
00079         self.data_directory = data_directory
00080         self.doc = Document()
00081         root = self.doc.createElement('group')
00082         self.doc.appendChild(root)
00083         
00084         self.logger = logging.getLogger('GermanDeliCrawer')
00085         file_handler = logging.FileHandler(os.path.join(self.data_directory, 'crawler.log'))
00086         console_handler = logging.StreamHandler(sys.stdout)
00087         
00088         formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
00089         file_handler.setFormatter(formatter)
00090         console_handler.setFormatter(formatter)
00091         self.logger.addHandler(file_handler)
00092         self.logger.addHandler(console_handler)
00093         self.logger.setLevel(logging.INFO)
00094         
00095     
00096     
00097     # Extracts the structure of the Germandeli.com site and stores it in a XML file
00098     def extract_germandeli_structure(self):
00099         self.log("Extracting site structure...");
00100         url = urllib.urlopen('http://www.germandeli.com')
00101         content = url.read()
00102         url.close()
00103             
00104         nav_start = '<ul id="nav">'.encode();
00105         nav_end = '\n</div>\n<div id="bodyshell" class="clear">'.encode()
00106             
00107         nav_start_id = content.find(nav_start)
00108         nav_end_id = content.find(nav_end)
00109         
00110         nav_string = content[nav_start_id : nav_end_id].decode('iso-8859-1', 'ignore').replace("&", "&amp;").replace("<BR>","")
00111         
00112         structure = xml.dom.minidom.parseString(nav_string.encode("utf-8"))
00113         
00114         self.trace_structure_node_to_xml(structure.firstChild, self.doc.firstChild)
00115         
00116         self.save_document()
00117     
00118     # Helper function for the extraction of the structure of the site
00119     def trace_structure_node_to_xml(self, node, xml_node):
00120         if node.tagName == 'a':
00121             if node.firstChild.nodeValue is not None:
00122                 xml_node.setAttribute('name', node.firstChild.nodeValue.encode('iso-8859-1').decode('utf8', 'ignore'))
00123                 xml_node.setAttribute('link', node.getAttribute('href'))
00124             return
00125         
00126         if node.tagName == 'li':
00127             new_node = self.doc.createElement('page')
00128             xml_node.appendChild(new_node)
00129             xml_node.tagName = 'group'
00130             xml_node = new_node
00131             
00132         for child in node.childNodes:
00133             self.trace_structure_node_to_xml(child, xml_node)
00134     
00135     # Loads the structure of the Germandeli.com site from a XML file
00136     def load_structure_from_file(self):
00137         self.log("Loading site structure from file...");
00138         self.doc = xml.dom.minidom.parse(os.path.join(self.data_directory,'structure.xml')) 
00139         
00140     # Saves the structure XML file
00141     def save_document(self):
00142         f = open(os.path.join(self.data_directory, 'structure.xml'), 'wb')
00143         f.write(self.doc.toprettyxml('\t', '\n', 'utf-8'))
00144         f.close()
00145         
00146     
00147     
00148     
00149     
00150     
00151     
00152     # processes a node form the XML description of the site structure
00153     def process_node(self, node, data_directory):
00154         if node.nodeType != node.ELEMENT_NODE:
00155             return
00156         
00157         # if the node is of type page we should get all products listed there
00158         if node.tagName == 'page':
00159             page_directory = os.path.join(data_directory, get_id_from_url(node.getAttribute('link')))
00160             ensure_directory(page_directory)
00161             self.process_page(node.getAttribute('link'), node, page_directory)
00162             
00163         # if the node is of type groups, we should create a new directory and process the children of the groups
00164         elif node.tagName == 'group':
00165             group_directory = os.path.join(data_directory, get_id_from_url(node.getAttribute('link')))
00166             ensure_directory(group_directory)
00167             for child in node.childNodes:
00168                 self.process_node(child, group_directory)
00169                 
00170         # if the node is unknown just process its children
00171         else:
00172             for child in node.childNodes:
00173                 self.process_node(child, data_directory)
00174 
00175         
00176     
00177     # processes a page with products
00178     def process_page(self, page, xml_node, data_directory):
00179         if xml_node.getAttribute('traced') == '1':
00180             self.log("Skipping category:   " + page)
00181             return;
00182         
00183         self.log("Processing category:   " + page)
00184         
00185         content = get_page_content(self.base_url + page)
00186         head_content = content[1:content.find('<body')]
00187     
00188         items = re.findall('window\.item[0-9].+?pagingItem\(\"(.+?)\"', head_content)
00189        
00190         item_number = 0;
00191         item_count = len(items)
00192         for item in items:
00193             item_number = item_number + 1
00194             
00195             page = self.base_url + item + ".html"
00196             self.log("\tProcessing product (%d/%d):   %s"  % (item_number, item_count, page))  
00197             
00198             # start a new thread, which should download the picture and description for a single product
00199             thread = ProcessorThread(self, item, page, data_directory, xml_node)
00200             thread.start()
00201             
00202             if item_number % self.max_threads == 0:
00203                 self.wait_for_threads()
00204             
00205             
00206         # Wait for all threads to finish
00207         self.wait_for_threads()
00208             
00209         # mark the node as traced
00210         xml_node.setAttribute('traced', '1')
00211         self.save_document()
00212         
00213     def wait_for_threads(self):
00214         sys.stdout.write(str(threading.active_count()))
00215         while threading.active_count() > 1:
00216             time.sleep(0.5)
00217             sys.stdout.write(', ' + str(threading.active_count()))
00218             sys.stdout.flush()
00219             
00220         sys.stdout.write('\n');
00221         
00222        
00223     # Processes a single product
00224     def process_product_page(self, product_page):
00225         content = get_page_content(product_page)
00226         
00227         result = {};
00228         
00229         # image information
00230         image_div_start_string = '<div class="item-images">'
00231         
00232         image_div_start = content.find(image_div_start_string)
00233         image_div_end = content.find('</div>', image_div_start)
00234         
00235         image_content = content[image_div_start : image_div_end]
00236         
00237         m = re.search('<img.*?src="(.*?)".*?alt="(.*?)"', image_content)
00238 
00239         result['image_url'] = m.group(1);
00240         result['image_description'] = m.group(2)
00241         
00242         
00243         
00244         # perishability
00245         m = re.search('<div.*?class="perishable.*?>.*?<img.*?alt="(.*?)".*?>', content);
00246         result['perishability'] = m.group(1) if m else ''
00247      
00248         
00249         
00250         
00251         # info table
00252         m = re.search('<table.*?id="product-info-table".*?>(.*?)</table>', content);
00253         info_content = m.group(1) if m else ''
00254         
00255         # look for brand
00256         m = re.search('<tr.*?class="brand".*?>.*?<td>(.*?)</td>', info_content)
00257         result['info_brand'] =  m.group(1) if m else ''
00258         
00259         # look for countryoforigin
00260         m = re.search('<tr.*?class="countryoforigin".*?>.*?<td>(.*?)</td>', info_content)
00261         result['info_countryoforigin'] = m.group(1) if m else ''
00262         
00263         # look for code
00264         m = re.search('<tr.*?class="code".*?>.*?<td>(.*?)</td>', info_content)
00265         result['info_code'] = m.group(1) if m else ''
00266         
00267         # look for weight
00268         m = re.search('<tr.*?class="weight".*?>.*?<td>(.*?)</td>', info_content)
00269         result['info_weight'] = m.group(1) if m else ''
00270         
00271         # look for sale-price
00272         m = re.search('<tr.*?class="sale-price".*?>.*?<td>(.*?)</td>', info_content)
00273         result['info_sale_price'] = m.group(1) if m else ''
00274         
00275         return result        
00276     
00277     
00278     # Saves the product information (currently just the picture)
00279     def save_product(self, product_id, picture_url, data_directory):
00280         urllib.urlretrieve(picture_url, os.path.join(data_directory, product_id + ".jpg"))
00281     
00282 
00283 
00284 
00285 
00286 
00287     def start(self):
00288         self.process_node(self.doc.firstChild, self.data_directory)
00289         
00290     def log(self, message):
00291         self.logger.info(message)
00292         
00293         
00294         
00295         
00296         
00297         
00298         
00299         
00300     
00301     


germandeli_crawler
Author(s): Vladimir Haltakov, Dejan Pangercic
autogenerated on Mon Oct 6 2014 08:06:34