00001 import urllib
00002 import xml.dom.minidom
00003 import re
00004 import os
00005 import logging
00006 import sys
00007 import time
00008 from xml.dom.minidom import Document
00009 from threading import Thread
00010 import threading
00011
00012
00013
00014
00015
00016
00017
00018 def get_page_content(page):
00019 url = urllib.urlopen(page)
00020 raw_content = url.read()
00021 url.close()
00022 return raw_content.decode('iso-8859-1', 'ignore')
00023
00024
00025 def ensure_directory(dir):
00026 if not os.path.exists(dir):
00027 os.makedirs(dir)
00028
00029
00030 def get_id_from_url(page):
00031 return page[page.rfind('/')+1 : page.rfind('.')]
00032
00033
00034
00035
00036
00037 class ProcessorThread(Thread):
00038 def __init__ (self, crawler, item, page, data_directory, xml_node):
00039 Thread.__init__(self)
00040 self.crawler = crawler
00041 self.item = item
00042 self.page = page
00043 self.data_directory = data_directory
00044 self.xml_node = xml_node
00045
00046 def run(self):
00047 product = self.crawler.process_product_page(self.page)
00048 self.crawler.save_product(self.item, product['image_url'], self.data_directory)
00049
00050 product_element = self.crawler.doc.createElement('product')
00051 product_element.setAttribute('description', product['image_description'])
00052 product_element.setAttribute('perishability', product['perishability'])
00053 product_element.setAttribute('weight', product['info_weight'])
00054 product_element.setAttribute('brand', product['info_brand'])
00055 product_element.setAttribute('country_of_origin', product['info_countryoforigin'])
00056 product_element.setAttribute('sale_price', product['info_sale_price'])
00057 product_element.setAttribute('product_code', product['info_code'])
00058 product_element.setAttribute('link', self.page)
00059 product_element.setAttribute('id', self.item)
00060 product_element.setAttribute('location', self.data_directory)
00061 self.xml_node.appendChild(product_element)
00062
00063
00064
00065
00066
00067
00068 class GermanDeliCrawler:
00069
00070 base_url = "http://www.germandeli.com/"
00071 data_directory = ""
00072
00073 doc = None
00074 logger = None
00075
00076 def __init__(self, data_directory, max_threads = 999999, base_url = "http://www.germandeli.com/"):
00077 self.base_url = base_url
00078 self.max_threads = max_threads
00079 self.data_directory = data_directory
00080 self.doc = Document()
00081 root = self.doc.createElement('group')
00082 self.doc.appendChild(root)
00083
00084 self.logger = logging.getLogger('GermanDeliCrawer')
00085 file_handler = logging.FileHandler(os.path.join(self.data_directory, 'crawler.log'))
00086 console_handler = logging.StreamHandler(sys.stdout)
00087
00088 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
00089 file_handler.setFormatter(formatter)
00090 console_handler.setFormatter(formatter)
00091 self.logger.addHandler(file_handler)
00092 self.logger.addHandler(console_handler)
00093 self.logger.setLevel(logging.INFO)
00094
00095
00096
00097
00098 def extract_germandeli_structure(self):
00099 self.log("Extracting site structure...");
00100 url = urllib.urlopen('http://www.germandeli.com')
00101 content = url.read()
00102 url.close()
00103
00104 nav_start = '<ul id="nav">'.encode();
00105 nav_end = '\n</div>\n<div id="bodyshell" class="clear">'.encode()
00106
00107 nav_start_id = content.find(nav_start)
00108 nav_end_id = content.find(nav_end)
00109
00110 nav_string = content[nav_start_id : nav_end_id].decode('iso-8859-1', 'ignore').replace("&", "&").replace("<BR>","")
00111
00112 structure = xml.dom.minidom.parseString(nav_string.encode("utf-8"))
00113
00114 self.trace_structure_node_to_xml(structure.firstChild, self.doc.firstChild)
00115
00116 self.save_document()
00117
00118
00119 def trace_structure_node_to_xml(self, node, xml_node):
00120 if node.tagName == 'a':
00121 if node.firstChild.nodeValue is not None:
00122 xml_node.setAttribute('name', node.firstChild.nodeValue.encode('iso-8859-1').decode('utf8', 'ignore'))
00123 xml_node.setAttribute('link', node.getAttribute('href'))
00124 return
00125
00126 if node.tagName == 'li':
00127 new_node = self.doc.createElement('page')
00128 xml_node.appendChild(new_node)
00129 xml_node.tagName = 'group'
00130 xml_node = new_node
00131
00132 for child in node.childNodes:
00133 self.trace_structure_node_to_xml(child, xml_node)
00134
00135
00136 def load_structure_from_file(self):
00137 self.log("Loading site structure from file...");
00138 self.doc = xml.dom.minidom.parse(os.path.join(self.data_directory,'structure.xml'))
00139
00140
00141 def save_document(self):
00142 f = open(os.path.join(self.data_directory, 'structure.xml'), 'wb')
00143 f.write(self.doc.toprettyxml('\t', '\n', 'utf-8'))
00144 f.close()
00145
00146
00147
00148
00149
00150
00151
00152
00153 def process_node(self, node, data_directory):
00154 if node.nodeType != node.ELEMENT_NODE:
00155 return
00156
00157
00158 if node.tagName == 'page':
00159 page_directory = os.path.join(data_directory, get_id_from_url(node.getAttribute('link')))
00160 ensure_directory(page_directory)
00161 self.process_page(node.getAttribute('link'), node, page_directory)
00162
00163
00164 elif node.tagName == 'group':
00165 group_directory = os.path.join(data_directory, get_id_from_url(node.getAttribute('link')))
00166 ensure_directory(group_directory)
00167 for child in node.childNodes:
00168 self.process_node(child, group_directory)
00169
00170
00171 else:
00172 for child in node.childNodes:
00173 self.process_node(child, data_directory)
00174
00175
00176
00177
00178 def process_page(self, page, xml_node, data_directory):
00179 if xml_node.getAttribute('traced') == '1':
00180 self.log("Skipping category: " + page)
00181 return;
00182
00183 self.log("Processing category: " + page)
00184
00185 content = get_page_content(self.base_url + page)
00186 head_content = content[1:content.find('<body')]
00187
00188 items = re.findall('window\.item[0-9].+?pagingItem\(\"(.+?)\"', head_content)
00189
00190 item_number = 0;
00191 item_count = len(items)
00192 for item in items:
00193 item_number = item_number + 1
00194
00195 page = self.base_url + item + ".html"
00196 self.log("\tProcessing product (%d/%d): %s" % (item_number, item_count, page))
00197
00198
00199 thread = ProcessorThread(self, item, page, data_directory, xml_node)
00200 thread.start()
00201
00202 if item_number % self.max_threads == 0:
00203 self.wait_for_threads()
00204
00205
00206
00207 self.wait_for_threads()
00208
00209
00210 xml_node.setAttribute('traced', '1')
00211 self.save_document()
00212
00213 def wait_for_threads(self):
00214 sys.stdout.write(str(threading.active_count()))
00215 while threading.active_count() > 1:
00216 time.sleep(0.5)
00217 sys.stdout.write(', ' + str(threading.active_count()))
00218 sys.stdout.flush()
00219
00220 sys.stdout.write('\n');
00221
00222
00223
00224 def process_product_page(self, product_page):
00225 content = get_page_content(product_page)
00226
00227 result = {};
00228
00229
00230 image_div_start_string = '<div class="item-images">'
00231
00232 image_div_start = content.find(image_div_start_string)
00233 image_div_end = content.find('</div>', image_div_start)
00234
00235 image_content = content[image_div_start : image_div_end]
00236
00237 m = re.search('<img.*?src="(.*?)".*?alt="(.*?)"', image_content)
00238
00239 result['image_url'] = m.group(1);
00240 result['image_description'] = m.group(2)
00241
00242
00243
00244
00245 m = re.search('<div.*?class="perishable.*?>.*?<img.*?alt="(.*?)".*?>', content);
00246 result['perishability'] = m.group(1) if m else ''
00247
00248
00249
00250
00251
00252 m = re.search('<table.*?id="product-info-table".*?>(.*?)</table>', content);
00253 info_content = m.group(1) if m else ''
00254
00255
00256 m = re.search('<tr.*?class="brand".*?>.*?<td>(.*?)</td>', info_content)
00257 result['info_brand'] = m.group(1) if m else ''
00258
00259
00260 m = re.search('<tr.*?class="countryoforigin".*?>.*?<td>(.*?)</td>', info_content)
00261 result['info_countryoforigin'] = m.group(1) if m else ''
00262
00263
00264 m = re.search('<tr.*?class="code".*?>.*?<td>(.*?)</td>', info_content)
00265 result['info_code'] = m.group(1) if m else ''
00266
00267
00268 m = re.search('<tr.*?class="weight".*?>.*?<td>(.*?)</td>', info_content)
00269 result['info_weight'] = m.group(1) if m else ''
00270
00271
00272 m = re.search('<tr.*?class="sale-price".*?>.*?<td>(.*?)</td>', info_content)
00273 result['info_sale_price'] = m.group(1) if m else ''
00274
00275 return result
00276
00277
00278
00279 def save_product(self, product_id, picture_url, data_directory):
00280 urllib.urlretrieve(picture_url, os.path.join(data_directory, product_id + ".jpg"))
00281
00282
00283
00284
00285
00286
00287 def start(self):
00288 self.process_node(self.doc.firstChild, self.data_directory)
00289
00290 def log(self, message):
00291 self.logger.info(message)
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301