germandeli_crawler: start_extracting.py Source File

Go to the documentation of this file.
00001 from crawler import GermanDeliCrawler
00002 import os
00003 import sys
00004 
00005 
00006 if __name__ == '__main__':
00007 
00008         if len(sys.argv) < 2:
00009                 print('Data directory not specified! Usage start_extracting.py <data_directory>')
00010                 sys.exit(1)
00011 
00012         data_directory = sys.argv[1]
00013         
00014         # instantiate the crawler with default parameters
00015         crawler = GermanDeliCrawler(data_directory)
00016         
00017         
00018         # if the file structure.xml exists load it and proceed with the crawling
00019         if os.path.exists(os.path.join(crawler.data_directory, 'structure.xml')):
00020                 crawler.load_structure_from_file()
00021                 
00022         # if not extract the structure of the site again and create the structure.xml file
00023         else:
00024                 crawler.extract_germandeli_structure();
00025         
00026         #print(crawler.doc.toprettyxml('   ', '\n', 'utf-8').decode())
00027         
00028         # start crawling
00029         crawler.start()
00030