diff --git a/dynamic_scraper/utils/task_utils.py b/dynamic_scraper/utils/task_utils.py index b32a1a72..b534b85a 100644 --- a/dynamic_scraper/utils/task_utils.py +++ b/dynamic_scraper/utils/task_utils.py @@ -1,5 +1,8 @@ import datetime, json import urllib, urllib2, httplib +from multiprocessing import Process +from scrapy import log +from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings settings = get_project_settings() from dynamic_scraper.models import Scraper @@ -73,4 +76,54 @@ def run_checker_tests(self): scraper_list = Scraper.objects.filter(checker_x_path__isnull=False, checker_x_path_result__isnull=False, checker_x_path_ref_url__isnull=False) for scraper in scraper_list: - self._run_spider(id=scraper.id, spider='checker_test', run_type='TASK', do_action='yes') \ No newline at end of file + self._run_spider(id=scraper.id, spider='checker_test', run_type='TASK', do_action='yes') + + +class ProcessBasedUtils(TaskUtils): + + # settings are defined in the manage.py file + # set the SCRAPY_SETTINGS_MODULE path in manage.py + # Ex: + # os.environ.setdefault("DJANGO_SETTINGS_MODULE", "scrapy_test.settings.dev") + # os.environ.setdefault("SCRAPY_SETTINGS_MODULE", "scrapy_test.apps.web_scraper.settings") <-- IMPORTANT + + # how to get settings: http://stackoverflow.com/questions/15564844/locally-run-all-of-the-spiders-in-scrapy + + def _run_crawl_process(self, **kwargs): + # region How to run a crawler in-process + # examples on how to get this stuff: + # http://stackoverflow.com/questions/14777910/scrapy-crawl-from-script-always-blocks-script-execution-after-scraping?lq=1 + # http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script + # http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python + # http://stackoverflow.com/questions/15564844/locally-run-all-of-the-spiders-in-scrapy + # https://groups.google.com/forum/#!topic/scrapy-users/d4axj6nPVDw + # endregion + + crawler = CrawlerProcess(settings) + crawler.install() + crawler.configure() + spider = crawler.spiders.create(kwargs['spider'], **kwargs) + crawler.crawl(spider) + + log.start() + log.msg('Spider started...') + crawler.start() + log.msg('Spider stopped.') + crawler.stop() + + def _run_spider(self, **kwargs): + param_dict = { + 'project': 'default', + 'spider': kwargs['spider'], + 'id': kwargs['id'], + 'run_type': kwargs['run_type'], + 'do_action': kwargs['do_action'] + } + + p = Process(target=self._run_crawl_process, kwargs=param_dict) + p.start() + p.join() + + def _pending_jobs(self, spider): + # don't worry about scheduling new jobs if there are still pending jobs for same spider + return False