Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolves #106 - updated scrapy req after successful test run. #109

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 35 additions & 35 deletions dynamic_scraper/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import datetime
from django.db import models
from django.db.models import Q

from django.utils import timezone

@python_2_unicode_compatible
class ScrapedObjClass(models.Model):
Expand All @@ -25,7 +25,7 @@ class ScrapedObjClass(models.Model):
"ZERO_ACTIONS_FACTOR_CHANGE": 5,\n\
"FACTOR_CHANGE_FACTOR": 1.3,\n')
comments = models.TextField(blank=True)

def __str__(self):
return self.name

Expand All @@ -50,10 +50,10 @@ class ScrapedObjAttr(models.Model):
attr_type = models.CharField(max_length=1, choices=ATTR_TYPE_CHOICES)
id_field = models.BooleanField(default=False)
save_to_db = models.BooleanField(default=True)

def __str__(self):
return self.name + " (" + str(self.obj_class) + ")"

class Meta(object):
ordering = ['order',]

Expand Down Expand Up @@ -112,28 +112,28 @@ class Scraper(models.Model):
pagination_type = models.CharField(max_length=1, choices=PAGINATION_TYPE, default='N')
pagination_on_start = models.BooleanField(default=False)
pagination_append_str = models.CharField(max_length=200, blank=True, help_text="Syntax: /somepartofurl/{page}/moreurlstuff.html")
pagination_page_replace = models.TextField(blank=True,
pagination_page_replace = models.TextField(blank=True,
help_text="RANGE_FUNCT: uses Python range funct., syntax: [start], stop[, step], FREE_LIST: 'Replace text 1', 'Some other text 2', 'Maybe a number 3', ...")
help_text = "Optional, follow links from a single non-paginated or all statically paginated (RANGE_FUNCT, FREE_LIST) main pages"
follow_pages_url_xpath = models.TextField(blank=True, help_text=help_text)
help_text = "Optional additional XPath for the page number, can be used in {follow_page} placeholder."
follow_pages_page_xpath = models.TextField(blank=True, help_text=help_text)
help_text = "Optionally limit number of pages to follow (default: follow until XPath fails)"
num_pages_follow = models.IntegerField(blank=True, null=True, help_text=help_text)
last_scraper_save_alert_period = models.CharField(max_length=5, blank=True,
last_scraper_save_alert_period = models.CharField(max_length=5, blank=True,
help_text="Optional, used for scraper monitoring with 'check_last_scraper_saves' management cmd, \
syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')")
next_last_scraper_save_alert = models.DateTimeField(default=datetime.datetime.now,
help_text="Next time the last scraper save will be alerted, normally set on management cmd run.",)
last_checker_delete_alert_period = models.CharField(max_length=5, blank=True,
last_checker_delete_alert_period = models.CharField(max_length=5, blank=True,
help_text="Optional, used for scraper monitoring with 'check_last_checker_deletes' management cmd, \
syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')")
next_last_checker_delete_alert = models.DateTimeField(default=datetime.datetime.now,
help_text="Next time the last checker delete will be alerted, normally set on management cmd run.",)
comments = models.TextField(blank=True)
last_scraper_save = models.DateTimeField(null=True, blank=True)
last_checker_delete = models.DateTimeField(null=True, blank=True)

def get_alert_period_timedelta(self, attribute_str):
if getattr(self, attribute_str) and len(getattr(self, attribute_str)) >= 2:
period_str = getattr(self, attribute_str)[-1]
Expand All @@ -153,16 +153,16 @@ def get_alert_period_timedelta(self, attribute_str):
return None
else:
return None

def get_last_scraper_save_alert_period_timedelta(self):
return self.get_alert_period_timedelta('last_scraper_save_alert_period')

def get_last_checker_delete_alert_period_timedelta(self):
return self.get_alert_period_timedelta('last_checker_delete_alert_period')

def get_main_page_rpt(self):
return self.requestpagetype_set.get(page_type='MP')

def get_follow_page_rpts(self):
return self.requestpagetype_set.filter(page_type='FP')

Expand All @@ -177,16 +177,16 @@ def get_rpt_for_scraped_obj_attr(self, soa):

def get_base_elems(self):
return self.scraperelem_set.filter(scraped_obj_attr__attr_type='B')

def get_base_elem(self):
return self.scraperelem_set.get(scraped_obj_attr__attr_type='B')

def get_detail_page_url_elems(self):
return self.scraperelem_set.filter(scraped_obj_attr__attr_type='U')

def get_detail_page_url_id_elems(self):
return self.scraperelem_set.filter(scraped_obj_attr__attr_type='U', scraped_obj_attr__id_field=True)

def get_standard_elems(self):
q1 = Q(scraped_obj_attr__attr_type='S')
q2 = Q(scraped_obj_attr__attr_type='T')
Expand All @@ -204,33 +204,33 @@ def get_standard_update_elems(self):

def get_standard_update_elems_from_detail_pages(self):
return self.scraperelem_set.filter(scraped_obj_attr__attr_type='T').filter(~Q(request_page_type='MP'))

def get_image_elems(self):
return self.scraperelem_set.filter(scraped_obj_attr__attr_type='I')

def get_image_elem(self):
return self.scraperelem_set.get(scraped_obj_attr__attr_type='I')

def get_scrape_elems(self):
q1 = Q(scraped_obj_attr__attr_type='S')
q2 = Q(scraped_obj_attr__attr_type='T')
q3 = Q(scraped_obj_attr__attr_type='U')
q4 = Q(scraped_obj_attr__attr_type='I')
return self.scraperelem_set.filter(q1 | q2 | q3 | q4)

def get_mandatory_scrape_elems(self):
q1 = Q(scraped_obj_attr__attr_type='S')
q2 = Q(scraped_obj_attr__attr_type='T')
q3 = Q(scraped_obj_attr__attr_type='U')
q4 = Q(scraped_obj_attr__attr_type='I')
return self.scraperelem_set.filter(q1 | q2 | q3 | q4).filter(mandatory=True)

def get_from_detail_pages_scrape_elems(self):
return self.scraperelem_set.filter(~Q(request_page_type='MP'))

def __str__(self):
return self.name + " (" + self.scraped_obj_class.name + ")"

class Meta(object):
ordering = ['name', 'scraped_obj_class',]

Expand Down Expand Up @@ -287,17 +287,17 @@ class Checker(models.Model):
checker_x_path_result = models.TextField(blank=True)
checker_ref_url = models.URLField(max_length=500, blank=True)
comments = models.TextField(blank=True)

def __str__(self):
return str(self.scraped_obj_attr) + ' > ' + self.get_checker_type_display()


@python_2_unicode_compatible
class ScraperElem(models.Model):
REQUEST_PAGE_TYPE_CHOICES = tuple([("MP", "Main Page")] + [("DP{n}".format(n=str(n)), "Detail Page {n}".format(n=str(n))) for n in list(range(1, 26))])
help_text = "The different attributes to be scraped, exactly one attribute of type BASE necessary."
scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, help_text=help_text)
scraper = models.ForeignKey(Scraper)
scraper = models.ForeignKey(Scraper)
x_path = models.TextField(blank=True, help_text='XPath or JSONPath expression, leave blank on "static" processor use.')
reg_exp = models.TextField(blank=True, help_text="Optional filtering by regular expression (e.g. 'Scrape only (.*) the text in between').")
help_text = "Corresponding Request Page Types created for this scraper."
Expand All @@ -310,16 +310,16 @@ class ScraperElem(models.Model):
proc_ctxt = models.TextField(blank=True, help_text=help_text)
help_text = "Drop item if attribute could not be scraped."
mandatory = models.BooleanField(default=True, help_text=help_text)

def __str__(self):
return '{s} > {soa} Attribute ({rpt})'.format(
s=str(self.scraper),
soa=self.scraped_obj_attr.name,
rpt=self.get_request_page_type_display())

class Meta(object):
ordering = ['scraped_obj_attr__order',]



@python_2_unicode_compatible
Expand All @@ -329,13 +329,13 @@ class SchedulerRuntime(models.Model):
('C', 'CHECKER'),
)
runtime_type = models.CharField(max_length=1, choices=TYPE, default='P')
next_action_time = models.DateTimeField(default=datetime.datetime.now)
next_action_time = models.DateTimeField(default=timezone.now())
next_action_factor = models.FloatField(blank=True, null=True)
num_zero_actions = models.IntegerField(default=0)

def __str__(self):
return str(self.id)

class Meta(object):
ordering = ['next_action_time',]

Expand All @@ -347,7 +347,7 @@ class LogMarker(models.Model):
('IM', 'Important'),
('IG', 'Ignore'),
('MI', 'Miscellaneous'),
('CU', 'Custom'),
('CU', 'Custom'),
)
message_contains = models.CharField(max_length=255)
help_text = "Use the string format from the log messages"
Expand All @@ -374,14 +374,14 @@ class Log(models.Model):
spider_name = models.CharField(max_length=200)
scraper = models.ForeignKey(Scraper, blank=True, null=True)
date = models.DateTimeField(default=datetime.datetime.now)

@staticmethod
def numeric_level(level):
numeric_level = 0
for choice in Log.LEVEL_CHOICES:
if choice[1] == level:
numeric_level = choice[0]
return numeric_level
return numeric_level

class Meta(object):
ordering = ['-date']
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Django>=1.8,<1.12
Scrapy>=1.4,<1.5
Scrapy>=1.5,<1.6
scrapy-djangoitem>=1.1.1,<1.2
scrapy-splash>=0.7,<0.8
scrapyd>=1.2,<1.3
Expand All @@ -9,4 +9,3 @@ Celery==3.1.25
django-celery==3.2.1
future>=0.15,<0.16
pillow>=3.0,<4.0