arxiv: avoid duplicated records (cross-set)

Signed-off-by: David Caro <[email protected]>
szymonlopaciuk · Jan 23, 2018 · cb85cc8 · cb85cc8
1 parent e115cbf
commit cb85cc8
Show file tree

Hide file tree

Showing 6 changed files with 361 additions and 275 deletions.
diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py
@@ -122,6 +122,10 @@ def parse_record(self, selector):
 
         return parsed_item
 
+    def get_record_identifier(self, record):
+        """Extracts a unique identifier from a sickle record."""
+        return record.header.identifier
+
     def _get_authors_or_collaboration(self, node):
         """Parse authors, affiliations; extract collaboration"""
         author_selectors = node.xpath('.//authors//author')

diff --git a/hepcrawl/spiders/common/oaipmh_spider.py b/hepcrawl/spiders/common/oaipmh_spider.py
@@ -65,6 +65,7 @@ def __init__(
         self.sets = sets
         self.from_date = from_date
         self.until_date = until_date
+        self._crawled_records = {}
 
     def start_requests(self):
         started_at = datetime.utcnow()
@@ -116,19 +117,33 @@ def start_requests(self):
                 )
             )
 
-        LOGGER.info("Harvesting completed.")
+        LOGGER.info(
+            "Harvesting completed, harvested %s records.",
+            len(self._crawled_records),
+        )
 
     @abc.abstractmethod
     def parse_record(self, record):
         """
-        This method need to be reimplemented in order to provide special
+        This method needs to be reimplemented in order to provide special
         parsing.
 
         Args:
             record (scrapy.selector.Selector): selector on the parsed record
         """
         raise NotImplementedError()
 
+    @abc.abstractmethod
+    def get_record_identifier(self, record):
+        """
+        This method need to be reimplemented in order to extract a unique
+        identifier from the record to avoid cross-set reharvesting.
+
+        Args:
+            record (sickle.models.Record): sickle record response
+        """
+        raise NotImplementedError()
+
     def parse(self, response):
         sickle = Sickle(self.url)
         params = {
@@ -153,6 +168,18 @@ def parse(self, response):
             params,
         )
         for record in records:
+            rec_identifier = self.get_record_identifier(record)
+            if rec_identifier in self._crawled_records:
+                # avoid cross-set repeated records
+                LOGGER.info('Skipping duplicated record %s', rec_identifier)
+                continue
+
+            LOGGER.debug(
+                'Not skipping non-duplicated record %s',
+                rec_identifier,
+            )
+
+            self._crawled_records[rec_identifier] = record
             response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
             selector = Selector(response, type='xml')
             yield self.parse_record(selector)