Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added --citations-only option. It prints all the articles that cite the queried one #83

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 42 additions & 5 deletions scholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@
import re
import sys
import warnings
import time

try:
# Try importing for Python 3
Expand Down Expand Up @@ -879,7 +880,7 @@ class ScholarSettings(object):

def __init__(self):
self.citform = 0 # Citation format, default none
self.per_page_results = None
self.per_page_results = 10
self._is_configured = False

def set_citation_format(self, citform):
Expand All @@ -893,8 +894,7 @@ def set_citation_format(self, citform):
def set_per_page_results(self, per_page_results):
self.per_page_results = ScholarUtils.ensure_int(
per_page_results, 'page results must be integer')
self.per_page_results = min(
self.per_page_results, ScholarConf.MAX_PAGE_RESULTS)
self.per_page_results = min(self.per_page_results, ScholarConf.MAX_PAGE_RESULTS)
self._is_configured = True

def is_configured(self):
Expand Down Expand Up @@ -1026,6 +1026,38 @@ def send_query(self, query):

self.parse(html)

def get_citations(self,query):
"""
Given a query, it retrieve the list of articles that cite the first
article returned by the query.
It's done in two steps: first it retrieves the citations url of the
first article, then it retrieves the articles that cite it
"""
self.send_query(query)

if len(self.articles)==0 or self.articles[0]['url_citations'] is None:
return
citations_url=self.articles[0]['url_citations']
citations_num=self.articles[0]['num_citations']
self.clear_articles()

html = self._get_http_response(url=citations_url,
log_msg='dump of query response HTML',
err_msg='results retrieval failed')
if html is None:
return
self.parse(html)
while len(self.articles)<citations_num:
# this is a workaround to fetch all the citations, ought to be better integrated at some point
time.sleep(1)
html = self._get_http_response(url=citations_url+'&start='+str(len(self.articles)),
log_msg='dump of query response HTML',
err_msg='results retrieval failed')
if html is None:
return

self.parse(html)

def get_citation_data(self, article):
"""
Given an article, retrieves citation link. Note, this requires that
Expand All @@ -1043,7 +1075,6 @@ def get_citation_data(self, article):
err_msg='requesting citation data failed')
if data is None:
return False

article.set_citation_data(data)
return True

Expand Down Expand Up @@ -1187,6 +1218,8 @@ def main():
help='Do not include patents in results')
group.add_option('--no-citations', action='store_true', default=False,
help='Do not include citations in results')
group.add_option('--citations-only', action='store_true', default=False,
help='Prints only the citations list in results')
group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None,
help='Do not search, just use articles in given cluster ID')
group.add_option('-c', '--count', type='int', default=None,
Expand Down Expand Up @@ -1290,7 +1323,11 @@ def main():
options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS)
query.set_num_page_results(options.count)

querier.send_query(query)

if options.citations_only:
querier.get_citations(query)
else:
querier.send_query(query)

if options.csv:
csv(querier)
Expand Down