diff --git a/scraper/code_gov/__init__.py b/scraper/code_gov/__init__.py index f88220b..4a83f4e 100644 --- a/scraper/code_gov/__init__.py +++ b/scraper/code_gov/__init__.py @@ -3,13 +3,20 @@ import json import logging +import os +import re +import tempfile import github3 import gitlab -# import stashy +import requests + +from scraper.util import execute logger = logging.getLogger(__name__) +EFFORT_REGEX = re.compile(r'Effort = ([\d\.]+) Person-months') + DOE_LAB_MAPPING = { 'AMES': 'Ames Laboratory (AMES)', 'ANL': 'Argonne National Laboratory (ANL)', @@ -113,6 +120,108 @@ def _prune_dict_null_str(dictionary): return dictionary +def git_repo_to_sloc(url): + """ + Given a Git repository URL, returns number of lines of code based on cloc + + Reference: + - cloc: https://github.com/AlDanial/cloc + + Sample cloc output: + { + "header": { + "cloc_url": "github.com/AlDanial/cloc", + "cloc_version": "1.74", + "elapsed_seconds": 0.195950984954834, + "n_files": 27, + "n_lines": 2435, + "files_per_second": 137.78956000769, + "lines_per_second": 12426.5769858787 + }, + "C++": { + "nFiles": 7, + "blank": 121, + "comment": 314, + "code": 371 + }, + "C/C++ Header": { + "nFiles": 8, + "blank": 107, + "comment": 604, + "code": 191 + }, + "CMake": { + "nFiles": 11, + "blank": 49, + "comment": 465, + "code": 165 + }, + "Markdown": { + "nFiles": 1, + "blank": 18, + "comment": 0, + "code": 30 + }, + "SUM": { + "blank": 295, + "comment": 1383, + "code": 757, + "nFiles": 27 + } + } + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + logger.debug('Cloning: url=%s tmp_dir=%s', url, tmp_dir) + + tmp_clone = os.path.join(tmp_dir, 'clone-dir') + + cmd = ['git', 'clone', '--depth=1', url, tmp_clone] + execute(cmd) + + cmd = ['cloc', '--json', tmp_clone] + out, _ = execute(cmd) + + try: + cloc_json = json.loads(out[1:].replace('\\n', '').replace('\'', '')) + sloc = cloc_json['SUM']['code'] + except json.decoder.JSONDecodeError: + logger.debug('Error Decoding: url=%s, out=%s', url, out) + sloc = 0 + + logger.debug('SLOC: url=%s, sloc=%d', sloc) + + return sloc + + +def compute_labor_hours(sloc): + """ + Compute the labor hours, given a count of source lines of code + + The intention is to use the COCOMO II model to compute this value. + + References: + - http://csse.usc.edu/tools/cocomoii.php + - http://docs.python-guide.org/en/latest/scenarios/scrape/ + """ + # (40 Hours / week) * (52 weeks / year) / (12 months / year) ~= 173.33 + HOURS_PER_PERSON_MONTH = 40.0 * 52 / 12 + + cocomo_url = 'http://csse.usc.edu/tools/cocomoii.php' + page = requests.post(cocomo_url, data={'new_size': sloc}) + + try: + person_months = float(EFFORT_REGEX.search(page.text).group(1)) + except AttributeError: + # If there is no match, and .search(..) returns None + person_months = 0 + + labor_hours = person_months * HOURS_PER_PERSON_MONTH + logger.debug('sloc=%d labor_hours=%d', sloc, labor_hours) + + return labor_hours + + class CodeGovMetadata(dict): """ Defines the entire contents of a Code.gov 's code.json file @@ -297,8 +406,9 @@ def from_github3(klass, repository, organization=None): project['permissions']['licenses'] = None project['permissions']['usageType'] = 'openSource' - # TODO: Compute from git repo - project['laborHours'] = 0 + sum_sloc = git_repo_to_sloc(project['repositoryURL']) + laborHours = compute_labor_hours(sum_sloc) + project['laborHours'] = laborHours # TODO: Compute from GitHub project['tags'] = ['github'] diff --git a/scraper/gen_code_gov_json.py b/scraper/gen_code_gov_json.py index 7d5e9e0..fcbe532 100755 --- a/scraper/gen_code_gov_json.py +++ b/scraper/gen_code_gov_json.py @@ -10,10 +10,10 @@ import github3 import stashy -import requests from scraper.code_gov import CodeGovMetadata, CodeGovProject from scraper.code_gov.doe import to_doe_csv +from scraper.github import gov_orgs logger = logging.getLogger(__name__) @@ -42,7 +42,7 @@ def _configure_logging(verbose=False): logger.addHandler(handler) -def _check_api_limits(min_requests_remaining=250, sleep_time=15): +def _check_api_limits(gh_session, min_requests_remaining=250, sleep_time=15): """ Simplified check for API limits @@ -52,18 +52,18 @@ def _check_api_limits(min_requests_remaining=250, sleep_time=15): See: https://developer.github.com/v3/#rate-limiting """ - api_rates = gh.rate_limit() + api_rates = gh_session.rate_limit() api_remaining = api_rates['rate']['remaining'] api_reset = api_rates['rate']['reset'] - logger.info('Rate Limit - %d requests remaining', api_remaining) + logger.debug('Rate Limit - %d requests remaining', api_remaining) if api_remaining > min_requests_remaining: return now_time = time.time() time_to_reset = int(api_reset - now_time) - logger.info('Rate Limit - Need to sleep for %d seconds', time_to_reset) + logger.warn('Rate Limit Depleted - Sleeping for %d seconds', time_to_reset) while now_time < api_reset: time.sleep(10) @@ -83,7 +83,7 @@ def process_organization(org_name): WIGGLE_ROOM = 100 num_requests_needed = 2 * num_repos + WIGGLE_ROOM - _check_api_limits(min_requests_remaining=num_requests_needed) + _check_api_limits(gh, min_requests_remaining=num_requests_needed) logger.info('Processing GitHub Org: %s (%d public repos)', org_name, num_repos) @@ -136,23 +136,6 @@ def process_doecode(doecode_json_filename): return projects -def government_at_github(): - """ - Returns a list of US Government GitHub orgs - - Based on: https://government.github.com/community/ - """ - us_gov_github_orgs = set() - - gov_orgs = requests.get('https://government.github.com/organizations.json').json() - - us_gov_github_orgs.update(gov_orgs['governments']['U.S. Federal']) - us_gov_github_orgs.update(gov_orgs['governments']['U.S. Military and Intelligence']) - us_gov_github_orgs.update(gov_orgs['research']['U.S. Research Labs']) - - return list(us_gov_github_orgs) - - def main(): parser = argparse.ArgumentParser(description='Scrape code repositories for Code.gov / DOECode') @@ -205,7 +188,7 @@ def main(): logger.debug('GitHub.com Organizations: %s', github_orgs) if args.github_gov_orgs: - github_orgs.extend(government_at_github()) + github_orgs.extend(gov_orgs()) github_repos = config_json.get('github_repos', []) github_repos.extend(args.github_repos) diff --git a/scraper/github.py b/scraper/github.py new file mode 100644 index 0000000..5051f4a --- /dev/null +++ b/scraper/github.py @@ -0,0 +1,21 @@ +#! /usr/bin/env python +# -*- coding: UTF-8 -*- + +import requests + + +def gov_orgs(): + """ + Returns a list of US Government GitHub orgs + + Based on: https://government.github.com/community/ + """ + us_gov_github_orgs = set() + + gov_orgs = requests.get('https://government.github.com/organizations.json').json() + + us_gov_github_orgs.update(gov_orgs['governments']['U.S. Federal']) + us_gov_github_orgs.update(gov_orgs['governments']['U.S. Military and Intelligence']) + us_gov_github_orgs.update(gov_orgs['research']['U.S. Research Labs']) + + return list(us_gov_github_orgs) diff --git a/scraper/util.py b/scraper/util.py new file mode 100644 index 0000000..e1901b8 --- /dev/null +++ b/scraper/util.py @@ -0,0 +1,24 @@ +import logging +import os + +from subprocess import Popen, PIPE, STDOUT # nosec + +logger = logging.getLogger(__name__) + + +def execute(command, cwd=None): + logger.debug('Forking command: %s', command) + + if cwd is None: + cwd = os.getcwd() + elif not os.path.isdir(cwd): + raise ValueError('path does not exist: %s', cwd) + + process = Popen( + command, + cwd=cwd, + stdout=PIPE, + stderr=STDOUT, + shell=False) # nosec + out, err = process.communicate() + return str(out), str(err) diff --git a/scripts/codegov_compute_hours.py b/scripts/codegov_compute_hours.py new file mode 100755 index 0000000..e7e4eef --- /dev/null +++ b/scripts/codegov_compute_hours.py @@ -0,0 +1,29 @@ +#! /usr/bin/env python3 + +import argparse +import json + +from scraper import code_gov + + +parser = argparse.ArgumentParser(description='Scrape code repositories for Code.gov / DOECode') +parser.add_argument('filename', type=str, help='Path to locally stored `code.json` file') +args = parser.parse_args() + +code_gov_json = json.load(open(args.filename)) +releases = code_gov_json['releases'] + +repo_urls = { + release['repositoryURL'].rstrip('/') + for release in releases + if release.get('vcs', '') == 'git' +} + +for url in repo_urls: + # print(url) + + sloc = code_gov.git_repo_to_sloc(url) + # print(sloc) + + hours = code_gov.compute_labor_hours(sloc) + print('-- url=%s, sloc=%d, hours=%d' % (url, sloc, hours)) diff --git a/setup.py b/setup.py index b2adfed..e5f197a 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ 'scraper = scraper.gen_code_gov_json:main', ] }, + scripts=['scripts/codegov_compute_hours.py'], classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers',