Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cloc cocomo #5

Merged
merged 16 commits into from
Mar 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 113 additions & 3 deletions scraper/code_gov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,20 @@

import json
import logging
import os
import re
import tempfile

import github3
import gitlab
# import stashy
import requests

from scraper.util import execute

logger = logging.getLogger(__name__)

EFFORT_REGEX = re.compile(r'Effort = ([\d\.]+) Person-months')

DOE_LAB_MAPPING = {
'AMES': 'Ames Laboratory (AMES)',
'ANL': 'Argonne National Laboratory (ANL)',
Expand Down Expand Up @@ -113,6 +120,108 @@ def _prune_dict_null_str(dictionary):
return dictionary


def git_repo_to_sloc(url):
"""
Given a Git repository URL, returns number of lines of code based on cloc

Reference:
- cloc: https://github.com/AlDanial/cloc

Sample cloc output:
{
"header": {
"cloc_url": "github.com/AlDanial/cloc",
"cloc_version": "1.74",
"elapsed_seconds": 0.195950984954834,
"n_files": 27,
"n_lines": 2435,
"files_per_second": 137.78956000769,
"lines_per_second": 12426.5769858787
},
"C++": {
"nFiles": 7,
"blank": 121,
"comment": 314,
"code": 371
},
"C/C++ Header": {
"nFiles": 8,
"blank": 107,
"comment": 604,
"code": 191
},
"CMake": {
"nFiles": 11,
"blank": 49,
"comment": 465,
"code": 165
},
"Markdown": {
"nFiles": 1,
"blank": 18,
"comment": 0,
"code": 30
},
"SUM": {
"blank": 295,
"comment": 1383,
"code": 757,
"nFiles": 27
}
}
"""

with tempfile.TemporaryDirectory() as tmp_dir:
logger.debug('Cloning: url=%s tmp_dir=%s', url, tmp_dir)

tmp_clone = os.path.join(tmp_dir, 'clone-dir')

cmd = ['git', 'clone', '--depth=1', url, tmp_clone]
execute(cmd)

cmd = ['cloc', '--json', tmp_clone]
out, _ = execute(cmd)

try:
cloc_json = json.loads(out[1:].replace('\\n', '').replace('\'', ''))
sloc = cloc_json['SUM']['code']
except json.decoder.JSONDecodeError:
logger.debug('Error Decoding: url=%s, out=%s', url, out)
sloc = 0

logger.debug('SLOC: url=%s, sloc=%d', sloc)

return sloc


def compute_labor_hours(sloc):
"""
Compute the labor hours, given a count of source lines of code

The intention is to use the COCOMO II model to compute this value.

References:
- http://csse.usc.edu/tools/cocomoii.php
- http://docs.python-guide.org/en/latest/scenarios/scrape/
"""
# (40 Hours / week) * (52 weeks / year) / (12 months / year) ~= 173.33
HOURS_PER_PERSON_MONTH = 40.0 * 52 / 12

cocomo_url = 'http://csse.usc.edu/tools/cocomoii.php'
page = requests.post(cocomo_url, data={'new_size': sloc})

try:
person_months = float(EFFORT_REGEX.search(page.text).group(1))
except AttributeError:
# If there is no match, and .search(..) returns None
person_months = 0

labor_hours = person_months * HOURS_PER_PERSON_MONTH
logger.debug('sloc=%d labor_hours=%d', sloc, labor_hours)

return labor_hours


class CodeGovMetadata(dict):
"""
Defines the entire contents of a Code.gov 's code.json file
Expand Down Expand Up @@ -297,8 +406,9 @@ def from_github3(klass, repository, organization=None):
project['permissions']['licenses'] = None
project['permissions']['usageType'] = 'openSource'

# TODO: Compute from git repo
project['laborHours'] = 0
sum_sloc = git_repo_to_sloc(project['repositoryURL'])
laborHours = compute_labor_hours(sum_sloc)
project['laborHours'] = laborHours

# TODO: Compute from GitHub
project['tags'] = ['github']
Expand Down
31 changes: 7 additions & 24 deletions scraper/gen_code_gov_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@

import github3
import stashy
import requests

from scraper.code_gov import CodeGovMetadata, CodeGovProject
from scraper.code_gov.doe import to_doe_csv
from scraper.github import gov_orgs

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -42,7 +42,7 @@ def _configure_logging(verbose=False):
logger.addHandler(handler)


def _check_api_limits(min_requests_remaining=250, sleep_time=15):
def _check_api_limits(gh_session, min_requests_remaining=250, sleep_time=15):
"""
Simplified check for API limits

Expand All @@ -52,18 +52,18 @@ def _check_api_limits(min_requests_remaining=250, sleep_time=15):

See: https://developer.github.com/v3/#rate-limiting
"""
api_rates = gh.rate_limit()
api_rates = gh_session.rate_limit()

api_remaining = api_rates['rate']['remaining']
api_reset = api_rates['rate']['reset']
logger.info('Rate Limit - %d requests remaining', api_remaining)
logger.debug('Rate Limit - %d requests remaining', api_remaining)

if api_remaining > min_requests_remaining:
return

now_time = time.time()
time_to_reset = int(api_reset - now_time)
logger.info('Rate Limit - Need to sleep for %d seconds', time_to_reset)
logger.warn('Rate Limit Depleted - Sleeping for %d seconds', time_to_reset)

while now_time < api_reset:
time.sleep(10)
Expand All @@ -83,7 +83,7 @@ def process_organization(org_name):
WIGGLE_ROOM = 100
num_requests_needed = 2 * num_repos + WIGGLE_ROOM

_check_api_limits(min_requests_remaining=num_requests_needed)
_check_api_limits(gh, min_requests_remaining=num_requests_needed)

logger.info('Processing GitHub Org: %s (%d public repos)', org_name, num_repos)

Expand Down Expand Up @@ -136,23 +136,6 @@ def process_doecode(doecode_json_filename):
return projects


def government_at_github():
"""
Returns a list of US Government GitHub orgs

Based on: https://government.github.com/community/
"""
us_gov_github_orgs = set()

gov_orgs = requests.get('https://government.github.com/organizations.json').json()

us_gov_github_orgs.update(gov_orgs['governments']['U.S. Federal'])
us_gov_github_orgs.update(gov_orgs['governments']['U.S. Military and Intelligence'])
us_gov_github_orgs.update(gov_orgs['research']['U.S. Research Labs'])

return list(us_gov_github_orgs)


def main():
parser = argparse.ArgumentParser(description='Scrape code repositories for Code.gov / DOECode')

Expand Down Expand Up @@ -205,7 +188,7 @@ def main():
logger.debug('GitHub.com Organizations: %s', github_orgs)

if args.github_gov_orgs:
github_orgs.extend(government_at_github())
github_orgs.extend(gov_orgs())

github_repos = config_json.get('github_repos', [])
github_repos.extend(args.github_repos)
Expand Down
21 changes: 21 additions & 0 deletions scraper/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#! /usr/bin/env python
# -*- coding: UTF-8 -*-

import requests


def gov_orgs():
"""
Returns a list of US Government GitHub orgs

Based on: https://government.github.com/community/
"""
us_gov_github_orgs = set()

gov_orgs = requests.get('https://government.github.com/organizations.json').json()

us_gov_github_orgs.update(gov_orgs['governments']['U.S. Federal'])
us_gov_github_orgs.update(gov_orgs['governments']['U.S. Military and Intelligence'])
us_gov_github_orgs.update(gov_orgs['research']['U.S. Research Labs'])

return list(us_gov_github_orgs)
24 changes: 24 additions & 0 deletions scraper/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import logging
import os

from subprocess import Popen, PIPE, STDOUT # nosec

logger = logging.getLogger(__name__)


def execute(command, cwd=None):
logger.debug('Forking command: %s', command)

if cwd is None:
cwd = os.getcwd()
elif not os.path.isdir(cwd):
raise ValueError('path does not exist: %s', cwd)

process = Popen(
command,
cwd=cwd,
stdout=PIPE,
stderr=STDOUT,
shell=False) # nosec
out, err = process.communicate()
return str(out), str(err)
29 changes: 29 additions & 0 deletions scripts/codegov_compute_hours.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#! /usr/bin/env python3

import argparse
import json

from scraper import code_gov


parser = argparse.ArgumentParser(description='Scrape code repositories for Code.gov / DOECode')
parser.add_argument('filename', type=str, help='Path to locally stored `code.json` file')
args = parser.parse_args()

code_gov_json = json.load(open(args.filename))
releases = code_gov_json['releases']

repo_urls = {
release['repositoryURL'].rstrip('/')
for release in releases
if release.get('vcs', '') == 'git'
}

for url in repo_urls:
# print(url)

sloc = code_gov.git_repo_to_sloc(url)
# print(sloc)

hours = code_gov.compute_labor_hours(sloc)
print('-- url=%s, sloc=%d, hours=%d' % (url, sloc, hours))
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
'scraper = scraper.gen_code_gov_json:main',
]
},
scripts=['scripts/codegov_compute_hours.py'],
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Developers',
Expand Down