From a0ad4a82a66571d30897d0374094d1a384019d7c Mon Sep 17 00:00:00 2001 From: Matt Fullerton Date: Tue, 21 Nov 2017 09:40:15 +0100 Subject: [PATCH] Experimental code to test #28 --- harvesters/eiti/scripts/extract_summary.py | 207 ++++++++++++++++++--- 1 file changed, 178 insertions(+), 29 deletions(-) diff --git a/harvesters/eiti/scripts/extract_summary.py b/harvesters/eiti/scripts/extract_summary.py index 8e8fa1a..9935392 100644 --- a/harvesters/eiti/scripts/extract_summary.py +++ b/harvesters/eiti/scripts/extract_summary.py @@ -3,6 +3,7 @@ import json import requests import unicodedata +import unicodecsv as csv API_ENDPOINT = "https://eiti.org/api/v1.0/" @@ -51,17 +52,21 @@ def writeCsv(name, company_or_govt, data): for l in data: f.write(l.encode('utf-8') + '\n') +#From Python 3.5, https://stackoverflow.com/questions/5595425/what-is-the-best-way-to-compare-floats-for-almost-equality-in-python#33024979 +def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): + return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) -def write(meta, data, company_or_govt): +def write(meta, data, company_or_govt_or_combined): countryName = meta['country']['label'] sanitizedCountryName = sanitizeCountryName(countryName) - writeCsv(sanitizedCountryName, company_or_govt, data) + writeCsv(sanitizedCountryName, company_or_govt_or_combined, data) dataset_title = "EITI Summary data table for %s" % countryName dataset_name = "eiti-summary-data-table-for-%s" % sanitizedCountryName resource_title_company = "Company payments - %s" % countryName resource_title_government = "Revenues received by government agencies - %s" % countryName + resource_title_combined = "Matched transactions - %s" % countryName if not (dataset_name in tracking): dataset = { @@ -78,8 +83,10 @@ def write(meta, data, company_or_govt): "category": ["Precept 2: Accountability and Transparency"], "filename_company": './out/company/%s-company.csv' % sanitizedCountryName, "filename_government": './out/government/%s-government.csv' % sanitizedCountryName, + "filename_combined": './out/government/%s-government.csv' % sanitizedCountryName, "resource_title_company": resource_title_company, - "resource_title_government": resource_title_government + "resource_title_government": resource_title_government, + "resource_title_combined": resource_title_combined } datasets[dataset_name] = dataset tracking.append(dataset_name) @@ -87,9 +94,9 @@ def write(meta, data, company_or_govt): else: yearsofar = datasets[dataset_name]['year'] ysf = set(yearsofar) - theyear = meta['label'][-4:] - ysf.add(theyear) - allyears.add(theyear) + myyear = meta['label'][-4:] + ysf.add(myyear) + allyears.add(myyear) yearsofar = list(ysf) yearsofar.sort() datasets[dataset_name]['year'] = yearsofar @@ -100,16 +107,22 @@ def sanitizeCountryName(countryName): return re.sub('[^a-z]', '-', asciiCountryName) def getSummaryData(): + offline = True page = 1 done = False data = [] - - while (done is False): - d = requests.get(API_ENDPOINT + 'summary_data?page=%s' % page).json()['data'] - if len(d) == 0: - done = True - data.extend(d) - page += 1 + if (offline): + with open ("rawdata.json", "rb") as jsonfile: + data = json.loads(jsonfile.read()) + else: + while (done is False): + d = requests.get(API_ENDPOINT + 'summary_data?page=%s' % page).json()['data'] + if len(d) == 0: + done = True + data.extend(d) + page += 1 + #with open ("rawdata.json", "wb") as jsonfile: + # jsonfile.write(json.dumps(data)) return data @@ -131,15 +144,27 @@ def getLineForRevenue(d, company, company_or_govt): cid = company['organisation_id'] companyurl = API_ENDPOINT + 'organisation/' + cid + if cid not in organisations: - j = requests.get(companyurl).json() - organisations[cid] = j['data'][0] + try: + j = requests.get(companyurl).json() + organisations[cid] = j['data'][0] + except: + print "Error: " + companyurl + " failed" + organisations[cid] = {} + organisations[cid]['label'] = 'Unknown' rid = company['id'] revurl = API_ENDPOINT + 'revenue/' + rid if rid not in revenues: - j = requests.get(revurl).json() - revenues[rid] = j['data'][0] + try: + j = requests.get(revurl).json() + revenues[rid] = j['data'][0] + except: + print "Error: " + revurl + " failed" + revenues[rid] = {} + revenues[rid]['label'] = 'Unknown' + gfscode = gfs[gid]['code'] gfsdesc = gfs[gid]['label'] @@ -157,11 +182,20 @@ def getLineForRevenue(d, company, company_or_govt): valreportedusd = company['revenue'] + #Weird bug in API that we probably shouldn't completely ignore; may not happen in online version + if valreported == '0' or valreportedusd == '0' or valreported == None or valreportedusd == None: + valreported = '0' + valreportedusd = '0' + stream_name = revenues[rid]['label'] currency_code = company['original_currency'] - currency_rate = d['country']['metadata'][year]['currency_rate'] + if type(d['country']['metadata']) == dict: + currency_rate = d['country']['metadata'][year]['currency_rate'] + else: + print "Error: Metadata is missing,can't get currency rate" + currency_rate = "Unknown" #Split files https://github.com/NRGI/resourcedata.org/issues/13 returnstring = ( @@ -211,6 +245,25 @@ def getLineForRevenue(d, company, company_or_govt): year = d['label'][-4:] + disaggregated = {} + + if type(d['country']['metadata']) == dict: + for theyear, report in dict.items(d['country']['metadata']): + if report['disaggregated']['revenue_stream'] == "1" and report['disaggregated']['company'] == "1": + disaggregated[theyear] = True + #elif report['disaggregated']['revenue_stream'] == "1": + # print "Error: rev stream is disag but company not" + # exit() + #elif report['disaggregated']['company'] == "1": + # print "Error: company is disag but rev stream not" + # exit() + else: + disaggregated[theyear] = False + else: + print "Error: Metadata is missing, assuming not disaggregated" + for y in range(1990, 2020): + disaggregated[str(y)] = False + if (d['revenue_company'] or d['revenue_government']): print "%s/%s %s %s" % (i, total_len, country, year) @@ -219,25 +272,121 @@ def getLineForRevenue(d, company, company_or_govt): revcompany = [] if 'revenue_government' in d and not (d['revenue_government'] is None): - revgovt.extend(d['revenue_government']) + revgovt = d['revenue_government'] if 'revenue_company' in d and not (d['revenue_company'] is None): - revcompany.extend(d['revenue_company']) + revcompany = d['revenue_company'] for revenue in revgovt: - try: - out_government.append(getLineForRevenue(d, revenue, 'government')) - except Exception: - continue + out_government.append(getLineForRevenue(d, revenue, 'government')) for revenue in revcompany: - try: - out_company.append(getLineForRevenue(d, revenue, 'company')) - except Exception: - continue - + out_company.append(getLineForRevenue(d, revenue, 'company')) + #Split files https://github.com/NRGI/resourcedata.org/issues/13 write(d, out_government, 'government') write(d, out_company, 'company') + + #Join up where possible + #https://github.com/NRGI/resourcedata.org/issues/13 + #May seem crazy to read them back in etc. after splitting and writing out, but we + #can only do it for some files/rows, so this seem's smart enough for now + matched = 0 + to_match = len(out_company) + + #Regenerate the country name so we know what file to read in + countryName = d['country']['label'] + sanitizedCountryName = sanitizeCountryName(countryName) + + comp_rows = [] + gmt_rows = [] + totalled_government_rows = [] + totalled_government_rows_indexed = {} + matches = [] + + #Read in company payments + with open('./out/company/' + sanitizedCountryName + '-company.csv', 'rb') as csvfile: + csvreader = csv.DictReader(csvfile) + for row in csvreader: + comp_rows.append(row) + + #Read in government receipts + with open('./out/government/' + sanitizedCountryName + '-government.csv', 'rb') as csvfile: + csvreader = csv.DictReader(csvfile) + for row in csvreader: + gmt_rows.append(row) + + def dosum(srows): + total = 0 + for srow in srows: + total += float(srow['value_reported']) + return total + + #Generate government receipt -totals- + for row in gmt_rows: + #print "Row has year " + row['year'] + not_found = True + if disaggregated[row['year']]: + #Group related stuff together - this would probably be a great place to use dataframes (pandas) #TODO + if row['year'] not in totalled_government_rows_indexed: + totalled_government_rows_indexed[row['year']] = {} + if row['gfs_code'] not in totalled_government_rows_indexed[row['year']]: + totalled_government_rows_indexed[row['year']][row['gfs_code']] = {} + if row['name_of_revenue_stream'].lower() not in totalled_government_rows_indexed[row['year']][row['gfs_code']]: + totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()] = {} + #We're going to track these three. Everything else should be equal. We could add some checks for that later. + totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['local_total'] = [] + totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['us_total'] = [] + totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['local_total'].append(float(row['value_reported'])) + totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['us_total'].append(float(row['value_reported_as_USD'])) + #Copy of data for later + totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['orig_row'] = row + + #Now convert back to simple rows + for year_in in totalled_government_rows_indexed: + for gfs_code_in in totalled_government_rows_indexed[year_in]: + for rev_str_in in totalled_government_rows_indexed[year_in][gfs_code_in]: + newrow = totalled_government_rows_indexed[year_in][gfs_code_in][rev_str_in]['orig_row'].copy() + newrow['value_reported'] = sum(totalled_government_rows_indexed[year_in][gfs_code_in][rev_str_in]['local_total']) + newrow['value_reported_as_USD'] = sum(totalled_government_rows_indexed[year_in][gfs_code_in][rev_str_in]['us_total']) + totalled_government_rows.append(newrow) + + #For each g'ment sum, look for companies paying in + for row in totalled_government_rows: + not_found = True + #No income, no payments + if row['value_reported'] == 0: + continue + search_gfs = row['gfs_code'] + search_st = row['name_of_revenue_stream'].lower() + search_amount = float(row['value_reported']) + search_year = row['year'] + print "Looking for gfs: " + search_gfs + " / str: " + search_st + " / amount: " + str(row['value_reported']) + " / year: " + search_year + ind_matches = [] + #Look for matching rows in companies; store for summing + for grow in comp_rows: + if search_gfs == grow['gfs_code'] and search_st == grow['name_of_revenue_stream'].lower() and search_year == grow['year']: + not_found = False + print "Found a matching company payment, accumulating..." + ind_matches.append(grow.copy()) + del grow #Prevent reuse + #Check sum for equality with government sum + if isclose(dosum(ind_matches), search_amount): + print "The total company payments match the total government receipts" + #Now we have diff. info for each company, so keep them as sep. rows and add in the government columns without duplicating columns + for match in ind_matches: + nc = match.copy() + nc.update(row) #Merge in government info (same for every row and mostly overlapping info) + matches.append(nc) + matched += 1 + else: + print "Error: sum should be: " + str(search_amount) + " but sum is: " + str(dosum(ind_matches)) + if not_found: + print "Error: despite disaggregated info, there are no company contributions for the government receipt" + print "Matched " + str(matched) + " of " + str(to_match) + " company rows to government receipts" + + #Write out results + #write(d, matches, 'combined') + else: print "%s/%s %s %s - No revenue_company or revenue_government" % (i, total_len, country, year)