From a0ad4a82a66571d30897d0374094d1a384019d7c Mon Sep 17 00:00:00 2001
From: Matt Fullerton <matt.fullerton@gmail.com>
Date: Tue, 21 Nov 2017 09:40:15 +0100
Subject: [PATCH] Experimental code to test #28

---
 harvesters/eiti/scripts/extract_summary.py | 207 ++++++++++++++++++---
 1 file changed, 178 insertions(+), 29 deletions(-)

diff --git a/harvesters/eiti/scripts/extract_summary.py b/harvesters/eiti/scripts/extract_summary.py
index 8e8fa1a..9935392 100644
--- a/harvesters/eiti/scripts/extract_summary.py
+++ b/harvesters/eiti/scripts/extract_summary.py
@@ -3,6 +3,7 @@
 import json
 import requests
 import unicodedata
+import unicodecsv as csv
 
 API_ENDPOINT = "https://eiti.org/api/v1.0/"
 
@@ -51,17 +52,21 @@ def writeCsv(name, company_or_govt, data):
         for l in data:
             f.write(l.encode('utf-8') + '\n')
 
+#From Python 3.5, https://stackoverflow.com/questions/5595425/what-is-the-best-way-to-compare-floats-for-almost-equality-in-python#33024979
+def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
+    return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
 
-def write(meta, data, company_or_govt):
+def write(meta, data, company_or_govt_or_combined):
     countryName = meta['country']['label']
     sanitizedCountryName = sanitizeCountryName(countryName)
-    writeCsv(sanitizedCountryName, company_or_govt, data)
+    writeCsv(sanitizedCountryName, company_or_govt_or_combined, data)
 
     dataset_title = "EITI Summary data table for %s" % countryName
     dataset_name = "eiti-summary-data-table-for-%s" % sanitizedCountryName
     
     resource_title_company = "Company payments - %s" % countryName
     resource_title_government = "Revenues received by government agencies - %s" % countryName
+    resource_title_combined = "Matched transactions - %s" % countryName
 
     if not (dataset_name in tracking):
         dataset = {
@@ -78,8 +83,10 @@ def write(meta, data, company_or_govt):
             "category": ["Precept 2: Accountability and Transparency"],
             "filename_company": './out/company/%s-company.csv' % sanitizedCountryName,
             "filename_government": './out/government/%s-government.csv' % sanitizedCountryName,
+            "filename_combined": './out/government/%s-government.csv' % sanitizedCountryName,
             "resource_title_company": resource_title_company,
-            "resource_title_government": resource_title_government
+            "resource_title_government": resource_title_government,
+            "resource_title_combined": resource_title_combined
         }
         datasets[dataset_name] = dataset
         tracking.append(dataset_name)
@@ -87,9 +94,9 @@ def write(meta, data, company_or_govt):
     else:
         yearsofar = datasets[dataset_name]['year']
         ysf = set(yearsofar)
-        theyear = meta['label'][-4:]
-        ysf.add(theyear)
-        allyears.add(theyear)
+        myyear = meta['label'][-4:]
+        ysf.add(myyear)
+        allyears.add(myyear)
         yearsofar = list(ysf)
         yearsofar.sort()
         datasets[dataset_name]['year'] = yearsofar
@@ -100,16 +107,22 @@ def sanitizeCountryName(countryName):
     return re.sub('[^a-z]', '-', asciiCountryName)
 
 def getSummaryData():
+    offline = True
     page = 1
     done = False
     data = []
-
-    while (done is False):
-        d = requests.get(API_ENDPOINT + 'summary_data?page=%s' % page).json()['data']
-        if len(d) == 0:
-            done = True
-        data.extend(d)
-        page += 1
+    if (offline):
+        with open ("rawdata.json", "rb") as jsonfile:
+            data = json.loads(jsonfile.read())
+    else:
+        while (done is False):
+            d = requests.get(API_ENDPOINT + 'summary_data?page=%s' % page).json()['data']
+            if len(d) == 0:
+                done = True
+            data.extend(d)
+            page += 1
+        #with open ("rawdata.json", "wb") as jsonfile:
+        #    jsonfile.write(json.dumps(data))
     return data
 
 
@@ -131,15 +144,27 @@ def getLineForRevenue(d, company, company_or_govt):
     
     cid = company['organisation_id']
     companyurl = API_ENDPOINT + 'organisation/' + cid
+
     if cid not in organisations:
-        j = requests.get(companyurl).json()
-        organisations[cid] = j['data'][0]
+        try:
+            j = requests.get(companyurl).json()
+            organisations[cid] = j['data'][0]
+        except:
+            print "Error: " + companyurl + " failed"
+            organisations[cid] = {}
+            organisations[cid]['label'] = 'Unknown'
 
     rid = company['id']
     revurl = API_ENDPOINT + 'revenue/' + rid
     if rid not in revenues:
-        j = requests.get(revurl).json()
-        revenues[rid] = j['data'][0]
+        try:
+            j = requests.get(revurl).json()
+            revenues[rid] = j['data'][0]
+        except:
+            print "Error: " + revurl + " failed"
+            revenues[rid] = {}
+            revenues[rid]['label'] = 'Unknown'
+            
 
     gfscode = gfs[gid]['code']
     gfsdesc = gfs[gid]['label']
@@ -157,11 +182,20 @@ def getLineForRevenue(d, company, company_or_govt):
 
     valreportedusd = company['revenue']
 
+    #Weird bug in API that we probably shouldn't completely ignore; may not happen in online version
+    if valreported == '0' or valreportedusd == '0' or valreported == None or valreportedusd == None:
+         valreported = '0'
+         valreportedusd = '0'
+
     stream_name = revenues[rid]['label']
 
     currency_code = company['original_currency']
 
-    currency_rate = d['country']['metadata'][year]['currency_rate']
+    if type(d['country']['metadata']) == dict:
+        currency_rate = d['country']['metadata'][year]['currency_rate']
+    else:
+        print "Error: Metadata is missing,can't get currency rate"
+        currency_rate = "Unknown"
 
     #Split files https://github.com/NRGI/resourcedata.org/issues/13
     returnstring = (
@@ -211,6 +245,25 @@ def getLineForRevenue(d, company, company_or_govt):
     
     year = d['label'][-4:]
 
+    disaggregated = {}
+
+    if type(d['country']['metadata']) == dict:
+        for theyear, report in dict.items(d['country']['metadata']):
+            if report['disaggregated']['revenue_stream'] == "1" and report['disaggregated']['company'] == "1":
+                disaggregated[theyear] = True
+            #elif report['disaggregated']['revenue_stream'] == "1":
+            #    print "Error: rev stream is disag but company not"
+            #    exit()
+            #elif report['disaggregated']['company'] == "1":
+            #    print "Error: company is disag but rev stream not"
+            #    exit()
+            else:
+                disaggregated[theyear] = False
+    else:
+        print "Error: Metadata is missing, assuming not disaggregated"
+        for y in range(1990, 2020):
+            disaggregated[str(y)] = False
+
     if (d['revenue_company'] or d['revenue_government']):
         print "%s/%s  %s %s" % (i, total_len, country, year)
 
@@ -219,25 +272,121 @@ def getLineForRevenue(d, company, company_or_govt):
         revcompany = []
 
         if 'revenue_government' in d and not (d['revenue_government'] is None):
-            revgovt.extend(d['revenue_government'])
+            revgovt = d['revenue_government']
         if 'revenue_company' in d and not (d['revenue_company'] is None):
-            revcompany.extend(d['revenue_company'])
+            revcompany = d['revenue_company']
 
         for revenue in revgovt:
-            try:
-                out_government.append(getLineForRevenue(d, revenue, 'government'))
-            except Exception:
-                continue
+            out_government.append(getLineForRevenue(d, revenue, 'government'))
 
         for revenue in revcompany:
-            try:
-                out_company.append(getLineForRevenue(d, revenue, 'company'))
-            except Exception:
-                continue
-            
+            out_company.append(getLineForRevenue(d, revenue, 'company'))
+        
         #Split files https://github.com/NRGI/resourcedata.org/issues/13
         write(d, out_government, 'government')
         write(d, out_company, 'company')
+        
+        #Join up where possible
+        #https://github.com/NRGI/resourcedata.org/issues/13
+        #May seem crazy to read them back in etc. after splitting and writing out, but we
+        #can only do it for some files/rows, so this seem's smart enough for now
+        matched = 0
+        to_match = len(out_company)
+        
+        #Regenerate the country name so we know what file to read in
+        countryName = d['country']['label']
+        sanitizedCountryName = sanitizeCountryName(countryName)
+        
+        comp_rows = []
+        gmt_rows = []
+        totalled_government_rows = []
+        totalled_government_rows_indexed = {}
+        matches = []
+
+        #Read in company payments
+        with open('./out/company/' + sanitizedCountryName + '-company.csv', 'rb') as csvfile:
+            csvreader = csv.DictReader(csvfile)
+            for row in csvreader:
+                comp_rows.append(row)
+
+        #Read in government receipts
+        with open('./out/government/' + sanitizedCountryName + '-government.csv', 'rb') as csvfile:
+            csvreader = csv.DictReader(csvfile)
+            for row in csvreader:
+                gmt_rows.append(row)
+
+        def dosum(srows):
+            total = 0
+            for srow in srows:
+                total += float(srow['value_reported'])
+            return total
+
+        #Generate government receipt -totals-
+        for row in gmt_rows:
+            #print "Row has year " + row['year']
+            not_found = True
+            if disaggregated[row['year']]:
+                #Group related stuff together - this would probably be a great place to use dataframes (pandas) #TODO
+                if row['year'] not in totalled_government_rows_indexed:
+                    totalled_government_rows_indexed[row['year']] = {}
+                if row['gfs_code'] not in totalled_government_rows_indexed[row['year']]:
+                    totalled_government_rows_indexed[row['year']][row['gfs_code']] = {}
+                if row['name_of_revenue_stream'].lower() not in totalled_government_rows_indexed[row['year']][row['gfs_code']]:
+                    totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()] = {}
+                    #We're going to track these three. Everything else should be equal. We could add some checks for that later.
+                    totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['local_total'] = []
+                    totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['us_total'] = []
+                totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['local_total'].append(float(row['value_reported']))
+                totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['us_total'].append(float(row['value_reported_as_USD']))
+                #Copy of data for later
+                totalled_government_rows_indexed[row['year']][row['gfs_code']][row['name_of_revenue_stream'].lower()]['orig_row'] = row
+                
+        #Now convert back to simple rows
+        for year_in in totalled_government_rows_indexed:
+            for gfs_code_in in totalled_government_rows_indexed[year_in]:
+                for rev_str_in in totalled_government_rows_indexed[year_in][gfs_code_in]:
+                    newrow = totalled_government_rows_indexed[year_in][gfs_code_in][rev_str_in]['orig_row'].copy()
+                    newrow['value_reported'] = sum(totalled_government_rows_indexed[year_in][gfs_code_in][rev_str_in]['local_total'])
+                    newrow['value_reported_as_USD'] = sum(totalled_government_rows_indexed[year_in][gfs_code_in][rev_str_in]['us_total'])
+                    totalled_government_rows.append(newrow)
+                    
+        #For each g'ment sum, look for companies paying in
+        for row in totalled_government_rows:
+            not_found = True
+            #No income, no payments
+            if row['value_reported'] == 0:
+                continue
+            search_gfs = row['gfs_code']
+            search_st = row['name_of_revenue_stream'].lower()
+            search_amount = float(row['value_reported'])
+            search_year = row['year']
+            print "Looking for gfs: " + search_gfs + " / str: " + search_st + " / amount: " + str(row['value_reported']) + " / year: " + search_year
+            ind_matches = []
+            #Look for matching rows in companies; store for summing
+            for grow in comp_rows:
+                if search_gfs == grow['gfs_code'] and search_st == grow['name_of_revenue_stream'].lower() and search_year == grow['year']:
+                    not_found = False
+                    print "Found a matching company payment, accumulating..."
+                    ind_matches.append(grow.copy())
+                    del grow #Prevent reuse
+            #Check sum for equality with government sum
+            if isclose(dosum(ind_matches), search_amount):
+                print "The total company payments match the total government receipts"
+                #Now we have diff. info for each company, so keep them as sep. rows and add in the government columns without duplicating columns
+                for match in ind_matches:
+                    nc = match.copy()
+                    nc.update(row) #Merge in government info (same for every row and mostly overlapping info)
+                    matches.append(nc)
+                    matched += 1
+            else:
+                print "Error: sum should be: " + str(search_amount) + " but sum is: " + str(dosum(ind_matches))
+            if not_found:
+                print "Error: despite disaggregated info, there are no company contributions for the government receipt"
+        print "Matched " + str(matched) + " of " + str(to_match) + " company rows to government receipts"
+        
+        #Write out results
+        #write(d, matches, 'combined')
+    
     else:
         print "%s/%s  %s %s - No revenue_company or revenue_government" % (i, total_len, country, year)