-
Notifications
You must be signed in to change notification settings - Fork 3
/
OKR_techsup_ga.py
250 lines (214 loc) · 9.1 KB
/
OKR_techsup_ga.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 15 15:48:38 2018
@author: ericgrant
"""
import argparse
from apiclient.discovery import build
import httplib2
from oauth2client import client
from oauth2client import file
from oauth2client import tools
import pandas as pd
from pandas.io.json import json_normalize
from omnisci_utils import get_credentials
from omnisci_utils import wake_and_connect_to_mapd
from omnisci_utils import drop_table_mapd
from omnisci_utils import disconnect_mapd
from parsing_utils import format_date_cols
from parsing_utils import format_int_col
from parsing_utils import format_str_col
from parsing_utils import format_flt_col
file_path = '/Users/ericgrant/Downloads/OKR_Dashboards/xfer/'
file_geocodes = file_path + 'AdWords API Location Criteria 2018-09-04.csv'
# parameters for OmniSci Cloud
mapdhost = 'use2-api.mapd.cloud'
mapdport = 443
mapdprotocol = 'https'
mapddbname = 'mapd'
mapduser = 'mapd'
omnisci_keyfile = file_path + 'omnisci_keys.json'
wait_interval = 25
# parameters for Google API
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
DISCOVERY_URI = ('https://analyticsreporting.googleapis.com/$discovery/rest')
VIEW_ID = '93521025'
CLIENT_SECRETS_PATH = file_path + 'client_secrets.json' # path to client_secrets.json file.
start_date = '2017-04-01'
tables_and_files = [
#blog post views
('techsup_ga_blogvisits', file_path + 'techsup_ga_blogvisits.csv', {'c1_timestamp'}, {}, {'time_on_page', 'unique_pageviews'}, {}, {'geo_city_code'}, "%Y%m%d",
'CREATE TABLE IF NOT EXISTS techsup_ga_blogvisits (blog_title TEXT ENCODING DICT(8), blog_url TEXT ENCODING DICT(8), referral_path TEXT ENCODING DICT(8), c1_timestamp TIMESTAMP, unique_pageviews FLOAT, time_on_page FLOAT, source TEXT ENCODING DICT(8), city_name TEXT ENCODING DICT(8), city_canonical_name TEXT ENCODING DICT(8), country_code TEXT ENCODING DICT(8));',
{
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [
{'startDate': start_date, 'endDate': 'today'}
],
'pageSize': 10000,
'metrics': [
{'expression': 'ga:uniquePageviews'},
{'expression': 'ga:timeOnPage'}
],
'dimensions': [
{'name': 'ga:pageTitle'},
{'name': 'ga:pagePath'},
{'name': 'ga:referralPath'},
{'name': 'ga:date'},
{'name': 'ga:cityID'}
],
'dimensionFilterClauses': [
{'filters': [
{'dimensionName': 'ga:pageTitle', 'operator': 'PARTIAL', 'expressions': ['blog']}
]}
]}
]}
)
]
# GOOGLE ANALYTICS FUNCTIONS
def initialize_analyticsreporting():
"""Initializes the analyticsreporting service object.
Returns:
analytics an authorized analyticsreporting service object.
"""
# Parse command-line arguments.
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[tools.argparser])
flags = parser.parse_args([])
# Set up a Flow object to be used if we need to authenticate.
flow = client.flow_from_clientsecrets(
CLIENT_SECRETS_PATH, scope=SCOPES,
message=tools.message_if_missing(CLIENT_SECRETS_PATH))
# Prepare credentials, and authorize HTTP object with them.
# If the credentials don't exist or are invalid run through the native client
# flow. The Storage object will ensure that if successful the good
# credentials will get written back to a file.
storage = file.Storage('analyticsreporting.dat')
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = tools.run_flow(flow, storage, flags)
http = credentials.authorize(http=httplib2.Http())
# Build the service object.
analytics = build('analytics', 'v4', http=http, discoveryServiceUrl=DISCOVERY_URI)
return analytics
def get_report(analytics, bodycontent):
# Use the Analytics Service Object to query the Analytics Reporting API V4.
return analytics.reports().batchGet(
body=bodycontent).execute()
def print_response(response):
"""Parses and prints the Analytics Reporting API V4 response"""
for report in response.get('reports', []):
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
print (header + ': ' + dimension)
for i, values in enumerate(dateRangeValues):
print ('Date range (' + str(i) + ')')
for metricHeader, value in zip(metricHeaders, values.get('values')):
print (metricHeader.get('name') + ': ' + value)
def format_data(response):
reports = response['reports'][0]
columnHeader = reports['columnHeader']['dimensions']
metricHeader = reports['columnHeader']['metricHeader']['metricHeaderEntries']
columns = columnHeader
for metric in metricHeader:
columns.append(metric['name'])
data = json_normalize(reports['data']['rows'])
data_dimensions = pd.DataFrame(data['dimensions'].tolist())
data_metrics = pd.DataFrame(data['metrics'].tolist())
data_metrics = data_metrics.applymap(lambda x: x['values'])
data_metrics = pd.DataFrame(data_metrics[0].tolist())
result = pd.concat([data_dimensions, data_metrics], axis=1, ignore_index=True)
result.columns = ["blog_title", "blog_url", "referral_path", "c1_timestamp", "geo_city_code", "unique_pageviews", "time_on_page"] # set the column names
return (result)
def output_to_csv(df, fn):
df.to_csv(fn, index=False)
# OMNISCI FUNCTIONS
def source(url):
if 'tag' in url:
source = 'tag'
elif 'author' in url:
source = 'author'
elif url == 'www.mapd.com/blog':
source = 'mapd blog landing page'
elif url == 'www.mapd.com/blog/':
source = 'mapd blog landing page'
elif url == 'www.omnisci.com/blog':
source = 'omnisci blog landing page'
elif url == 'www.mapd.com/blog/':
source = 'omnisci blog landing page'
elif 'community.mapd.com' in url:
source = 'mapd community forum'
elif 'community.omnisci.com' in url:
source = 'omnisci community forum'
else:
source = 'other / direct'
return (source)
def parse_source(df):
df['source'] = df['blog_url'].apply(source)
return df
def parse_city(df):
dfcity = pd.read_csv(file_geocodes)
dfcity.columns = ['geo_city_code', 'city_name', 'city_canonical_name', 'city_parent_code', 'country_code', 'city_target_type', 'city_status'] # set the column names
format_str_col(dfcity, {'geo_city_code'})
df = pd.merge(df, dfcity, on=['geo_city_code'], how='left')
return df
def parse_geo_data(df):
df = parse_source(df)
df = parse_city(df)
return df
# Load CSV to dataframe
def parse_data(csvfile, dtcols, intcols, floatcols, strcols, renamings, tfrmt):
df = pd.read_csv(csvfile)
df.reset_index(drop=True, inplace=True)
format_date_cols(df, dtcols, tfrmt) #force the column containing datetime values to be recast from strings to timestamps
format_int_col(df, intcols)
format_str_col(df, strcols)
format_flt_col(df, floatcols)
df = parse_geo_data(df)
df = df.drop('geo_city_code', 1)
df = df.drop('city_parent_code', 1)
df = df.drop('city_target_type', 1)
df = df.drop('city_status', 1)
return df
def wake_up_omnisci():
# get OmniSci credentials
dfcreds = pd.DataFrame()
dfcreds = get_credentials(omnisci_keyfile)
# connect to OmniSci, allowing time for the instance to wake
connection = wake_and_connect_to_mapd(dfcreds['write_key_name'], dfcreds['write_key_secret'], mapdhost, mapddbname)
return connection
# MAIN
def main():
# connect to omnisci
connection = wake_up_omnisci()
if connection != "RETRY":
# loop through tables and reports
for os_table, csv_file, dt_cols, int_cols, float_cols, str_cols, rename_cols, time_format, creationstring, reportbody in tables_and_files:
# connect to Google Analytics
analytics = initialize_analyticsreporting()
response = get_report(analytics, reportbody)
# format the data into the columnar tables OmniSci wants
df = format_data(response)
# save the dataframe to a file
output_to_csv(df, csv_file)
# create the new dataframe from the file contents
df = parse_data(csv_file, dt_cols, int_cols, float_cols, str_cols, rename_cols, time_format)
print ('loading dataframe into table ' + os_table)
drop_table_mapd(connection, os_table) #drop the old table
connection.execute(creationstring)
connection.load_table(os_table, df) #load the new table into OmniSci
# disconnect from OmniSci
disconnect_mapd(connection)
else:
print('could not wake OmniSci; exiting')
if __name__ == '__main__':
main()