Skip to content

Commit

Permalink
Update summary dataframe and data list (#41)
Browse files Browse the repository at this point in the history
  • Loading branch information
maxrjones authored Aug 24, 2023
1 parent ee5aa1d commit 895edb5
Show file tree
Hide file tree
Showing 6 changed files with 221 additions and 752 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ options:
--timeout TIMEOUT Timeout limit in milliseconds
--detect-provider Detect provider
--approach APPROACH Approach to use. Must be one of: ['dynamic-client']
--dataset DATASET dataset name. Must be one of: ['pyramids-v3-sharded-4326-1MB', 'pyramids-v3-sharded-4326-5MB']
--dataset DATASET dataset name.
--variable VARIABLE Zarr version. Must be one of: ['tasmax']
--non-headless Run in non-headless mode
--s3-bucket S3_BUCKET
Expand Down
1 change: 1 addition & 0 deletions binder/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ dependencies:
- cloud-detect
- opencv-python-headless
- git+https://github.com/carbonplan/benchmark-maps
- git+https://github.com/scalableminds/zarrita.git@b88ae5ca02917e24e716855f15ffaf020b95bb1a
103 changes: 93 additions & 10 deletions carbonplan_benchmarks/analysis/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
import fsspec
import numpy as np
import pandas as pd
import zarrita

from .parsing import extract_event_type, extract_frame_data, extract_request_data

pd.options.plotting.backend = 'holoviews'
pd.options.mode.chained_assignment = None


def base64_to_img(base64jpeg):
Expand All @@ -30,7 +32,7 @@ def base64_to_img(base64jpeg):
return cv.imdecode(arr, cv.IMREAD_COLOR)


def calculate_snapshot_rmse(*, trace_events, snapshots, metadata):
def calculate_snapshot_rmse(*, trace_events, snapshots, metadata, xstart: int = 133):
"""
Extract screenshots from a list of Chromium trace events.
Expand All @@ -57,7 +59,7 @@ def calculate_rmse(predictions, targets):
var = f'rmse_snapshot_{zoom_level}'
for ind, row in screenshots.iterrows():
frame = base64_to_img(row['args.snapshot'])
screenshots.loc[ind, var] = calculate_rmse(frame, snapshot)
screenshots.loc[ind, var] = calculate_rmse(frame[:, xstart:], snapshot[:, xstart:])
return screenshots


Expand Down Expand Up @@ -118,6 +120,28 @@ def load_data(*, metadata_path: str, run: int):
metadata['full_trace_path'] = trace_path
with fs.open(trace_path) as f:
trace_events = json.loads(f.read())['traceEvents']
event_types = [
'ResourceSendRequest',
'ResourceFinish',
'BeginFrame',
'DrawFrame',
'DroppedFrame',
'Commit',
'Screenshot',
'benchmark-initial-load:start',
'benchmark-initial-load:end',
'benchmark-zoom_in-level-0:start'
'benchmark-zoom_in-level-1:start'
'benchmark-zoom_in-level-2:start'
'benchmark-zoom_in-level-0:end'
'benchmark-zoom_in-level-1:end'
'benchmark-zoom_in-level-2:end',
]
trace_events = [
event
for event in trace_events
if event['name'] in event_types or 'benchmark-zoom' in event['name']
]
return metadata, trace_events


Expand All @@ -141,16 +165,60 @@ def load_snapshots(*, snapshot_path: str):
return snapshots


def get_chunk_size(URI, zarr_version, sharded, var='tasmax'):
"""
Get chunk size based on zoom level 0.
"""
source_store = zarrita.RemoteStore(URI)
if zarr_version == 2:
source_array = zarrita.ArrayV2.open(source_store / '0' / var)
chunks = source_array.metadata.chunks
itemsize = source_array.metadata.dtype.itemsize
else:
source_array = zarrita.Array.open(source_store / '0' / var)
if sharded:
chunks = source_array.metadata.codecs[0].configuration.chunk_shape
else:
chunks = source_array.metadata.chunk_grid.configuration.chunk_shape
itemsize = source_array.metadata.dtype.itemsize
chunk_size = np.prod(chunks) * itemsize * 1e-6
return chunk_size


def add_chunk_size(
summary: pd.DataFrame,
*,
root_path: str = 's3://carbonplan-benchmarks/data/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/tasmax/tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn',
):
"""
Add a column to the summary DataFrame containing the chunk size.
"""
datasets = summary[
['zarr_version', 'dataset', 'shard_size', 'target_chunk_size']
].drop_duplicates()
datasets['URI'] = root_path + '/' + datasets['dataset']
datasets['actual_chunk_size'] = datasets.apply(
lambda x: get_chunk_size(x['URI'], x['zarr_version'], x['shard_size']), axis=1
)
datasets = datasets[['dataset', 'actual_chunk_size']]
return summary.set_index('dataset').join(datasets.set_index('dataset'))


def create_summary(*, metadata: pd.DataFrame, data: dict, url_filter: str = None):
"""
Create summary DataFrame for a given run
"""
metadata
summary = pd.concat(
[pd.DataFrame(metadata, index=[0])] * (metadata['zoom_level'] + 1), ignore_index=True
)
summary['metadata_path'] = metadata['metadata_path']
summary['trace_path'] = metadata['trace_path']
summary['zarr_version'] = summary['dataset'].apply(lambda x: int(x.split('-')[1][1]))
summary['chunk_size'] = summary['dataset'].apply(lambda x: int(x.split('-')[5]))
summary['projection'] = summary['dataset'].apply(lambda x: int(x.split('-')[2]))
summary['pixels_per_tile'] = summary['dataset'].apply(lambda x: int(x.split('-')[4]))
summary['target_chunk_size'] = summary['dataset'].apply(lambda x: int(x.split('-')[5]))
summary['shard_orientation'] = summary['dataset'].apply(lambda x: x.split('-')[6])
summary['shard_size'] = summary['dataset'].apply(lambda x: int(x.split('-')[7]))
frames_data = data['frames_data']
request_data = data['request_data']

Expand All @@ -164,16 +232,30 @@ def create_summary(*, metadata: pd.DataFrame, data: dict, url_filter: str = None
(request_data['request_start'] > actions.loc[zoom, 'start_time'])
& (request_data['request_start'] <= actions.loc[zoom, 'action_end_time'])
]
summary['total_requests'] = len(requests)
summary.loc[zoom, 'total_requests'] = len(requests)
if url_filter:
requests = requests[requests['url'].str.contains(url_filter)]
summary['filtered_requests'] = len(requests)
if requests['request_start'].max() > actions.loc[zoom, 'action_end_time']:
raise Warning(f'Request for zoom level {zoom} started after timeout')
if requests['response_end'].max() > actions.loc[zoom, 'action_end_time']:
raise Warning(f'Response duration for zoom level {zoom} exceeded timeout')
summary.loc[zoom, 'filtered_requests'] = len(requests)
summary.loc[zoom, 'filtered_requests_average_encoded_data_length'] = requests[
'encoded_data_length'
].mean()
summary.loc[zoom, 'filtered_requests_maximum_encoded_data_length'] = requests[
'encoded_data_length'
].max()
summary.loc[zoom, 'zoom'] = zoom
summary.loc[zoom, 'duration'] = actions.loc[zoom, 'duration']
summary.loc[zoom, 'timeout'] = False
if requests['request_start'].max() > actions.loc[zoom, 'action_end_time']:
actions.loc[zoom, 'action_end_time'] = np.nan
summary.loc[zoom, 'duration'] = metadata['timeout']
summary.loc[zoom, 'timeout'] = True
if requests['response_end'].max() > actions.loc[zoom, 'action_end_time']:
actions.loc[zoom, 'action_end_time'] = np.nan
summary.loc[zoom, 'duration'] = metadata['timeout']
summary.loc[zoom, 'timeout'] = True
if summary.loc[zoom, 'duration'] > metadata['timeout']:
summary.loc[zoom, 'duration'] = metadata['timeout']
summary.loc[zoom, 'timeout'] = True
summary.loc[zoom, 'fps'] = len(frames) / (actions.loc[zoom, 'duration'] * 1e-3)
if requests.empty:
summary.loc[zoom, 'request_duration'] = 0
Expand All @@ -183,6 +265,7 @@ def create_summary(*, metadata: pd.DataFrame, data: dict, url_filter: str = None
)
summary['request_percent'] = summary['request_duration'] / summary['duration'] * 100
summary['non_request_duration'] = summary['duration'] - summary['request_duration']
summary = add_chunk_size(summary)

return summary

Expand Down
Loading

0 comments on commit 895edb5

Please sign in to comment.