# External modules
import hvplot.pandas
import holoviews as hv
import json
import pandas as pd
= 'holoviews'
pd.options.plotting.backend import warnings
'ignore')
warnings.filterwarnings(
# Local modules
import sys; sys.path.append('..')
import helpers.dataframe as dataframe_helpers
import helpers.eodc_hub_role as eodc_hub_role
from xarray_tile_test import XarrayTileTest
Tile Generation Benchmarks for Varied Chunk Sizes
Explanation
In this notebook we compare the performance of tiling artificially generated Zarr data to different chunk sizes. The CMIP6 data provides an excellent real world dataset, but is relatively low resolution. In order to study the impact of higher resolution data, we artificially generated Zarr datastores to explore the relationship between tile generation time and chunk size.
Setup
= eodc_hub_role.fetch_and_set_credentials() credentials
Load the fake datasets which have increasingly fine spatial resolution and thus increasingly large chunk size.
# Run 3 iterations of each setting
= 5
iterations = range(6)
zooms = json.loads(open('../01-generate-datasets/fake-datasets.json').read())
all_zarr_datasets = {k: v for k, v in all_zarr_datasets.items() if 'single_chunk' in k} zarr_datasets
Run Tests
= []
results
for zarr_dataset_id, zarr_dataset in zarr_datasets.items():
= XarrayTileTest(
zarr_tile_test =zarr_dataset_id,
dataset_id**zarr_dataset
)
# Run it 3 times for each zoom level
for zoom in zooms:
'zoom': zoom}, batch_size=iterations)
zarr_tile_test.run_batch({
results.append(zarr_tile_test.store_results(credentials))
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230919222720_XarrayTileTest_single_chunk_store_lat1024_lon2048.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230919222734_XarrayTileTest_single_chunk_store_lat1448_lon2896.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230919222759_XarrayTileTest_single_chunk_store_lat2048_lon4096.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230919222845_XarrayTileTest_single_chunk_store_lat2896_lon5792.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230919223015_XarrayTileTest_single_chunk_store_lat4096_lon8192.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230919223021_XarrayTileTest_single_chunk_store_lat512_lon1024.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230919223027_XarrayTileTest_single_chunk_store_lat724_lon1448.zarr.json
Read and Plot Results
= dataframe_helpers.load_all_into_dataframe(credentials, results)
all_df = dataframe_helpers.expand_timings(all_df) expanded_df
= expanded_df.sort_values('chunk_size_mb') expanded_df
= ["#E66100", "#5D3A9B"]
cmap = {"width": 400, "height": 300}
plt_opts
= []
plts
for zoom_level in zooms:
= expanded_df[expanded_df["zoom"] == zoom_level]
df_level
plts.append(
df_level.hvplot.box(="time",
y=["chunk_size_mb"],
by="chunk_size_mb",
c=cmap,
cmap="Time to render (ms)",
ylabel="Chunk size (MB)",
xlabel=False,
legend=f"Zoom level {zoom_level}",
title**plt_opts)
).opts(
)2) hv.Layout(plts).cols(
'results-csvs/03-chunk-size-results.csv') expanded_df.to_csv(