Tile Generation Benchmarks for Spatial Chunk Variations

Explanation

In this notebook we compare the performance of tiling artificially generated Zarr data when the chunk size is constant, but increasing spatial resolution means a greater number of chunks.

Setup

# External modules
import hvplot.pandas
import holoviews as hv
import json
import pandas as pd
pd.options.plotting.backend = 'holoviews'
import warnings
warnings.filterwarnings('ignore')

# Local modules
import sys; sys.path.append('..')
import helpers.eodc_hub_role as eodc_hub_role
import helpers.dataframe as dataframe_helpers
from xarray_tile_test import XarrayTileTest

credentials = eodc_hub_role.fetch_and_set_credentials()

Load the fake datasets which have increasing numbers of chunks (but all at the same chunk size, 32MB).

# Run 3 iterations of each setting
iterations = 10
zooms = range(6)
all_zarr_datasets = json.loads(open('../01-generate-datasets/fake-datasets.json').read())
zarr_datasets = {k: v for k, v in all_zarr_datasets.items() if 'with_chunks' in k}

Run Tests

results = []

for zarr_dataset_id, zarr_dataset in zarr_datasets.items():
    zarr_tile_test = XarrayTileTest(
        dataset_id=zarr_dataset_id,
        **zarr_dataset
    )

    # Run it 3 times for each zoom level
    for zoom in zooms:
        zarr_tile_test.run_batch({'zoom': zoom}, batch_size=iterations)

    results.append(zarr_tile_test.store_results(credentials))

Wrote instance data to s3://nasa-eodc-data-store/test-results/20230907011424_XarrayTileTest_with_chunks_store_lat1448_lon2896.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230907011449_XarrayTileTest_with_chunks_store_lat2048_lon4096.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230907011520_XarrayTileTest_with_chunks_store_lat2896_lon5792.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230907011556_XarrayTileTest_with_chunks_store_lat4096_lon8192.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230907011648_XarrayTileTest_with_chunks_store_lat5793_lon11586.zarr.json

Read and Plot Results

all_df = dataframe_helpers.load_all_into_dataframe(credentials, results)
expanded_df = dataframe_helpers.expand_timings(all_df)

cmap=["#FEFE62", "#D35FB7"]

plt_opts = {"width": 400, "height": 300}

plts = []

for zoom_level in zooms:
    df_level = expanded_df[expanded_df["zoom"] == zoom_level]
    plts.append(
        df_level.hvplot.box(
            y="time",
            by=["number_of_spatial_chunks"],
            c="number_of_spatial_chunks",
            cmap=cmap,
            ylabel="Time to render (ms)",
            xlabel="Number of spatial chunks",
            legend=False,
            title=f"Zoom level {zoom_level}",
        ).opts(**plt_opts)
    )
hv.Layout(plts).cols(2)

expanded_df.to_csv('results-csvs/04-number-of-spatial-chunks-results.csv')