Tile Generation Benchmarks across Data Formats

Explanation

In this notebook we compare the performance of tiling CMIP6 data stored as COG, NetCDF and Zarr. In order to tile the NetCDF, we use a kerchunk reference file. You are able to use the ZarrReader with NetCDF directly, however you cannot read more than file at once which makes it incomparable with the pgSTAC+COG and Zarr methods.

Setup

# External modules
import hvplot
import hvplot.pandas
import holoviews as hv
import json
import pandas as pd
pd.options.plotting.backend = 'holoviews'
import rioxarray
import warnings
warnings.filterwarnings('ignore')

# Local modules
import sys; sys.path.append('..')
from cog_tile_test import CogTileTest
import helpers.dataframe as dataframe_helpers
import helpers.eodc_hub_role as eodc_hub_role
from xarray_tile_test import XarrayTileTest
credentials = eodc_hub_role.fetch_and_set_credentials()

Below we only load the CMIP6 Zarr dataset which has the same chunk structure as the original NetCDF data.

# Run 3 iterations of each setting
iterations = 10
zooms = range(6)
cog_dataset_id, cog_dataset = list(json.loads(open('../01-generate-datasets/cmip6-pgstac/cog-datasets.json').read()).items())[0]
kerchunk_dataset_id, kerchunk_dataset = list(json.loads(open('../01-generate-datasets/cmip6-kerchunk-dataset.json').read()).items())[0]
zarr_datasets = json.loads(open('../01-generate-datasets/cmip6-zarr-datasets.json').read())
filtered_dict = {k: v for k, v in zarr_datasets.items() if '600_1440_1' in k}
zarr_dataset_id, zarr_dataset = list(filtered_dict.items())[0]

Run Tests

COG Tests

# Based on our findings in 01-cog-gdal-tests we run these tests with set_gdal_vars to True.
cog_tile_test = CogTileTest(
    dataset_id=cog_dataset_id,
    lat_extent=[-59, 89],
    lon_extent=[-179, 179],
    extra_args={
        'query': cog_dataset['example_query'],
        'set_gdal_vars': True,
        'credentials': credentials
    }
)

# Run it 3 times for each zoom level
for zoom in zooms:
    cog_tile_test.run_batch({'zoom': zoom}, batch_size=iterations)

cog_results = cog_tile_test.store_results(credentials)
Caught exception: An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 35.93.112.139/32, TCP, from port: 5432, to port: 5432, ALLOW" already exists
Connected to database
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230907003859_CogTileTest_CMIP6_daily_GISS-E2-1-G_tas.json

Kerchunk Tests

kerchunk_tile_test = XarrayTileTest(
    dataset_id=kerchunk_dataset_id,
    **kerchunk_dataset
)

# Run many times for each zoom level
for zoom in zooms:
    kerchunk_tile_test.run_batch({'zoom': zoom}, batch_size=iterations)

kerchunk_results = kerchunk_tile_test.store_results(credentials)
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230907003910_XarrayTileTest_cmip6-kerchunk.json
zarr_tile_test = XarrayTileTest(
    dataset_id=zarr_dataset_id,
    **zarr_dataset
)

# Run it 3 times for each zoom level
for zoom in zooms:
    zarr_tile_test.run_batch({'zoom': zoom}, batch_size=iterations)

zarr_results = zarr_tile_test.store_results(credentials)
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230907003922_XarrayTileTest_600_1440_1_CMIP6_daily_GISS-E2-1-G_tas.zarr.json

Read and Plot Results

all_urls = [cog_results, zarr_results, kerchunk_results]
all_df = dataframe_helpers.load_all_into_dataframe(credentials, all_urls)
expanded_df = dataframe_helpers.expand_timings(all_df)
expanded_df['data_format'] = 'Unknown'
# Define the conditions
expanded_df.loc[expanded_df['dataset_id'] == cog_dataset_id, 'data_format'] = 'COG'
expanded_df.loc[expanded_df['dataset_id'] == zarr_dataset_id, 'data_format'] = 'Zarr'
expanded_df.loc[expanded_df['dataset_id'] == kerchunk_dataset_id, 'data_format'] = 'kerchunk'
cmap = ["#E1BE6A", "#40B0A6"]
plt_opts = {"width": 300, "height": 250}

plts = []

for zoom_level in zooms:
    df_level = expanded_df[expanded_df["zoom"] == zoom_level]
    plts.append(
        df_level.hvplot.box(
            y="time",
            by=["data_format"],
            c="data_format",
            cmap=cmap,
            ylabel="Time to render (ms)",
            xlabel="Data Format",
            legend=False,
            title=f"Zoom level {zoom_level}",
        ).opts(**plt_opts)
    )

hv.Layout(plts).cols(2)
expanded_df.to_csv('results-csvs/02-cog-kerchunk-zarr-results.csv')