Dispersed metadata¶
The primary reason to avoid dispersing metadata throughout a file or across many files is that it increases the number of requests and/or size of requests required for an application to open the file and understand its contents, which slows performance and increases costs.
The cloud native geospatial formats guide provides more details about how metadata is organized in different file formats.
import datacube_benchmark
import zarr
import pandas as pd
import hvplot.pandas # noqa
import warnings
from azure.identity import DefaultAzureCredential
from obstore.auth.azure import AzureCredentialProvider
config = datacube_benchmark.Config
config.target_array_size = "50 megabyte"
config.credential_provider = AzureCredentialProvider(
credential=DefaultAzureCredential()
)
config.warmup_samples = 1
config.create_data = True
zarr.config.set({"async.concurrency": config.zarr_concurrency})
<donfig.config_obj.ConfigSet at 0x7aa999b3c2d0>
Demonstrating performance inefficiencies of dispersed metadata¶
Create (or reuse) a blosc compressed array with consolidated metadata
url_for_consolidated_metadata = "https://datacubeguide.blob.core.windows.net/performance-testing/consolidated-metadata.zarr"
consolidated_store = datacube_benchmark.create_or_open_zarr_store(
url_for_consolidated_metadata,
target_chunk_size="25 megabyte",
config=config,
consolidated_metadata=True,
)
Create (or reuse) a blosc compressed array without consolidated metadata
url_for_unconsolidated_metadata = "https://datacubeguide.blob.core.windows.net/performance-testing/unconsolidated-metadata.zarr"
unconsolidated_store = datacube_benchmark.create_or_open_zarr_store(
url_for_unconsolidated_metadata,
target_chunk_size="25 megabyte",
config=config,
consolidated_metadata=False,
)
Add extra arrays since consolidated metadata is more impactful for datacubes with multiple arrays
arr = zarr.open_array(consolidated_store, path="data")
shape = arr.shape
chunks = arr.chunks
dtype = arr.dtype
dimension_names = arr.metadata.dimension_names
n_extra_arrays = 50
for store in [consolidated_store, unconsolidated_store]:
for n in range(n_extra_arrays):
arr = zarr.create_array(
store=store,
name=f"data_{n}",
shape=shape,
chunks=chunks,
dtype=dtype,
dimension_names=dimension_names,
)
arr[:] = 42
Reconsolidate the metadata in the consolidated store
zarr.consolidate_metadata(consolidated_store)
<Group object_store://AzureStore(container_name="performance-testing", account_name="datacubeguide", prefix="performance-testing/consolidated-metadata.zarr")>
Test time required to open the Zarr store using Xarray
warnings.filterwarnings(
"ignore",
message="Failed to open Zarr store with consolidated metadata, but successfully read with non-consolidated metadata",
category=RuntimeWarning,
)
unconsolidated_results = datacube_benchmark.benchmark_dataset_open(
unconsolidated_store,
num_samples=config.num_samples,
warmup_samples=config.warmup_samples,
)
consolidated_results = datacube_benchmark.benchmark_dataset_open(
consolidated_store,
num_samples=config.num_samples,
warmup_samples=config.warmup_samples,
)
df = pd.concat([consolidated_results.T, unconsolidated_results.T])
df["mean_time"] = df.apply(lambda row: float(row["mean_time"].magnitude), axis=1)
df["zarr_store"] = df["zarr_store"].replace(
{
'object_store://AzureStore(container_name="performance-testing", account_name="datacubeguide", prefix="performance-testing/unconsolidated-metadata.zarr")': "Unconsolidated",
'object_store://AzureStore(container_name="performance-testing", account_name="datacubeguide", prefix="performance-testing/consolidated-metadata.zarr")': "Consolidated",
}
)
title = "Duration to open dataset using Xarray"
plt = df.hvplot.bar(
x="zarr_store",
y="mean_time",
width=1000,
rot=45,
title=title,
ylabel="Duration (s)",
xlabel="Metadata structure",
)
plt
The placement of metadata in a single location greatly reduces the time required to load the data.