Dispersed metadata¶

The primary reason to avoid dispersing metadata throughout a file or across many files is that it increases the number of requests and/or size of requests required for an application to open the file and understand its contents, which slows performance and increases costs.

The cloud native geospatial formats guide provides more details about how metadata is organized in different file formats.

In [1]:

Copied!





import datacube_benchmark
import zarr
import pandas as pd
import hvplot.pandas  # noqa
import warnings

from azure.identity import DefaultAzureCredential
from obstore.auth.azure import AzureCredentialProvider
import datacube_benchmark
import zarr
import pandas as pd
import hvplot.pandas  # noqa
import warnings

from azure.identity import DefaultAzureCredential
from obstore.auth.azure import AzureCredentialProvider

In [2]:

Copied!





config = datacube_benchmark.Config
config.target_array_size = "50 megabyte"
config.credential_provider = AzureCredentialProvider(
    credential=DefaultAzureCredential()
)
config.warmup_samples = 1
config.create_data = True
config = datacube_benchmark.Config
config.target_array_size = "50 megabyte"
config.credential_provider = AzureCredentialProvider(
    credential=DefaultAzureCredential()
)
config.warmup_samples = 1
config.create_data = True

In [3]:

Copied!

zarr.config.set({"async.concurrency": config.zarr_concurrency})
zarr.config.set({"async.concurrency": config.zarr_concurrency})

Out[3]:

<donfig.config_obj.ConfigSet at 0x7aa999b3c2d0>

Demonstrating performance inefficiencies of dispersed metadata¶

Create (or reuse) a blosc compressed array with consolidated metadata

In [4]:

Copied!





url_for_consolidated_metadata = "https://datacubeguide.blob.core.windows.net/performance-testing/consolidated-metadata.zarr"
consolidated_store = datacube_benchmark.create_or_open_zarr_store(
    url_for_consolidated_metadata,
    target_chunk_size="25 megabyte",
    config=config,
    consolidated_metadata=True,
)
url_for_consolidated_metadata = "https://datacubeguide.blob.core.windows.net/performance-testing/consolidated-metadata.zarr"
consolidated_store = datacube_benchmark.create_or_open_zarr_store(
    url_for_consolidated_metadata,
    target_chunk_size="25 megabyte",
    config=config,
    consolidated_metadata=True,
)

Create (or reuse) a blosc compressed array without consolidated metadata

In [5]:

Copied!





url_for_unconsolidated_metadata = "https://datacubeguide.blob.core.windows.net/performance-testing/unconsolidated-metadata.zarr"
unconsolidated_store = datacube_benchmark.create_or_open_zarr_store(
    url_for_unconsolidated_metadata,
    target_chunk_size="25 megabyte",
    config=config,
    consolidated_metadata=False,
)
url_for_unconsolidated_metadata = "https://datacubeguide.blob.core.windows.net/performance-testing/unconsolidated-metadata.zarr"
unconsolidated_store = datacube_benchmark.create_or_open_zarr_store(
    url_for_unconsolidated_metadata,
    target_chunk_size="25 megabyte",
    config=config,
    consolidated_metadata=False,
)

Add extra arrays since consolidated metadata is more impactful for datacubes with multiple arrays

In [6]:

Copied!





arr = zarr.open_array(consolidated_store, path="data")
shape = arr.shape
chunks = arr.chunks
dtype = arr.dtype
dimension_names = arr.metadata.dimension_names
n_extra_arrays = 50
for store in [consolidated_store, unconsolidated_store]:
    for n in range(n_extra_arrays):
        arr = zarr.create_array(
            store=store,
            name=f"data_{n}",
            shape=shape,
            chunks=chunks,
            dtype=dtype,
            dimension_names=dimension_names,
        )
        arr[:] = 42
arr = zarr.open_array(consolidated_store, path="data")
shape = arr.shape
chunks = arr.chunks
dtype = arr.dtype
dimension_names = arr.metadata.dimension_names
n_extra_arrays = 50
for store in [consolidated_store, unconsolidated_store]:
    for n in range(n_extra_arrays):
        arr = zarr.create_array(
            store=store,
            name=f"data_{n}",
            shape=shape,
            chunks=chunks,
            dtype=dtype,
            dimension_names=dimension_names,
        )
        arr[:] = 42

Reconsolidate the metadata in the consolidated store

In [7]:

Copied!

zarr.consolidate_metadata(consolidated_store)
zarr.consolidate_metadata(consolidated_store)

Out[7]:

<Group object_store://AzureStore(container_name="performance-testing", account_name="datacubeguide", prefix="performance-testing/consolidated-metadata.zarr")>

Test time required to open the Zarr store using Xarray

In [8]:

Copied!





warnings.filterwarnings(
    "ignore",
    message="Failed to open Zarr store with consolidated metadata, but successfully read with non-consolidated metadata",
    category=RuntimeWarning,
)
unconsolidated_results = datacube_benchmark.benchmark_dataset_open(
    unconsolidated_store,
    num_samples=config.num_samples,
    warmup_samples=config.warmup_samples,
)
consolidated_results = datacube_benchmark.benchmark_dataset_open(
    consolidated_store,
    num_samples=config.num_samples,
    warmup_samples=config.warmup_samples,
)
warnings.filterwarnings(
    "ignore",
    message="Failed to open Zarr store with consolidated metadata, but successfully read with non-consolidated metadata",
    category=RuntimeWarning,
)
unconsolidated_results = datacube_benchmark.benchmark_dataset_open(
    unconsolidated_store,
    num_samples=config.num_samples,
    warmup_samples=config.warmup_samples,
)
consolidated_results = datacube_benchmark.benchmark_dataset_open(
    consolidated_store,
    num_samples=config.num_samples,
    warmup_samples=config.warmup_samples,
)

In [9]:

Copied!





df = pd.concat([consolidated_results.T, unconsolidated_results.T])
df["mean_time"] = df.apply(lambda row: float(row["mean_time"].magnitude), axis=1)
df["zarr_store"] = df["zarr_store"].replace(
    {
        'object_store://AzureStore(container_name="performance-testing", account_name="datacubeguide", prefix="performance-testing/unconsolidated-metadata.zarr")': "Unconsolidated",
        'object_store://AzureStore(container_name="performance-testing", account_name="datacubeguide", prefix="performance-testing/consolidated-metadata.zarr")': "Consolidated",
    }
)
df = pd.concat([consolidated_results.T, unconsolidated_results.T])
df["mean_time"] = df.apply(lambda row: float(row["mean_time"].magnitude), axis=1)
df["zarr_store"] = df["zarr_store"].replace(
    {
        'object_store://AzureStore(container_name="performance-testing", account_name="datacubeguide", prefix="performance-testing/unconsolidated-metadata.zarr")': "Unconsolidated",
        'object_store://AzureStore(container_name="performance-testing", account_name="datacubeguide", prefix="performance-testing/consolidated-metadata.zarr")': "Consolidated",
    }
)

In [10]:

Copied!





title = "Duration to open dataset using Xarray"
plt = df.hvplot.bar(
    x="zarr_store",
    y="mean_time",
    width=1000,
    rot=45,
    title=title,
    ylabel="Duration (s)",
    xlabel="Metadata structure",
)
title = "Duration to open dataset using Xarray"
plt = df.hvplot.bar(
    x="zarr_store",
    y="mean_time",
    width=1000,
    rot=45,
    title=title,
    ylabel="Duration (s)",
    xlabel="Metadata structure",
)

In [11]:

Copied!

plt
plt

Out[11]:

The placement of metadata in a single location greatly reduces the time required to load the data.