Create Zarr Stores with Different Chunk Shapes

In this notebook, we create Zarr stores for the CMIP6 TAS daily data available in NetCDF on S3. This method of creating Zarr stores uses pangeo-forge and it’s recipes pattern.

The test datasets produced are:

Chunked to optimize for time series analysis:
- latitude: 252, longitude: 252, time: 365.
- This dataset has larger chunks, but more timesteps are loaded into each chunk.
Chunked to optimize for visualization at a single time step.
- latitude: 600, longitude: 1440, time: 1.
- This dataset has small chunks, but will likely not work well for time series generation.
Chunked to optimize for both time series and visualization:
- latitude: 600, longitude: 1440, time: 29.
- This dataset has larger chunks, but more timesteps are loaded into each chunk.

1.1 Install and import libraries

%%capture
!pip install loguru

import fsspec
import s3fs
import xarray as xr
import sys; sys.path.append('..')
from helpers.profiler import Timer
import helpers.eodc_hub_role as eodc_hub_role

credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
zarr_directory = 'test-data/cmip6-zarr'

Note: This is adapted from https://github.com/carbonplan/benchmark-maps/blob/datasets/stores/01b_cmip6_netcdf_to_zarr.ipynb.

1.2 Set parameters

#parameters
model = "GISS-E2-1-G"
variable = "tas"
anon=True

# Initiate fsspec filesystems for reading and writing
s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")

65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1*/tas/*

files_paths[0]

'nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950.nc'

1.3 Test we can open the files

fs_s3 = s3fs.S3FileSystem(anon=True)
filepath = f's3://{files_paths[0]}'
f = fs_s3.open(filepath, mode='rb')
ds = xr.open_dataset(f)
ds

2: Setup the destination

s3_fs = s3fs.S3FileSystem(
    key=credentials['aws_access_key_id'],
    secret=credentials['aws_secret_access_key'],
    token=credentials['aws_session_token'], 
    anon=False
)

3: Set different target chunks

For different sets of chunks, generate a zarr store.

chunk_sets = []
# Optimized for analysis
temporal_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 29 }
chunk_sets.append(temporal_target_chunks)
# Optimized for visualization at a single time step
global_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 1 }
chunk_sets.append(global_target_chunks)
# Optimized for time series
#spatial_target_chunks = calc_auspicious_chunks_dict(ds[variable], chunk_dims=('lat','lon',))
spatial_target_chunks = {'time': 365, 'lat': 262, 'lon': 262}
chunk_sets.append(spatial_target_chunks)

chunk_sets

[{'lat': 600, 'lon': 1440, 'time': 29},
 {'lat': 600, 'lon': 1440, 'time': 1},
 {'time': 365, 'lat': 262, 'lon': 262}]

timings = {}
# Iterate through remote_files to create a fileset
fileset = [s3_fs.open(file) for file in files_paths[0:2]]
for chunk_set in chunk_sets:
    chunk_prefix = str(("_").join(map(str, chunk_set.values())))
    store_name = f"{zarr_directory}/{chunk_prefix}_CMIP6_daily_{model}_{variable}.zarr"
    with Timer() as t:
        data = xr.open_mfdataset(fileset, combine='by_coords')
        data_chunked = data.chunk(chunk_set)
        store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=s3_fs, check=False)
        data_chunked.to_zarr(store, mode='w')
    timings[chunk_prefix] = round(t.elapsed * 1000, 2)

timings

{'600_1440_29': 68548.07, '600_1440_1': 46144.38, '365_262_262': 33111.03}

4: Check it worked

for chunk_set in chunk_sets:
    chunk_prefix = str(("_").join(map(str, chunk_set.values())))
    store_name = f"{zarr_directory}/{chunk_prefix}_CMIP6_daily_{model}_{variable}.zarr"    
    store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=s3_fs, check=True)
    ds = xr.open_zarr(store, consolidated=True)
    display(ds)

<xarray.Dataset>
Dimensions:  (lat: 600, lon: 1440, time: 730)
Coordinates:
  * lat      (lat) float64 -59.88 -59.62 -59.38 -59.12 ... 89.38 89.62 89.88
  * lon      (lon) float64 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9
  * time     (time) object 1950-01-01 12:00:00 ... 1951-12-31 12:00:00
Data variables:
    tas      (time, lat, lon) float32 dask.array<chunksize=(29, 600, 1440), meta=np.ndarray>
Attributes: (12/23)
    Conventions:           CF-1.7
    activity:              NEX-GDDP-CMIP6
    cmip6_institution_id:  NASA-GISS
    cmip6_license:         CC-BY-SA 4.0
    cmip6_source_id:       GISS-E2-1-G
    contact:               Dr. Rama Nemani: [email protected], Dr. Bridget...
    ...                    ...
    scenario:              historical
    source:                BCSD
    title:                 GISS-E2-1-G, r1i1p1f2, historical, global downscal...
    tracking_id:           25d6baa3-0404-4eba-a3f1-afddbf69d4cc
    variant_label:         r1i1p1f2
    version:               1.0

<xarray.Dataset>
Dimensions:  (lat: 600, lon: 1440, time: 730)
Coordinates:
  * lat      (lat) float64 -59.88 -59.62 -59.38 -59.12 ... 89.38 89.62 89.88
  * lon      (lon) float64 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9
  * time     (time) object 1950-01-01 12:00:00 ... 1951-12-31 12:00:00
Data variables:
    tas      (time, lat, lon) float32 dask.array<chunksize=(1, 600, 1440), meta=np.ndarray>
Attributes: (12/23)
    Conventions:           CF-1.7
    activity:              NEX-GDDP-CMIP6
    cmip6_institution_id:  NASA-GISS
    cmip6_license:         CC-BY-SA 4.0
    cmip6_source_id:       GISS-E2-1-G
    contact:               Dr. Rama Nemani: [email protected], Dr. Bridget...
    ...                    ...
    scenario:              historical
    source:                BCSD
    title:                 GISS-E2-1-G, r1i1p1f2, historical, global downscal...
    tracking_id:           25d6baa3-0404-4eba-a3f1-afddbf69d4cc
    variant_label:         r1i1p1f2
    version:               1.0

<xarray.Dataset>
Dimensions:  (lat: 600, lon: 1440, time: 730)
Coordinates:
  * lat      (lat) float64 -59.88 -59.62 -59.38 -59.12 ... 89.38 89.62 89.88
  * lon      (lon) float64 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9
  * time     (time) object 1950-01-01 12:00:00 ... 1951-12-31 12:00:00
Data variables:
    tas      (time, lat, lon) float32 dask.array<chunksize=(365, 262, 262), meta=np.ndarray>
Attributes: (12/23)
    Conventions:           CF-1.7
    activity:              NEX-GDDP-CMIP6
    cmip6_institution_id:  NASA-GISS
    cmip6_license:         CC-BY-SA 4.0
    cmip6_source_id:       GISS-E2-1-G
    contact:               Dr. Rama Nemani: [email protected], Dr. Bridget...
    ...                    ...
    scenario:              historical
    source:                BCSD
    title:                 GISS-E2-1-G, r1i1p1f2, historical, global downscal...
    tracking_id:           25d6baa3-0404-4eba-a3f1-afddbf69d4cc
    variant_label:         r1i1p1f2
    version:               1.0

# Write output to json file
import json
datasets = {}
for chunk_set in chunk_sets:
    chunk_prefix = str(("_").join(map(str, chunk_set.values())))
    dataset_id = f"{chunk_prefix}_CMIP6_daily_{model}_{variable}.zarr"
    dataset_url = f"s3://{bucket}/{zarr_directory}/{dataset_id}"
    datasets[dataset_id] = {
        "dataset_url": dataset_url,
        "variable": variable
    }
    
with open("cmip6-zarr-datasets.json", "w") as f:
    f.write(json.dumps(datasets))
    f.close()