Create Zarr Stores with Different Chunk Shapes

In this notebook, we create Zarr stores for the CMIP6 TAS daily data available in NetCDF on S3. This method of creating Zarr stores uses pangeo-forge and it’s recipes pattern.

The test datasets produced are:

1.1 Install and import libraries

%%capture
!pip install loguru
import fsspec
import s3fs
import xarray as xr
import sys; sys.path.append('..')
from helpers.profiler import Timer
import helpers.eodc_hub_role as eodc_hub_role
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
zarr_directory = 'test-data/cmip6-zarr'

Note: This is adapted from https://github.com/carbonplan/benchmark-maps/blob/datasets/stores/01b_cmip6_netcdf_to_zarr.ipynb.

1.2 Set parameters

#parameters
model = "GISS-E2-1-G"
variable = "tas"
anon=True
# Initiate fsspec filesystems for reading and writing
s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")
# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")
65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1*/tas/*
files_paths[0]
'nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950.nc'

1.3 Test we can open the files

fs_s3 = s3fs.S3FileSystem(anon=True)
filepath = f's3://{files_paths[0]}'
f = fs_s3.open(filepath, mode='rb')
ds = xr.open_dataset(f)
ds
<xarray.Dataset>
Dimensions:  (time: 365, lat: 600, lon: 1440)
Coordinates:
  * time     (time) object 1950-01-01 12:00:00 ... 1950-12-31 12:00:00
  * lat      (lat) float64 -59.88 -59.62 -59.38 -59.12 ... 89.38 89.62 89.88
  * lon      (lon) float64 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9
Data variables:
    tas      (time, lat, lon) float32 ...
Attributes: (12/23)
    downscalingModel:      BCSD
    activity:              NEX-GDDP-CMIP6
    contact:               Dr. Rama Nemani: [email protected], Dr. Bridget...
    Conventions:           CF-1.7
    creation_date:         2021-10-04T18:41:40.796912+00:00
    frequency:             day
    ...                    ...
    history:               2021-10-04T18:41:40.796912+00:00: install global a...
    disclaimer:            This data is considered provisional and subject to...
    external_variables:    areacella
    cmip6_source_id:       GISS-E2-1-G
    cmip6_institution_id:  NASA-GISS
    cmip6_license:         CC-BY-SA 4.0

2: Setup the destination

s3_fs = s3fs.S3FileSystem(
    key=credentials['aws_access_key_id'],
    secret=credentials['aws_secret_access_key'],
    token=credentials['aws_session_token'], 
    anon=False
)

3: Set different target chunks

For different sets of chunks, generate a zarr store.

chunk_sets = []
# Optimized for analysis
temporal_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 29 }
chunk_sets.append(temporal_target_chunks)
# Optimized for visualization at a single time step
global_target_chunks = { 'lat': ds.lat.shape[0], 'lon': ds.lon.shape[0], 'time': 1 }
chunk_sets.append(global_target_chunks)
# Optimized for time series
#spatial_target_chunks = calc_auspicious_chunks_dict(ds[variable], chunk_dims=('lat','lon',))
spatial_target_chunks = {'time': 365, 'lat': 262, 'lon': 262}
chunk_sets.append(spatial_target_chunks)
chunk_sets
[{'lat': 600, 'lon': 1440, 'time': 29},
 {'lat': 600, 'lon': 1440, 'time': 1},
 {'time': 365, 'lat': 262, 'lon': 262}]
timings = {}
# Iterate through remote_files to create a fileset
fileset = [s3_fs.open(file) for file in files_paths[0:2]]
for chunk_set in chunk_sets:
    chunk_prefix = str(("_").join(map(str, chunk_set.values())))
    store_name = f"{zarr_directory}/{chunk_prefix}_CMIP6_daily_{model}_{variable}.zarr"
    with Timer() as t:
        data = xr.open_mfdataset(fileset, combine='by_coords')
        data_chunked = data.chunk(chunk_set)
        store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=s3_fs, check=False)
        data_chunked.to_zarr(store, mode='w')
    timings[chunk_prefix] = round(t.elapsed * 1000, 2) 
timings
{'600_1440_29': 68548.07, '600_1440_1': 46144.38, '365_262_262': 33111.03}

4: Check it worked

for chunk_set in chunk_sets:
    chunk_prefix = str(("_").join(map(str, chunk_set.values())))
    store_name = f"{zarr_directory}/{chunk_prefix}_CMIP6_daily_{model}_{variable}.zarr"    
    store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=s3_fs, check=True)
    ds = xr.open_zarr(store, consolidated=True)
    display(ds)
<xarray.Dataset>
Dimensions:  (lat: 600, lon: 1440, time: 730)
Coordinates:
  * lat      (lat) float64 -59.88 -59.62 -59.38 -59.12 ... 89.38 89.62 89.88
  * lon      (lon) float64 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9
  * time     (time) object 1950-01-01 12:00:00 ... 1951-12-31 12:00:00
Data variables:
    tas      (time, lat, lon) float32 dask.array<chunksize=(29, 600, 1440), meta=np.ndarray>
Attributes: (12/23)
    Conventions:           CF-1.7
    activity:              NEX-GDDP-CMIP6
    cmip6_institution_id:  NASA-GISS
    cmip6_license:         CC-BY-SA 4.0
    cmip6_source_id:       GISS-E2-1-G
    contact:               Dr. Rama Nemani: [email protected], Dr. Bridget...
    ...                    ...
    scenario:              historical
    source:                BCSD
    title:                 GISS-E2-1-G, r1i1p1f2, historical, global downscal...
    tracking_id:           25d6baa3-0404-4eba-a3f1-afddbf69d4cc
    variant_label:         r1i1p1f2
    version:               1.0
<xarray.Dataset>
Dimensions:  (lat: 600, lon: 1440, time: 730)
Coordinates:
  * lat      (lat) float64 -59.88 -59.62 -59.38 -59.12 ... 89.38 89.62 89.88
  * lon      (lon) float64 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9
  * time     (time) object 1950-01-01 12:00:00 ... 1951-12-31 12:00:00
Data variables:
    tas      (time, lat, lon) float32 dask.array<chunksize=(1, 600, 1440), meta=np.ndarray>
Attributes: (12/23)
    Conventions:           CF-1.7
    activity:              NEX-GDDP-CMIP6
    cmip6_institution_id:  NASA-GISS
    cmip6_license:         CC-BY-SA 4.0
    cmip6_source_id:       GISS-E2-1-G
    contact:               Dr. Rama Nemani: [email protected], Dr. Bridget...
    ...                    ...
    scenario:              historical
    source:                BCSD
    title:                 GISS-E2-1-G, r1i1p1f2, historical, global downscal...
    tracking_id:           25d6baa3-0404-4eba-a3f1-afddbf69d4cc
    variant_label:         r1i1p1f2
    version:               1.0
<xarray.Dataset>
Dimensions:  (lat: 600, lon: 1440, time: 730)
Coordinates:
  * lat      (lat) float64 -59.88 -59.62 -59.38 -59.12 ... 89.38 89.62 89.88
  * lon      (lon) float64 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9
  * time     (time) object 1950-01-01 12:00:00 ... 1951-12-31 12:00:00
Data variables:
    tas      (time, lat, lon) float32 dask.array<chunksize=(365, 262, 262), meta=np.ndarray>
Attributes: (12/23)
    Conventions:           CF-1.7
    activity:              NEX-GDDP-CMIP6
    cmip6_institution_id:  NASA-GISS
    cmip6_license:         CC-BY-SA 4.0
    cmip6_source_id:       GISS-E2-1-G
    contact:               Dr. Rama Nemani: [email protected], Dr. Bridget...
    ...                    ...
    scenario:              historical
    source:                BCSD
    title:                 GISS-E2-1-G, r1i1p1f2, historical, global downscal...
    tracking_id:           25d6baa3-0404-4eba-a3f1-afddbf69d4cc
    variant_label:         r1i1p1f2
    version:               1.0
# Write output to json file
import json
datasets = {}
for chunk_set in chunk_sets:
    chunk_prefix = str(("_").join(map(str, chunk_set.values())))
    dataset_id = f"{chunk_prefix}_CMIP6_daily_{model}_{variable}.zarr"
    dataset_url = f"s3://{bucket}/{zarr_directory}/{dataset_id}"
    datasets[dataset_id] = {
        "dataset_url": dataset_url,
        "variable": variable
    }
    
with open("cmip6-zarr-datasets.json", "w") as f:
    f.write(json.dumps(datasets))
    f.close()