Generate Kerchunk Reference from CMIP6 NetCDF files

This notebook demonstrates how to create a kerchunk reference from NetCDF files on S3.

from tempfile import TemporaryDirectory
import boto3
import fsspec
import json
import os
import ujson
import xarray as xr
from kerchunk.combine import MultiZarrToZarr
from kerchunk.hdf import SingleHdf5ToZarr
from typing import Dict
import sys; sys.path.append('..')
import helpers.eodc_hub_role as eodc_hub_role

credentials = eodc_hub_role.fetch_and_set_credentials()

# Specify the CMIP collection to use (daily or monthly)
bucket_name = 'nasa-eodc-data-store'
model = "GISS-E2-1-G"
variable = "tas"
anon = True
s3_path = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*"

# Initiate fsspec filesystems for reading and writing
fs_read = fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_write = fsspec.filesystem("")

# Retrieve list of available months
files_paths = fs_read.glob(s3_path)
print(f"{len(files_paths)} discovered from {s3_path}")

65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1*/tas/*

subset_files = sorted(["s3://" + f for f in files_paths if "1950.nc" in f or "1951.nc" in f])

print(f"{len(subset_files)} file paths were retrieved.")
subset_files

2 file paths were retrieved.

['s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950.nc',
 's3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1951.nc']

so = dict(mode="rb", anon=anon, default_fill_cache=False, default_cache_type="first")

# We are creating a temporary directory to store the .json reference files
# Alternately, you could write these to cloud storage.
td = TemporaryDirectory()
temp_dir = td.name
print(f"Writing single file references to {temp_dir}")

Writing single file references to /tmp/tmp3pz0iio1

# Use Kerchunk's `SingleHdf5ToZarr` method to create a `Kerchunk` index from a NetCDF file.
def generate_json_reference(u, temp_dir: str):
    with fs_read.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        fname = u.split("/")[-1].strip(".nc")
        outf = os.path.join(temp_dir, f"{fname}.json")
        with open(outf, "wb") as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())
        return outf

# Iterate through filelist to generate Kerchunked files. Good use for `Dask`
output_files = []
for single_file in subset_files:
    out_file = generate_json_reference(single_file, temp_dir)
    output_files.append(out_file)

# combine individual references into single consolidated reference
mzz = MultiZarrToZarr(
    output_files,
    remote_protocol='s3',
    remote_options={'anon': anon},
    concat_dims=['time'],
    coo_map={"time": "cf:time"},
    inline_threshold=0
)

multi_kerchunk = mzz.translate()

# Write kerchunk .json record
output_fname = f"combined_CMIP6_daily_{model}_{variable}_kerchunk.json"

output_location = os.path.join(temp_dir, output_fname)
with open(f"{output_location}", "wb") as f:
    print(f"Writing combined kerchunk reference file {output_location}")
    f.write(ujson.dumps(multi_kerchunk).encode())

Writing combined kerchunk reference file /tmp/tmplrgw8zf1/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json

# open dataset as zarr object using fsspec reference file system and Xarray
fs = fsspec.filesystem(
    "reference", fo=multi_kerchunk, remote_protocol="s3", remote_options={"anon": anon}
)
m = fs.get_mapper("")

# Check the data
ds = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False))
print(ds)

<xarray.Dataset>
Dimensions:  (lat: 600, lon: 1440, time: 730)
Coordinates:
  * lat      (lat) float64 -59.88 -59.62 -59.38 -59.12 ... 89.38 89.62 89.88
  * lon      (lon) float64 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9
  * time     (time) object 1950-01-01 12:00:00 ... 1951-12-31 12:00:00
Data variables:
    tas      (time, lat, lon) float32 ...
Attributes: (12/23)
    Conventions:           CF-1.7
    activity:              NEX-GDDP-CMIP6
    cmip6_institution_id:  NASA-GISS
    cmip6_license:         CC-BY-SA 4.0
    cmip6_source_id:       GISS-E2-1-G
    contact:               Dr. Rama Nemani: [email protected], Dr. Bridget...
    ...                    ...
    scenario:              historical
    source:                BCSD
    title:                 GISS-E2-1-G, r1i1p1f2, historical, global downscal...
    tracking_id:           25d6baa3-0404-4eba-a3f1-afddbf69d4cc
    variant_label:         r1i1p1f2
    version:               1.0

s3 = boto3.client(
   's3',
    aws_access_key_id=credentials['aws_access_key_id'],
    aws_secret_access_key=credentials['aws_secret_access_key'],
    aws_session_token=credentials['aws_session_token']
)
response = s3.upload_file(output_location, bucket_name, f'test-data/cmip6-kerchunk/{output_fname}')
print(f"Response uploading {output_fname} to {bucket_name} was {response}.")

Response uploading combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json to nasa-eodc-data-store was None.

# Write information to json file
with open("cmip6-kerchunk-dataset.json", "w") as f:
    kdict = {
        "cmip6-kerchunk": {
            "dataset_url": f"s3://{bucket_name}/test-data/cmip6-kerchunk/{output_fname}",
            "variable": variable,
            "extra_args": {"reference": True}
        }
    }
    f.write(json.dumps(kdict))
    f.close()