from tempfile import TemporaryDirectory
import boto3
import fsspec
import json
import os
import ujson
import xarray as xr
from kerchunk.combine import MultiZarrToZarr
from kerchunk.hdf import SingleHdf5ToZarr
from typing import Dict
import sys; sys.path.append('..')
import helpers.eodc_hub_role as eodc_hub_role
Generate Kerchunk Reference from CMIP6 NetCDF files
This notebook demonstrates how to create a kerchunk reference from NetCDF files on S3.
= eodc_hub_role.fetch_and_set_credentials() credentials
# Specify the CMIP collection to use (daily or monthly)
= 'nasa-eodc-data-store'
bucket_name = "GISS-E2-1-G"
model = "tas"
variable = True
anon = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1*/{variable}/*" s3_path
# Initiate fsspec filesystems for reading and writing
= fsspec.filesystem("s3", anon=anon, skip_instance_cache=False)
fs_read = fsspec.filesystem("") fs_write
# Retrieve list of available months
= fs_read.glob(s3_path)
files_paths print(f"{len(files_paths)} discovered from {s3_path}")
65 discovered from s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1*/tas/*
= sorted(["s3://" + f for f in files_paths if "1950.nc" in f or "1951.nc" in f]) subset_files
print(f"{len(subset_files)} file paths were retrieved.")
subset_files
2 file paths were retrieved.
['s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950.nc',
's3://nex-gddp-cmip6/NEX-GDDP-CMIP6/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1951.nc']
= dict(mode="rb", anon=anon, default_fill_cache=False, default_cache_type="first") so
# We are creating a temporary directory to store the .json reference files
# Alternately, you could write these to cloud storage.
= TemporaryDirectory()
td = td.name
temp_dir print(f"Writing single file references to {temp_dir}")
Writing single file references to /tmp/tmp3pz0iio1
# Use Kerchunk's `SingleHdf5ToZarr` method to create a `Kerchunk` index from a NetCDF file.
def generate_json_reference(u, temp_dir: str):
with fs_read.open(u, **so) as infile:
= SingleHdf5ToZarr(infile, u, inline_threshold=300)
h5chunks = u.split("/")[-1].strip(".nc")
fname = os.path.join(temp_dir, f"{fname}.json")
outf with open(outf, "wb") as f:
f.write(ujson.dumps(h5chunks.translate()).encode())return outf
# Iterate through filelist to generate Kerchunked files. Good use for `Dask`
= []
output_files for single_file in subset_files:
= generate_json_reference(single_file, temp_dir)
out_file output_files.append(out_file)
# combine individual references into single consolidated reference
= MultiZarrToZarr(
mzz
output_files,='s3',
remote_protocol={'anon': anon},
remote_options=['time'],
concat_dims={"time": "cf:time"},
coo_map=0
inline_threshold )
= mzz.translate() multi_kerchunk
# Write kerchunk .json record
= f"combined_CMIP6_daily_{model}_{variable}_kerchunk.json" output_fname
= os.path.join(temp_dir, output_fname)
output_location with open(f"{output_location}", "wb") as f:
print(f"Writing combined kerchunk reference file {output_location}")
f.write(ujson.dumps(multi_kerchunk).encode())
Writing combined kerchunk reference file /tmp/tmplrgw8zf1/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json
# open dataset as zarr object using fsspec reference file system and Xarray
= fsspec.filesystem(
fs "reference", fo=multi_kerchunk, remote_protocol="s3", remote_options={"anon": anon}
)= fs.get_mapper("") m
# Check the data
= xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False))
ds print(ds)
<xarray.Dataset>
Dimensions: (lat: 600, lon: 1440, time: 730)
Coordinates:
* lat (lat) float64 -59.88 -59.62 -59.38 -59.12 ... 89.38 89.62 89.88
* lon (lon) float64 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9
* time (time) object 1950-01-01 12:00:00 ... 1951-12-31 12:00:00
Data variables:
tas (time, lat, lon) float32 ...
Attributes: (12/23)
Conventions: CF-1.7
activity: NEX-GDDP-CMIP6
cmip6_institution_id: NASA-GISS
cmip6_license: CC-BY-SA 4.0
cmip6_source_id: GISS-E2-1-G
contact: Dr. Rama Nemani: [email protected], Dr. Bridget...
... ...
scenario: historical
source: BCSD
title: GISS-E2-1-G, r1i1p1f2, historical, global downscal...
tracking_id: 25d6baa3-0404-4eba-a3f1-afddbf69d4cc
variant_label: r1i1p1f2
version: 1.0
= boto3.client(
s3 's3',
=credentials['aws_access_key_id'],
aws_access_key_id=credentials['aws_secret_access_key'],
aws_secret_access_key=credentials['aws_session_token']
aws_session_token
)= s3.upload_file(output_location, bucket_name, f'test-data/cmip6-kerchunk/{output_fname}')
response print(f"Response uploading {output_fname} to {bucket_name} was {response}.")
Response uploading combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json to nasa-eodc-data-store was None.
# Write information to json file
with open("cmip6-kerchunk-dataset.json", "w") as f:
= {
kdict "cmip6-kerchunk": {
"dataset_url": f"s3://{bucket_name}/test-data/cmip6-kerchunk/{output_fname}",
"variable": variable,
"extra_args": {"reference": True}
}
}
f.write(json.dumps(kdict)) f.close()