Finding NetCDF-4 collections¶

This notebook shows how to use earthaccess to discover NASA Earthdata collections that provide granules in netCDF-4 format. In the next step, it opens a representative netCDF-4 file from each collection to inspect and list the available variable names.

In [ ]:

Copied!





import earthaccess
import pandas as pd
import numpy as np

from typing import Dict, Optional, Tuple, Any

# ----------------------------------------
# Helpers to parse metadata from earthaccess
# ----------------------------------------


def _parse_temporal(umm: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
    temporal = umm.get("TemporalExtents", [])
    rng = (temporal or [{}])[0].get("RangeDateTimes", [])
    begin = (rng or [{}])[0].get("BeginningDateTime")
    end = (rng or [{}])[0].get("EndingDateTime")
    return begin, end


def _parse_bounds_from_spatial(
    umm: Dict[str, Any],
) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
    spatial = umm.get("SpatialExtent", {}) or {}
    horiz = spatial.get("HorizontalSpatialDomain", {}) or {}
    geom = horiz.get("Geometry", {}) or {}

    # 1) Bounding rectangles
    rects = geom.get("BoundingRectangles") or []
    if rects:
        wests = [r.get("WestBoundingCoordinate") for r in rects if r]
        easts = [r.get("EastBoundingCoordinate") for r in rects if r]
        souths = [r.get("SouthBoundingCoordinate") for r in rects if r]
        norths = [r.get("NorthBoundingCoordinate") for r in rects if r]
        if all(len(lst) > 0 for lst in (wests, easts, souths, norths)):
            return (
                float(np.min(wests)),
                float(np.min(souths)),
                float(np.max(easts)),
                float(np.max(norths)),
            )

    # 2) GPolygons
    gpolys = geom.get("GPolygons") or []
    coords_w, coords_e, coords_s, coords_n = [], [], [], []
    for gp in gpolys:
        b = gp.get("Boundary", {})
        pts = b.get("Points", [])
        lons = [p.get("Longitude") for p in pts if p and p.get("Longitude") is not None]
        lats = [p.get("Latitude") for p in pts if p and p.get("Latitude") is not None]
        if lons and lats:
            coords_w.append(np.min(lons))
            coords_e.append(np.max(lons))
            coords_s.append(np.min(lats))
            coords_n.append(np.max(lats))
    if coords_w and coords_e and coords_s and coords_n:
        return (
            float(np.min(coords_w)),
            float(np.min(coords_s)),
            float(np.max(coords_e)),
            float(np.max(coords_n)),
        )

    return None, None, None, None
import earthaccess
import pandas as pd
import numpy as np

from typing import Dict, Optional, Tuple, Any

# ----------------------------------------
# Helpers to parse metadata from earthaccess
# ----------------------------------------


def _parse_temporal(umm: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
    temporal = umm.get("TemporalExtents", [])
    rng = (temporal or [{}])[0].get("RangeDateTimes", [])
    begin = (rng or [{}])[0].get("BeginningDateTime")
    end = (rng or [{}])[0].get("EndingDateTime")
    return begin, end


def _parse_bounds_from_spatial(
    umm: Dict[str, Any],
) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
    spatial = umm.get("SpatialExtent", {}) or {}
    horiz = spatial.get("HorizontalSpatialDomain", {}) or {}
    geom = horiz.get("Geometry", {}) or {}

    # 1) Bounding rectangles
    rects = geom.get("BoundingRectangles") or []
    if rects:
        wests = [r.get("WestBoundingCoordinate") for r in rects if r]
        easts = [r.get("EastBoundingCoordinate") for r in rects if r]
        souths = [r.get("SouthBoundingCoordinate") for r in rects if r]
        norths = [r.get("NorthBoundingCoordinate") for r in rects if r]
        if all(len(lst) > 0 for lst in (wests, easts, souths, norths)):
            return (
                float(np.min(wests)),
                float(np.min(souths)),
                float(np.max(easts)),
                float(np.max(norths)),
            )

    # 2) GPolygons
    gpolys = geom.get("GPolygons") or []
    coords_w, coords_e, coords_s, coords_n = [], [], [], []
    for gp in gpolys:
        b = gp.get("Boundary", {})
        pts = b.get("Points", [])
        lons = [p.get("Longitude") for p in pts if p and p.get("Longitude") is not None]
        lats = [p.get("Latitude") for p in pts if p and p.get("Latitude") is not None]
        if lons and lats:
            coords_w.append(np.min(lons))
            coords_e.append(np.max(lons))
            coords_s.append(np.min(lats))
            coords_n.append(np.max(lats))
    if coords_w and coords_e and coords_s and coords_n:
        return (
            float(np.min(coords_w)),
            float(np.min(coords_s)),
            float(np.max(coords_e)),
            float(np.max(coords_n)),
        )

    return None, None, None, None

First, let's find all collections that provide netCDF-4 files using the earthaccess library.

In [ ]:

Copied!





%%time

# step 1-a: search collections with netcdf-4

query = earthaccess.DataCollections()
query.params["granule_data_format"] = "*netcdf-4*"
query.option("granule_data_format", "pattern", True)
results = query.get_all()
print(f"Number of collections found: {len(results)}")
%%time

# step 1-a: search collections with netcdf-4

query = earthaccess.DataCollections()
query.params["granule_data_format"] = "*netcdf-4*"
query.option("granule_data_format", "pattern", True)
results = query.get_all()
print(f"Number of collections found: {len(results)}")

Next, parse metadata for each collection to find a temporal and spatial range.

In [ ]:

Copied!





%%time

# step 1-b: parse metadata to find temporal and spatial bounds and save to csv
rows = []
for rec in results:
    meta = rec.get("meta", {}) or {}
    umm = rec.get("umm", {}) or {}
    concept_id = meta.get("concept-id") or meta.get("concept_id")
    short_name = umm.get("ShortName")
    entry_title = umm.get("EntryTitle")
    provider_id = meta.get("provider-id")

    begin, end = _parse_temporal(umm)
    west, south, east, north = _parse_bounds_from_spatial(umm)

    rows.append(
        {
            "concept_id": concept_id,
            "short_name": short_name,
            "entry_title": entry_title,
            "provider_id": provider_id,
            "begin_time": begin,
            "end_time": end,
            "west": west,
            "south": south,
            "east": east,
            "north": north,
        }
    )

df = pd.DataFrame(rows)

print(df.head())

concept_ids = [r["concept_id"] for r in rows if r["concept_id"]]

out_csv = "output/cmr_collections_netcdf4.csv"
df.to_csv(out_csv, index=False)
%%time

# step 1-b: parse metadata to find temporal and spatial bounds and save to csv
rows = []
for rec in results:
    meta = rec.get("meta", {}) or {}
    umm = rec.get("umm", {}) or {}
    concept_id = meta.get("concept-id") or meta.get("concept_id")
    short_name = umm.get("ShortName")
    entry_title = umm.get("EntryTitle")
    provider_id = meta.get("provider-id")

    begin, end = _parse_temporal(umm)
    west, south, east, north = _parse_bounds_from_spatial(umm)

    rows.append(
        {
            "concept_id": concept_id,
            "short_name": short_name,
            "entry_title": entry_title,
            "provider_id": provider_id,
            "begin_time": begin,
            "end_time": end,
            "west": west,
            "south": south,
            "east": east,
            "north": north,
        }
    )

df = pd.DataFrame(rows)

print(df.head())

concept_ids = [r["concept_id"] for r in rows if r["concept_id"]]

out_csv = "output/cmr_collections_netcdf4.csv"
df.to_csv(out_csv, index=False)

Next, open a representative netcdf-4 file from each collection and list variable names

In [ ]:

Copied!





import concurrent.futures
import earthaccess
from urllib.parse import urlparse
import pandas as pd
import xarray as xr
from datetime import datetime, timezone


df = pd.read_csv("output/cmr_collections_netcdf4.csv")

for col in ["links", "variables", "status", "error", "scheme"]:
    df[col] = None


def _pick_best_link(all_links):
    """Prefer HTTPS; else S3; else None."""
    https = [u for u in all_links if u.startswith("http")]
    s3 = [u for u in all_links if u.startswith("s3://")]
    if s3:
        return s3[0]
    if https:
        return https[0]
    return None


def _open_xarray_dataset(url):
    """Open a NetCDF URL that may be HTTPS or S3 and return (ds, scheme)."""
    scheme = urlparse(url).scheme.lower()
    if scheme in ("http", "https"):
        fs = earthaccess.get_fsspec_https_session()
        return xr.open_dataset(
            fs.open(url), engine="h5netcdf", decode_times=False
        ), "https"
    elif scheme == "s3":
        s3 = earthaccess.get_s3fs_session()
        return xr.open_dataset(
            s3.open(url, "rb"), engine="h5netcdf", decode_times=False
        ), "s3"
    else:
        raise ValueError(f"Unsupported URL scheme: {scheme}")


def process_row(i_row):
    i, row = i_row
    concept_id = row["concept_id"]
    begin = row["begin_time"]
    end = (
        row["end_time"]
        if pd.notna(row["end_time"])
        else datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    )

    logs = []
    logs.append(f"\n🔍 [{i}] Concept ID: {concept_id}")
    logs.append(f"   🚀 [{i}] Starting search for {concept_id}...")

    try:
        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
            fut = ex.submit(
                earthaccess.search_data,
                concept_id=concept_id,
                temporal=(begin, end),
                count=1,
            )
            results = fut.result(timeout=120)
    except concurrent.futures.TimeoutError:
        logs.append(f"   ⏳ [{i}] Timeout while searching {concept_id}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "timeout",
            "error": None,
            "scheme": None,
            "logs": logs,
        }
    except Exception as e:
        logs.append(f"   ❌ [{i}] Search failed for {concept_id}: {e}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "search_failed",
            "error": str(e),
            "scheme": None,
            "logs": logs,
        }

    if not results:
        logs.append(f"   ⚠️  [{i}] No granules for {concept_id}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "no_granules",
            "error": None,
            "scheme": None,
            "logs": logs,
        }

    try:
        all_links = results[0].data_links() or []
    except Exception as e:
        logs.append(f"   ⚠️  [{i}] Could not extract data_links: {e}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "no_links",
            "error": str(e),
            "scheme": None,
            "logs": logs,
        }

    netcdf_url = _pick_best_link(all_links)
    if not netcdf_url:
        logs.append(f"   ⚠️  [{i}] No usable HTTPS/S3 NetCDF links for {concept_id}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "no_links",
            "error": None,
            "scheme": None,
            "logs": logs,
        }

    logs.append(f"   🔗 [{i}] Link chosen: {netcdf_url}")

    try:
        ds, scheme = _open_xarray_dataset(netcdf_url)
        with ds:
            variables = list(ds.data_vars.keys())
        logs.append(f"   📊 [{i}] Variables ({len(variables)}): {variables}")
        logs.append(f"   ✅ [{i}] Result: ok, scheme: {scheme}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": netcdf_url,
            "variables": variables,
            "status": "ok",
            "error": None,
            "scheme": scheme,
            "logs": logs,
        }
    except Exception as e:
        logs.append(f"   ⚠️  [{i}] Failed to open dataset: {e}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": netcdf_url,
            "variables": [],
            "status": "open_failed",
            "error": str(e),
            "scheme": urlparse(netcdf_url).scheme or None,
            "logs": logs,
        }


# ----------------------------
# Run in parallel
# ----------------------------
rows = []
n = max(10, len(df))
print(f"\n🚀 Starting processing of {n} rows...", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(process_row, item) for item in df.iloc[:n].iterrows()]
    for fut in concurrent.futures.as_completed(futures):
        res = fut.result()

        # Print all logs for this collection at once
        for log in res.get("logs", []):
            print(log, flush=True)

        rows.append({k: v for k, v in res.items() if k != "logs"})

out = pd.DataFrame(rows).set_index("i").sort_index()

# Merge back into original df (preserves all other columns)
df.loc[out.index, ["links", "variables", "status", "error", "scheme"]] = out[
    ["links", "variables", "status", "error", "scheme"]
]

print("\n📦 Merge complete. Sample:", flush=True)
print(df.loc[out.index, ["concept_id", "scheme", "links", "status"]].head(), flush=True)
import concurrent.futures
import earthaccess
from urllib.parse import urlparse
import pandas as pd
import xarray as xr
from datetime import datetime, timezone


df = pd.read_csv("output/cmr_collections_netcdf4.csv")

for col in ["links", "variables", "status", "error", "scheme"]:
    df[col] = None


def _pick_best_link(all_links):
    """Prefer HTTPS; else S3; else None."""
    https = [u for u in all_links if u.startswith("http")]
    s3 = [u for u in all_links if u.startswith("s3://")]
    if s3:
        return s3[0]
    if https:
        return https[0]
    return None


def _open_xarray_dataset(url):
    """Open a NetCDF URL that may be HTTPS or S3 and return (ds, scheme)."""
    scheme = urlparse(url).scheme.lower()
    if scheme in ("http", "https"):
        fs = earthaccess.get_fsspec_https_session()
        return xr.open_dataset(
            fs.open(url), engine="h5netcdf", decode_times=False
        ), "https"
    elif scheme == "s3":
        s3 = earthaccess.get_s3fs_session()
        return xr.open_dataset(
            s3.open(url, "rb"), engine="h5netcdf", decode_times=False
        ), "s3"
    else:
        raise ValueError(f"Unsupported URL scheme: {scheme}")


def process_row(i_row):
    i, row = i_row
    concept_id = row["concept_id"]
    begin = row["begin_time"]
    end = (
        row["end_time"]
        if pd.notna(row["end_time"])
        else datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    )

    logs = []
    logs.append(f"\n🔍 [{i}] Concept ID: {concept_id}")
    logs.append(f"   🚀 [{i}] Starting search for {concept_id}...")

    try:
        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
            fut = ex.submit(
                earthaccess.search_data,
                concept_id=concept_id,
                temporal=(begin, end),
                count=1,
            )
            results = fut.result(timeout=120)
    except concurrent.futures.TimeoutError:
        logs.append(f"   ⏳ [{i}] Timeout while searching {concept_id}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "timeout",
            "error": None,
            "scheme": None,
            "logs": logs,
        }
    except Exception as e:
        logs.append(f"   ❌ [{i}] Search failed for {concept_id}: {e}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "search_failed",
            "error": str(e),
            "scheme": None,
            "logs": logs,
        }

    if not results:
        logs.append(f"   ⚠️  [{i}] No granules for {concept_id}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "no_granules",
            "error": None,
            "scheme": None,
            "logs": logs,
        }

    try:
        all_links = results[0].data_links() or []
    except Exception as e:
        logs.append(f"   ⚠️  [{i}] Could not extract data_links: {e}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "no_links",
            "error": str(e),
            "scheme": None,
            "logs": logs,
        }

    netcdf_url = _pick_best_link(all_links)
    if not netcdf_url:
        logs.append(f"   ⚠️  [{i}] No usable HTTPS/S3 NetCDF links for {concept_id}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": None,
            "variables": None,
            "status": "no_links",
            "error": None,
            "scheme": None,
            "logs": logs,
        }

    logs.append(f"   🔗 [{i}] Link chosen: {netcdf_url}")

    try:
        ds, scheme = _open_xarray_dataset(netcdf_url)
        with ds:
            variables = list(ds.data_vars.keys())
        logs.append(f"   📊 [{i}] Variables ({len(variables)}): {variables}")
        logs.append(f"   ✅ [{i}] Result: ok, scheme: {scheme}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": netcdf_url,
            "variables": variables,
            "status": "ok",
            "error": None,
            "scheme": scheme,
            "logs": logs,
        }
    except Exception as e:
        logs.append(f"   ⚠️  [{i}] Failed to open dataset: {e}")
        return {
            "i": i,
            "concept_id": concept_id,
            "links": netcdf_url,
            "variables": [],
            "status": "open_failed",
            "error": str(e),
            "scheme": urlparse(netcdf_url).scheme or None,
            "logs": logs,
        }


# ----------------------------
# Run in parallel
# ----------------------------
rows = []
n = max(10, len(df))
print(f"\n🚀 Starting processing of {n} rows...", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(process_row, item) for item in df.iloc[:n].iterrows()]
    for fut in concurrent.futures.as_completed(futures):
        res = fut.result()

        # Print all logs for this collection at once
        for log in res.get("logs", []):
            print(log, flush=True)

        rows.append({k: v for k, v in res.items() if k != "logs"})

out = pd.DataFrame(rows).set_index("i").sort_index()

# Merge back into original df (preserves all other columns)
df.loc[out.index, ["links", "variables", "status", "error", "scheme"]] = out[
    ["links", "variables", "status", "error", "scheme"]
]

print("\n📦 Merge complete. Sample:", flush=True)
print(df.loc[out.index, ["concept_id", "scheme", "links", "status"]].head(), flush=True)

In [ ]:

Copied!

df_valid_vars = df.dropna(subset=["variables"])
df_valid_vars
df_valid_vars = df.dropna(subset=["variables"])
df_valid_vars

In [ ]:

Copied!

# Save result
df.to_csv("output/cmr_collections_netcdf4_updated_saved_all.csv", index=False)
print(f"\n✅ Updated CSV saved with {df['link'].notna().sum()} links populated.")
# Save result
df.to_csv("output/cmr_collections_netcdf4_updated_saved_all.csv", index=False)
print(f"\n✅ Updated CSV saved with {df['link'].notna().sum()} links populated.")

In [ ]:

Copied!





## For grouped hdf-5 files, it does not use datatree (reason is current mechanics of Titiler-CMR).
url = "https://data.laadsdaac.earthdatacloud.nasa.gov/prod-lads/VNP03IMG/VNP03IMG.A2012019.0000.002.2020318135750.nc"
fs = earthaccess.get_fsspec_https_session()
ds = xr.open_datatree(fs.open(url), engine="h5netcdf", decode_times=False)
ds
## For grouped hdf-5 files, it does not use datatree (reason is current mechanics of Titiler-CMR).
url = "https://data.laadsdaac.earthdatacloud.nasa.gov/prod-lads/VNP03IMG/VNP03IMG.A2012019.0000.002.2020318135750.nc"
fs = earthaccess.get_fsspec_https_session()
ds = xr.open_datatree(fs.open(url), engine="h5netcdf", decode_times=False)
ds