Finding NetCDF-4 collections¶
This notebook shows how to use earthaccess to discover NASA Earthdata collections that provide granules in netCDF-4 format. In the next step, it opens a representative netCDF-4 file from each collection to inspect and list the available variable names.
In [ ]:
Copied!
import earthaccess
import pandas as pd
import numpy as np
from typing import Dict, Optional, Tuple, Any
# ----------------------------------------
# Helpers to parse metadata from earthaccess
# ----------------------------------------
def _parse_temporal(umm: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
temporal = umm.get("TemporalExtents", [])
rng = (temporal or [{}])[0].get("RangeDateTimes", [])
begin = (rng or [{}])[0].get("BeginningDateTime")
end = (rng or [{}])[0].get("EndingDateTime")
return begin, end
def _parse_bounds_from_spatial(
umm: Dict[str, Any],
) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
spatial = umm.get("SpatialExtent", {}) or {}
horiz = spatial.get("HorizontalSpatialDomain", {}) or {}
geom = horiz.get("Geometry", {}) or {}
# 1) Bounding rectangles
rects = geom.get("BoundingRectangles") or []
if rects:
wests = [r.get("WestBoundingCoordinate") for r in rects if r]
easts = [r.get("EastBoundingCoordinate") for r in rects if r]
souths = [r.get("SouthBoundingCoordinate") for r in rects if r]
norths = [r.get("NorthBoundingCoordinate") for r in rects if r]
if all(len(lst) > 0 for lst in (wests, easts, souths, norths)):
return (
float(np.min(wests)),
float(np.min(souths)),
float(np.max(easts)),
float(np.max(norths)),
)
# 2) GPolygons
gpolys = geom.get("GPolygons") or []
coords_w, coords_e, coords_s, coords_n = [], [], [], []
for gp in gpolys:
b = gp.get("Boundary", {})
pts = b.get("Points", [])
lons = [p.get("Longitude") for p in pts if p and p.get("Longitude") is not None]
lats = [p.get("Latitude") for p in pts if p and p.get("Latitude") is not None]
if lons and lats:
coords_w.append(np.min(lons))
coords_e.append(np.max(lons))
coords_s.append(np.min(lats))
coords_n.append(np.max(lats))
if coords_w and coords_e and coords_s and coords_n:
return (
float(np.min(coords_w)),
float(np.min(coords_s)),
float(np.max(coords_e)),
float(np.max(coords_n)),
)
return None, None, None, None
import earthaccess
import pandas as pd
import numpy as np
from typing import Dict, Optional, Tuple, Any
# ----------------------------------------
# Helpers to parse metadata from earthaccess
# ----------------------------------------
def _parse_temporal(umm: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
temporal = umm.get("TemporalExtents", [])
rng = (temporal or [{}])[0].get("RangeDateTimes", [])
begin = (rng or [{}])[0].get("BeginningDateTime")
end = (rng or [{}])[0].get("EndingDateTime")
return begin, end
def _parse_bounds_from_spatial(
umm: Dict[str, Any],
) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
spatial = umm.get("SpatialExtent", {}) or {}
horiz = spatial.get("HorizontalSpatialDomain", {}) or {}
geom = horiz.get("Geometry", {}) or {}
# 1) Bounding rectangles
rects = geom.get("BoundingRectangles") or []
if rects:
wests = [r.get("WestBoundingCoordinate") for r in rects if r]
easts = [r.get("EastBoundingCoordinate") for r in rects if r]
souths = [r.get("SouthBoundingCoordinate") for r in rects if r]
norths = [r.get("NorthBoundingCoordinate") for r in rects if r]
if all(len(lst) > 0 for lst in (wests, easts, souths, norths)):
return (
float(np.min(wests)),
float(np.min(souths)),
float(np.max(easts)),
float(np.max(norths)),
)
# 2) GPolygons
gpolys = geom.get("GPolygons") or []
coords_w, coords_e, coords_s, coords_n = [], [], [], []
for gp in gpolys:
b = gp.get("Boundary", {})
pts = b.get("Points", [])
lons = [p.get("Longitude") for p in pts if p and p.get("Longitude") is not None]
lats = [p.get("Latitude") for p in pts if p and p.get("Latitude") is not None]
if lons and lats:
coords_w.append(np.min(lons))
coords_e.append(np.max(lons))
coords_s.append(np.min(lats))
coords_n.append(np.max(lats))
if coords_w and coords_e and coords_s and coords_n:
return (
float(np.min(coords_w)),
float(np.min(coords_s)),
float(np.max(coords_e)),
float(np.max(coords_n)),
)
return None, None, None, None
First, let's find all collections that provide netCDF-4 files using the earthaccess
library.
In [ ]:
Copied!
%%time
# step 1-a: search collections with netcdf-4
query = earthaccess.DataCollections()
query.params["granule_data_format"] = "*netcdf-4*"
query.option("granule_data_format", "pattern", True)
results = query.get_all()
print(f"Number of collections found: {len(results)}")
%%time
# step 1-a: search collections with netcdf-4
query = earthaccess.DataCollections()
query.params["granule_data_format"] = "*netcdf-4*"
query.option("granule_data_format", "pattern", True)
results = query.get_all()
print(f"Number of collections found: {len(results)}")
Next, parse metadata for each collection to find a temporal and spatial range.
In [ ]:
Copied!
%%time
# step 1-b: parse metadata to find temporal and spatial bounds and save to csv
rows = []
for rec in results:
meta = rec.get("meta", {}) or {}
umm = rec.get("umm", {}) or {}
concept_id = meta.get("concept-id") or meta.get("concept_id")
short_name = umm.get("ShortName")
entry_title = umm.get("EntryTitle")
provider_id = meta.get("provider-id")
begin, end = _parse_temporal(umm)
west, south, east, north = _parse_bounds_from_spatial(umm)
rows.append(
{
"concept_id": concept_id,
"short_name": short_name,
"entry_title": entry_title,
"provider_id": provider_id,
"begin_time": begin,
"end_time": end,
"west": west,
"south": south,
"east": east,
"north": north,
}
)
df = pd.DataFrame(rows)
print(df.head())
concept_ids = [r["concept_id"] for r in rows if r["concept_id"]]
out_csv = "output/cmr_collections_netcdf4.csv"
df.to_csv(out_csv, index=False)
%%time
# step 1-b: parse metadata to find temporal and spatial bounds and save to csv
rows = []
for rec in results:
meta = rec.get("meta", {}) or {}
umm = rec.get("umm", {}) or {}
concept_id = meta.get("concept-id") or meta.get("concept_id")
short_name = umm.get("ShortName")
entry_title = umm.get("EntryTitle")
provider_id = meta.get("provider-id")
begin, end = _parse_temporal(umm)
west, south, east, north = _parse_bounds_from_spatial(umm)
rows.append(
{
"concept_id": concept_id,
"short_name": short_name,
"entry_title": entry_title,
"provider_id": provider_id,
"begin_time": begin,
"end_time": end,
"west": west,
"south": south,
"east": east,
"north": north,
}
)
df = pd.DataFrame(rows)
print(df.head())
concept_ids = [r["concept_id"] for r in rows if r["concept_id"]]
out_csv = "output/cmr_collections_netcdf4.csv"
df.to_csv(out_csv, index=False)
Next, open a representative netcdf-4 file from each collection and list variable names
In [ ]:
Copied!
import concurrent.futures
import earthaccess
from urllib.parse import urlparse
import pandas as pd
import xarray as xr
from datetime import datetime, timezone
df = pd.read_csv("output/cmr_collections_netcdf4.csv")
for col in ["links", "variables", "status", "error", "scheme"]:
df[col] = None
def _pick_best_link(all_links):
"""Prefer HTTPS; else S3; else None."""
https = [u for u in all_links if u.startswith("http")]
s3 = [u for u in all_links if u.startswith("s3://")]
if s3:
return s3[0]
if https:
return https[0]
return None
def _open_xarray_dataset(url):
"""Open a NetCDF URL that may be HTTPS or S3 and return (ds, scheme)."""
scheme = urlparse(url).scheme.lower()
if scheme in ("http", "https"):
fs = earthaccess.get_fsspec_https_session()
return xr.open_dataset(
fs.open(url), engine="h5netcdf", decode_times=False
), "https"
elif scheme == "s3":
s3 = earthaccess.get_s3fs_session()
return xr.open_dataset(
s3.open(url, "rb"), engine="h5netcdf", decode_times=False
), "s3"
else:
raise ValueError(f"Unsupported URL scheme: {scheme}")
def process_row(i_row):
i, row = i_row
concept_id = row["concept_id"]
begin = row["begin_time"]
end = (
row["end_time"]
if pd.notna(row["end_time"])
else datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
)
logs = []
logs.append(f"\n🔍 [{i}] Concept ID: {concept_id}")
logs.append(f" 🚀 [{i}] Starting search for {concept_id}...")
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
fut = ex.submit(
earthaccess.search_data,
concept_id=concept_id,
temporal=(begin, end),
count=1,
)
results = fut.result(timeout=120)
except concurrent.futures.TimeoutError:
logs.append(f" ⏳ [{i}] Timeout while searching {concept_id}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "timeout",
"error": None,
"scheme": None,
"logs": logs,
}
except Exception as e:
logs.append(f" ❌ [{i}] Search failed for {concept_id}: {e}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "search_failed",
"error": str(e),
"scheme": None,
"logs": logs,
}
if not results:
logs.append(f" ⚠️ [{i}] No granules for {concept_id}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "no_granules",
"error": None,
"scheme": None,
"logs": logs,
}
try:
all_links = results[0].data_links() or []
except Exception as e:
logs.append(f" ⚠️ [{i}] Could not extract data_links: {e}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "no_links",
"error": str(e),
"scheme": None,
"logs": logs,
}
netcdf_url = _pick_best_link(all_links)
if not netcdf_url:
logs.append(f" ⚠️ [{i}] No usable HTTPS/S3 NetCDF links for {concept_id}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "no_links",
"error": None,
"scheme": None,
"logs": logs,
}
logs.append(f" 🔗 [{i}] Link chosen: {netcdf_url}")
try:
ds, scheme = _open_xarray_dataset(netcdf_url)
with ds:
variables = list(ds.data_vars.keys())
logs.append(f" 📊 [{i}] Variables ({len(variables)}): {variables}")
logs.append(f" ✅ [{i}] Result: ok, scheme: {scheme}")
return {
"i": i,
"concept_id": concept_id,
"links": netcdf_url,
"variables": variables,
"status": "ok",
"error": None,
"scheme": scheme,
"logs": logs,
}
except Exception as e:
logs.append(f" ⚠️ [{i}] Failed to open dataset: {e}")
return {
"i": i,
"concept_id": concept_id,
"links": netcdf_url,
"variables": [],
"status": "open_failed",
"error": str(e),
"scheme": urlparse(netcdf_url).scheme or None,
"logs": logs,
}
# ----------------------------
# Run in parallel
# ----------------------------
rows = []
n = max(10, len(df))
print(f"\n🚀 Starting processing of {n} rows...", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
futures = [executor.submit(process_row, item) for item in df.iloc[:n].iterrows()]
for fut in concurrent.futures.as_completed(futures):
res = fut.result()
# Print all logs for this collection at once
for log in res.get("logs", []):
print(log, flush=True)
rows.append({k: v for k, v in res.items() if k != "logs"})
out = pd.DataFrame(rows).set_index("i").sort_index()
# Merge back into original df (preserves all other columns)
df.loc[out.index, ["links", "variables", "status", "error", "scheme"]] = out[
["links", "variables", "status", "error", "scheme"]
]
print("\n📦 Merge complete. Sample:", flush=True)
print(df.loc[out.index, ["concept_id", "scheme", "links", "status"]].head(), flush=True)
import concurrent.futures
import earthaccess
from urllib.parse import urlparse
import pandas as pd
import xarray as xr
from datetime import datetime, timezone
df = pd.read_csv("output/cmr_collections_netcdf4.csv")
for col in ["links", "variables", "status", "error", "scheme"]:
df[col] = None
def _pick_best_link(all_links):
"""Prefer HTTPS; else S3; else None."""
https = [u for u in all_links if u.startswith("http")]
s3 = [u for u in all_links if u.startswith("s3://")]
if s3:
return s3[0]
if https:
return https[0]
return None
def _open_xarray_dataset(url):
"""Open a NetCDF URL that may be HTTPS or S3 and return (ds, scheme)."""
scheme = urlparse(url).scheme.lower()
if scheme in ("http", "https"):
fs = earthaccess.get_fsspec_https_session()
return xr.open_dataset(
fs.open(url), engine="h5netcdf", decode_times=False
), "https"
elif scheme == "s3":
s3 = earthaccess.get_s3fs_session()
return xr.open_dataset(
s3.open(url, "rb"), engine="h5netcdf", decode_times=False
), "s3"
else:
raise ValueError(f"Unsupported URL scheme: {scheme}")
def process_row(i_row):
i, row = i_row
concept_id = row["concept_id"]
begin = row["begin_time"]
end = (
row["end_time"]
if pd.notna(row["end_time"])
else datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
)
logs = []
logs.append(f"\n🔍 [{i}] Concept ID: {concept_id}")
logs.append(f" 🚀 [{i}] Starting search for {concept_id}...")
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
fut = ex.submit(
earthaccess.search_data,
concept_id=concept_id,
temporal=(begin, end),
count=1,
)
results = fut.result(timeout=120)
except concurrent.futures.TimeoutError:
logs.append(f" ⏳ [{i}] Timeout while searching {concept_id}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "timeout",
"error": None,
"scheme": None,
"logs": logs,
}
except Exception as e:
logs.append(f" ❌ [{i}] Search failed for {concept_id}: {e}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "search_failed",
"error": str(e),
"scheme": None,
"logs": logs,
}
if not results:
logs.append(f" ⚠️ [{i}] No granules for {concept_id}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "no_granules",
"error": None,
"scheme": None,
"logs": logs,
}
try:
all_links = results[0].data_links() or []
except Exception as e:
logs.append(f" ⚠️ [{i}] Could not extract data_links: {e}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "no_links",
"error": str(e),
"scheme": None,
"logs": logs,
}
netcdf_url = _pick_best_link(all_links)
if not netcdf_url:
logs.append(f" ⚠️ [{i}] No usable HTTPS/S3 NetCDF links for {concept_id}")
return {
"i": i,
"concept_id": concept_id,
"links": None,
"variables": None,
"status": "no_links",
"error": None,
"scheme": None,
"logs": logs,
}
logs.append(f" 🔗 [{i}] Link chosen: {netcdf_url}")
try:
ds, scheme = _open_xarray_dataset(netcdf_url)
with ds:
variables = list(ds.data_vars.keys())
logs.append(f" 📊 [{i}] Variables ({len(variables)}): {variables}")
logs.append(f" ✅ [{i}] Result: ok, scheme: {scheme}")
return {
"i": i,
"concept_id": concept_id,
"links": netcdf_url,
"variables": variables,
"status": "ok",
"error": None,
"scheme": scheme,
"logs": logs,
}
except Exception as e:
logs.append(f" ⚠️ [{i}] Failed to open dataset: {e}")
return {
"i": i,
"concept_id": concept_id,
"links": netcdf_url,
"variables": [],
"status": "open_failed",
"error": str(e),
"scheme": urlparse(netcdf_url).scheme or None,
"logs": logs,
}
# ----------------------------
# Run in parallel
# ----------------------------
rows = []
n = max(10, len(df))
print(f"\n🚀 Starting processing of {n} rows...", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
futures = [executor.submit(process_row, item) for item in df.iloc[:n].iterrows()]
for fut in concurrent.futures.as_completed(futures):
res = fut.result()
# Print all logs for this collection at once
for log in res.get("logs", []):
print(log, flush=True)
rows.append({k: v for k, v in res.items() if k != "logs"})
out = pd.DataFrame(rows).set_index("i").sort_index()
# Merge back into original df (preserves all other columns)
df.loc[out.index, ["links", "variables", "status", "error", "scheme"]] = out[
["links", "variables", "status", "error", "scheme"]
]
print("\n📦 Merge complete. Sample:", flush=True)
print(df.loc[out.index, ["concept_id", "scheme", "links", "status"]].head(), flush=True)
In [ ]:
Copied!
df_valid_vars = df.dropna(subset=["variables"])
df_valid_vars
df_valid_vars = df.dropna(subset=["variables"])
df_valid_vars
In [ ]:
Copied!
# Save result
df.to_csv("output/cmr_collections_netcdf4_updated_saved_all.csv", index=False)
print(f"\n✅ Updated CSV saved with {df['link'].notna().sum()} links populated.")
# Save result
df.to_csv("output/cmr_collections_netcdf4_updated_saved_all.csv", index=False)
print(f"\n✅ Updated CSV saved with {df['link'].notna().sum()} links populated.")
In [ ]:
Copied!
## For grouped hdf-5 files, it does not use datatree (reason is current mechanics of Titiler-CMR).
url = "https://data.laadsdaac.earthdatacloud.nasa.gov/prod-lads/VNP03IMG/VNP03IMG.A2012019.0000.002.2020318135750.nc"
fs = earthaccess.get_fsspec_https_session()
ds = xr.open_datatree(fs.open(url), engine="h5netcdf", decode_times=False)
ds
## For grouped hdf-5 files, it does not use datatree (reason is current mechanics of Titiler-CMR).
url = "https://data.laadsdaac.earthdatacloud.nasa.gov/prod-lads/VNP03IMG/VNP03IMG.A2012019.0000.002.2020318135750.nc"
fs = earthaccess.get_fsspec_https_session()
ds = xr.open_datatree(fs.open(url), engine="h5netcdf", decode_times=False)
ds