Working with CMR STAC¶
NASA's Common Metadata Repository (CMR) can be accessed via DAAC-specific STAC APIs. This opens up a huge amount of data to users but there are some limitations to the STAC item metadata that is served by these APIs. This notebook shows how to work around the "asset keys are not consistent among items in a collection" problem.
import json
import math
import os
from pathlib import Path
from urllib.parse import urlparse
import requests
import rustac
from async_geotiff import GeoTIFF
from obstore.store import HTTPStore
from pyproj import Transformer
import lazycogs
LPCLOUD_STAC_API_URL = "https://cmr.earthdata.nasa.gov/stac/LPCLOUD"
# we will query for STAC records that intersect a lon/lat coordinate pair
lon, lat = -92, 46
Start by querying CMR STAC. We would normally cache this directly to a stac-geoparquet file but since the asset keys are different for every single item, the table schema winds up with ~13k columns which is not ideal. Instead we will load the items into memory here so we can fix them up before writing to stac-geoparquet.
raw_items = await rustac.search(
LPCLOUD_STAC_API_URL,
collections="ECO_L2T_STARS_002",
intersects={"type": "Point", "coordinates": [lon, lat]},
limit=500,
)
Look at the asset keys for these items - they look like filenames :(
print(json.dumps(list(raw_items[0]["assets"].keys()), indent=2))
[ "browse", "thumbnail_0", "thumbnail_1", "thumbnail_2", "thumbnail_3", "thumbnail_4", "thumbnail_5", "thumbnail_6", "thumbnail_7", "thumbnail_8", "thumbnail_9", "002/ECOv002_L2T_STARS_15TWL_20180729_0712_01/ECOv002_L2T_STARS_15TWL_20180729_0712_01_NDVI", "002/ECOv002_L2T_STARS_15TWL_20180729_0712_01/ECOv002_L2T_STARS_15TWL_20180729_0712_01_NDVI-UQ", "002/ECOv002_L2T_STARS_15TWL_20180729_0712_01/ECOv002_L2T_STARS_15TWL_20180729_0712_01_albedo", "002/ECOv002_L2T_STARS_15TWL_20180729_0712_01/ECOv002_L2T_STARS_15TWL_20180729_0712_01_albedo-UQ", "s3_002/ECOv002_L2T_STARS_15TWL_20180729_0712_01/ECOv002_L2T_STARS_15TWL_20180729_0712_01_NDVI", "s3_002/ECOv002_L2T_STARS_15TWL_20180729_0712_01/ECOv002_L2T_STARS_15TWL_20180729_0712_01_NDVI-UQ", "s3_002/ECOv002_L2T_STARS_15TWL_20180729_0712_01/ECOv002_L2T_STARS_15TWL_20180729_0712_01_albedo", "s3_002/ECOv002_L2T_STARS_15TWL_20180729_0712_01/ECOv002_L2T_STARS_15TWL_20180729_0712_01_albedo-UQ", "metadata" ]
Then iterate through the items and map the geotiff assets to keys that should be consistent across all items. Cloud-hosted CMR collections will have pairs of https/s3 hrefs for each actual asset. If you are running this in the same AWS region as the data are stored (probably us-west-2) you could use the S3 hrefs instead of the https hrefs.
items = []
for item in raw_items:
assets = {}
for key, asset in item["assets"].items():
if asset["href"].endswith(".tif") and asset["href"]:
variable = Path(key).stem.split("_")[-1]
if asset["href"].startswith("s3://"):
variable += "-s3"
assets[variable] = asset
item["assets"] = assets
items.append(item)
sample_item = items[0]
print(json.dumps(list(sample_item["assets"].keys()), indent=2))
[ "NDVI", "NDVI-UQ", "albedo", "albedo-UQ", "NDVI-s3", "NDVI-UQ-s3", "albedo-s3", "albedo-UQ-s3" ]
Write the modified assets to a separate parquet file.
FIXED_PARQUET = f"data/ECO_L2T_STARS_002_{lon}_{lat}_fixed.parquet"
rustac.write_sync(
FIXED_PARQUET,
items,
)
{'e_tag': '3fcd2d-651b331819d3c-28779', 'version': None}
Now configure an HTTPStore with your Earthdata Login token so you lazycogs can access the tif assets.
def get_earthdata_token(username: str, password: str) -> str:
r = requests.post(
"https://urs.earthdata.nasa.gov/api/users/find_or_create_token",
auth=(username, password),
timeout=10,
)
r.raise_for_status()
return r.json()["access_token"]
token = get_earthdata_token(
os.getenv("EARTHDATA_USERNAME"),
os.getenv("EARTHDATA_PASSWORD"),
)
store_prefix = "https://data.lpdaac.earthdatacloud.nasa.gov"
store = HTTPStore(
store_prefix,
client_options={
"default_headers": {
"Authorization": f"Bearer {token}",
},
},
)
Inspect one of the geotiffs assets to get some information about the extent/resolution of the underlying data.
geotiff = await GeoTIFF.open(
urlparse(sample_item["assets"]["NDVI"]["href"]).path,
store=store,
)
crs = geotiff.crs
print("crs:", crs)
print("transform:", geotiff.transform)
print("shape:", geotiff.shape)
print("bounds:", geotiff.bounds)
print("nodata value:", geotiff.nodata)
print("data type:", geotiff.dtype)
resolutions = [geotiff.res[0]]
print("full resolution:", geotiff.res)
for i, overview in enumerate(geotiff.overviews, start=1):
resolutions.append(overview.res[0])
print("overview", i, overview.res)
crs: {"type": "ProjectedCRS", "$schema": "https://proj.org/schemas/v0.7/projjson.schema.json", "name": "WGS 84 / UTM zone 15N", "base_crs": {"type": "GeographicCRS", "$schema": "https://proj.org/schemas/v0.7/projjson.schema.json", "name": "WGS 84", "datum": {"type": "GeodeticReferenceFrame", "name": "World Geodetic System 1984", "ellipsoid": {"name": "User-defined", "semi_major_axis": 6378137.0, "inverse_flattening": 298.257223563}, "prime_meridian": {"name": "Greenwich", "longitude": 0.0}}, "coordinate_system": {"subtype": "ellipsoidal", "axis": [{"name": "Latitude", "abbreviation": "lat", "direction": "north", "unit": "degree"}, {"name": "Longitude", "abbreviation": "lon", "direction": "east", "unit": "degree"}]}}, "conversion": {"type": "Conversion", "name": "UTM zone 15N", "method": {"name": "Transverse Mercator", "id": {"authority": "EPSG", "code": 9807}}, "parameters": [{"name": "Latitude of natural origin", "value": 0, "unit": "degree", "id": {"authority": "EPSG", "code": 8801}}, {"name": "Longitude of natural origin", "value": -93, "unit": "degree", "id": {"authority": "EPSG", "code": 8802}}, {"name": "Scale factor at natural origin", "value": 0.9996, "unit": "unity", "id": {"authority": "EPSG", "code": 8805}}, {"name": "False easting", "value": 500000, "unit": "metre", "id": {"authority": "EPSG", "code": 8806}}, {"name": "False northing", "value": 0, "unit": "metre", "id": {"authority": "EPSG", "code": 8807}}], "id": {"authority": "EPSG", "code": 16015}}, "coordinate_system": {"subtype": "Cartesian", "axis": [{"name": "Easting", "abbreviation": "E", "direction": "east", "unit": "metre"}, {"name": "Northing", "abbreviation": "N", "direction": "north", "unit": "metre"}]}}
transform: | 70.00, 0.00, 499980.00|
| 0.00,-70.00, 5100000.00|
| 0.00, 0.00, 1.00|
shape: (1568, 1568)
bounds: BoundingBox(left=499980.0, bottom=4990240.0, right=609740.0, top=5100000.0)
nodata value: nan
data type: float32
full resolution: (70.0, 70.0)
Define a target grid in a CRS that can reasonably represent the underlying assets. If all of the assets share a single CRS, you could use that instead but for this example I am picking a projected CRS that covers all of CONUS.
bboxes = {tuple(item["bbox"]) for item in items}
sample_hrefs = {}
while len(sample_hrefs) < len(bboxes):
for item in items:
if tuple(item["bbox"]) not in sample_hrefs:
sample_hrefs[tuple(item["bbox"])] = item["assets"]["NDVI"]["href"]
bbox = None
for _bbox in bboxes:
if not bbox:
bbox = _bbox
else:
xmin, ymin, xmax, ymax = bbox
_xmin, _ymin, _xmax, _ymax = _bbox
bbox = [min(xmin, _xmin), min(ymin, _ymin), max(xmax, _xmax), max(ymax, _ymax)]
dst_crs = "epsg:5070"
resolution = resolutions[0]
transformer = Transformer.from_crs("epsg:4326", dst_crs, always_xy=True)
_dst_bbox = transformer.transform_bounds(*bbox)
dst_bbox = (
math.floor(_dst_bbox[0] / resolution) * resolution,
math.floor(_dst_bbox[1] / resolution) * resolution,
math.ceil(_dst_bbox[2] / resolution) * resolution,
math.ceil(_dst_bbox[3] / resolution) * resolution,
)
dst_bbox
(229320.0, 2455740.0, 349370.0, 2670570.0)
Now open the array
da = lazycogs.open(
href=FIXED_PARQUET,
crs=dst_crs,
bbox=dst_bbox,
resolution=resolution,
time_period="P1D", # one time coordinate per day
bands=["NDVI", "albedo"],
dtype=geotiff.dtype,
nodata=geotiff.nodata,
store=store,
)
da
<xarray.DataArray (band: 2, time: 869, y: 3069, x: 1715)> Size: 37GB
[9147676230 values with dtype=float32]
Coordinates:
* band (band) <U6 48B 'NDVI' 'albedo'
* time (time) datetime64[s] 7kB 2018-07-28 2018-07-30 ... 2026-02-24
* y (y) float64 25kB 2.671e+06 2.67e+06 ... 2.456e+06 2.456e+06
* x (x) float64 14kB 2.294e+05 2.294e+05 ... 3.493e+05 3.493e+05
spatial_ref int64 8B 0
Indexes:
┌ x RasterIndex (crs=EPSG:5070)
└ y
Attributes:
grid_mapping: spatial_ref
zarr_conventions: [{'schema_url': 'https://raw.githubusercontent.c...
spatial:dimensions: ['y', 'x']
spatial:bbox: (229320.0, 2455740.0, 349370.0, 2670570.0)
spatial:transform_type: affine
spatial:transform: [70.0, 0.0, 229320.0, 0.0, -70.0, 2670570.0]
spatial:shape: [3069, 1715]
spatial:registration: pixel
_stac_backend: MultiBandStacBackendArray(bands=['NDVI', 'albedo...
_stac_time_coords: 2018-07-28 … 2026-02-24 (n=869)
proj:code: EPSG:5070Select the values that intersect the provided point coordinate
transformer = Transformer.from_crs("EPSG:4326", dst_crs, always_xy=True)
x, y = transformer.transform(lon, lat)
pt_extraction = da.sel(x=x, y=y, method="nearest")
pt_extraction
<xarray.DataArray (band: 2, time: 869)> Size: 7kB
[1738 values with dtype=float32]
Coordinates:
* band (band) <U6 48B 'NDVI' 'albedo'
* time (time) datetime64[s] 7kB 2018-07-28 2018-07-30 ... 2026-02-24
spatial_ref int64 8B 0
x float64 8B 3.102e+05
y float64 8B 2.563e+06
Attributes:
grid_mapping: spatial_ref
zarr_conventions: [{'schema_url': 'https://raw.githubusercontent.c...
spatial:dimensions: ['y', 'x']
spatial:bbox: (229320.0, 2455740.0, 349370.0, 2670570.0)
spatial:transform_type: affine
spatial:transform: [70.0, 0.0, 229320.0, 0.0, -70.0, 2670570.0]
spatial:shape: [3069, 1715]
spatial:registration: pixel
_stac_backend: MultiBandStacBackendArray(bands=['NDVI', 'albedo...
_stac_time_coords: 2018-07-28 … 2026-02-24 (n=869)
proj:code: EPSG:5070vals = await pt_extraction.isel(time=slice(0, 20)).load_async()
vals
<xarray.DataArray (band: 2, time: 20)> Size: 160B
array([[ nan, nan, 0.8179467 , 0.81781656, 0.81838804,
0.7948556 , 0.76919585, nan, 0.8192896 , 0.81656504,
0.8191934 , 0.80539215, 0.8055197 , 0.8045705 , 0.8024133 ,
0.8048437 , 0.81053853, 0.81357896, 0.80615324, 0.81571937],
[ nan, nan, 0.0636671 , 0.06439627, 0.06531847,
0.06322999, 0.06255466, nan, 0.05723622, 0.05787302,
0.05673463, 0.02479203, 0.02479203, 0.0651263 , 0.06567526,
0.06563522, 0.02452473, 0.05779882, 0.05661042, 0.05655375]],
dtype=float32)
Coordinates:
* band (band) <U6 48B 'NDVI' 'albedo'
* time (time) datetime64[s] 160B 2018-07-28 2018-07-30 ... 2018-08-27
spatial_ref int64 8B 0
x float64 8B 3.102e+05
y float64 8B 2.563e+06
Attributes:
grid_mapping: spatial_ref
zarr_conventions: [{'schema_url': 'https://raw.githubusercontent.c...
spatial:dimensions: ['y', 'x']
spatial:bbox: (229320.0, 2455740.0, 349370.0, 2670570.0)
spatial:transform_type: affine
spatial:transform: [70.0, 0.0, 229320.0, 0.0, -70.0, 2670570.0]
spatial:shape: [3069, 1715]
spatial:registration: pixel
_stac_backend: MultiBandStacBackendArray(bands=['NDVI', 'albedo...
_stac_time_coords: 2018-07-28 … 2026-02-24 (n=869)
proj:code: EPSG:5070