Spatial index
Create Zarr store¶
In [1]:
Copied!
import numpy as np
import zarr
from obstore.store import LocalStore
root = zarr.open_group("spatial_index.zarr", mode="w", zarr_format=3)
store = LocalStore("spatial_index.zarr")
import numpy as np
import zarr
from obstore.store import LocalStore
root = zarr.open_group("spatial_index.zarr", mode="w", zarr_format=3)
store = LocalStore("spatial_index.zarr")
Search a STAC API and ingest the results into the zarr-datafusion-search metadata schema in the Zarr store¶
In [2]:
Copied!
from zarr_datafusion_search import ingest_stac_search
rows = await ingest_stac_search(
url="https://earth-search.aws.element84.com/v1",
store=store,
collections=["landsat-c2-l2"],
bbox=(-124.4096, 32.5343, -114.1312, 42.0095),
datetime="2025-10-01/2025-10-05",
asset_hrefs=["red", "green", "blue"]
)
rows
from zarr_datafusion_search import ingest_stac_search
rows = await ingest_stac_search(
url="https://earth-search.aws.element84.com/v1",
store=store,
collections=["landsat-c2-l2"],
bbox=(-124.4096, 32.5343, -114.1312, 42.0095),
datetime="2025-10-01/2025-10-05",
asset_hrefs=["red", "green", "blue"]
)
rows
/var/folders/mp/33cxt8xj36bbxj0jqdwbfwyc0000gn/T/ipykernel_21718/632695111.py:2: RuntimeWarning: Successfully reconstructed a store defined in another Python module. Connection pooling will not be shared across store instances. rows = await ingest_stac_search(
Out[2]:
30
Register Zarr store with custom TableProvider¶
In [3]:
Copied!
from datafusion import SessionContext
from zarr_datafusion_search import ZarrTable
zarr_table = await ZarrTable.from_obstore(store, "/meta")
ctx = SessionContext()
ctx.register_table("spatial_index", zarr_table)
from datafusion import SessionContext
from zarr_datafusion_search import ZarrTable
zarr_table = await ZarrTable.from_obstore(store, "/meta")
ctx = SessionContext()
ctx.register_table("spatial_index", zarr_table)
/var/folders/mp/33cxt8xj36bbxj0jqdwbfwyc0000gn/T/ipykernel_21718/1603298212.py:4: RuntimeWarning: Successfully reconstructed a store defined in another Python module. Connection pooling will not be shared across store instances. zarr_table = await ZarrTable.from_obstore(store, "/meta")
Do a spatial query with no spatial index¶
In [4]:
Copied!
from geodatafusion import register_all
register_all(ctx)
df = ctx.sql("""
SELECT id, datetime
FROM spatial_index
WHERE ST_Intersects(
bbox,
ST_GeomFromText('POLYGON((-118.5 33.9, -118.1 33.9, -118.1 34.2, -118.5 34.2, -118.5 33.9))')
)
ORDER BY id
""")
df.show()
from geodatafusion import register_all
register_all(ctx)
df = ctx.sql("""
SELECT id, datetime
FROM spatial_index
WHERE ST_Intersects(
bbox,
ST_GeomFromText('POLYGON((-118.5 33.9, -118.1 33.9, -118.1 34.2, -118.5 34.2, -118.5 33.9))')
)
ORDER BY id
""")
df.show()
DataFrame() +---------------------------------+-------------------------+ | id | datetime | +---------------------------------+-------------------------+ | LC08_L2SP_041036_20251005_02_T1 | 2025-10-05T18:28:34.608 | | LC08_L2SP_041037_20251005_02_T1 | 2025-10-05T18:28:58.499 | +---------------------------------+-------------------------+
Create a spatial index and store it in the indexes group¶
In [5]:
Copied!
from geoindex_rs import rtree as rt
import shapely
meta = root["/meta"]
geoms = shapely.from_wkb(meta["bbox"][:]) # vectorized, returns ndarray of geometries
bounds = shapely.bounds(geoms)
min_x = np.ascontiguousarray(bounds[:, 0])
min_y = np.ascontiguousarray(bounds[:, 1])
max_x = np.ascontiguousarray(bounds[:, 2])
max_y = np.ascontiguousarray(bounds[:, 3])
builder = rt.RTreeBuilder(num_items=len(meta["bbox"][:]))
builder.add(min_x, min_y, max_x, max_y)
tree = builder.finish()
from geoindex_rs import rtree as rt
import shapely
meta = root["/meta"]
geoms = shapely.from_wkb(meta["bbox"][:]) # vectorized, returns ndarray of geometries
bounds = shapely.bounds(geoms)
min_x = np.ascontiguousarray(bounds[:, 0])
min_y = np.ascontiguousarray(bounds[:, 1])
max_x = np.ascontiguousarray(bounds[:, 2])
max_y = np.ascontiguousarray(bounds[:, 3])
builder = rt.RTreeBuilder(num_items=len(meta["bbox"][:]))
builder.add(min_x, min_y, max_x, max_y)
tree = builder.finish()
In [6]:
Copied!
indexes = root.create_group("indexes")
tree_bytes = bytes(tree)
indexes.create_array(
"bbox",
data=np.frombuffer(tree_bytes, dtype=np.uint8)
)
indexes = root.create_group("indexes")
tree_bytes = bytes(tree)
indexes.create_array(
"bbox",
data=np.frombuffer(tree_bytes, dtype=np.uint8)
)
Out[6]:
<Array file://spatial_index.zarr/indexes/bbox shape=(1130,) dtype=uint8>
The query returns the same result when using the index¶
In [7]:
Copied!
df = ctx.sql("""
SELECT id, datetime
FROM spatial_index
WHERE ST_Intersects(
bbox,
ST_GeomFromText('POLYGON((-118.5 33.9, -118.1 33.9, -118.1 34.2, -118.5 34.2, -118.5 33.9))')
)
ORDER BY id
""")
df.show()
df = ctx.sql("""
SELECT id, datetime
FROM spatial_index
WHERE ST_Intersects(
bbox,
ST_GeomFromText('POLYGON((-118.5 33.9, -118.1 33.9, -118.1 34.2, -118.5 34.2, -118.5 33.9))')
)
ORDER BY id
""")
df.show()
DataFrame() +---------------------------------+-------------------------+ | id | datetime | +---------------------------------+-------------------------+ | LC08_L2SP_041036_20251005_02_T1 | 2025-10-05T18:28:34.608 | | LC08_L2SP_041037_20251005_02_T1 | 2025-10-05T18:28:58.499 | +---------------------------------+-------------------------+