Quickstart
In [1]:
Copied!
import numpy as np
import shapely
import warnings
import zarr
from zarr.dtype import VariableLengthBytes, VariableLengthUTF8
from zarr.errors import UnstableSpecificationWarning
warnings.filterwarnings("ignore", category=UnstableSpecificationWarning)
import numpy as np
import shapely
import warnings
import zarr
from zarr.dtype import VariableLengthBytes, VariableLengthUTF8
from zarr.errors import UnstableSpecificationWarning
warnings.filterwarnings("ignore", category=UnstableSpecificationWarning)
Create Zarr store using meta group¶
In [2]:
Copied!
root = zarr.open_group("my_store.zarr", mode="w", zarr_format=3)
meta = root.create_group("meta")
# Timestamps
meta.create_array(
"date",
data=np.array(["2023-01-01", "2023-01-02", "2023-01-03"], dtype="datetime64[ms]"),
)
# String metadata
collection = meta.create_array(
"collection",
shape=(3,),
dtype=VariableLengthUTF8(),
)
collection[:] = ["sentinel-2", "sentinel-2", "landsat-8"]
# Bounding boxes stored as WKB
bbox = meta.create_array(
"bbox",
shape=(3,),
dtype=VariableLengthBytes(),
)
bbox[:] = shapely.to_wkb([
shapely.box(-10.0, -10.0, 10.0, 10.0),
shapely.box(-20.0, -20.0, 20.0, 20.0),
shapely.box( 30.0, 30.0, 50.0, 50.0),
])
root = zarr.open_group("my_store.zarr", mode="w", zarr_format=3)
meta = root.create_group("meta")
# Timestamps
meta.create_array(
"date",
data=np.array(["2023-01-01", "2023-01-02", "2023-01-03"], dtype="datetime64[ms]"),
)
# String metadata
collection = meta.create_array(
"collection",
shape=(3,),
dtype=VariableLengthUTF8(),
)
collection[:] = ["sentinel-2", "sentinel-2", "landsat-8"]
# Bounding boxes stored as WKB
bbox = meta.create_array(
"bbox",
shape=(3,),
dtype=VariableLengthBytes(),
)
bbox[:] = shapely.to_wkb([
shapely.box(-10.0, -10.0, 10.0, 10.0),
shapely.box(-20.0, -20.0, 20.0, 20.0),
shapely.box( 30.0, 30.0, 50.0, 50.0),
])
Register Zarr store with custom TableProvider¶
In [3]:
Copied!
from datafusion import SessionContext
from obstore.store import LocalStore
from zarr_datafusion_search import ZarrTable
store = LocalStore("my_store.zarr")
zarr_table = await ZarrTable.from_obstore(store, "/meta")
ctx = SessionContext()
ctx.register_table("my_data", zarr_table)
from datafusion import SessionContext
from obstore.store import LocalStore
from zarr_datafusion_search import ZarrTable
store = LocalStore("my_store.zarr")
zarr_table = await ZarrTable.from_obstore(store, "/meta")
ctx = SessionContext()
ctx.register_table("my_data", zarr_table)
/var/folders/mp/33cxt8xj36bbxj0jqdwbfwyc0000gn/T/ipykernel_93099/1827052559.py:6: RuntimeWarning: Successfully reconstructed a store defined in another Python module. Connection pooling will not be shared across store instances. zarr_table = await ZarrTable.from_obstore(store, "/meta")
Query metadata¶
In [4]:
Copied!
df = ctx.sql("SELECT * FROM my_data")
print(df.schema())
df.show()
# Filter by date
df = ctx.sql("""
SELECT date, collection
FROM my_data
WHERE date >= '2023-01-02'
""")
df
df = ctx.sql("SELECT * FROM my_data")
print(df.schema())
df.show()
# Filter by date
df = ctx.sql("""
SELECT date, collection
FROM my_data
WHERE date >= '2023-01-02'
""")
df
bbox: binary_view not null
-- field metadata --
ARROW:extension:name: 'geoarrow.wkb'
ARROW:extension:metadata: '{"crs":"EPSG:4326","crs_type":"authority_cod' + 3
collection: string_view not null
date: timestamp[ms] not null
DataFrame()
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+---------------------+
| bbox | collection | date |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+---------------------+
| 01030000000100000005000000000000000000244000000000000024c00000000000002440000000000000244000000000000024c0000000000000244000000000000024c000000000000024c0000000000000244000000000000024c0 | sentinel-2 | 2023-01-01T00:00:00 |
| 01030000000100000005000000000000000000344000000000000034c00000000000003440000000000000344000000000000034c0000000000000344000000000000034c000000000000034c0000000000000344000000000000034c0 | sentinel-2 | 2023-01-02T00:00:00 |
| 0103000000010000000500000000000000000049400000000000003e40000000000000494000000000000049400000000000003e4000000000000049400000000000003e400000000000003e4000000000000049400000000000003e40 | landsat-8 | 2023-01-03T00:00:00 |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+---------------------+
Out[4]:
| date | collection |
|---|---|
| 2023-01-02 00:00:00 | sentinel-2 |
| 2023-01-03 00:00:00 | landsat-8 |
Query spatial metadata¶
In [6]:
Copied!
from geodatafusion import register_all
register_all(ctx)
df = ctx.sql("""
SELECT date, collection
FROM my_data
WHERE ST_Intersects(
bbox,
ST_GeomFromText('POLYGON((-15 -15, -15 15, 15 15, 15 -15, -15 -15))')
)
""")
df
from geodatafusion import register_all
register_all(ctx)
df = ctx.sql("""
SELECT date, collection
FROM my_data
WHERE ST_Intersects(
bbox,
ST_GeomFromText('POLYGON((-15 -15, -15 15, 15 15, 15 -15, -15 -15))')
)
""")
df
Out[6]:
| date | collection |
|---|---|
| 2023-01-01 00:00:00 | sentinel-2 |
| 2023-01-02 00:00:00 | sentinel-2 |