#!pip install rio-tiler==4.1.11 loguru
Generate fake data
In this notebook, we generate multiple data stores of increasingly finer resolution so that the total spatial size of the dataset grows by 2 to allow for variation in chunk size and number of chunks.
This is so we can understand the relationship between the size and number of chunks and tiling performance.
Setup 1: Load the necessary libraries
%load_ext autoreload
%autoreload
import json
import xarray as xr
import numpy as np
import os
import s3fs
import sys; sys.path.append('..')
import helpers.eodc_hub_role as eodc_hub_role
import helpers.zarr_helpers as zarr_helpers
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
Setup 2: Setup data storage
Store data in the fake data directory in a “with chunks”.
= eodc_hub_role.fetch_and_set_credentials()
credentials = 'nasa-eodc-data-store'
bucket = 'test-data/fake-data'
fake_data_dir = s3fs.S3FileSystem(
s3_fs =credentials['aws_access_key_id'],
key=credentials['aws_secret_access_key'],
secret=credentials['aws_session_token'],
token=False
anon )
Fake Data Generation Part 1: Generate data stores with a single chunk
These datastores will have varying chunk size since we generate them at varying resolution but no spatial chunking.
# Define starting conditions
= 1
time_steps = 512
ydim = 1024
xdim = 2 # how much do you want the dataset to grow by each iteration
multiple = 7
n_multiples = 'single_chunk' data_path
# If you are updating this data, remove anything that is there
#!aws s3 rm --recursive s3://{bucket}/{data_path}/
# generate and store data
zarr_helpers.generate_multiple_datastores(
n_multiples,
xdim,
ydim,f'{bucket}/{fake_data_dir}/{data_path}',
s3_fs )
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat512_lon1024.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat724_lon1448.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat1024_lon2048.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat1448_lon2896.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat2048_lon4096.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat2896_lon5792.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat4096_lon8192.zarr
Check that it worked
= s3_fs.ls(f'{bucket}/{fake_data_dir}/{data_path}')
directories for path in directories:
try:
# Attempt to open the Zarr store using xarray
= s3fs.S3Map(root=path, s3=s3_fs, check=False)
store = xr.open_zarr(store)
ds except Exception as e:
# Print an error message if unable to open the Zarr store
print(f"Could not open {path} as a Zarr store. Error: {e}")
['nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat1024_lon2048.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat1448_lon2896.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat2048_lon4096.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat2896_lon5792.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat4096_lon8192.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat512_lon1024.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat724_lon1448.zarr']
Fake Data Generation Part 2
Part 2 Step 1: Define starting conditions for generating data of the same chunk size, but varied chunk shape
The following are set as variables so tests can be modified easily for different starting conditions. For example, we might want to test a different target chunk size.
# Define starting conditions
# variable: target size of chunks in mb
= 32
target_size # not variable: bytes per mb
= 1024 # bytes per mb
onemb # number of data values per chunk
= (target_size * onemb * onemb)/8 # 8 bytes for each data value
data_values_per_chunk # since there are half as many latitudes as longitudes, calculate the y dimension to be half the x dimension
= round(np.sqrt(data_values_per_chunk/2))
ydim = 2*ydim
xdim = {'time': 1, 'lat': ydim, 'lon': xdim}
target_chunks print(f"Each dataset will have chunks of the following dimensions {target_chunks}.")
# timesteps are 1 for now
= 1
time_steps # how much do you want the dataset to grow by each iteration
= 2
multiple # how many datasets we want to test
= 5
n_multiples print(f"We will generate {n_multiples} datasets, each being {multiple} times larger.")
= 'with_chunks' data_path
Each dataset will have chunks of the following dimensions {'time': 1, 'lat': 1448, 'lon': 2896}.
We will generate 5 datasets, each being 2 times larger.
Part 2 Step 2: Generate Datastores
# If necessary, remove anything that is there
#!aws s3 rm --recursive s3://{bucket}/{data_path}/
zarr_helpers.generate_multiple_datastores(
n_multiples,
xdim,
ydim,f'{bucket}/{fake_data_dir}/{data_path}',
s3_fs,
target_chunks )
Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat1448_lon2896.zarr
Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat2048_lon4096.zarr
Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat2896_lon5792.zarr
Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat4096_lon8192.zarr
Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat5793_lon11586.zarr
Part 2 Step 3 (Optional): Check that it worked
# List all items in the directory
= s3_fs.ls(f'{bucket}/{fake_data_dir}/{data_path}')
directories directories
['nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat1448_lon2896.zarr',
'nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat2048_lon4096.zarr',
'nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat2896_lon5792.zarr',
'nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat4096_lon8192.zarr',
'nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat5793_lon11586.zarr']
for path in directories:
try:
# Attempt to open the Zarr store using xarray
= s3fs.S3Map(root=path, s3=s3_fs, check=False)
store = xr.open_zarr(store)
ds except Exception as e:
# Print an error message if unable to open the Zarr store
print(f"Could not open {path} as a Zarr store. Error: {e}")
Capture datasets
= ['single_chunk', 'with_chunks']
data_paths = s3_fs.ls(f'{bucket}/{fake_data_dir}/{data_paths[0]}')
directories f'{bucket}/{fake_data_dir}/{data_paths[1]}'))
directories.extend(s3_fs.ls(# Write output to json file
= {}
datasets = "data"
variable for directory in directories:
= '_'.join(directory.split('/')[-2:])
dataset_id = f"s3://{directory}"
dataset_url = {
datasets[dataset_id] "dataset_url": dataset_url,
"variable": variable
}
with open("fake-datasets.json", "w") as f:
f.write(json.dumps(datasets)) f.close()