Generate fake data

In this notebook, we generate multiple data stores of increasingly finer resolution so that the total spatial size of the dataset grows by 2 to allow for variation in chunk size and number of chunks.

This is so we can understand the relationship between the size and number of chunks and tiling performance.

Setup 1: Load the necessary libraries

#!pip install rio-tiler==4.1.11 loguru

%load_ext autoreload
%autoreload
import json
import xarray as xr
import numpy as np
import os
import s3fs
import sys; sys.path.append('..')
import helpers.eodc_hub_role as eodc_hub_role
import helpers.zarr_helpers as zarr_helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Setup 2: Setup data storage

Store data in the fake data directory in a “with chunks”.

credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
fake_data_dir = 'test-data/fake-data'
s3_fs = s3fs.S3FileSystem(
    key=credentials['aws_access_key_id'],
    secret=credentials['aws_secret_access_key'],
    token=credentials['aws_session_token'], 
    anon=False
)

Fake Data Generation Part 1: Generate data stores with a single chunk

These datastores will have varying chunk size since we generate them at varying resolution but no spatial chunking.

# Define starting conditions
time_steps = 1
ydim = 512
xdim = 1024
multiple = 2 # how much do you want the dataset to grow by each iteration
n_multiples = 7
data_path = 'single_chunk'

# If you are updating this data, remove anything that is there
#!aws s3 rm --recursive s3://{bucket}/{data_path}/

# generate and store data
zarr_helpers.generate_multiple_datastores(
    n_multiples,
    xdim,
    ydim,
    f'{bucket}/{fake_data_dir}/{data_path}',
    s3_fs
)

Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat512_lon1024.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat724_lon1448.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat1024_lon2048.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat1448_lon2896.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat2048_lon4096.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat2896_lon5792.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk/store_lat4096_lon8192.zarr

Check that it worked

directories = s3_fs.ls(f'{bucket}/{fake_data_dir}/{data_path}')
for path in directories:
    try:
        # Attempt to open the Zarr store using xarray
        store = s3fs.S3Map(root=path, s3=s3_fs, check=False)
        ds = xr.open_zarr(store)
    except Exception as e:
        # Print an error message if unable to open the Zarr store
        print(f"Could not open {path} as a Zarr store. Error: {e}")

['nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat1024_lon2048.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat1448_lon2896.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat2048_lon4096.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat2896_lon5792.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat4096_lon8192.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat512_lon1024.zarr', 'nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat724_lon1448.zarr']

Fake Data Generation Part 2

Part 2 Step 1: Define starting conditions for generating data of the same chunk size, but varied chunk shape

The following are set as variables so tests can be modified easily for different starting conditions. For example, we might want to test a different target chunk size.

# Define starting conditions
# variable: target size of chunks in mb
target_size = 32
# not variable: bytes per mb
onemb = 1024 # bytes per mb
# number of data values per chunk
data_values_per_chunk = (target_size * onemb * onemb)/8 # 8 bytes for each data value
# since there are half as many latitudes as longitudes, calculate the y dimension to be half the x dimension
ydim = round(np.sqrt(data_values_per_chunk/2))
xdim = 2*ydim
target_chunks = {'time': 1, 'lat': ydim, 'lon': xdim}
print(f"Each dataset will have chunks of the following dimensions {target_chunks}.")

# timesteps are 1 for now
time_steps = 1
# how much do you want the dataset to grow by each iteration
multiple = 2 
# how many datasets we want to test
n_multiples = 5
print(f"We will generate {n_multiples} datasets, each being {multiple} times larger.")

data_path = 'with_chunks'

Each dataset will have chunks of the following dimensions {'time': 1, 'lat': 1448, 'lon': 2896}.
We will generate 5 datasets, each being 2 times larger.

Part 2 Step 2: Generate Datastores

# If necessary, remove anything that is there
#!aws s3 rm --recursive s3://{bucket}/{data_path}/

zarr_helpers.generate_multiple_datastores(
    n_multiples,
    xdim,
    ydim,
    f'{bucket}/{fake_data_dir}/{data_path}',
    s3_fs,
    target_chunks
)

Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat1448_lon2896.zarr
Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat2048_lon4096.zarr
Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat2896_lon5792.zarr
Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat4096_lon8192.zarr
Writing to nasa-eodc-data-store/fake_data/with_chunks/store_lat5793_lon11586.zarr

Part 2 Step 3 (Optional): Check that it worked

# List all items in the directory
directories = s3_fs.ls(f'{bucket}/{fake_data_dir}/{data_path}')
directories

['nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat1448_lon2896.zarr',
 'nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat2048_lon4096.zarr',
 'nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat2896_lon5792.zarr',
 'nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat4096_lon8192.zarr',
 'nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat5793_lon11586.zarr']

for path in directories:
    try:
        # Attempt to open the Zarr store using xarray
        store = s3fs.S3Map(root=path, s3=s3_fs, check=False)
        ds = xr.open_zarr(store)
    except Exception as e:
        # Print an error message if unable to open the Zarr store
        print(f"Could not open {path} as a Zarr store. Error: {e}")

Capture datasets

data_paths = ['single_chunk', 'with_chunks']
directories = s3_fs.ls(f'{bucket}/{fake_data_dir}/{data_paths[0]}')
directories.extend(s3_fs.ls(f'{bucket}/{fake_data_dir}/{data_paths[1]}'))
# Write output to json file
datasets = {}
variable = "data"
for directory in directories:
    dataset_id = '_'.join(directory.split('/')[-2:])
    dataset_url = f"s3://{directory}"
    datasets[dataset_id] = {
        "dataset_url": dataset_url,
        "variable": variable
    }
    
with open("fake-datasets.json", "w") as f:
    f.write(json.dumps(datasets))
    f.close()