Comparing prod and dev

This notebook compares results between prod and dev titiler deployments. Running end-to-end benchmarks is documented in https://github.com/developmentseed/tile-benchmarking/tree/main/03-e2e/README.md.

# Import libraries
import os
import pandas as pd
import hvplot.pandas
import holoviews as hv
pd.options.plotting.backend = 'holoviews'
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('../helpers/')
import eodc_hub_role
credentials = eodc_hub_role.fetch_and_set_credentials()
%%capture
!rm -rf downloaded_*_results/
!aws s3 cp --recursive s3://nasa-eodc-data-store/tile-benchmarking-results/2023-11-22_17-09-28/ downloaded_dev_results/
!aws s3 cp --recursive s3://nasa-eodc-data-store/tile-benchmarking-results/2023-11-22_17-11-37/ downloaded_prod_results/

Parse and merge results into a single dataframe.

results = { 'prod': {}, 'dev': {} }
for env in results.keys():
    # Specify the directory path and the suffix
    directory_path = f"downloaded_{env}_results/"
    suffix = "_urls_stats.csv"  # For example, if you're interested in text files

    # List all files in the directory
    all_files = os.listdir(directory_path)

    # Filter the files to only include those that end with the specified suffix
    files_with_suffix = [f"{directory_path}{f}" for f in all_files if f.endswith(suffix)]

    dfs = []
    for file in files_with_suffix:
        df = pd.read_csv(file)
        df['file'] = file
        dfs.append(df)

    merged_df = pd.concat(dfs)
    merged_df['dataset'] = [file.split('/')[1].replace('_urls_stats.csv', '') for file in merged_df['file']]
    results[env]['all'] = merged_df
    # The "Aggregated" results represent aggregations across tile endpoints. 
    results[env][f'Aggregated {env}'] = merged_df[merged_df['Name'] == 'Aggregated']
prod_df = results['prod'][f'Aggregated prod']
dev_df = results['dev'][f'Aggregated dev']
merged_df = pd.merge(prod_df, dev_df, on='dataset', suffixes=(' Prod', ' Dev'))
merged_df['Failure Rate Prod'] = merged_df['Failure Count Prod']/merged_df['Request Count Prod'] * 100
merged_df['Failure Rate Dev'] = merged_df['Failure Count Dev']/merged_df['Request Count Dev'] * 100
merged_df[['Median Response Time Prod', 'Failure Rate Prod', 'Median Response Time Dev', 'Failure Rate Dev', 'dataset']].sort_values('Median Response Time Dev')
Median Response Time Prod Failure Rate Prod Median Response Time Dev Failure Rate Dev dataset
2 460.0 0.0 200.0 0.0 pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc
5 100.0 100.0 210.0 0.0 3B42_Daily.19980101.7.nc4
0 100.0 100.0 240.0 0.0 GLDAS_NOAH025_3H.A20230731.2100.021.nc4
4 500.0 0.0 290.0 0.0 power_901_monthly_meteorology_utc.zarr
3 470.0 0.0 420.0 0.0 combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk....
6 580.0 0.0 440.0 0.0 cmip6-pds_GISS-E2-1-G_historical_tas
1 110.0 100.0 640.0 0.0 3B-DAY.MS.MRG.3IMERG.20000601-S000000-E235959....
7 830.0 0.0 690.0 0.0 20231107090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v...