import sys
import hvplot.pandas # noqa
import pandas as pd
"..")
sys.path.append(import subprocess
from datetime import datetime
from processing import process_locust_results
from tile import generate_locust_urls
Comparing titiler-cmr usage against AWS lambda concurrency limits
This notebook compares tile generation using titiler-cmr on AWS lambda against the concurrency limits.
Specify file containing results from VEDA JupyterHub tests
We’ll run the tests twice, first with the default unreserved concurrency limits and second with a reserved concurrency limit of 50.
# Define URIs for results from running tile generation tests on the VEDA Hub
= {
gpm_imerg "uri": "s3://nasa-eodc-data-store/test-results/20240816190529_CMRTileTest_C2723754850-GES_DISC.json",
"kwargs": {"rescale": "0,455", "colormap_name": "coolwarm", "output_format": "png"},
}= {
mur_sst "uri": "s3://nasa-eodc-data-store/test-results/20240816191458_CMRTileTest_C1996881146-POCLOUD.json",
"kwargs": {
"rescale": "271,305",
"colormap_name": "coolwarm",
"output_format": "png",
},
}= "_reserved_concurrency"
test_id_suffix for ds in [gpm_imerg, mur_sst]:
"test_id"] = ds["uri"].split("/")[-1].split(".")[0]
ds["urls_output_file"] = f"urls/{ds['test_id']}{test_id_suffix}.csv"
ds["results_output"] = f"results/{ds['test_id']}{test_id_suffix}"
ds["df"] = generate_locust_urls(
ds["uri"], ds["urls_output_file"], **ds["kwargs"], subset=slice(0, 50)
ds[ )
Use locust to time tile generation
for ds in [gpm_imerg, mur_sst]:
= [
command "locust",
"-f",
"locust_titiler_cmr.py",
"--headless",
"--users",
"100",
"--iterations",
"100",
"--csv",
"results_output"],
ds["--urls-file",
"urls_output_file"],
ds["--csv-full-history",
"--host",
"https://dev-titiler-cmr.delta-backend.com",
] subprocess.run(command)
Process locust results
for ds in [gpm_imerg, mur_sst]:
= process_locust_results(
df_reserved f"results/{ds['test_id']}_reserved_concurrency",
="reserved",
run_id=False,
split_aggregated
)= process_locust_results(
df_unreserved f"results/{ds['test_id']}_unreserved_concurrency",
="unreserved",
run_id=False,
split_aggregated
)= pd.concat([df_reserved, df_unreserved], axis=0).reset_index()
combined_df = [
columns "Average Response Time",
"Median Response Time",
"Min Response Time",
"Max Response Time",
]for ind, c in enumerate(columns):
f"{c} (s)"] = combined_df[c] * 1e-3
combined_df[= f"{c} (s)"
columns[ind] "concept_id"] = combined_df.apply(
combined_df[lambda x: x["url"].split("?")[1].split("&")[0].split("=")[1], axis=1
)f"{ds['results_output']}_combined.csv")
combined_df.to_csv("df"] = combined_df[
ds[
["url",
"method",
"tile",
"zoom",
"run_id",
"concept_id",
"Request Count",
"Failure Count",
*columns,
"Average Content Size",
] ]
Upload results to S3
= datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
current_datetime = (
results_store f"s3://nasa-eodc-data-store/tile-benchmarking-results/{current_datetime}/"
)
subprocess.run(
["aws",
"s3",
"cp",
"results/",
results_store,"--recursive",
"--exclude",
"*",
"--include",
f"{ds['test_id']}{test_id_suffix}*.csv",
] )
Plot results
Show failure rates for unreserved concurrency (~1000) versus reserved concurrency (50) for GPM IMERG
These results show that for a simulation of 100 users each requesting one tile at a time, roughly half of the requests will fail when the AWS Lambda is configured with a reserved concurrency of 50. The default AWS Lambda concurrency setting is 1000 concurrent executions across all functions in an account in an AWS region, although a quota increase can be requested. The concurrency limit bounds the scalability of titiler-cmr.
= (
summary_df "df"].groupby(["run_id"])[["Request Count", "Failure Count"]].agg("sum")
gpm_imerg[
)"Failure Rate (%)"] = (
summary_df["Failure Count"] / summary_df["Request Count"] * 100
summary_df[
)
summary_df.hvplot.bar(="run_id",
x="Failure Rate (%)",
y="Lambda concurrency setting",
xlabel=(0, 100),
ylim="GPM IMERG",
title )
Show failure rates for unreserved concurrency (~1000) versus reserved concurrency (50) for MUR SST
= (
summary_df "df"].groupby(["run_id"])[["Request Count", "Failure Count"]].agg("sum")
mur_sst[
)"Failure Rate (%)"] = (
summary_df["Failure Count"] / summary_df["Request Count"] * 100
summary_df[
)
summary_df.hvplot.bar(="run_id",
x="Failure Rate (%)",
y="Lambda concurrency setting",
xlabel=(0, 100),
ylim="MUR SST",
title )