Pretty Print JS TGI Benchmark results

Jonathan Makunga · Jonathan Makunga · commit 332b2ecc48a3 · 2024-03-19T17:52:11.000-07:00
diff --git a/src/sagemaker/serve/builder/jumpstart_builder.py b/src/sagemaker/serve/builder/jumpstart_builder.py
@@ -34,7 +34,7 @@
 from sagemaker.serve.utils.local_hardware import _get_nb_instance, _get_ram_usage_mb
 from sagemaker.serve.utils.telemetry_logger import _capture_telemetry
 from sagemaker.serve.utils.tuning import _serial_benchmark, _concurrent_benchmark, _more_performant, \
-    _pretty_print_results_tgi
+    _pretty_print_results_tgi, _pretty_print_results_tgi_js
 from sagemaker.serve.utils.types import ModelServer
 from sagemaker.base_predictor import PredictorBase
 from sagemaker.jumpstart.model import JumpStartModel
@@ -392,7 +392,7 @@ def tune_for_djl_jumpstart(self, max_tuning_duration: int = 1800):
             logger.debug(
                 "Failed to gather any tuning results. "
                 "Please inspect the stack trace emitted from live logging for more details. "
-                "Falling back to default serving.properties: %s",
+                "Falling back to default model environment variable configurations: %s",
                 self.pysdk_model.env,
             )
 
@@ -517,7 +517,8 @@ def tune_for_tgi_jumpstart(self, max_tuning_duration: int = 1800):
                     str(e),
                 )
                 break
-            except Exception:
+            except Exception as e:
+                logger.exception(e)
                 logger.exception(
                     "Deployment unsuccessful with SM_NUM_GPUS: %s. "
                     "with uncovered exception",
@@ -530,7 +531,7 @@ def tune_for_tgi_jumpstart(self, max_tuning_duration: int = 1800):
                 "SM_NUM_GPUS": str(best_tuned_combination[1])
             })
 
-            _pretty_print_results_tgi(benchmark_results)
+            _pretty_print_results_tgi_js(benchmark_results)
             logger.info(
                 "Model Configuration: %s was most performant with avg latency: %s, "
                 "p90 latency: %s, average tokens per second: %s, throughput/s: %s, "
@@ -547,7 +548,7 @@ def tune_for_tgi_jumpstart(self, max_tuning_duration: int = 1800):
             logger.debug(
                 "Failed to gather any tuning results. "
                 "Please inspect the stack trace emitted from live logging for more details. "
-                "Falling back to default serving.properties: %s",
+                "Falling back to default model environment variable configurations: %s",
                 self.pysdk_model.env,
             )
 
diff --git a/src/sagemaker/serve/utils/tuning.py b/src/sagemaker/serve/utils/tuning.py
@@ -98,6 +98,43 @@ def _pretty_print_results_tgi(results: dict):
     )
 
 
+def _pretty_print_results_tgi_js(results: dict):
+    """Placeholder docstring"""
+    avg_latencies = []
+    sm_num_gpus = []
+    p90s = []
+    avg_tokens_per_seconds = []
+    throughput_per_seconds = []
+    standard_deviations = []
+    ordered = collections.OrderedDict(sorted(results.items()))
+
+    for key, value in ordered.items():
+        avg_latencies.append(key)
+        sm_num_gpus.append(value[0]["SM_NUM_GPUS"])
+        p90s.append(value[1])
+        avg_tokens_per_seconds.append(value[2])
+        throughput_per_seconds.append(value[3])
+        standard_deviations.append(value[4])
+
+    df = pd.DataFrame(
+        {
+            "AverageLatency (Serial)": avg_latencies,
+            "P90_Latency (Serial)": p90s,
+            "AverageTokensPerSecond (Serial)": avg_tokens_per_seconds,
+            "ThroughputPerSecond (Concurrent)": throughput_per_seconds,
+            "StandardDeviationResponse (Concurrent)": standard_deviations,
+            "SM_NUM_GPUS": sm_num_gpus,
+        }
+    )
+    logger.info(
+        "\n================================================================== Benchmark "
+        "Results ==================================================================\n%s"
+        "\n============================================================================"
+        "===========================================================================\n",
+        df.to_string(),
+    )
+
+
 def _tokens_per_second(generated_text: str, max_token_length: int, latency: float) -> int:
     """Placeholder docstring"""
     est_tokens = (_tokens_from_chars(generated_text) + _tokens_from_words(generated_text)) / 2