documentation: Add dynamic model tables

Raghav Dhall · Raghav Dhall · commit 468dadd7d5f9 · 2022-08-22T14:09:40.000Z
diff --git a/doc/doc_utils/jumpstart_doc_utils.py b/doc/doc_utils/jumpstart_doc_utils.py
@@ -57,6 +57,19 @@ class ProblemTypes(str, Enum):
     TABULAR_REGRESSION = "Regression"
     TABULAR_CLASSIFICATION = "Classification"
 
+class Frameworks(str, Enum):
+    """Possible frameworks for JumpStart models"""
+
+    TENSORFLOW = "Tensorflow Hub"
+    PYTORCH = "Pytorch Hub"
+    HUGGINGFACE = "HuggingFace"
+    CATBOOST = "Catboost"
+    GLUONCV = "GluonCV"
+    LIGHTGBM = "LightGBM"
+    XGBOOST = "XGBoost"
+    SCIKIT_LEARN = "ScikitLearn"
+    SOURCE = "Source"
+
 
 JUMPSTART_REGION = "eu-west-2"
 SDK_MANIFEST_FILE = "models_manifest.json"
@@ -82,6 +95,40 @@ class ProblemTypes(str, Enum):
     Tasks.TABULAR_CLASSIFICATION: ProblemTypes.TABULAR_CLASSIFICATION,
 }
 
+TO_FRAMEWORK = {
+    "Tensorflow Hub": Frameworks.TENSORFLOW,
+    "Pytorch Hub": Frameworks.PYTORCH,
+    "HuggingFace": Frameworks.HUGGINGFACE,
+    "Catboost": Frameworks.CATBOOST,
+    "GluonCV": Frameworks.GLUONCV,
+    "LightGBM": Frameworks.LIGHTGBM,
+    "XGBoost": Frameworks.XGBOOST,
+    "ScikitLearn": Frameworks.SCIKIT_LEARN,
+    "Source": Frameworks.SOURCE
+}
+
+
+MODALITY_MAP = {
+    (Tasks.IC, Frameworks.PYTORCH): "algorithms/vision/image_classification_pytorch.rst",
+    (Tasks.IC, Frameworks.TENSORFLOW): "algorithms/vision/image_classification_tensorflow.rst",
+    (Tasks.IC_EMBEDDING, Frameworks.TENSORFLOW): "algorithms/vision/image_embedding_tensorflow.rst",
+    (Tasks.IS, Frameworks.GLUONCV): "algorithms/vision/instance_segmentation_mxnet.rst",
+    (Tasks.OD, Frameworks.GLUONCV): "algorithms/vision/object_detection_mxnet.rst",
+    (Tasks.OD, Frameworks.PYTORCH): "algorithms/vision/object_detection_pytorch.rst",
+    (Tasks.OD, Frameworks.TENSORFLOW): "algorithms/vision/object_detection_tensorflow.rst",
+    (Tasks.SEMSEG, Frameworks.GLUONCV): "algorithms/vision/semantic_segmentation_mxnet.rst",
+    (Tasks.TRANSLATION, Frameworks.HUGGINGFACE): "algorithms/text/machine_translation_hugging_face.rst",
+    (Tasks.NER, Frameworks.GLUONCV): "algorithms/text/named_entity_recognition_hugging_face.rst",
+    (Tasks.EQA, Frameworks.PYTORCH): "algorithms/text/question_answering_pytorch.rst",
+    (Tasks.SPC, Frameworks.HUGGINGFACE): "algorithms/text/sentence_pair_classification_hugging_face.rst",
+    (Tasks.SPC, Frameworks.TENSORFLOW): "algorithms/text/sentence_pair_classification_tensorflow.rst",
+    (Tasks.TC, Frameworks.TENSORFLOW): "algorithms/text/text_classification_tensorflow.rst",
+    (Tasks.TC_EMBEDDING, Frameworks.GLUONCV): "algorithms/text/text_embedding_tensorflow_mxnet.rst",
+    (Tasks.TC_EMBEDDING, Frameworks.TENSORFLOW): "algorithms/text/text_embedding_tensorflow_mxnet.rst",
+    (Tasks.TEXT_GENERATION, Frameworks.HUGGINGFACE): "algorithms/text/text_generation_hugging_face.rst",
+    (Tasks.SUMMARIZATION, Frameworks.HUGGINGFACE): "algorithms/text/text_summarization_hugging_face.rst",
+}
+
 
 def get_jumpstart_sdk_manifest():
     url = "{}/{}".format(JUMPSTART_BUCKET_BASE_URL, SDK_MANIFEST_FILE)
@@ -102,6 +149,10 @@ def get_model_task(id):
     return TASK_MAP[task_short] if task_short in TASK_MAP else "Source"
 
 
+def get_string_model_task(id):
+    return id.split("-")[1]
+
+
 def get_model_source(url):
     if "tfhub" in url:
         return "Tensorflow Hub"
@@ -113,8 +164,6 @@ def get_model_source(url):
         return "Catboost"
     if "gluon" in url:
         return "GluonCV"
-    if "catboost" in url:
-        return "Catboost"
     if "lightgbm" in url:
         return "LightGBM"
     if "xgboost" in url:
@@ -138,58 +187,94 @@ def create_jumpstart_model_table():
             ) < Version(model["version"]):
                 sdk_manifest_top_versions_for_models[model["model_id"]] = model
 
-    file_content = []
+    file_content_intro = []
 
-    file_content.append(".. _all-pretrained-models:\n\n")
-    file_content.append(".. |external-link| raw:: html\n\n")
-    file_content.append('   <i class="fa fa-external-link"></i>\n\n')
+    file_content_intro.append(".. _all-pretrained-models:\n\n")
+    file_content_intro.append(".. |external-link| raw:: html\n\n")
+    file_content_intro.append('   <i class="fa fa-external-link"></i>\n\n')
 
-    file_content.append("================================================\n")
-    file_content.append("Built-in Algorithms with pre-trained Model Table\n")
-    file_content.append("================================================\n")
-    file_content.append(
+    file_content_intro.append("================================================\n")
+    file_content_intro.append("Built-in Algorithms with pre-trained Model Table\n")
+    file_content_intro.append("================================================\n")
+    file_content_intro.append(
         """
     The SageMaker Python SDK uses model IDs and model versions to access the necessary
     utilities for pre-trained models. This table serves to provide the core material plus
     some extra information that can be useful in selecting the correct model ID and
     corresponding parameters.\n"""
     )
-    file_content.append(
+    file_content_intro.append(
         """
     If you want to automatically use the latest version of the model, use "*" for the `model_version` attribute.
     We highly suggest pinning an exact model version however.\n"""
     )
-    file_content.append(
+    file_content_intro.append(
         """
     These models are also available through the
     `JumpStart UI in SageMaker Studio <https://docs.aws.amazon.com/sagemaker/latest/dg/studio-jumpstart.html>`__\n"""
     )
-    file_content.append("\n")
-    file_content.append(".. list-table:: Available Models\n")
-    file_content.append("   :widths: 50 20 20 20 30 20\n")
-    file_content.append("   :header-rows: 1\n")
-    file_content.append("   :class: datatable\n")
-    file_content.append("\n")
-    file_content.append("   * - Model ID\n")
-    file_content.append("     - Fine Tunable?\n")
-    file_content.append("     - Latest Version\n")
-    file_content.append("     - Min SDK Version\n")
-    file_content.append("     - Problem Type\n")
-    file_content.append("     - Source\n")
+    file_content_intro.append("\n")
+    file_content_intro.append(".. list-table:: Available Models\n")
+    file_content_intro.append("   :widths: 50 20 20 20 30 20\n")
+    file_content_intro.append("   :header-rows: 1\n")
+    file_content_intro.append("   :class: datatable\n")
+    file_content_intro.append("\n")
+    file_content_intro.append("   * - Model ID\n")
+    file_content_intro.append("     - Fine Tunable?\n")
+    file_content_intro.append("     - Latest Version\n")
+    file_content_intro.append("     - Min SDK Version\n")
+    file_content_intro.append("     - Problem Type\n")
+    file_content_intro.append("     - Source\n")
+
+    dynamic_table_files = []
+    file_content_entries = []
 
     for model in sdk_manifest_top_versions_for_models.values():
         model_spec = get_jumpstart_sdk_spec(model["spec_key"])
         model_task = get_model_task(model_spec["model_id"])
+        string_model_task = get_string_model_task(model_spec["model_id"])
         model_source = get_model_source(model_spec["url"])
-        file_content.append("   * - {}\n".format(model_spec["model_id"]))
-        file_content.append("     - {}\n".format(model_spec["training_supported"]))
-        file_content.append("     - {}\n".format(model["version"]))
-        file_content.append("     - {}\n".format(model["min_version"]))
-        file_content.append("     - {}\n".format(model_task))
-        file_content.append(
+        file_content_entries.append("   * - {}\n".format(model_spec["model_id"]))
+        file_content_entries.append("     - {}\n".format(model_spec["training_supported"]))
+        file_content_entries.append("     - {}\n".format(model["version"]))
+        file_content_entries.append("     - {}\n".format(model["min_version"]))
+        file_content_entries.append("     - {}\n".format(model_task))
+        file_content_entries.append(
             "     - `{} <{}>`__ |external-link|\n".format(model_source, model_spec["url"])
         )
 
-    f = open("doc_utils/pretrainedmodels.rst", "w")
-    f.writelines(file_content)
+        if (string_model_task, TO_FRAMEWORK[model_source]) in MODALITY_MAP:
+            file_content_single_entry = []
+
+            if MODALITY_MAP[(string_model_task, TO_FRAMEWORK[model_source])] not in dynamic_table_files:
+                file_content_single_entry.append("\n")
+                file_content_single_entry.append(".. list-table:: Available Models\n")
+                file_content_single_entry.append("   :widths: 50 20 20 20 30 20\n")
+                file_content_single_entry.append("   :header-rows: 1\n")
+                file_content_single_entry.append("   :class: datatable\n")
+                file_content_single_entry.append("\n")
+                file_content_single_entry.append("   * - Model ID\n")
+                file_content_single_entry.append("     - Fine Tunable?\n")
+                file_content_single_entry.append("     - Latest Version\n")
+                file_content_single_entry.append("     - Min SDK Version\n")
+                file_content_single_entry.append("     - Problem Type\n")
+                file_content_single_entry.append("     - Source\n")
+                
+                dynamic_table_files.append(MODALITY_MAP[(string_model_task, TO_FRAMEWORK[model_source])])
+
+            file_content_single_entry.append("   * - {}\n".format(model_spec["model_id"]))
+            file_content_single_entry.append("     - {}\n".format(model_spec["training_supported"]))
+            file_content_single_entry.append("     - {}\n".format(model["version"]))
+            file_content_single_entry.append("     - {}\n".format(model["min_version"]))
+            file_content_single_entry.append("     - {}\n".format(model_task))
+            file_content_single_entry.append(
+                "     - `{} <{}>`__ \n".format(model_source, model_spec["url"])
+            )
+            f = open(MODALITY_MAP[(string_model_task, TO_FRAMEWORK[model_source])], "a")
+            f.writelines(file_content_single_entry)
+            f.close()
+
+    f = open("doc_utils/pretrainedmodels.rst", "a")
+    f.writelines(file_content_intro)
+    f.writelines(file_content_entries)
     f.close()