Fix inference output dtype, use FP16 (#2537)

PaulZhang12 · facebook-github-bot · commit eccb5b37a2e3 · 2024-11-11T14:21:44.000-08:00
Summary: Pull Request resolved: #2537 This diff supports specifying the output type of the TBE during model processing in AIMP with TorchRec eager mode. Part of the ~30% QPS gain optimization for SNN on APS. Reviewed By: ZhengkaiZ Differential Revision: D65445160 fbshipit-source-id: d16226c1856486916e83192fb79730641f70fc7c
diff --git a/torchrec/inference/modules.py b/torchrec/inference/modules.py
@@ -418,7 +418,6 @@ def _quantize_fp_module(
         model: torch.nn.Module,
         fp_module: FeatureProcessedEmbeddingBagCollection,
         fp_module_fqn: str,
-        activation_dtype: torch.dtype = torch.float,
         weight_dtype: torch.dtype = DEFAULT_QUANTIZATION_DTYPE,
         per_fp_table_weight_dtype: Optional[Dict[str, torch.dtype]] = None,
     ) -> None:
@@ -428,7 +427,7 @@ def _quantize_fp_module(
 
         quant_prep_enable_register_tbes(model, [FeatureProcessedEmbeddingBagCollection])
         fp_module.qconfig = QuantConfig(
-            activation=quant.PlaceholderObserver.with_args(dtype=activation_dtype),
+            activation=quant.PlaceholderObserver.with_args(dtype=output_dtype),
             weight=quant.PlaceholderObserver.with_args(dtype=weight_dtype),
             per_table_weight_dtype=per_fp_table_weight_dtype,
         )