|
1 | 1 | import logging
|
2 | 2 | from enum import Enum, auto
|
3 |
| -from typing import Any, Callable, Dict, List, Optional |
| 3 | +from typing import Any, Callable, Dict, List, Optional, Tuple |
4 | 4 |
|
5 | 5 | import torch
|
6 | 6 | from torch._decomp import register_decomposition
|
@@ -423,6 +423,132 @@ def instance_norm_decomposition(
|
423 | 423 | )
|
424 | 424 |
|
425 | 425 |
|
| 426 | +@register_torch_trt_decomposition( |
| 427 | + aten.scaled_dot_product_attention, registry=TORCH_TRT_DECOMPOSITIONS |
| 428 | +) |
| 429 | +def scaled_dot_product_attention_decomposition( |
| 430 | + query: torch.Tensor, |
| 431 | + key: torch.Tensor, |
| 432 | + value: torch.Tensor, |
| 433 | + attn_mask: Optional[torch.Tensor] = None, |
| 434 | + dropout_p: float = 0.0, |
| 435 | + is_causal: bool = False, |
| 436 | + *, |
| 437 | + scale: Optional[float] = None, |
| 438 | + enable_gqa: bool = False, |
| 439 | +) -> torch.Tensor: |
| 440 | + L, S = query.size(-2), key.size(-2) |
| 441 | + device = query.device |
| 442 | + attn_bias = torch.zeros(L, S, dtype=query.dtype, device=device) |
| 443 | + |
| 444 | + if is_causal: |
| 445 | + assert attn_mask is None, "attn_mask must be None when is_causal=True" |
| 446 | + temp_mask = torch.ones(L, S, dtype=torch.bool, device=device).tril(diagonal=0) |
| 447 | + attn_bias = attn_bias.masked_fill(temp_mask.logical_not(), float("-inf")) |
| 448 | + |
| 449 | + if attn_mask is not None: |
| 450 | + if attn_mask.dtype == torch.bool: |
| 451 | + attn_bias = attn_bias.masked_fill(attn_mask.logical_not(), float("-inf")) |
| 452 | + else: |
| 453 | + attn_bias = attn_mask + attn_bias |
| 454 | + |
| 455 | + if enable_gqa: |
| 456 | + key = key.repeat_interleave(query.size(-3) // key.size(-3), -3) |
| 457 | + value = value.repeat_interleave(query.size(-3) // value.size(-3), -3) |
| 458 | + |
| 459 | + attn_weight = query @ key.transpose(-2, -1) |
| 460 | + |
| 461 | + if scale is None: |
| 462 | + scale = torch.sqrt(torch.scalar_tensor(query.size(-1), dtype=torch.int)) |
| 463 | + attn_weight = attn_weight / scale |
| 464 | + else: |
| 465 | + attn_weight = attn_weight * scale |
| 466 | + |
| 467 | + attn_weight = attn_weight + attn_bias |
| 468 | + attn_weight = torch.softmax(attn_weight, dim=-1) |
| 469 | + return attn_weight @ value |
| 470 | + |
| 471 | + |
| 472 | +@register_torch_trt_decomposition( |
| 473 | + aten._scaled_dot_product_flash_attention, registry=TORCH_TRT_DECOMPOSITIONS |
| 474 | +) |
| 475 | +def scaled_dot_product_flash_attention_decomposition( |
| 476 | + query: torch.Tensor, |
| 477 | + key: torch.Tensor, |
| 478 | + value: torch.Tensor, |
| 479 | + dropout_p: float = 0.0, |
| 480 | + is_causal: bool = False, |
| 481 | + return_debug_mask: bool = False, |
| 482 | + *, |
| 483 | + scale: Optional[float] = None, |
| 484 | +) -> Tuple[ |
| 485 | + torch.Tensor, |
| 486 | + torch.Tensor, |
| 487 | + torch.Tensor, |
| 488 | + torch.Tensor, |
| 489 | + torch.SymInt, |
| 490 | + torch.SymInt, |
| 491 | + torch.Tensor, |
| 492 | + torch.Tensor, |
| 493 | + torch.Tensor, |
| 494 | +]: |
| 495 | + attn = scaled_dot_product_attention_decomposition( |
| 496 | + query, key, value, None, dropout_p, is_causal, scale=scale |
| 497 | + ) |
| 498 | + return attn, None, None, None, 0, 0, None, None, None |
| 499 | + |
| 500 | + |
| 501 | +@register_torch_trt_decomposition( |
| 502 | + aten._scaled_dot_product_efficient_attention, registry=TORCH_TRT_DECOMPOSITIONS |
| 503 | +) |
| 504 | +def scaled_dot_product_efficient_attention_decomposition( |
| 505 | + query: torch.Tensor, |
| 506 | + key: torch.Tensor, |
| 507 | + value: torch.Tensor, |
| 508 | + attn_bias: Optional[torch.Tensor], |
| 509 | + compute_log_sumexp: bool, |
| 510 | + dropout_p: float = 0.0, |
| 511 | + is_causal: bool = False, |
| 512 | + *, |
| 513 | + scale: Optional[float] = None, |
| 514 | +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: |
| 515 | + attn = scaled_dot_product_attention_decomposition( |
| 516 | + query, key, value, attn_bias, dropout_p, is_causal, scale=scale |
| 517 | + ) |
| 518 | + return attn, None, None, None |
| 519 | + |
| 520 | + |
| 521 | +@register_torch_trt_decomposition( |
| 522 | + aten._scaled_dot_product_cudnn_attention, registry=TORCH_TRT_DECOMPOSITIONS |
| 523 | +) |
| 524 | +def scaled_dot_product_cudnn_attention_decomposition( |
| 525 | + query: torch.Tensor, |
| 526 | + key: torch.Tensor, |
| 527 | + value: torch.Tensor, |
| 528 | + attn_bias: Optional[torch.Tensor], |
| 529 | + compute_log_sumexp: bool, |
| 530 | + dropout_p: float = 0.0, |
| 531 | + is_causal: bool = False, |
| 532 | + return_debug_mask: bool = False, |
| 533 | + *, |
| 534 | + scale: Optional[float] = None, |
| 535 | +) -> Tuple[ |
| 536 | + torch.Tensor, |
| 537 | + torch.Tensor, |
| 538 | + torch.Tensor, |
| 539 | + torch.Tensor, |
| 540 | + torch.SymInt, |
| 541 | + torch.SymInt, |
| 542 | + torch.Tensor, |
| 543 | + torch.Tensor, |
| 544 | + torch.Tensor, |
| 545 | +]: |
| 546 | + attn = scaled_dot_product_attention_decomposition( |
| 547 | + query, key, value, attn_bias, dropout_p, is_causal, scale=scale |
| 548 | + ) |
| 549 | + return attn, None, None, None, 0, 0, None, None, None |
| 550 | + |
| 551 | + |
426 | 552 | def get_decompositions(
|
427 | 553 | enable_experimental_decompositions: bool = False,
|
428 | 554 | ) -> Dict[OpOverload, Callable[[Any], Any]]:
|
|
0 commit comments