Skip to content

Commit 442d2ab

Browse files
Hakon-Buggejfvogel
authored andcommitted
IB/mlx5: Disable BME on panic
There are two reasons for disabling BME on an RDMA capable device on a panic. The first one is Exadata's FNDD logic, the other is quite simple; we do not want a vmcore to be generated whilst there are incoming RDMA Write/Atomics, modifying the image while it is generated. Exadata has implemented Fast Node Death Detection (FNDD) by means of posting RDMA operation between nodes. Now, since the RDMA responder is handled by the remote HCA, the RDMA requests will be satisfied, even though the host OS is crashing, generating vmcore, or other. It will do so until the PCIe PRST signal (PCI Reset) has been raised. Exadata has tried to circumvent this situation by having a process at the target, regularly incrementing a variable in the MR used by FNDD, then performing RDMA Read to read the variable and make sure it increments. This mechanism has proven to give too many false negatives, as the process incrementing the variable may be suspended for several seconds, due to a very high number of processes. Hence, the idea is to revoke the HCA's ability to perform host memory accesses, by simply resetting the Bus Master Enable (BME) bit, when the host OS panics or reboots. Here is an excerpt of the PCI Express specification: <quote> Bus Master Enable - Controls the ability of a PCI Express Endpoint to issue Memory and I/O Read/Write Requests, and the ability of a Root or Switch Port to forward Memory and I/O Read/Write Requests in the Upstream direction Endpoints: When this bit is Set, the PCI Express Function is allowed to issue Memory or I/O Requests. When this bit is Clear, the PCI Express Function is not allowed to issue any Memory or I/O Requests. Note that as MSI/MSI-X interrupt Messages are in-band memory writes, setting the Bus Master Enable bit to 0b disables MSI/MSI-X interrupt Messages as well. Requests other than Memory or I/O Requests are not controlled by this bit. Default value of this bit is 0b. This bit is hardwired to 0b if a Function does not generate Memory or I/O Requests. </quote> To accommodate Exadata's requirement here, we install a panic-notifier than when invoked, revokes BME for the function. Orabug: 31556128 UEK5 => UEK6 (cherry picked from commit 6baf337) cherry-pick-repo=UEK/production/linux-uek.git Conflicts: drivers/infiniband/hw/mlx5/mlx5_ib.h drivers/infiniband/hw/mlx5/main.c * The conflict arouse because in UEK5, the device struct * contained an array of mlx5_roce structs, whereas in UEK6, the * mlx5_roce struct is a sub-ordinate to the device's port * array. Signed-off-by: Håkon Bugge <[email protected]> Reviewed-by: Sharath Srinivasan <[email protected]> Signed-off-by: Aron Silverton <[email protected]> Orabug: 31556129 UEK6 => LUCI (cherry picked from commit 363dada) cherry-pick-repo=UEK/production/linux-uek.git Conflicts: drivers/infiniband/hw/mlx5/main.c Conflicts due to refactored mlx5_ib_roce_init/cleanup(). Also fix compilation issue by including <linux/notifier.h>. Signed-off-by: Sharath Srinivasan <[email protected]> Reviewed-by: Qing Huang <[email protected]> Signed-off-by: Aron Silverton <[email protected]
1 parent a8c4720 commit 442d2ab

File tree

2 files changed

+89
-0
lines changed

2 files changed

+89
-0
lines changed

drivers/infiniband/hw/mlx5/main.c

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include <rdma/mlx5_user_ioctl_verbs.h>
4848
#include <rdma/mlx5_user_ioctl_cmds.h>
4949
#include <rdma/ib_umem_odp.h>
50+
#include <linux/panic_notifier.h>
5051

5152
#define UVERBS_MODULE_NAME mlx5_ib
5253
#include <rdma/uverbs_named_ioctl.h>
@@ -3038,6 +3039,75 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
30383039
}
30393040
}
30403041

3042+
#ifndef WITHOUT_ORACLE_EXTENSIONS
3043+
static int mlx5_ib_panic_handler(struct notifier_block *nb,
3044+
unsigned long action, void *unused)
3045+
{
3046+
struct mlx5_ib_dev *dev;
3047+
3048+
list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
3049+
struct pci_dev *pdev = dev->mdev->pdev;
3050+
3051+
pci_crit(pdev, "Revoke Bus_Mastership_Enable (BME)\n");
3052+
pci_clear_master(pdev);
3053+
}
3054+
3055+
return NOTIFY_DONE;
3056+
}
3057+
3058+
static bool panic_handler_added;
3059+
static int mlx5_ib_add_panic_notifier(struct mlx5_ib_dev *dev, u8 port_num)
3060+
{
3061+
int err;
3062+
3063+
/* We only need to add the handler once, as it iterates
3064+
* through all the devices
3065+
*/
3066+
if (panic_handler_added)
3067+
return 0;
3068+
3069+
dev->port[port_num].roce.panic_nb.notifier_call = mlx5_ib_panic_handler;
3070+
err = atomic_notifier_chain_register(&panic_notifier_list,
3071+
&dev->port[port_num].roce.panic_nb);
3072+
if (err) {
3073+
pci_crit(dev->mdev->pdev,
3074+
"Failed registering panic handler for port %d, err %d\n",
3075+
port_num + 1, err);
3076+
dev->port[port_num].roce.panic_nb.notifier_call = NULL;
3077+
return err;
3078+
}
3079+
3080+
pci_crit(dev->mdev->pdev,
3081+
"Successfully registered panic handler for port %d\n", port_num + 1);
3082+
3083+
panic_handler_added = true;
3084+
3085+
return 0;
3086+
}
3087+
3088+
static void mlx5_ib_remove_panic_notifier(struct mlx5_ib_dev *dev, u8 port_num)
3089+
{
3090+
int err = 0;
3091+
3092+
if (dev->port[port_num].roce.panic_nb.notifier_call) {
3093+
err = atomic_notifier_chain_unregister(&panic_notifier_list,
3094+
&dev->port[port_num].roce.panic_nb);
3095+
if (err) {
3096+
pci_crit(dev->mdev->pdev,
3097+
"Unregistering the panic handler for port %d failed, err %d\n",
3098+
port_num + 1, err);
3099+
} else {
3100+
pci_crit(dev->mdev->pdev,
3101+
"Successfully unregistered panic handler for port %d",
3102+
port_num + 1);
3103+
panic_handler_added = false;
3104+
}
3105+
3106+
dev->port[port_num].roce.panic_nb.notifier_call = NULL;
3107+
}
3108+
}
3109+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
3110+
30413111
static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num)
30423112
{
30433113
int err;
@@ -3957,12 +4027,25 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
39574027
if (err)
39584028
return err;
39594029

4030+
#ifndef WITHOUT_ORACLE_EXTENSIONS
4031+
err = mlx5_ib_add_panic_notifier(dev, port_num);
4032+
if (err)
4033+
goto cleanup;
4034+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
39604035
err = mlx5_enable_eth(dev);
39614036
if (err)
4037+
#ifndef WITHOUT_ORACLE_EXTENSIONS
4038+
goto err_unregister_panic_handler;
4039+
#else /* !WITHOUT_ORACLE_EXTENSIONS */
39624040
goto cleanup;
4041+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
39634042
}
39644043

39654044
return 0;
4045+
#ifndef WITHOUT_ORACLE_EXTENSIONS
4046+
err_unregister_panic_handler:
4047+
mlx5_ib_remove_panic_notifier(dev, port_num);
4048+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
39664049
cleanup:
39674050
mlx5_remove_netdev_notifier(dev, port_num);
39684051
return err;
@@ -3982,6 +4065,9 @@ static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
39824065
mlx5_disable_eth(dev);
39834066

39844067
port_num = mlx5_core_native_port_num(dev->mdev) - 1;
4068+
#ifndef WITHOUT_ORACLE_EXTENSIONS
4069+
mlx5_ib_remove_panic_notifier(dev, port_num);
4070+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
39854071
mlx5_remove_netdev_notifier(dev, port_num);
39864072
}
39874073
}

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -824,6 +824,9 @@ struct mlx5_roce {
824824
rwlock_t netdev_lock;
825825
struct net_device *netdev;
826826
struct notifier_block nb;
827+
#ifndef WITHOUT_ORACLE_EXTENSIONS
828+
struct notifier_block panic_nb;
829+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
827830
atomic_t tx_port_affinity;
828831
enum ib_port_state last_port_state;
829832
struct mlx5_ib_dev *dev;

0 commit comments

Comments
 (0)