Skip to content

Commit 363dada

Browse files
Hakon-Buggejfvogel
authored andcommitted
IB/mlx5: Disable BME on panic
There are two reasons for disabling BME on an RDMA capable device on a panic. The first one is Exadata's FNDD logic, the other is quite simple; we do not want a vmcore to be generated whilst there are incoming RDMA Write/Atomics, modifying the image while it is generated. Exadata has implemented Fast Node Death Detection (FNDD) by means of posting RDMA operation between nodes. Now, since the RDMA responder is handled by the remote HCA, the RDMA requests will be satisfied, even though the host OS is crashing, generating vmcore, or other. It will do so until the PCIe PRST signal (PCI Reset) has been raised. Exadata has tried to circumvent this situation by having a process at the target, regularly incrementing a variable in the MR used by FNDD, then performing RDMA Read to read the variable and make sure it increments. This mechanism has proven to give too many false negatives, as the process incrementing the variable may be suspended for several seconds, due to a very high number of processes. Hence, the idea is to revoke the HCA's ability to perform host memory accesses, by simply resetting the Bus Master Enable (BME) bit, when the host OS panics or reboots. Here is an excerpt of the PCI Express specification: <quote> Bus Master Enable - Controls the ability of a PCI Express Endpoint to issue Memory and I/O Read/Write Requests, and the ability of a Root or Switch Port to forward Memory and I/O Read/Write Requests in the Upstream direction Endpoints: When this bit is Set, the PCI Express Function is allowed to issue Memory or I/O Requests. When this bit is Clear, the PCI Express Function is not allowed to issue any Memory or I/O Requests. Note that as MSI/MSI-X interrupt Messages are in-band memory writes, setting the Bus Master Enable bit to 0b disables MSI/MSI-X interrupt Messages as well. Requests other than Memory or I/O Requests are not controlled by this bit. Default value of this bit is 0b. This bit is hardwired to 0b if a Function does not generate Memory or I/O Requests. </quote> To accommodate Exadata's requirement here, we install a panic-notifier than when invoked, revokes BME for the function. Orabug: 31556128 UEK5 => UEK6 (cherry picked from commit 6baf337) cherry-pick-repo=UEK/production/linux-uek.git Conflicts: drivers/infiniband/hw/mlx5/mlx5_ib.h drivers/infiniband/hw/mlx5/main.c * The conflict arouse because in UEK5, the device struct * contained an array of mlx5_roce structs, whereas in UEK6, the * mlx5_roce struct is a sub-ordinate to the device's port * array. Signed-off-by: Håkon Bugge <[email protected]> Reviewed-by: Sharath Srinivasan <[email protected]> Signed-off-by: Aron Silverton <[email protected]>
1 parent 43d4ec7 commit 363dada

File tree

2 files changed

+95
-1
lines changed

2 files changed

+95
-1
lines changed

drivers/infiniband/hw/mlx5/main.c

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5281,6 +5281,75 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
52815281
}
52825282
}
52835283

5284+
#ifndef WITHOUT_ORACLE_EXTENSIONS
5285+
static int mlx5_ib_panic_handler(struct notifier_block *nb,
5286+
unsigned long action, void *unused)
5287+
{
5288+
struct mlx5_ib_dev *dev;
5289+
5290+
list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
5291+
struct pci_dev *pdev = dev->mdev->pdev;
5292+
5293+
pci_crit(pdev, "Revoke Bus_Mastership_Enable (BME)\n");
5294+
pci_clear_master(pdev);
5295+
}
5296+
5297+
return NOTIFY_DONE;
5298+
}
5299+
5300+
static bool panic_handler_added;
5301+
static int mlx5_ib_add_panic_notifier(struct mlx5_ib_dev *dev, u8 port_num)
5302+
{
5303+
int err;
5304+
5305+
/* We only need to add the handler once, as it iterates
5306+
* through all the devices
5307+
*/
5308+
if (panic_handler_added)
5309+
return 0;
5310+
5311+
dev->port[port_num].roce.panic_nb.notifier_call = mlx5_ib_panic_handler;
5312+
err = atomic_notifier_chain_register(&panic_notifier_list,
5313+
&dev->port[port_num].roce.panic_nb);
5314+
if (err) {
5315+
pci_crit(dev->mdev->pdev,
5316+
"Failed registering panic handler for port %d, err %d\n",
5317+
port_num + 1, err);
5318+
dev->port[port_num].roce.panic_nb.notifier_call = NULL;
5319+
return err;
5320+
}
5321+
5322+
pci_crit(dev->mdev->pdev,
5323+
"Successfully registered panic handler for port %d\n", port_num + 1);
5324+
5325+
panic_handler_added = true;
5326+
5327+
return 0;
5328+
}
5329+
5330+
static void mlx5_ib_remove_panic_notifier(struct mlx5_ib_dev *dev, u8 port_num)
5331+
{
5332+
int err = 0;
5333+
5334+
if (dev->port[port_num].roce.panic_nb.notifier_call) {
5335+
err = atomic_notifier_chain_unregister(&panic_notifier_list,
5336+
&dev->port[port_num].roce.panic_nb);
5337+
if (err) {
5338+
pci_crit(dev->mdev->pdev,
5339+
"Unregistering the panic handler for port %d failed, err %d\n",
5340+
port_num + 1, err);
5341+
} else {
5342+
pci_crit(dev->mdev->pdev,
5343+
"Successfully unregistered panic handler for port %d",
5344+
port_num + 1);
5345+
panic_handler_added = false;
5346+
}
5347+
5348+
dev->port[port_num].roce.panic_nb.notifier_call = NULL;
5349+
}
5350+
}
5351+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
5352+
52845353
static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
52855354
{
52865355
int err;
@@ -6569,6 +6638,9 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
65696638
static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
65706639
{
65716640
u8 port_num;
6641+
#ifndef WITHOUT_ORACLE_EXTENSIONS
6642+
int err;
6643+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
65726644

65736645
dev->ib_dev.uverbs_ex_cmd_mask |=
65746646
(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
@@ -6581,13 +6653,32 @@ static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
65816653
port_num = mlx5_core_native_port_num(dev->mdev) - 1;
65826654

65836655
/* Register only for native ports */
6584-
return mlx5_add_netdev_notifier(dev, port_num);
6656+
#ifdef WITHOUT_ORACLE_EXTENSIONS
6657+
return mlx5_add_netdev_notifier(dev, port_num)
6658+
#else
6659+
err = mlx5_add_netdev_notifier(dev, port_num);
6660+
if (err)
6661+
return err;
6662+
6663+
err = mlx5_ib_add_panic_notifier(dev, port_num);
6664+
if (err)
6665+
goto err_remove_netdev_notifier;
6666+
6667+
return 0;
6668+
6669+
err_remove_netdev_notifier:
6670+
mlx5_remove_netdev_notifier(dev, port_num);
6671+
return err;
6672+
#endif /* WITHOUT_ORACLE_EXTENSIONS */
65856673
}
65866674

65876675
static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
65886676
{
65896677
u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
65906678

6679+
#ifndef WITHOUT_ORACLE_EXTENSIONS
6680+
mlx5_ib_remove_panic_notifier(dev, port_num);
6681+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
65916682
mlx5_remove_netdev_notifier(dev, port_num);
65926683
}
65936684

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,9 @@ struct mlx5_roce {
744744
rwlock_t netdev_lock;
745745
struct net_device *netdev;
746746
struct notifier_block nb;
747+
#ifndef WITHOUT_ORACLE_EXTENSIONS
748+
struct notifier_block panic_nb;
749+
#endif /* !WITHOUT_ORACLE_EXTENSIONS */
747750
atomic_t tx_port_affinity;
748751
enum ib_port_state last_port_state;
749752
struct mlx5_ib_dev *dev;

0 commit comments

Comments
 (0)