Skip to content

Commit c6e77aa

Browse files
shayshyikuba-moo
authored andcommitted
net/mlx5: Register devlink first under devlink lock
In case device is having a non fatal FW error during probe, the driver will report the error to user via devlink. This will trigger a WARN_ON, since mlx5 is calling devlink_register() last. In order to avoid the WARN_ON[1], change mlx5 to invoke devl_register() first under devlink lock. [1] WARNING: CPU: 5 PID: 227 at net/devlink/health.c:483 devlink_recover_notify.constprop.0+0xb8/0xc0 CPU: 5 PID: 227 Comm: kworker/u16:3 Not tainted 6.4.0-rc5_for_upstream_min_debug_2023_06_12_12_38 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: mlx5_health0000:08:00.0 mlx5_fw_reporter_err_work [mlx5_core] RIP: 0010:devlink_recover_notify.constprop.0+0xb8/0xc0 Call Trace: <TASK> ? __warn+0x79/0x120 ? devlink_recover_notify.constprop.0+0xb8/0xc0 ? report_bug+0x17c/0x190 ? handle_bug+0x3c/0x60 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x16/0x20 ? devlink_recover_notify.constprop.0+0xb8/0xc0 devlink_health_report+0x4a/0x1c0 mlx5_fw_reporter_err_work+0xa4/0xd0 [mlx5_core] process_one_work+0x1bb/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x4d/0x3c0 ? process_one_work+0x3c0/0x3c0 kthread+0xc6/0xf0 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 </TASK> Fixes: cf53021 ("devlink: Notify users when objects are accessible") Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> Signed-off-by: Tariq Toukan <tariqt@nvidia.com> Link: https://lore.kernel.org/r/20240409190820.227554-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 0553e75 commit c6e77aa

2 files changed

Lines changed: 20 additions & 18 deletions

File tree

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,6 +1480,14 @@ int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
14801480
if (err)
14811481
goto err_register;
14821482

1483+
err = mlx5_crdump_enable(dev);
1484+
if (err)
1485+
mlx5_core_err(dev, "mlx5_crdump_enable failed with error code %d\n", err);
1486+
1487+
err = mlx5_hwmon_dev_register(dev);
1488+
if (err)
1489+
mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err);
1490+
14831491
mutex_unlock(&dev->intf_state_mutex);
14841492
return 0;
14851493

@@ -1505,7 +1513,10 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
15051513
int err;
15061514

15071515
devl_lock(devlink);
1516+
devl_register(devlink);
15081517
err = mlx5_init_one_devl_locked(dev);
1518+
if (err)
1519+
devl_unregister(devlink);
15091520
devl_unlock(devlink);
15101521
return err;
15111522
}
@@ -1517,6 +1528,8 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)
15171528
devl_lock(devlink);
15181529
mutex_lock(&dev->intf_state_mutex);
15191530

1531+
mlx5_hwmon_dev_unregister(dev);
1532+
mlx5_crdump_disable(dev);
15201533
mlx5_unregister_device(dev);
15211534

15221535
if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@@ -1534,6 +1547,7 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)
15341547
mlx5_function_teardown(dev, true);
15351548
out:
15361549
mutex_unlock(&dev->intf_state_mutex);
1550+
devl_unregister(devlink);
15371551
devl_unlock(devlink);
15381552
}
15391553

@@ -1680,16 +1694,20 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev)
16801694
}
16811695

16821696
devl_lock(devlink);
1697+
devl_register(devlink);
1698+
16831699
err = mlx5_devlink_params_register(priv_to_devlink(dev));
1684-
devl_unlock(devlink);
16851700
if (err) {
16861701
mlx5_core_warn(dev, "mlx5_devlink_param_reg err = %d\n", err);
16871702
goto query_hca_caps_err;
16881703
}
16891704

1705+
devl_unlock(devlink);
16901706
return 0;
16911707

16921708
query_hca_caps_err:
1709+
devl_unregister(devlink);
1710+
devl_unlock(devlink);
16931711
mlx5_function_disable(dev, true);
16941712
out:
16951713
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
@@ -1702,6 +1720,7 @@ void mlx5_uninit_one_light(struct mlx5_core_dev *dev)
17021720

17031721
devl_lock(devlink);
17041722
mlx5_devlink_params_unregister(priv_to_devlink(dev));
1723+
devl_unregister(devlink);
17051724
devl_unlock(devlink);
17061725
if (dev->state != MLX5_DEVICE_STATE_UP)
17071726
return;
@@ -1943,16 +1962,7 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
19431962
goto err_init_one;
19441963
}
19451964

1946-
err = mlx5_crdump_enable(dev);
1947-
if (err)
1948-
dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);
1949-
1950-
err = mlx5_hwmon_dev_register(dev);
1951-
if (err)
1952-
mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err);
1953-
19541965
pci_save_state(pdev);
1955-
devlink_register(devlink);
19561966
return 0;
19571967

19581968
err_init_one:
@@ -1973,16 +1983,9 @@ static void remove_one(struct pci_dev *pdev)
19731983
struct devlink *devlink = priv_to_devlink(dev);
19741984

19751985
set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state);
1976-
/* mlx5_drain_fw_reset() and mlx5_drain_health_wq() are using
1977-
* devlink notify APIs.
1978-
* Hence, we must drain them before unregistering the devlink.
1979-
*/
19801986
mlx5_drain_fw_reset(dev);
19811987
mlx5_drain_health_wq(dev);
1982-
devlink_unregister(devlink);
19831988
mlx5_sriov_disable(pdev, false);
1984-
mlx5_hwmon_dev_unregister(dev);
1985-
mlx5_crdump_disable(dev);
19861989
mlx5_uninit_one(dev);
19871990
mlx5_pci_close(dev);
19881991
mlx5_mdev_uninit(dev);

drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ static void mlx5_sf_dev_remove(struct auxiliary_device *adev)
101101
devlink = priv_to_devlink(mdev);
102102
set_bit(MLX5_BREAK_FW_WAIT, &mdev->intf_state);
103103
mlx5_drain_health_wq(mdev);
104-
devlink_unregister(devlink);
105104
if (mlx5_dev_is_lightweight(mdev))
106105
mlx5_uninit_one_light(mdev);
107106
else

0 commit comments

Comments
 (0)