Skip to content

Commit 7997bca

Browse files
committed
Merge branch 'mlx5-misc-fixes-2026-02-18'
Tariq Toukan says: ==================== mlx5 misc fixes 2026-02-18 This patchset provides misc bug fixes from the team to the mlx5 core and Eth drivers. ==================== Link: https://patch.msgid.link/20260218072904.1764634-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents e6834a4 + 57a94d4 commit 7997bca

9 files changed

Lines changed: 78 additions & 85 deletions

File tree

drivers/net/ethernet/mellanox/mlx5/core/en.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
180180
}
181181

182182
/* Use this function to get max num channels (rxqs/txqs) only to create netdev */
183-
static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
183+
static inline unsigned int
184+
mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
184185
{
185186
return is_kdump_kernel() ?
186187
MLX5E_MIN_NUM_CHANNELS :

drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -457,22 +457,8 @@ static void mlx5e_ptpsq_unhealthy_work(struct work_struct *work)
457457
{
458458
struct mlx5e_ptpsq *ptpsq =
459459
container_of(work, struct mlx5e_ptpsq, report_unhealthy_work);
460-
struct mlx5e_txqsq *sq = &ptpsq->txqsq;
461-
462-
/* Recovering the PTP SQ means re-enabling NAPI, which requires the
463-
* netdev instance lock. However, SQ closing has to wait for this work
464-
* task to finish while also holding the same lock. So either get the
465-
* lock or find that the SQ is no longer enabled and thus this work is
466-
* not relevant anymore.
467-
*/
468-
while (!netdev_trylock(sq->netdev)) {
469-
if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
470-
return;
471-
msleep(20);
472-
}
473460

474461
mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq);
475-
netdev_unlock(sq->netdev);
476462
}
477463

478464
static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,

drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
// SPDX-License-Identifier: GPL-2.0
22
// Copyright (c) 2019 Mellanox Technologies.
33

4+
#include <net/netdev_lock.h>
5+
46
#include "health.h"
57
#include "params.h"
68
#include "txrx.h"
@@ -177,6 +179,16 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx)
177179
rq = ctx;
178180
priv = rq->priv;
179181

182+
/* Acquire netdev instance lock to synchronize with channel close and
183+
* reopen flows. Either successfully obtain the lock, or detect that
184+
* channels are closing for another reason, making this work no longer
185+
* necessary.
186+
*/
187+
while (!netdev_trylock(rq->netdev)) {
188+
if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
189+
return 0;
190+
msleep(20);
191+
}
180192
mutex_lock(&priv->state_lock);
181193

182194
eq = rq->cq.mcq.eq;
@@ -186,6 +198,7 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx)
186198
clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state);
187199

188200
mutex_unlock(&priv->state_lock);
201+
netdev_unlock(rq->netdev);
189202

190203
return err;
191204
}

drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
/* SPDX-License-Identifier: GPL-2.0 */
22
/* Copyright (c) 2019 Mellanox Technologies. */
33

4+
#include <net/netdev_lock.h>
5+
46
#include "health.h"
57
#include "en/ptp.h"
68
#include "en/devlink.h"
@@ -79,6 +81,18 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
7981
if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
8082
return 0;
8183

84+
/* Recovering queues means re-enabling NAPI, which requires the netdev
85+
* instance lock. However, SQ closing flows have to wait for work tasks
86+
* to finish while also holding the netdev instance lock. So either get
87+
* the lock or find that the SQ is no longer enabled and thus this work
88+
* is not relevant anymore.
89+
*/
90+
while (!netdev_trylock(dev)) {
91+
if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
92+
return 0;
93+
msleep(20);
94+
}
95+
8296
err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
8397
if (err) {
8498
netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
@@ -114,9 +128,11 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
114128
else
115129
mlx5e_trigger_napi_sched(sq->cq.napi);
116130

131+
netdev_unlock(dev);
117132
return 0;
118133
out:
119134
clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
135+
netdev_unlock(dev);
120136
return err;
121137
}
122138

@@ -137,26 +153,41 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
137153
sq = to_ctx->sq;
138154
eq = sq->cq.mcq.eq;
139155
priv = sq->priv;
156+
157+
/* Recovering the TX queues implies re-enabling NAPI, which requires
158+
* the netdev instance lock.
159+
* However, channel closing flows have to wait for this work to finish
160+
* while holding the same lock. So either get the lock or find that
161+
* channels are being closed for other reason and this work is not
162+
* relevant anymore.
163+
*/
164+
while (!netdev_trylock(sq->netdev)) {
165+
if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
166+
return 0;
167+
msleep(20);
168+
}
169+
140170
err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
141171
if (!err) {
142172
to_ctx->status = 0; /* this sq recovered */
143-
return err;
173+
goto out;
144174
}
145175

146176
mutex_lock(&priv->state_lock);
147177
err = mlx5e_safe_reopen_channels(priv);
148178
mutex_unlock(&priv->state_lock);
149179
if (!err) {
150180
to_ctx->status = 1; /* all channels recovered */
151-
return err;
181+
goto out;
152182
}
153183

154184
to_ctx->status = err;
155185
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
156186
netdev_err(priv->netdev,
157187
"mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
158188
err);
159-
189+
out:
190+
netdev_unlock(sq->netdev);
160191
return err;
161192
}
162193

@@ -173,10 +204,22 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
173204
return 0;
174205

175206
priv = ptpsq->txqsq.priv;
207+
netdev = priv->netdev;
208+
209+
/* Recovering the PTP SQ means re-enabling NAPI, which requires the
210+
* netdev instance lock. However, SQ closing has to wait for this work
211+
* task to finish while also holding the same lock. So either get the
212+
* lock or find that the SQ is no longer enabled and thus this work is
213+
* not relevant anymore.
214+
*/
215+
while (!netdev_trylock(netdev)) {
216+
if (!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))
217+
return 0;
218+
msleep(20);
219+
}
176220

177221
mutex_lock(&priv->state_lock);
178222
chs = &priv->channels;
179-
netdev = priv->netdev;
180223

181224
carrier_ok = netif_carrier_ok(netdev);
182225
netif_carrier_off(netdev);
@@ -193,6 +236,7 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
193236
netif_carrier_on(netdev);
194237

195238
mutex_unlock(&priv->state_lock);
239+
netdev_unlock(netdev);
196240

197241
return err;
198242
}

drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
22
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33

4+
#include <linux/iopoll.h>
45
#include <linux/math64.h>
56
#include "lib/aso.h"
67
#include "en/tc/post_act.h"
@@ -115,7 +116,6 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev,
115116
struct mlx5e_flow_meters *flow_meters;
116117
u8 cir_man, cir_exp, cbs_man, cbs_exp;
117118
struct mlx5_aso_wqe *aso_wqe;
118-
unsigned long expires;
119119
struct mlx5_aso *aso;
120120
u64 rate, burst;
121121
u8 ds_cnt;
@@ -187,12 +187,8 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev,
187187
mlx5_aso_post_wqe(aso, true, &aso_wqe->ctrl);
188188

189189
/* With newer FW, the wait for the first ASO WQE is more than 2us, put the wait 10ms. */
190-
expires = jiffies + msecs_to_jiffies(10);
191-
do {
192-
err = mlx5_aso_poll_cq(aso, true);
193-
if (err)
194-
usleep_range(2, 10);
195-
} while (err && time_is_after_jiffies(expires));
190+
read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
191+
false, aso, true);
196192
mutex_unlock(&flow_meters->aso_lock);
197193

198194
return err;

drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <linux/mlx5/mlx5_ifc.h>
66
#include <linux/xarray.h>
77
#include <linux/if_vlan.h>
8+
#include <linux/iopoll.h>
89

910
#include "en.h"
1011
#include "lib/aso.h"
@@ -1385,7 +1386,8 @@ static int macsec_aso_set_arm_event(struct mlx5_core_dev *mdev, struct mlx5e_mac
13851386
MLX5_ACCESS_ASO_OPC_MOD_MACSEC);
13861387
macsec_aso_build_ctrl(aso, &aso_wqe->aso_ctrl, in);
13871388
mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl);
1388-
err = mlx5_aso_poll_cq(maso, false);
1389+
read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
1390+
false, maso, false);
13891391
mutex_unlock(&aso->aso_lock);
13901392

13911393
return err;
@@ -1397,7 +1399,6 @@ static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *mac
13971399
struct mlx5e_macsec_aso *aso;
13981400
struct mlx5_aso_wqe *aso_wqe;
13991401
struct mlx5_aso *maso;
1400-
unsigned long expires;
14011402
int err;
14021403

14031404
aso = &macsec->aso;
@@ -1411,12 +1412,8 @@ static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *mac
14111412
macsec_aso_build_wqe_ctrl_seg(aso, &aso_wqe->aso_ctrl, NULL);
14121413

14131414
mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl);
1414-
expires = jiffies + msecs_to_jiffies(10);
1415-
do {
1416-
err = mlx5_aso_poll_cq(maso, false);
1417-
if (err)
1418-
usleep_range(2, 10);
1419-
} while (err && time_is_after_jiffies(expires));
1415+
read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
1416+
false, maso, false);
14201417

14211418
if (err)
14221419
goto err_out;

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -631,19 +631,7 @@ static void mlx5e_rq_timeout_work(struct work_struct *timeout_work)
631631
struct mlx5e_rq,
632632
rx_timeout_work);
633633

634-
/* Acquire netdev instance lock to synchronize with channel close and
635-
* reopen flows. Either successfully obtain the lock, or detect that
636-
* channels are closing for another reason, making this work no longer
637-
* necessary.
638-
*/
639-
while (!netdev_trylock(rq->netdev)) {
640-
if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
641-
return;
642-
msleep(20);
643-
}
644-
645634
mlx5e_reporter_rx_timeout(rq);
646-
netdev_unlock(rq->netdev);
647635
}
648636

649637
static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq)
@@ -1952,20 +1940,7 @@ void mlx5e_tx_err_cqe_work(struct work_struct *recover_work)
19521940
struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq,
19531941
recover_work);
19541942

1955-
/* Recovering queues means re-enabling NAPI, which requires the netdev
1956-
* instance lock. However, SQ closing flows have to wait for work tasks
1957-
* to finish while also holding the netdev instance lock. So either get
1958-
* the lock or find that the SQ is no longer enabled and thus this work
1959-
* is not relevant anymore.
1960-
*/
1961-
while (!netdev_trylock(sq->netdev)) {
1962-
if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
1963-
return;
1964-
msleep(20);
1965-
}
1966-
19671943
mlx5e_reporter_tx_err_cqe(sq);
1968-
netdev_unlock(sq->netdev);
19691944
}
19701945

19711946
static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
@@ -5115,19 +5090,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
51155090
struct net_device *netdev = priv->netdev;
51165091
int i;
51175092

5118-
/* Recovering the TX queues implies re-enabling NAPI, which requires
5119-
* the netdev instance lock.
5120-
* However, channel closing flows have to wait for this work to finish
5121-
* while holding the same lock. So either get the lock or find that
5122-
* channels are being closed for other reason and this work is not
5123-
* relevant anymore.
5124-
*/
5125-
while (!netdev_trylock(netdev)) {
5126-
if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
5127-
return;
5128-
msleep(20);
5129-
}
5130-
51315093
for (i = 0; i < netdev->real_num_tx_queues; i++) {
51325094
struct netdev_queue *dev_queue =
51335095
netdev_get_tx_queue(netdev, i);
@@ -5140,8 +5102,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
51405102
/* break if tried to reopened channels */
51415103
break;
51425104
}
5143-
5144-
netdev_unlock(netdev);
51455105
}
51465106

51475107
static void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue)

drivers/net/ethernet/mellanox/mlx5/core/wc.c

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33

44
#include <linux/io.h>
5+
#include <linux/iopoll.h>
56
#include <linux/mlx5/transobj.h>
67
#include "lib/clock.h"
78
#include "mlx5_core.h"
@@ -15,7 +16,7 @@
1516
#define TEST_WC_NUM_WQES 255
1617
#define TEST_WC_LOG_CQ_SZ (order_base_2(TEST_WC_NUM_WQES))
1718
#define TEST_WC_SQ_LOG_WQ_SZ TEST_WC_LOG_CQ_SZ
18-
#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100)
19+
#define TEST_WC_POLLING_MAX_TIME_USEC (100 * USEC_PER_MSEC)
1920

2021
struct mlx5_wc_cq {
2122
/* data path - accessed per cqe */
@@ -359,7 +360,6 @@ static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq)
359360
static void mlx5_core_test_wc(struct mlx5_core_dev *mdev)
360361
{
361362
unsigned int offset = 0;
362-
unsigned long expires;
363363
struct mlx5_wc_sq *sq;
364364
int i, err;
365365

@@ -389,13 +389,9 @@ static void mlx5_core_test_wc(struct mlx5_core_dev *mdev)
389389

390390
mlx5_wc_post_nop(sq, &offset, true);
391391

392-
expires = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES;
393-
do {
394-
err = mlx5_wc_poll_cq(sq);
395-
if (err)
396-
usleep_range(2, 10);
397-
} while (mdev->wc_state == MLX5_WC_STATE_UNINITIALIZED &&
398-
time_is_after_jiffies(expires));
392+
poll_timeout_us(mlx5_wc_poll_cq(sq),
393+
mdev->wc_state != MLX5_WC_STATE_UNINITIALIZED, 10,
394+
TEST_WC_POLLING_MAX_TIME_USEC, false);
399395

400396
mlx5_wc_destroy_sq(sq);
401397

include/linux/mlx5/driver.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,12 +1282,12 @@ static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev)
12821282
static inline int mlx5_core_is_mp_slave(struct mlx5_core_dev *dev)
12831283
{
12841284
return MLX5_CAP_GEN(dev, affiliate_nic_vport_criteria) &&
1285-
MLX5_CAP_GEN(dev, num_vhca_ports) <= 1;
1285+
MLX5_CAP_GEN_MAX(dev, num_vhca_ports) <= 1;
12861286
}
12871287

12881288
static inline int mlx5_core_is_mp_master(struct mlx5_core_dev *dev)
12891289
{
1290-
return MLX5_CAP_GEN(dev, num_vhca_ports) > 1;
1290+
return MLX5_CAP_GEN_MAX(dev, num_vhca_ports) > 1;
12911291
}
12921292

12931293
static inline int mlx5_core_mp_enabled(struct mlx5_core_dev *dev)

0 commit comments

Comments
 (0)