11/* SPDX-License-Identifier: GPL-2.0 */
22/* Copyright (c) 2019 Mellanox Technologies. */
33
4+ #include <net/netdev_lock.h>
5+
46#include "health.h"
57#include "en/ptp.h"
68#include "en/devlink.h"
@@ -79,6 +81,18 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
7981 if (!test_bit (MLX5E_SQ_STATE_RECOVERING , & sq -> state ))
8082 return 0 ;
8183
84+ /* Recovering queues means re-enabling NAPI, which requires the netdev
85+ * instance lock. However, SQ closing flows have to wait for work tasks
86+ * to finish while also holding the netdev instance lock. So either get
87+ * the lock or find that the SQ is no longer enabled and thus this work
88+ * is not relevant anymore.
89+ */
90+ while (!netdev_trylock (dev )) {
91+ if (!test_bit (MLX5E_SQ_STATE_ENABLED , & sq -> state ))
92+ return 0 ;
93+ msleep (20 );
94+ }
95+
8296 err = mlx5_core_query_sq_state (mdev , sq -> sqn , & state );
8397 if (err ) {
8498 netdev_err (dev , "Failed to query SQ 0x%x state. err = %d\n" ,
@@ -114,9 +128,11 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
114128 else
115129 mlx5e_trigger_napi_sched (sq -> cq .napi );
116130
131+ netdev_unlock (dev );
117132 return 0 ;
118133out :
119134 clear_bit (MLX5E_SQ_STATE_RECOVERING , & sq -> state );
135+ netdev_unlock (dev );
120136 return err ;
121137}
122138
@@ -137,26 +153,41 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
137153 sq = to_ctx -> sq ;
138154 eq = sq -> cq .mcq .eq ;
139155 priv = sq -> priv ;
156+
157+ /* Recovering the TX queues implies re-enabling NAPI, which requires
158+ * the netdev instance lock.
159+ * However, channel closing flows have to wait for this work to finish
160+ * while holding the same lock. So either get the lock or find that
161+ * channels are being closed for other reason and this work is not
162+ * relevant anymore.
163+ */
164+ while (!netdev_trylock (sq -> netdev )) {
165+ if (!test_bit (MLX5E_STATE_CHANNELS_ACTIVE , & priv -> state ))
166+ return 0 ;
167+ msleep (20 );
168+ }
169+
140170 err = mlx5e_health_channel_eq_recover (sq -> netdev , eq , sq -> cq .ch_stats );
141171 if (!err ) {
142172 to_ctx -> status = 0 ; /* this sq recovered */
143- return err ;
173+ goto out ;
144174 }
145175
146176 mutex_lock (& priv -> state_lock );
147177 err = mlx5e_safe_reopen_channels (priv );
148178 mutex_unlock (& priv -> state_lock );
149179 if (!err ) {
150180 to_ctx -> status = 1 ; /* all channels recovered */
151- return err ;
181+ goto out ;
152182 }
153183
154184 to_ctx -> status = err ;
155185 clear_bit (MLX5E_SQ_STATE_ENABLED , & sq -> state );
156186 netdev_err (priv -> netdev ,
157187 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n" ,
158188 err );
159-
189+ out :
190+ netdev_unlock (sq -> netdev );
160191 return err ;
161192}
162193
@@ -173,10 +204,22 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
173204 return 0 ;
174205
175206 priv = ptpsq -> txqsq .priv ;
207+ netdev = priv -> netdev ;
208+
209+ /* Recovering the PTP SQ means re-enabling NAPI, which requires the
210+ * netdev instance lock. However, SQ closing has to wait for this work
211+ * task to finish while also holding the same lock. So either get the
212+ * lock or find that the SQ is no longer enabled and thus this work is
213+ * not relevant anymore.
214+ */
215+ while (!netdev_trylock (netdev )) {
216+ if (!test_bit (MLX5E_SQ_STATE_ENABLED , & ptpsq -> txqsq .state ))
217+ return 0 ;
218+ msleep (20 );
219+ }
176220
177221 mutex_lock (& priv -> state_lock );
178222 chs = & priv -> channels ;
179- netdev = priv -> netdev ;
180223
181224 carrier_ok = netif_carrier_ok (netdev );
182225 netif_carrier_off (netdev );
@@ -193,6 +236,7 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
193236 netif_carrier_on (netdev );
194237
195238 mutex_unlock (& priv -> state_lock );
239+ netdev_unlock (netdev );
196240
197241 return err ;
198242}
0 commit comments