Skip to content

Commit b0bc615

Browse files
msanallaSaeed Mahameed
authored andcommitted
net/mlx5: Add vnic devlink health reporter to PFs/VFs
Create a vnic devlink health reporter for PFs/VFs interfaces. The reporter's diagnose callback displays the values of vNIC/vport transport debug counters of PFs/VFs, as follows: $ devlink health diagnose pci/0000:08:00.0 reporter vnic vNIC env counters: total_error_queues: 0 send_queue_priority_update_flow: 0 comp_eq_overrun: 0 async_eq_overrun: 0 cq_overrun: 0 invalid_command: 0 quota_exceeded_command: 0 nic_receive_steering_discard: 0 Moreover, add documentation on the reporter functionality and the counters description. While at it, expose the vNIC counters diagnose function to be used by the downstream patch, which will reveal the counters for representor interfaces. Signed-off-by: Maher Sanalla <msanalla@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
1 parent 0a43141 commit b0bc615

6 files changed

Lines changed: 177 additions & 1 deletion

File tree

Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,3 +257,33 @@ User commands examples:
257257
$ devlink health dump show pci/0000:82:00.1 reporter fw_fatal
258258

259259
NOTE: This command can run only on PF.
260+
261+
vnic reporter
262+
-------------
263+
The vnic reporter implements only the `diagnose` callback.
264+
It is responsible for querying the vnic diagnostic counters from fw and displaying
265+
them in realtime.
266+
267+
Description of the vnic counters:
268+
total_q_under_processor_handle: number of queues in an error state due to
269+
an async error or errored command.
270+
send_queue_priority_update_flow: number of QP/SQ priority/SL update
271+
events.
272+
cq_overrun: number of times CQ entered an error state due to an
273+
overflow.
274+
async_eq_overrun: number of times an EQ mapped to async events was
275+
overrun.
276+
comp_eq_overrun: number of times an EQ mapped to completion events was
277+
overrun.
278+
quota_exceeded_command: number of commands issued and failed due to quota
279+
exceeded.
280+
invalid_command: number of commands issued and failed dues to any reason
281+
other than quota exceeded.
282+
nic_receive_steering_discard: number of packets that completed RX flow
283+
steering but were discarded due to a mismatch in flow table.
284+
285+
User commands examples:
286+
- Diagnose PF/VF vnic counters
287+
$ devlink health diagnose pci/0000:82:00.1 reporter vnic
288+
289+
NOTE: This command can run only on PF/VF ports.

drivers/net/ethernet/mellanox/mlx5/core/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
1616
transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
1717
fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \
1818
lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \
19-
diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o \
19+
diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \
2020
fw_reset.o qos.o lib/tout.o lib/aso.o
2121

2222
#
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2+
/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */
3+
4+
#include "reporter_vnic.h"
5+
#include "devlink.h"
6+
7+
#define VNIC_ENV_GET64(vnic_env_stats, c) \
8+
MLX5_GET64(query_vnic_env_out, (vnic_env_stats)->query_vnic_env_out, \
9+
vport_env.c)
10+
11+
struct mlx5_vnic_diag_stats {
12+
__be64 query_vnic_env_out[MLX5_ST_SZ_QW(query_vnic_env_out)];
13+
};
14+
15+
int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
16+
struct devlink_fmsg *fmsg,
17+
u16 vport_num, bool other_vport)
18+
{
19+
u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {};
20+
struct mlx5_vnic_diag_stats vnic;
21+
int err;
22+
23+
MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV);
24+
MLX5_SET(query_vnic_env_in, in, vport_number, vport_num);
25+
MLX5_SET(query_vnic_env_in, in, other_vport, !!other_vport);
26+
27+
err = mlx5_cmd_exec_inout(dev, query_vnic_env, in, &vnic.query_vnic_env_out);
28+
if (err)
29+
return err;
30+
31+
err = devlink_fmsg_pair_nest_start(fmsg, "vNIC env counters");
32+
if (err)
33+
return err;
34+
35+
err = devlink_fmsg_obj_nest_start(fmsg);
36+
if (err)
37+
return err;
38+
39+
err = devlink_fmsg_u64_pair_put(fmsg, "total_error_queues",
40+
VNIC_ENV_GET64(&vnic, total_error_queues));
41+
if (err)
42+
return err;
43+
44+
err = devlink_fmsg_u64_pair_put(fmsg, "send_queue_priority_update_flow",
45+
VNIC_ENV_GET64(&vnic, send_queue_priority_update_flow));
46+
if (err)
47+
return err;
48+
49+
err = devlink_fmsg_u64_pair_put(fmsg, "comp_eq_overrun",
50+
VNIC_ENV_GET64(&vnic, comp_eq_overrun));
51+
if (err)
52+
return err;
53+
54+
err = devlink_fmsg_u64_pair_put(fmsg, "async_eq_overrun",
55+
VNIC_ENV_GET64(&vnic, async_eq_overrun));
56+
if (err)
57+
return err;
58+
59+
err = devlink_fmsg_u64_pair_put(fmsg, "cq_overrun",
60+
VNIC_ENV_GET64(&vnic, cq_overrun));
61+
if (err)
62+
return err;
63+
64+
err = devlink_fmsg_u64_pair_put(fmsg, "invalid_command",
65+
VNIC_ENV_GET64(&vnic, invalid_command));
66+
if (err)
67+
return err;
68+
69+
err = devlink_fmsg_u64_pair_put(fmsg, "quota_exceeded_command",
70+
VNIC_ENV_GET64(&vnic, quota_exceeded_command));
71+
if (err)
72+
return err;
73+
74+
err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard",
75+
VNIC_ENV_GET64(&vnic, nic_receive_steering_discard));
76+
if (err)
77+
return err;
78+
79+
err = devlink_fmsg_obj_nest_end(fmsg);
80+
if (err)
81+
return err;
82+
83+
err = devlink_fmsg_pair_nest_end(fmsg);
84+
if (err)
85+
return err;
86+
87+
return 0;
88+
}
89+
90+
static int mlx5_reporter_vnic_diagnose(struct devlink_health_reporter *reporter,
91+
struct devlink_fmsg *fmsg,
92+
struct netlink_ext_ack *extack)
93+
{
94+
struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
95+
96+
return mlx5_reporter_vnic_diagnose_counters(dev, fmsg, 0, false);
97+
}
98+
99+
static const struct devlink_health_reporter_ops mlx5_reporter_vnic_ops = {
100+
.name = "vnic",
101+
.diagnose = mlx5_reporter_vnic_diagnose,
102+
};
103+
104+
void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev)
105+
{
106+
struct mlx5_core_health *health = &dev->priv.health;
107+
struct devlink *devlink = priv_to_devlink(dev);
108+
109+
health->vnic_reporter =
110+
devlink_health_reporter_create(devlink,
111+
&mlx5_reporter_vnic_ops,
112+
0, dev);
113+
if (IS_ERR(health->vnic_reporter))
114+
mlx5_core_warn(dev,
115+
"Failed to create vnic reporter, err = %ld\n",
116+
PTR_ERR(health->vnic_reporter));
117+
}
118+
119+
void mlx5_reporter_vnic_destroy(struct mlx5_core_dev *dev)
120+
{
121+
struct mlx5_core_health *health = &dev->priv.health;
122+
123+
if (!IS_ERR_OR_NULL(health->vnic_reporter))
124+
devlink_health_reporter_destroy(health->vnic_reporter);
125+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2+
* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
3+
*/
4+
#ifndef __MLX5_REPORTER_VNIC_H
5+
#define __MLX5_REPORTER_VNIC_H
6+
7+
#include "mlx5_core.h"
8+
9+
void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev);
10+
void mlx5_reporter_vnic_destroy(struct mlx5_core_dev *dev);
11+
12+
int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
13+
struct devlink_fmsg *fmsg,
14+
u16 vport_num, bool other_vport);
15+
16+
#endif /* __MLX5_REPORTER_VNIC_H */

drivers/net/ethernet/mellanox/mlx5/core/health.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include "lib/pci_vsc.h"
4343
#include "lib/tout.h"
4444
#include "diag/fw_tracer.h"
45+
#include "diag/reporter_vnic.h"
4546

4647
enum {
4748
MAX_MISSES = 3,
@@ -898,6 +899,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
898899

899900
cancel_delayed_work_sync(&health->update_fw_log_ts_work);
900901
destroy_workqueue(health->wq);
902+
mlx5_reporter_vnic_destroy(dev);
901903
mlx5_fw_reporters_destroy(dev);
902904
}
903905

@@ -907,6 +909,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
907909
char *name;
908910

909911
mlx5_fw_reporters_create(dev);
912+
mlx5_reporter_vnic_create(dev);
910913

911914
health = &dev->priv.health;
912915
name = kmalloc(64, GFP_KERNEL);
@@ -926,6 +929,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
926929
return 0;
927930

928931
out_err:
932+
mlx5_reporter_vnic_destroy(dev);
929933
mlx5_fw_reporters_destroy(dev);
930934
return -ENOMEM;
931935
}

include/linux/mlx5/driver.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ struct mlx5_core_health {
439439
struct work_struct report_work;
440440
struct devlink_health_reporter *fw_reporter;
441441
struct devlink_health_reporter *fw_fatal_reporter;
442+
struct devlink_health_reporter *vnic_reporter;
442443
struct delayed_work update_fw_log_ts_work;
443444
};
444445

0 commit comments

Comments
 (0)