Skip to content

Commit 571e4ab

Browse files
wenglianfarleon
authored andcommitted
RDMA/hns: Fix an AEQE overflow error caused by untimely update of eq_db_ci
eq_db_ci is updated only after all AEQEs are processed in the AEQ interrupt handler, which is not timely enough and may result in AEQ overflow. Two optimization methods are proposed: 1. Set an upper limit for AEQE processing. 2. Move time-consuming operations such as printings to the bottom half of the interrupt. cmd events and flush_cqe events are still fully processed in the top half to ensure timely handling. Fixes: a5073d6 ("RDMA/hns: Add eq support of hip08") Signed-off-by: wenglianfa <wenglianfa@huawei.com> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com> Link: https://patch.msgid.link/20241024124000.2931869-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky <leon@kernel.org>
1 parent 427b1f3 commit 571e4ab

4 files changed

Lines changed: 91 additions & 44 deletions

File tree

drivers/infiniband/hw/hns/hns_roce_device.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn);
12891289
void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type);
12901290
void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp);
12911291
void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type);
1292+
void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn);
12921293
void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
12931294
void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev);
12941295
int hns_roce_init(struct hns_roce_dev *hr_dev);

drivers/infiniband/hw/hns/hns_roce_hw_v2.c

Lines changed: 50 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5967,11 +5967,10 @@ static int hns_roce_v2_query_mpt(struct hns_roce_dev *hr_dev, u32 key,
59675967
return ret;
59685968
}
59695969

5970-
static void hns_roce_irq_work_handle(struct work_struct *work)
5970+
static void dump_aeqe_log(struct hns_roce_work *irq_work)
59715971
{
5972-
struct hns_roce_work *irq_work =
5973-
container_of(work, struct hns_roce_work, work);
5974-
struct ib_device *ibdev = &irq_work->hr_dev->ib_dev;
5972+
struct hns_roce_dev *hr_dev = irq_work->hr_dev;
5973+
struct ib_device *ibdev = &hr_dev->ib_dev;
59755974

59765975
switch (irq_work->event_type) {
59775976
case HNS_ROCE_EVENT_TYPE_PATH_MIG:
@@ -6015,6 +6014,8 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
60156014
case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
60166015
ibdev_warn(ibdev, "DB overflow.\n");
60176016
break;
6017+
case HNS_ROCE_EVENT_TYPE_MB:
6018+
break;
60186019
case HNS_ROCE_EVENT_TYPE_FLR:
60196020
ibdev_warn(ibdev, "function level reset.\n");
60206021
break;
@@ -6025,8 +6026,46 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
60256026
ibdev_err(ibdev, "invalid xrceth error.\n");
60266027
break;
60276028
default:
6029+
ibdev_info(ibdev, "Undefined event %d.\n",
6030+
irq_work->event_type);
60286031
break;
60296032
}
6033+
}
6034+
6035+
static void hns_roce_irq_work_handle(struct work_struct *work)
6036+
{
6037+
struct hns_roce_work *irq_work =
6038+
container_of(work, struct hns_roce_work, work);
6039+
struct hns_roce_dev *hr_dev = irq_work->hr_dev;
6040+
int event_type = irq_work->event_type;
6041+
u32 queue_num = irq_work->queue_num;
6042+
6043+
switch (event_type) {
6044+
case HNS_ROCE_EVENT_TYPE_PATH_MIG:
6045+
case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
6046+
case HNS_ROCE_EVENT_TYPE_COMM_EST:
6047+
case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
6048+
case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
6049+
case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
6050+
case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
6051+
case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
6052+
case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION:
6053+
case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH:
6054+
hns_roce_qp_event(hr_dev, queue_num, event_type);
6055+
break;
6056+
case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
6057+
case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
6058+
hns_roce_srq_event(hr_dev, queue_num, event_type);
6059+
break;
6060+
case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
6061+
case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
6062+
hns_roce_cq_event(hr_dev, queue_num, event_type);
6063+
break;
6064+
default:
6065+
break;
6066+
}
6067+
6068+
dump_aeqe_log(irq_work);
60306069

60316070
kfree(irq_work);
60326071
}
@@ -6087,14 +6126,14 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
60876126
static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
60886127
struct hns_roce_eq *eq)
60896128
{
6090-
struct device *dev = hr_dev->dev;
60916129
struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq);
60926130
irqreturn_t aeqe_found = IRQ_NONE;
6131+
int num_aeqes = 0;
60936132
int event_type;
60946133
u32 queue_num;
60956134
int sub_type;
60966135

6097-
while (aeqe) {
6136+
while (aeqe && num_aeqes < HNS_AEQ_POLLING_BUDGET) {
60986137
/* Make sure we read AEQ entry after we have checked the
60996138
* ownership bit
61006139
*/
@@ -6105,38 +6144,20 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
61056144
queue_num = hr_reg_read(aeqe, AEQE_EVENT_QUEUE_NUM);
61066145

61076146
switch (event_type) {
6108-
case HNS_ROCE_EVENT_TYPE_PATH_MIG:
6109-
case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
6110-
case HNS_ROCE_EVENT_TYPE_COMM_EST:
6111-
case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
61126147
case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
6113-
case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
61146148
case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
61156149
case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
61166150
case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION:
61176151
case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH:
6118-
hns_roce_qp_event(hr_dev, queue_num, event_type);
6119-
break;
6120-
case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
6121-
case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
6122-
hns_roce_srq_event(hr_dev, queue_num, event_type);
6123-
break;
6124-
case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
6125-
case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
6126-
hns_roce_cq_event(hr_dev, queue_num, event_type);
6152+
hns_roce_flush_cqe(hr_dev, queue_num);
61276153
break;
61286154
case HNS_ROCE_EVENT_TYPE_MB:
61296155
hns_roce_cmd_event(hr_dev,
61306156
le16_to_cpu(aeqe->event.cmd.token),
61316157
aeqe->event.cmd.status,
61326158
le64_to_cpu(aeqe->event.cmd.out_param));
61336159
break;
6134-
case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
6135-
case HNS_ROCE_EVENT_TYPE_FLR:
6136-
break;
61376160
default:
6138-
dev_err(dev, "unhandled event %d on EQ %d at idx %u.\n",
6139-
event_type, eq->eqn, eq->cons_index);
61406161
break;
61416162
}
61426163

@@ -6150,6 +6171,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
61506171
hns_roce_v2_init_irq_work(hr_dev, eq, queue_num);
61516172

61526173
aeqe = next_aeqe_sw_v2(eq);
6174+
++num_aeqes;
61536175
}
61546176

61556177
update_eq_db(eq);
@@ -6699,6 +6721,9 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
66996721
int ret;
67006722
int i;
67016723

6724+
if (hr_dev->caps.aeqe_depth < HNS_AEQ_POLLING_BUDGET)
6725+
return -EINVAL;
6726+
67026727
other_num = hr_dev->caps.num_other_vectors;
67036728
comp_num = hr_dev->caps.num_comp_vectors;
67046729
aeq_num = hr_dev->caps.num_aeq_vectors;

drivers/infiniband/hw/hns/hns_roce_hw_v2.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@
8585

8686
#define HNS_ROCE_V2_TABLE_CHUNK_SIZE (1 << 18)
8787

88+
/* budget must be smaller than aeqe_depth to guarantee that we update
89+
* the ci before we polled all the entries in the EQ.
90+
*/
91+
#define HNS_AEQ_POLLING_BUDGET 64
92+
8893
enum {
8994
HNS_ROCE_CMD_FLAG_IN = BIT(0),
9095
HNS_ROCE_CMD_FLAG_OUT = BIT(1),

drivers/infiniband/hw/hns/hns_roce_qp.c

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,25 @@
3939
#include "hns_roce_device.h"
4040
#include "hns_roce_hem.h"
4141

42+
static struct hns_roce_qp *hns_roce_qp_lookup(struct hns_roce_dev *hr_dev,
43+
u32 qpn)
44+
{
45+
struct device *dev = hr_dev->dev;
46+
struct hns_roce_qp *qp;
47+
unsigned long flags;
48+
49+
xa_lock_irqsave(&hr_dev->qp_table_xa, flags);
50+
qp = __hns_roce_qp_lookup(hr_dev, qpn);
51+
if (qp)
52+
refcount_inc(&qp->refcount);
53+
xa_unlock_irqrestore(&hr_dev->qp_table_xa, flags);
54+
55+
if (!qp)
56+
dev_warn(dev, "async event for bogus QP %08x\n", qpn);
57+
58+
return qp;
59+
}
60+
4261
static void flush_work_handle(struct work_struct *work)
4362
{
4463
struct hns_roce_work *flush_work = container_of(work,
@@ -95,31 +114,28 @@ void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp)
95114

96115
void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
97116
{
98-
struct device *dev = hr_dev->dev;
99117
struct hns_roce_qp *qp;
100118

101-
xa_lock(&hr_dev->qp_table_xa);
102-
qp = __hns_roce_qp_lookup(hr_dev, qpn);
103-
if (qp)
104-
refcount_inc(&qp->refcount);
105-
xa_unlock(&hr_dev->qp_table_xa);
106-
107-
if (!qp) {
108-
dev_warn(dev, "async event for bogus QP %08x\n", qpn);
119+
qp = hns_roce_qp_lookup(hr_dev, qpn);
120+
if (!qp)
109121
return;
110-
}
111122

112-
if (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR ||
113-
event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR ||
114-
event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR ||
115-
event_type == HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION ||
116-
event_type == HNS_ROCE_EVENT_TYPE_INVALID_XRCETH) {
117-
qp->state = IB_QPS_ERR;
123+
qp->event(qp, (enum hns_roce_event)event_type);
118124

119-
flush_cqe(hr_dev, qp);
120-
}
125+
if (refcount_dec_and_test(&qp->refcount))
126+
complete(&qp->free);
127+
}
121128

122-
qp->event(qp, (enum hns_roce_event)event_type);
129+
void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn)
130+
{
131+
struct hns_roce_qp *qp;
132+
133+
qp = hns_roce_qp_lookup(hr_dev, qpn);
134+
if (!qp)
135+
return;
136+
137+
qp->state = IB_QPS_ERR;
138+
flush_cqe(hr_dev, qp);
123139

124140
if (refcount_dec_and_test(&qp->refcount))
125141
complete(&qp->free);

0 commit comments

Comments
 (0)