Skip to content

Commit 856ef55

Browse files
committed
Merge branch 'for-6.4/cxl-poison' into for-6.4/cxl
Include the poison list and injection infrastructure from Alison for v6.4.
2 parents 267214a + 30a8a10 commit 856ef55

14 files changed

Lines changed: 1175 additions & 13 deletions

File tree

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
What: /sys/kernel/debug/cxl/memX/inject_poison
2+
Date: April, 2023
3+
KernelVersion: v6.4
4+
Contact: linux-cxl@vger.kernel.org
5+
Description:
6+
(WO) When a Device Physical Address (DPA) is written to this
7+
attribute, the memdev driver sends an inject poison command to
8+
the device for the specified address. The DPA must be 64-byte
9+
aligned and the length of the injected poison is 64-bytes. If
10+
successful, the device returns poison when the address is
11+
accessed through the CXL.mem bus. Injecting poison adds the
12+
address to the device's Poison List and the error source is set
13+
to Injected. In addition, the device adds a poison creation
14+
event to its internal Informational Event log, updates the
15+
Event Status register, and if configured, interrupts the host.
16+
It is not an error to inject poison into an address that
17+
already has poison present and no error is returned. The
18+
inject_poison attribute is only visible for devices supporting
19+
the capability.
20+
21+
22+
What: /sys/kernel/debug/memX/clear_poison
23+
Date: April, 2023
24+
KernelVersion: v6.4
25+
Contact: linux-cxl@vger.kernel.org
26+
Description:
27+
(WO) When a Device Physical Address (DPA) is written to this
28+
attribute, the memdev driver sends a clear poison command to
29+
the device for the specified address. Clearing poison removes
30+
the address from the device's Poison List and writes 0 (zero)
31+
for 64 bytes starting at address. It is not an error to clear
32+
poison from an address that does not have poison set. If the
33+
device cannot clear poison from the address, -ENXIO is returned.
34+
The clear_poison attribute is only visible for devices
35+
supporting the capability.

Documentation/ABI/testing/sysfs-bus-cxl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,3 +415,17 @@ Description:
415415
1), and checks that the hardware accepts the commit request.
416416
Reading this value indicates whether the region is committed or
417417
not.
418+
419+
420+
What: /sys/bus/cxl/devices/memX/trigger_poison_list
421+
Date: April, 2023
422+
KernelVersion: v6.4
423+
Contact: linux-cxl@vger.kernel.org
424+
Description:
425+
(WO) When a boolean 'true' is written to this attribute the
426+
memdev driver retrieves the poison list from the device. The
427+
list consists of addresses that are poisoned, or would result
428+
in poison if accessed, and the source of the poison. This
429+
attribute is only visible for devices supporting the
430+
capability. The retrieved errors are logged as kernel
431+
events when cxl_poison event tracing is enabled.

drivers/cxl/core/core.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,12 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled);
2525
#define CXL_DAX_REGION_TYPE(x) (&cxl_dax_region_type)
2626
int cxl_region_init(void);
2727
void cxl_region_exit(void);
28+
int cxl_get_poison_by_endpoint(struct cxl_port *port);
2829
#else
30+
static inline int cxl_get_poison_by_endpoint(struct cxl_port *port)
31+
{
32+
return 0;
33+
}
2934
static inline void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled)
3035
{
3136
}
@@ -64,4 +69,10 @@ int cxl_memdev_init(void);
6469
void cxl_memdev_exit(void);
6570
void cxl_mbox_init(void);
6671

72+
enum cxl_poison_trace_type {
73+
CXL_POISON_TRACE_LIST,
74+
CXL_POISON_TRACE_INJECT,
75+
CXL_POISON_TRACE_CLEAR,
76+
};
77+
6778
#endif /* __CXL_CORE_H__ */

drivers/cxl/core/mbox.c

Lines changed: 143 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include <linux/debugfs.h>
66
#include <linux/ktime.h>
77
#include <linux/mutex.h>
8+
#include <asm/unaligned.h>
9+
#include <cxlpci.h>
810
#include <cxlmem.h>
911
#include <cxl.h>
1012

@@ -61,12 +63,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
6163
CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0),
6264
CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0),
6365
CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0),
64-
CXL_CMD(GET_POISON, 0x10, CXL_VARIABLE_PAYLOAD, 0),
65-
CXL_CMD(INJECT_POISON, 0x8, 0, 0),
66-
CXL_CMD(CLEAR_POISON, 0x48, 0, 0),
6766
CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0),
68-
CXL_CMD(SCAN_MEDIA, 0x11, 0, 0),
69-
CXL_CMD(GET_SCAN_MEDIA, 0, CXL_VARIABLE_PAYLOAD, 0),
7067
};
7168

7269
/*
@@ -87,6 +84,9 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
8784
*
8885
* CXL_MBOX_OP_[GET_]SCAN_MEDIA: The kernel provides a native error list that
8986
* is kept up to date with patrol notifications and error management.
87+
*
88+
* CXL_MBOX_OP_[GET_,INJECT_,CLEAR_]POISON: These commands require kernel
89+
* driver orchestration for safety.
9090
*/
9191
static u16 cxl_disabled_raw_commands[] = {
9292
CXL_MBOX_OP_ACTIVATE_FW,
@@ -95,6 +95,9 @@ static u16 cxl_disabled_raw_commands[] = {
9595
CXL_MBOX_OP_SET_SHUTDOWN_STATE,
9696
CXL_MBOX_OP_SCAN_MEDIA,
9797
CXL_MBOX_OP_GET_SCAN_MEDIA,
98+
CXL_MBOX_OP_GET_POISON,
99+
CXL_MBOX_OP_INJECT_POISON,
100+
CXL_MBOX_OP_CLEAR_POISON,
98101
};
99102

100103
/*
@@ -119,6 +122,43 @@ static bool cxl_is_security_command(u16 opcode)
119122
return false;
120123
}
121124

125+
static bool cxl_is_poison_command(u16 opcode)
126+
{
127+
#define CXL_MBOX_OP_POISON_CMDS 0x43
128+
129+
if ((opcode >> 8) == CXL_MBOX_OP_POISON_CMDS)
130+
return true;
131+
132+
return false;
133+
}
134+
135+
static void cxl_set_poison_cmd_enabled(struct cxl_poison_state *poison,
136+
u16 opcode)
137+
{
138+
switch (opcode) {
139+
case CXL_MBOX_OP_GET_POISON:
140+
set_bit(CXL_POISON_ENABLED_LIST, poison->enabled_cmds);
141+
break;
142+
case CXL_MBOX_OP_INJECT_POISON:
143+
set_bit(CXL_POISON_ENABLED_INJECT, poison->enabled_cmds);
144+
break;
145+
case CXL_MBOX_OP_CLEAR_POISON:
146+
set_bit(CXL_POISON_ENABLED_CLEAR, poison->enabled_cmds);
147+
break;
148+
case CXL_MBOX_OP_GET_SCAN_MEDIA_CAPS:
149+
set_bit(CXL_POISON_ENABLED_SCAN_CAPS, poison->enabled_cmds);
150+
break;
151+
case CXL_MBOX_OP_SCAN_MEDIA:
152+
set_bit(CXL_POISON_ENABLED_SCAN_MEDIA, poison->enabled_cmds);
153+
break;
154+
case CXL_MBOX_OP_GET_SCAN_MEDIA:
155+
set_bit(CXL_POISON_ENABLED_SCAN_RESULTS, poison->enabled_cmds);
156+
break;
157+
default:
158+
break;
159+
}
160+
}
161+
122162
static struct cxl_mem_command *cxl_mem_find_command(u16 opcode)
123163
{
124164
struct cxl_mem_command *c;
@@ -634,13 +674,18 @@ static void cxl_walk_cel(struct cxl_dev_state *cxlds, size_t size, u8 *cel)
634674
u16 opcode = le16_to_cpu(cel_entry[i].opcode);
635675
struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
636676

637-
if (!cmd) {
677+
if (!cmd && !cxl_is_poison_command(opcode)) {
638678
dev_dbg(cxlds->dev,
639679
"Opcode 0x%04x unsupported by driver\n", opcode);
640680
continue;
641681
}
642682

643-
set_bit(cmd->info.id, cxlds->enabled_cmds);
683+
if (cmd)
684+
set_bit(cmd->info.id, cxlds->enabled_cmds);
685+
686+
if (cxl_is_poison_command(opcode))
687+
cxl_set_poison_cmd_enabled(&cxlds->poison, opcode);
688+
644689
dev_dbg(cxlds->dev, "Opcode 0x%04x enabled\n", opcode);
645690
}
646691
}
@@ -994,6 +1039,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
9941039
/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
9951040
struct cxl_mbox_identify id;
9961041
struct cxl_mbox_cmd mbox_cmd;
1042+
u32 val;
9971043
int rc;
9981044

9991045
mbox_cmd = (struct cxl_mbox_cmd) {
@@ -1017,6 +1063,11 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
10171063
cxlds->lsa_size = le32_to_cpu(id.lsa_size);
10181064
memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
10191065

1066+
if (test_bit(CXL_POISON_ENABLED_LIST, cxlds->poison.enabled_cmds)) {
1067+
val = get_unaligned_le24(id.poison_list_max_mer);
1068+
cxlds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX);
1069+
}
1070+
10201071
return 0;
10211072
}
10221073
EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
@@ -1107,6 +1158,91 @@ int cxl_set_timestamp(struct cxl_dev_state *cxlds)
11071158
}
11081159
EXPORT_SYMBOL_NS_GPL(cxl_set_timestamp, CXL);
11091160

1161+
int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
1162+
struct cxl_region *cxlr)
1163+
{
1164+
struct cxl_dev_state *cxlds = cxlmd->cxlds;
1165+
struct cxl_mbox_poison_out *po;
1166+
struct cxl_mbox_poison_in pi;
1167+
struct cxl_mbox_cmd mbox_cmd;
1168+
int nr_records = 0;
1169+
int rc;
1170+
1171+
rc = mutex_lock_interruptible(&cxlds->poison.lock);
1172+
if (rc)
1173+
return rc;
1174+
1175+
po = cxlds->poison.list_out;
1176+
pi.offset = cpu_to_le64(offset);
1177+
pi.length = cpu_to_le64(len / CXL_POISON_LEN_MULT);
1178+
1179+
mbox_cmd = (struct cxl_mbox_cmd) {
1180+
.opcode = CXL_MBOX_OP_GET_POISON,
1181+
.size_in = sizeof(pi),
1182+
.payload_in = &pi,
1183+
.size_out = cxlds->payload_size,
1184+
.payload_out = po,
1185+
.min_out = struct_size(po, record, 0),
1186+
};
1187+
1188+
do {
1189+
rc = cxl_internal_send_cmd(cxlds, &mbox_cmd);
1190+
if (rc)
1191+
break;
1192+
1193+
for (int i = 0; i < le16_to_cpu(po->count); i++)
1194+
trace_cxl_poison(cxlmd, cxlr, &po->record[i],
1195+
po->flags, po->overflow_ts,
1196+
CXL_POISON_TRACE_LIST);
1197+
1198+
/* Protect against an uncleared _FLAG_MORE */
1199+
nr_records = nr_records + le16_to_cpu(po->count);
1200+
if (nr_records >= cxlds->poison.max_errors) {
1201+
dev_dbg(&cxlmd->dev, "Max Error Records reached: %d\n",
1202+
nr_records);
1203+
break;
1204+
}
1205+
} while (po->flags & CXL_POISON_FLAG_MORE);
1206+
1207+
mutex_unlock(&cxlds->poison.lock);
1208+
return rc;
1209+
}
1210+
EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison, CXL);
1211+
1212+
static void free_poison_buf(void *buf)
1213+
{
1214+
kvfree(buf);
1215+
}
1216+
1217+
/* Get Poison List output buffer is protected by cxlds->poison.lock */
1218+
static int cxl_poison_alloc_buf(struct cxl_dev_state *cxlds)
1219+
{
1220+
cxlds->poison.list_out = kvmalloc(cxlds->payload_size, GFP_KERNEL);
1221+
if (!cxlds->poison.list_out)
1222+
return -ENOMEM;
1223+
1224+
return devm_add_action_or_reset(cxlds->dev, free_poison_buf,
1225+
cxlds->poison.list_out);
1226+
}
1227+
1228+
int cxl_poison_state_init(struct cxl_dev_state *cxlds)
1229+
{
1230+
int rc;
1231+
1232+
if (!test_bit(CXL_POISON_ENABLED_LIST, cxlds->poison.enabled_cmds))
1233+
return 0;
1234+
1235+
rc = cxl_poison_alloc_buf(cxlds);
1236+
if (rc) {
1237+
clear_bit(CXL_POISON_ENABLED_LIST, cxlds->poison.enabled_cmds);
1238+
return rc;
1239+
}
1240+
1241+
mutex_init(&cxlds->poison.lock);
1242+
return 0;
1243+
}
1244+
EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
1245+
11101246
struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
11111247
{
11121248
struct cxl_dev_state *cxlds;

0 commit comments

Comments
 (0)