Skip to content

Commit 7f599fe

Browse files
Stanley.Yangalexdeucher
authored andcommitted
drm/amdgpu: Add support EEPROM table v2.1
Add ras info to EEPROM table, app can analyse device ECC status without GPU driver through EEPROM table ras info. Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent b573cf8 commit 7f599fe

3 files changed

Lines changed: 203 additions & 15 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2314,7 +2314,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
23142314
atomic_set(&con->in_recovery, 0);
23152315
con->eeprom_control.bad_channel_bitmap = 0;
23162316

2317-
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
2317+
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
23182318
amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
23192319

23202320
/* Todo: During test the SMU might fail to read the eeprom through I2C

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

Lines changed: 191 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,20 @@
7272
/* Bad GPU tag ‘BADG’ */
7373
#define RAS_TABLE_HDR_BAD 0x42414447
7474

75+
/**
76+
* EEPROM Table structure v1
77+
* ---------------------------------
78+
* | |
79+
* | EEPROM TABLE HEADER |
80+
* | ( size 20 Bytes ) |
81+
* | |
82+
* ---------------------------------
83+
* | |
84+
* | BAD PAGE RECORD AREA |
85+
* | |
86+
* ---------------------------------
87+
*/
88+
7589
/* Assume 2-Mbit size EEPROM and take up the whole space. */
7690
#define RAS_TBL_SIZE_BYTES (256 * 1024)
7791
#define RAS_TABLE_START 0
@@ -80,6 +94,26 @@
8094
#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
8195
/ RAS_TABLE_RECORD_SIZE)
8296

97+
/**
98+
* EEPROM Table structrue v2.1
99+
* ---------------------------------
100+
* | |
101+
* | EEPROM TABLE HEADER |
102+
* | ( size 20 Bytes ) |
103+
* | |
104+
* ---------------------------------
105+
* | |
106+
* | EEPROM TABLE RAS INFO |
107+
* | (available info size 4 Bytes) |
108+
* | ( reserved size 252 Bytes ) |
109+
* | |
110+
* ---------------------------------
111+
* | |
112+
* | BAD PAGE RECORD AREA |
113+
* | |
114+
* ---------------------------------
115+
*/
116+
83117
/* EEPROM Table V2_1 */
84118
#define RAS_TABLE_V2_1_INFO_SIZE 256
85119
#define RAS_TABLE_V2_1_INFO_START RAS_TABLE_HEADER_SIZE
@@ -242,6 +276,69 @@ static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
242276
return res;
243277
}
244278

279+
static void
280+
__encode_table_ras_info_to_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
281+
unsigned char *buf)
282+
{
283+
u32 *pp = (uint32_t *)buf;
284+
u32 tmp;
285+
286+
tmp = ((uint32_t)(rai->rma_status) & 0xFF) |
287+
(((uint32_t)(rai->health_percent) << 8) & 0xFF00) |
288+
(((uint32_t)(rai->ecc_page_threshold) << 16) & 0xFFFF0000);
289+
pp[0] = cpu_to_le32(tmp);
290+
}
291+
292+
static void
293+
__decode_table_ras_info_from_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
294+
unsigned char *buf)
295+
{
296+
u32 *pp = (uint32_t *)buf;
297+
u32 tmp;
298+
299+
tmp = le32_to_cpu(pp[0]);
300+
rai->rma_status = tmp & 0xFF;
301+
rai->health_percent = (tmp >> 8) & 0xFF;
302+
rai->ecc_page_threshold = (tmp >> 16) & 0xFFFF;
303+
}
304+
305+
static int __write_table_ras_info(struct amdgpu_ras_eeprom_control *control)
306+
{
307+
struct amdgpu_device *adev = to_amdgpu_device(control);
308+
u8 *buf;
309+
int res;
310+
311+
buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
312+
if (!buf) {
313+
DRM_ERROR("Failed to alloc buf to write table ras info\n");
314+
return -ENOMEM;
315+
}
316+
317+
__encode_table_ras_info_to_buf(&control->tbl_rai, buf);
318+
319+
/* i2c may be unstable in gpu reset */
320+
down_read(&adev->reset_domain->sem);
321+
res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
322+
control->i2c_address +
323+
control->ras_info_offset,
324+
buf, RAS_TABLE_V2_1_INFO_SIZE);
325+
up_read(&adev->reset_domain->sem);
326+
327+
if (res < 0) {
328+
DRM_ERROR("Failed to write EEPROM table ras info:%d", res);
329+
} else if (res < RAS_TABLE_V2_1_INFO_SIZE) {
330+
DRM_ERROR("Short write:%d out of %d\n",
331+
res, RAS_TABLE_V2_1_INFO_SIZE);
332+
res = -EIO;
333+
} else {
334+
res = 0;
335+
}
336+
337+
kfree(buf);
338+
339+
return res;
340+
}
341+
245342
static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control)
246343
{
247344
int ii;
@@ -301,14 +398,27 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
301398
mutex_lock(&control->ras_tbl_mutex);
302399

303400
hdr->header = RAS_TABLE_HDR_VAL;
304-
hdr->version = RAS_TABLE_VER_V1;
305-
hdr->first_rec_offset = RAS_RECORD_START;
306-
hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
401+
if (adev->umc.ras &&
402+
adev->umc.ras->set_eeprom_table_version)
403+
adev->umc.ras->set_eeprom_table_version(hdr);
404+
else
405+
hdr->version = RAS_TABLE_VER_V1;
406+
407+
if (hdr->version == RAS_TABLE_VER_V2_1) {
408+
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
409+
hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
410+
RAS_TABLE_V2_1_INFO_SIZE;
411+
} else {
412+
hdr->first_rec_offset = RAS_RECORD_START;
413+
hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
414+
}
307415

308416
csum = __calc_hdr_byte_sum(control);
309417
csum = -csum;
310418
hdr->checksum = csum;
311419
res = __write_table_header(control);
420+
if (!res && hdr->version > RAS_TABLE_VER_V1)
421+
res = __write_table_ras_info(control);
312422

313423
control->ras_num_recs = 0;
314424
control->ras_fri = 0;
@@ -587,9 +697,13 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
587697
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
588698
}
589699

590-
control->tbl_hdr.version = RAS_TABLE_VER_V1;
591-
control->tbl_hdr.first_rec_offset = RAS_INDEX_TO_OFFSET(control, control->ras_fri);
592-
control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
700+
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
701+
control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
702+
RAS_TABLE_V2_1_INFO_SIZE +
703+
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
704+
else
705+
control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
706+
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
593707
control->tbl_hdr.checksum = 0;
594708

595709
buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
@@ -629,6 +743,8 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
629743
csum = -csum;
630744
control->tbl_hdr.checksum = csum;
631745
res = __write_table_header(control);
746+
if (!res && control->tbl_hdr.version > RAS_TABLE_VER_V1)
747+
res = __write_table_ras_info(control);
632748
Out:
633749
kfree(buf);
634750
return res;
@@ -819,9 +935,12 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
819935
return res;
820936
}
821937

822-
uint32_t amdgpu_ras_eeprom_max_record_count(void)
938+
uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control)
823939
{
824-
return RAS_MAX_RECORD_COUNT;
940+
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
941+
return RAS_MAX_RECORD_COUNT_V2_1;
942+
else
943+
return RAS_MAX_RECORD_COUNT;
825944
}
826945

827946
static ssize_t
@@ -1063,8 +1182,14 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
10631182
int buf_size, res;
10641183
u8 csum, *buf, *pp;
10651184

1066-
buf_size = RAS_TABLE_HEADER_SIZE +
1067-
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
1185+
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
1186+
buf_size = RAS_TABLE_HEADER_SIZE +
1187+
RAS_TABLE_V2_1_INFO_SIZE +
1188+
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
1189+
else
1190+
buf_size = RAS_TABLE_HEADER_SIZE +
1191+
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
1192+
10681193
buf = kzalloc(buf_size, GFP_KERNEL);
10691194
if (!buf) {
10701195
DRM_ERROR("Out of memory checking RAS table checksum.\n");
@@ -1092,6 +1217,39 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
10921217
return res < 0 ? res : csum;
10931218
}
10941219

1220+
static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control)
1221+
{
1222+
struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
1223+
struct amdgpu_device *adev = to_amdgpu_device(control);
1224+
unsigned char *buf;
1225+
int res;
1226+
1227+
buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
1228+
if (!buf) {
1229+
DRM_ERROR("Failed to alloc buf to read EEPROM table ras info\n");
1230+
return -ENOMEM;
1231+
}
1232+
1233+
/**
1234+
* EEPROM table V2_1 supports ras info,
1235+
* read EEPROM table ras info
1236+
*/
1237+
res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
1238+
control->i2c_address + control->ras_info_offset,
1239+
buf, RAS_TABLE_V2_1_INFO_SIZE);
1240+
if (res < RAS_TABLE_V2_1_INFO_SIZE) {
1241+
DRM_ERROR("Failed to read EEPROM table ras info, res:%d", res);
1242+
res = res >= 0 ? -EIO : res;
1243+
goto Out;
1244+
}
1245+
1246+
__decode_table_ras_info_from_buf(rai, buf);
1247+
1248+
Out:
1249+
kfree(buf);
1250+
return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
1251+
}
1252+
10951253
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
10961254
bool *exceed_err_limit)
10971255
{
@@ -1114,8 +1272,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
11141272
return -EINVAL;
11151273

11161274
control->ras_header_offset = RAS_HDR_START;
1117-
control->ras_record_offset = RAS_RECORD_START;
1118-
control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
1275+
control->ras_info_offset = RAS_TABLE_V2_1_INFO_START;
11191276
mutex_init(&control->ras_tbl_mutex);
11201277

11211278
/* Read the table header from EEPROM address */
@@ -1129,12 +1286,27 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
11291286

11301287
__decode_table_header_from_buf(hdr, buf);
11311288

1132-
control->ras_num_recs = RAS_NUM_RECS(hdr);
1289+
if (hdr->version == RAS_TABLE_VER_V2_1) {
1290+
control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
1291+
control->ras_record_offset = RAS_RECORD_START_V2_1;
1292+
control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
1293+
} else {
1294+
control->ras_num_recs = RAS_NUM_RECS(hdr);
1295+
control->ras_record_offset = RAS_RECORD_START;
1296+
control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
1297+
}
11331298
control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
11341299

11351300
if (hdr->header == RAS_TABLE_HDR_VAL) {
11361301
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
11371302
control->ras_num_recs);
1303+
1304+
if (hdr->version == RAS_TABLE_VER_V2_1) {
1305+
res = __read_table_ras_info(control);
1306+
if (res)
1307+
return res;
1308+
}
1309+
11381310
res = __verify_ras_table_checksum(control);
11391311
if (res)
11401312
DRM_ERROR("RAS table incorrect checksum or error:%d\n",
@@ -1148,6 +1320,12 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
11481320
ras->bad_page_cnt_threshold);
11491321
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
11501322
amdgpu_bad_page_threshold != 0) {
1323+
if (hdr->version == RAS_TABLE_VER_V2_1) {
1324+
res = __read_table_ras_info(control);
1325+
if (res)
1326+
return res;
1327+
}
1328+
11511329
res = __verify_ras_table_checksum(control);
11521330
if (res)
11531331
DRM_ERROR("RAS Table incorrect checksum or error:%d\n",

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,18 @@ struct amdgpu_ras_eeprom_table_header {
4646
uint32_t checksum;
4747
} __packed;
4848

49+
struct amdgpu_ras_eeprom_table_ras_info {
50+
u8 rma_status;
51+
u8 health_percent;
52+
u16 ecc_page_threshold;
53+
u32 padding[64 - 1];
54+
} __packed;
55+
4956
struct amdgpu_ras_eeprom_control {
5057
struct amdgpu_ras_eeprom_table_header tbl_hdr;
5158

59+
struct amdgpu_ras_eeprom_table_ras_info tbl_rai;
60+
5261
/* Base I2C EEPPROM 19-bit memory address,
5362
* where the table is located. For more information,
5463
* see top of amdgpu_eeprom.c.
@@ -61,6 +70,7 @@ struct amdgpu_ras_eeprom_control {
6170
* right after the header.
6271
*/
6372
u32 ras_header_offset;
73+
u32 ras_info_offset;
6474
u32 ras_record_offset;
6575

6676
/* Number of records in the table.
@@ -127,7 +137,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
127137
int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
128138
struct eeprom_table_record *records, const u32 num);
129139

130-
uint32_t amdgpu_ras_eeprom_max_record_count(void);
140+
uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control);
131141

132142
void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
133143

0 commit comments

Comments
 (0)