Skip to content

Commit a6b5a7a

Browse files
YiPeng Chaialexdeucher
authored andcommitted
drm/amdgpu: query bad page info of ras module
Query bad page info of ras module. V2: Update code to reuse bad page output code. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 62902b8 commit a6b5a7a

1 file changed

Lines changed: 98 additions & 44 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 98 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1782,7 +1782,9 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
17821782
/* sysfs begin */
17831783

17841784
static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1785-
struct ras_badpage **bps, unsigned int *count);
1785+
struct ras_badpage *bps, uint32_t count, uint32_t start);
1786+
static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev,
1787+
struct ras_badpage *bps, uint32_t count, uint32_t start);
17861788

17871789
static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
17881790
{
@@ -1840,19 +1842,50 @@ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
18401842
unsigned int end = div64_ul(ppos + count - 1, element_size);
18411843
ssize_t s = 0;
18421844
struct ras_badpage *bps = NULL;
1843-
unsigned int bps_count = 0;
1845+
int bps_count = 0, i, status;
1846+
uint64_t address;
18441847

18451848
memset(buf, 0, count);
18461849

1847-
if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1850+
bps_count = end - start;
1851+
bps = kmalloc_array(bps_count, sizeof(*bps), GFP_KERNEL);
1852+
if (!bps)
1853+
return 0;
1854+
1855+
memset(bps, 0, sizeof(*bps) * bps_count);
1856+
1857+
if (amdgpu_uniras_enabled(adev))
1858+
bps_count = amdgpu_uniras_badpages_read(adev, bps, bps_count, start);
1859+
else
1860+
bps_count = amdgpu_ras_badpages_read(adev, bps, bps_count, start);
1861+
1862+
if (bps_count <= 0) {
1863+
kfree(bps);
18481864
return 0;
1865+
}
1866+
1867+
for (i = 0; i < bps_count; i++) {
1868+
address = ((uint64_t)bps[i].bp) << AMDGPU_GPU_PAGE_SHIFT;
1869+
if (amdgpu_ras_check_critical_address(adev, address))
1870+
continue;
1871+
1872+
bps[i].size = AMDGPU_GPU_PAGE_SIZE;
1873+
1874+
status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
1875+
address);
1876+
if (status == -EBUSY)
1877+
bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
1878+
else if (status == -ENOENT)
1879+
bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
1880+
else
1881+
bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED;
18491882

1850-
for (; start < end && start < bps_count; start++)
18511883
s += scnprintf(&buf[s], element_size + 1,
18521884
"0x%08x : 0x%08x : %1s\n",
1853-
bps[start].bp,
1854-
bps[start].size,
1855-
amdgpu_ras_badpage_flags_str(bps[start].flags));
1885+
bps[i].bp,
1886+
bps[i].size,
1887+
amdgpu_ras_badpage_flags_str(bps[i].flags));
1888+
}
18561889

18571890
kfree(bps);
18581891

@@ -2645,62 +2678,83 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
26452678
}
26462679
}
26472680

2648-
/* recovery begin */
2649-
2650-
/* return 0 on success.
2651-
* caller need free bps.
2652-
*/
26532681
static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
2654-
struct ras_badpage **bps, unsigned int *count)
2682+
struct ras_badpage *bps, uint32_t count, uint32_t start)
26552683
{
26562684
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
26572685
struct ras_err_handler_data *data;
2658-
int i = 0;
2659-
int ret = 0, status;
2686+
int r = 0;
2687+
uint32_t i;
26602688

26612689
if (!con || !con->eh_data || !bps || !count)
26622690
return -EINVAL;
26632691

26642692
mutex_lock(&con->recovery_lock);
26652693
data = con->eh_data;
2666-
if (!data || data->count == 0) {
2667-
*bps = NULL;
2668-
ret = -EINVAL;
2669-
goto out;
2694+
if (start < data->count) {
2695+
for (i = start; i < data->count; i++) {
2696+
if (!data->bps[i].ts)
2697+
continue;
2698+
2699+
bps[r].bp = data->bps[i].retired_page;
2700+
r++;
2701+
if (r >= count)
2702+
break;
2703+
}
26702704
}
2705+
mutex_unlock(&con->recovery_lock);
26712706

2672-
*bps = kmalloc_array(data->count, sizeof(struct ras_badpage), GFP_KERNEL);
2673-
if (!*bps) {
2674-
ret = -ENOMEM;
2675-
goto out;
2676-
}
2707+
return r;
2708+
}
26772709

2678-
for (; i < data->count; i++) {
2679-
if (!data->bps[i].ts)
2680-
continue;
2710+
static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev,
2711+
struct ras_badpage *bps, uint32_t count, uint32_t start)
2712+
{
2713+
struct ras_cmd_bad_pages_info_req cmd_input;
2714+
struct ras_cmd_bad_pages_info_rsp *output;
2715+
uint32_t group, start_group, end_group;
2716+
uint32_t pos, pos_in_group;
2717+
int r = 0, i;
26812718

2682-
(*bps)[i] = (struct ras_badpage){
2683-
.bp = data->bps[i].retired_page,
2684-
.size = AMDGPU_GPU_PAGE_SIZE,
2685-
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
2686-
};
2719+
if (!bps || !count)
2720+
return -EINVAL;
26872721

2688-
if (amdgpu_ras_check_critical_address(adev,
2689-
data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
2690-
continue;
2722+
output = kmalloc(sizeof(*output), GFP_KERNEL);
2723+
if (!output)
2724+
return -ENOMEM;
26912725

2692-
status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
2693-
data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
2694-
if (status == -EBUSY)
2695-
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
2696-
else if (status == -ENOENT)
2697-
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
2726+
memset(&cmd_input, 0, sizeof(cmd_input));
2727+
2728+
start_group = start / RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
2729+
end_group = (start + count + RAS_CMD_MAX_BAD_PAGES_PER_GROUP - 1) /
2730+
RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
2731+
2732+
pos = start;
2733+
for (group = start_group; group < end_group; group++) {
2734+
memset(output, 0, sizeof(*output));
2735+
cmd_input.group_index = group;
2736+
if (amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BAD_PAGES,
2737+
&cmd_input, sizeof(cmd_input), output, sizeof(*output)))
2738+
goto out;
2739+
2740+
if (pos >= output->bp_total_cnt)
2741+
goto out;
2742+
2743+
pos_in_group = pos - group * RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
2744+
for (i = pos_in_group; i < output->bp_in_group; i++, pos++) {
2745+
if (!output->records[i].ts)
2746+
continue;
2747+
2748+
bps[r].bp = output->records[i].retired_page;
2749+
r++;
2750+
if (r >= count)
2751+
goto out;
2752+
}
26982753
}
26992754

2700-
*count = con->bad_page_num;
27012755
out:
2702-
mutex_unlock(&con->recovery_lock);
2703-
return ret;
2756+
kfree(output);
2757+
return r;
27042758
}
27052759

27062760
static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,

0 commit comments

Comments
 (0)