@@ -1782,7 +1782,9 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
17821782/* sysfs begin */
17831783
17841784static int amdgpu_ras_badpages_read (struct amdgpu_device * adev ,
1785- struct ras_badpage * * bps , unsigned int * count );
1785+ struct ras_badpage * bps , uint32_t count , uint32_t start );
1786+ static int amdgpu_uniras_badpages_read (struct amdgpu_device * adev ,
1787+ struct ras_badpage * bps , uint32_t count , uint32_t start );
17861788
17871789static char * amdgpu_ras_badpage_flags_str (unsigned int flags )
17881790{
@@ -1840,19 +1842,50 @@ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
18401842 unsigned int end = div64_ul (ppos + count - 1 , element_size );
18411843 ssize_t s = 0 ;
18421844 struct ras_badpage * bps = NULL ;
1843- unsigned int bps_count = 0 ;
1845+ int bps_count = 0 , i , status ;
1846+ uint64_t address ;
18441847
18451848 memset (buf , 0 , count );
18461849
1847- if (amdgpu_ras_badpages_read (adev , & bps , & bps_count ))
1850+ bps_count = end - start ;
1851+ bps = kmalloc_array (bps_count , sizeof (* bps ), GFP_KERNEL );
1852+ if (!bps )
1853+ return 0 ;
1854+
1855+ memset (bps , 0 , sizeof (* bps ) * bps_count );
1856+
1857+ if (amdgpu_uniras_enabled (adev ))
1858+ bps_count = amdgpu_uniras_badpages_read (adev , bps , bps_count , start );
1859+ else
1860+ bps_count = amdgpu_ras_badpages_read (adev , bps , bps_count , start );
1861+
1862+ if (bps_count <= 0 ) {
1863+ kfree (bps );
18481864 return 0 ;
1865+ }
1866+
1867+ for (i = 0 ; i < bps_count ; i ++ ) {
1868+ address = ((uint64_t )bps [i ].bp ) << AMDGPU_GPU_PAGE_SHIFT ;
1869+ if (amdgpu_ras_check_critical_address (adev , address ))
1870+ continue ;
1871+
1872+ bps [i ].size = AMDGPU_GPU_PAGE_SIZE ;
1873+
1874+ status = amdgpu_vram_mgr_query_page_status (& adev -> mman .vram_mgr ,
1875+ address );
1876+ if (status == - EBUSY )
1877+ bps [i ].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING ;
1878+ else if (status == - ENOENT )
1879+ bps [i ].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT ;
1880+ else
1881+ bps [i ].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED ;
18491882
1850- for (; start < end && start < bps_count ; start ++ )
18511883 s += scnprintf (& buf [s ], element_size + 1 ,
18521884 "0x%08x : 0x%08x : %1s\n" ,
1853- bps [start ].bp ,
1854- bps [start ].size ,
1855- amdgpu_ras_badpage_flags_str (bps [start ].flags ));
1885+ bps [i ].bp ,
1886+ bps [i ].size ,
1887+ amdgpu_ras_badpage_flags_str (bps [i ].flags ));
1888+ }
18561889
18571890 kfree (bps );
18581891
@@ -2645,62 +2678,83 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
26452678 }
26462679}
26472680
2648- /* recovery begin */
2649-
2650- /* return 0 on success.
2651- * caller need free bps.
2652- */
26532681static int amdgpu_ras_badpages_read (struct amdgpu_device * adev ,
2654- struct ras_badpage * * bps , unsigned int * count )
2682+ struct ras_badpage * bps , uint32_t count , uint32_t start )
26552683{
26562684 struct amdgpu_ras * con = amdgpu_ras_get_context (adev );
26572685 struct ras_err_handler_data * data ;
2658- int i = 0 ;
2659- int ret = 0 , status ;
2686+ int r = 0 ;
2687+ uint32_t i ;
26602688
26612689 if (!con || !con -> eh_data || !bps || !count )
26622690 return - EINVAL ;
26632691
26642692 mutex_lock (& con -> recovery_lock );
26652693 data = con -> eh_data ;
2666- if (!data || data -> count == 0 ) {
2667- * bps = NULL ;
2668- ret = - EINVAL ;
2669- goto out ;
2694+ if (start < data -> count ) {
2695+ for (i = start ; i < data -> count ; i ++ ) {
2696+ if (!data -> bps [i ].ts )
2697+ continue ;
2698+
2699+ bps [r ].bp = data -> bps [i ].retired_page ;
2700+ r ++ ;
2701+ if (r >= count )
2702+ break ;
2703+ }
26702704 }
2705+ mutex_unlock (& con -> recovery_lock );
26712706
2672- * bps = kmalloc_array (data -> count , sizeof (struct ras_badpage ), GFP_KERNEL );
2673- if (!* bps ) {
2674- ret = - ENOMEM ;
2675- goto out ;
2676- }
2707+ return r ;
2708+ }
26772709
2678- for (; i < data -> count ; i ++ ) {
2679- if (!data -> bps [i ].ts )
2680- continue ;
2710+ static int amdgpu_uniras_badpages_read (struct amdgpu_device * adev ,
2711+ struct ras_badpage * bps , uint32_t count , uint32_t start )
2712+ {
2713+ struct ras_cmd_bad_pages_info_req cmd_input ;
2714+ struct ras_cmd_bad_pages_info_rsp * output ;
2715+ uint32_t group , start_group , end_group ;
2716+ uint32_t pos , pos_in_group ;
2717+ int r = 0 , i ;
26812718
2682- (* bps )[i ] = (struct ras_badpage ){
2683- .bp = data -> bps [i ].retired_page ,
2684- .size = AMDGPU_GPU_PAGE_SIZE ,
2685- .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED ,
2686- };
2719+ if (!bps || !count )
2720+ return - EINVAL ;
26872721
2688- if ( amdgpu_ras_check_critical_address ( adev ,
2689- data -> bps [ i ]. retired_page << AMDGPU_GPU_PAGE_SHIFT ) )
2690- continue ;
2722+ output = kmalloc ( sizeof ( * output ), GFP_KERNEL );
2723+ if (! output )
2724+ return - ENOMEM ;
26912725
2692- status = amdgpu_vram_mgr_query_page_status (& adev -> mman .vram_mgr ,
2693- data -> bps [i ].retired_page << AMDGPU_GPU_PAGE_SHIFT );
2694- if (status == - EBUSY )
2695- (* bps )[i ].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING ;
2696- else if (status == - ENOENT )
2697- (* bps )[i ].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT ;
2726+ memset (& cmd_input , 0 , sizeof (cmd_input ));
2727+
2728+ start_group = start / RAS_CMD_MAX_BAD_PAGES_PER_GROUP ;
2729+ end_group = (start + count + RAS_CMD_MAX_BAD_PAGES_PER_GROUP - 1 ) /
2730+ RAS_CMD_MAX_BAD_PAGES_PER_GROUP ;
2731+
2732+ pos = start ;
2733+ for (group = start_group ; group < end_group ; group ++ ) {
2734+ memset (output , 0 , sizeof (* output ));
2735+ cmd_input .group_index = group ;
2736+ if (amdgpu_ras_mgr_handle_ras_cmd (adev , RAS_CMD__GET_BAD_PAGES ,
2737+ & cmd_input , sizeof (cmd_input ), output , sizeof (* output )))
2738+ goto out ;
2739+
2740+ if (pos >= output -> bp_total_cnt )
2741+ goto out ;
2742+
2743+ pos_in_group = pos - group * RAS_CMD_MAX_BAD_PAGES_PER_GROUP ;
2744+ for (i = pos_in_group ; i < output -> bp_in_group ; i ++ , pos ++ ) {
2745+ if (!output -> records [i ].ts )
2746+ continue ;
2747+
2748+ bps [r ].bp = output -> records [i ].retired_page ;
2749+ r ++ ;
2750+ if (r >= count )
2751+ goto out ;
2752+ }
26982753 }
26992754
2700- * count = con -> bad_page_num ;
27012755out :
2702- mutex_unlock ( & con -> recovery_lock );
2703- return ret ;
2756+ kfree ( output );
2757+ return r ;
27042758}
27052759
27062760static void amdgpu_ras_set_fed_all (struct amdgpu_device * adev ,
0 commit comments