@@ -152,8 +152,9 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
152152
153153static int amdgpu_reserve_page_direct (struct amdgpu_device * adev , uint64_t address )
154154{
155- struct ras_err_data err_data = { 0 , 0 , 0 , NULL } ;
155+ struct ras_err_data err_data ;
156156 struct eeprom_table_record err_rec ;
157+ int ret ;
157158
158159 if ((address >= adev -> gmc .mc_vram_size ) ||
159160 (address >= RAS_UMC_INJECT_ADDR_LIMIT )) {
@@ -170,6 +171,10 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
170171 return 0 ;
171172 }
172173
174+ ret = amdgpu_ras_error_data_init (& err_data );
175+ if (ret )
176+ return ret ;
177+
173178 memset (& err_rec , 0x0 , sizeof (struct eeprom_table_record ));
174179 err_data .err_addr = & err_rec ;
175180 amdgpu_umc_fill_error_record (& err_data , address , address , 0 , 0 );
@@ -180,6 +185,8 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
180185 amdgpu_ras_save_bad_pages (adev , NULL );
181186 }
182187
188+ amdgpu_ras_error_data_fini (& err_data );
189+
183190 dev_warn (adev -> dev , "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n" );
184191 dev_warn (adev -> dev , "Clear EEPROM:\n" );
185192 dev_warn (adev -> dev , " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n" );
@@ -1015,25 +1022,127 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
10151022 }
10161023}
10171024
1025+ static void amdgpu_ras_error_print_error_data (struct amdgpu_device * adev ,
1026+ struct ras_query_if * query_if ,
1027+ struct ras_err_data * err_data ,
1028+ bool is_ue )
1029+ {
1030+ struct ras_manager * ras_mgr = amdgpu_ras_find_obj (adev , & query_if -> head );
1031+ const char * blk_name = get_ras_block_str (& query_if -> head );
1032+ struct amdgpu_smuio_mcm_config_info * mcm_info ;
1033+ struct ras_err_node * err_node ;
1034+ struct ras_err_info * err_info ;
1035+
1036+ if (is_ue )
1037+ dev_info (adev -> dev , "%ld uncorrectable hardware errors detected in %s block\n" ,
1038+ ras_mgr -> err_data .ue_count , blk_name );
1039+ else
1040+ dev_info (adev -> dev , "%ld correctable hardware errors detected in %s block\n" ,
1041+ ras_mgr -> err_data .ue_count , blk_name );
1042+
1043+ for_each_ras_error (err_node , err_data ) {
1044+ err_info = & err_node -> err_info ;
1045+ mcm_info = & err_info -> mcm_info ;
1046+ if (is_ue && err_info -> ue_count ) {
1047+ dev_info (adev -> dev , "socket: %d, die: %d "
1048+ "%lld uncorrectable hardware errors detected in %s block\n" ,
1049+ mcm_info -> socket_id ,
1050+ mcm_info -> die_id ,
1051+ err_info -> ue_count ,
1052+ blk_name );
1053+ } else if (!is_ue && err_info -> ce_count ) {
1054+ dev_info (adev -> dev , "socket: %d, die: %d "
1055+ "%lld correctable hardware errors detected in %s block\n" ,
1056+ mcm_info -> socket_id ,
1057+ mcm_info -> die_id ,
1058+ err_info -> ue_count ,
1059+ blk_name );
1060+ }
1061+ }
1062+ }
1063+
1064+ static void amdgpu_ras_error_generate_report (struct amdgpu_device * adev ,
1065+ struct ras_query_if * query_if ,
1066+ struct ras_err_data * err_data )
1067+ {
1068+ struct ras_manager * ras_mgr = amdgpu_ras_find_obj (adev , & query_if -> head );
1069+ const char * blk_name = get_ras_block_str (& query_if -> head );
1070+
1071+ if (err_data -> ce_count ) {
1072+ if (!list_empty (& err_data -> err_node_list )) {
1073+ amdgpu_ras_error_print_error_data (adev , query_if ,
1074+ err_data , false);
1075+ } else if (!adev -> aid_mask &&
1076+ adev -> smuio .funcs &&
1077+ adev -> smuio .funcs -> get_socket_id &&
1078+ adev -> smuio .funcs -> get_die_id ) {
1079+ dev_info (adev -> dev , "socket: %d, die: %d "
1080+ "%ld correctable hardware errors "
1081+ "detected in %s block, no user "
1082+ "action is needed.\n" ,
1083+ adev -> smuio .funcs -> get_socket_id (adev ),
1084+ adev -> smuio .funcs -> get_die_id (adev ),
1085+ ras_mgr -> err_data .ce_count ,
1086+ blk_name );
1087+ } else {
1088+ dev_info (adev -> dev , "%ld correctable hardware errors "
1089+ "detected in %s block, no user "
1090+ "action is needed.\n" ,
1091+ ras_mgr -> err_data .ce_count ,
1092+ blk_name );
1093+ }
1094+ }
1095+
1096+ if (err_data -> ue_count ) {
1097+ if (!list_empty (& err_data -> err_node_list )) {
1098+ amdgpu_ras_error_print_error_data (adev , query_if ,
1099+ err_data , true);
1100+ } else if (!adev -> aid_mask &&
1101+ adev -> smuio .funcs &&
1102+ adev -> smuio .funcs -> get_socket_id &&
1103+ adev -> smuio .funcs -> get_die_id ) {
1104+ dev_info (adev -> dev , "socket: %d, die: %d "
1105+ "%ld uncorrectable hardware errors "
1106+ "detected in %s block\n" ,
1107+ adev -> smuio .funcs -> get_socket_id (adev ),
1108+ adev -> smuio .funcs -> get_die_id (adev ),
1109+ ras_mgr -> err_data .ue_count ,
1110+ blk_name );
1111+ } else {
1112+ dev_info (adev -> dev , "%ld uncorrectable hardware errors "
1113+ "detected in %s block\n" ,
1114+ ras_mgr -> err_data .ue_count ,
1115+ blk_name );
1116+ }
1117+ }
1118+
1119+ }
1120+
10181121/* query/inject/cure begin */
10191122int amdgpu_ras_query_error_status (struct amdgpu_device * adev ,
10201123 struct ras_query_if * info )
10211124{
10221125 struct amdgpu_ras_block_object * block_obj = NULL ;
10231126 struct ras_manager * obj = amdgpu_ras_find_obj (adev , & info -> head );
1024- struct ras_err_data err_data = {0 , 0 , 0 , NULL };
1127+ struct ras_err_data err_data ;
1128+ int ret ;
10251129
10261130 if (!obj )
10271131 return - EINVAL ;
10281132
1133+ ret = amdgpu_ras_error_data_init (& err_data );
1134+ if (ret )
1135+ return ret ;
1136+
10291137 if (info -> head .block == AMDGPU_RAS_BLOCK__UMC ) {
10301138 amdgpu_ras_get_ecc_info (adev , & err_data );
10311139 } else {
10321140 block_obj = amdgpu_ras_get_ras_block (adev , info -> head .block , 0 );
10331141 if (!block_obj || !block_obj -> hw_ops ) {
10341142 dev_dbg_once (adev -> dev , "%s doesn't config RAS function\n" ,
10351143 get_ras_block_str (& info -> head ));
1036- return - EINVAL ;
1144+ ret = - EINVAL ;
1145+ goto out_fini_err_data ;
10371146 }
10381147
10391148 if (block_obj -> hw_ops -> query_ras_error_count )
@@ -1053,48 +1162,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
10531162 info -> ue_count = obj -> err_data .ue_count ;
10541163 info -> ce_count = obj -> err_data .ce_count ;
10551164
1056- if (err_data .ce_count ) {
1057- if (!adev -> aid_mask &&
1058- adev -> smuio .funcs &&
1059- adev -> smuio .funcs -> get_socket_id &&
1060- adev -> smuio .funcs -> get_die_id ) {
1061- dev_info (adev -> dev , "socket: %d, die: %d "
1062- "%ld correctable hardware errors "
1063- "detected in %s block, no user "
1064- "action is needed.\n" ,
1065- adev -> smuio .funcs -> get_socket_id (adev ),
1066- adev -> smuio .funcs -> get_die_id (adev ),
1067- obj -> err_data .ce_count ,
1068- get_ras_block_str (& info -> head ));
1069- } else {
1070- dev_info (adev -> dev , "%ld correctable hardware errors "
1071- "detected in %s block, no user "
1072- "action is needed.\n" ,
1073- obj -> err_data .ce_count ,
1074- get_ras_block_str (& info -> head ));
1075- }
1076- }
1077- if (err_data .ue_count ) {
1078- if (!adev -> aid_mask &&
1079- adev -> smuio .funcs &&
1080- adev -> smuio .funcs -> get_socket_id &&
1081- adev -> smuio .funcs -> get_die_id ) {
1082- dev_info (adev -> dev , "socket: %d, die: %d "
1083- "%ld uncorrectable hardware errors "
1084- "detected in %s block\n" ,
1085- adev -> smuio .funcs -> get_socket_id (adev ),
1086- adev -> smuio .funcs -> get_die_id (adev ),
1087- obj -> err_data .ue_count ,
1088- get_ras_block_str (& info -> head ));
1089- } else {
1090- dev_info (adev -> dev , "%ld uncorrectable hardware errors "
1091- "detected in %s block\n" ,
1092- obj -> err_data .ue_count ,
1093- get_ras_block_str (& info -> head ));
1094- }
1095- }
1165+ amdgpu_ras_error_generate_report (adev , info , & err_data );
10961166
1097- return 0 ;
1167+ out_fini_err_data :
1168+ amdgpu_ras_error_data_fini (& err_data );
1169+
1170+ return ret ;
10981171}
10991172
11001173int amdgpu_ras_reset_error_status (struct amdgpu_device * adev ,
@@ -1744,12 +1817,16 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
17441817 struct amdgpu_iv_entry * entry )
17451818{
17461819 struct ras_ih_data * data = & obj -> ih_data ;
1747- struct ras_err_data err_data = { 0 , 0 , 0 , NULL } ;
1820+ struct ras_err_data err_data ;
17481821 int ret ;
17491822
17501823 if (!data -> cb )
17511824 return ;
17521825
1826+ ret = amdgpu_ras_error_data_init (& err_data );
1827+ if (ret )
1828+ return ;
1829+
17531830 /* Let IP handle its data, maybe we need get the output
17541831 * from the callback to update the error type/count, etc
17551832 */
@@ -1766,6 +1843,8 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
17661843 obj -> err_data .ue_count += err_data .ue_count ;
17671844 obj -> err_data .ce_count += err_data .ce_count ;
17681845 }
1846+
1847+ amdgpu_ras_error_data_fini (& err_data );
17691848}
17701849
17711850static void amdgpu_ras_interrupt_handler (struct ras_manager * obj )
@@ -3383,3 +3462,128 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
33833462 WREG32 (err_status_hi_offset , 0 );
33843463 }
33853464}
3465+
3466+ int amdgpu_ras_error_data_init (struct ras_err_data * err_data )
3467+ {
3468+ memset (err_data , 0 , sizeof (* err_data ));
3469+
3470+ INIT_LIST_HEAD (& err_data -> err_node_list );
3471+
3472+ return 0 ;
3473+ }
3474+
3475+ static void amdgpu_ras_error_node_release (struct ras_err_node * err_node )
3476+ {
3477+ if (!err_node )
3478+ return ;
3479+
3480+ list_del (& err_node -> node );
3481+ kvfree (err_node );
3482+ }
3483+
3484+ void amdgpu_ras_error_data_fini (struct ras_err_data * err_data )
3485+ {
3486+ struct ras_err_node * err_node , * tmp ;
3487+
3488+ list_for_each_entry_safe (err_node , tmp , & err_data -> err_node_list , node ) {
3489+ amdgpu_ras_error_node_release (err_node );
3490+ list_del (& err_node -> node );
3491+ }
3492+ }
3493+
3494+ static struct ras_err_node * amdgpu_ras_error_find_node_by_id (struct ras_err_data * err_data ,
3495+ struct amdgpu_smuio_mcm_config_info * mcm_info )
3496+ {
3497+ struct ras_err_node * err_node ;
3498+ struct amdgpu_smuio_mcm_config_info * ref_id ;
3499+
3500+ if (!err_data || !mcm_info )
3501+ return NULL ;
3502+
3503+ for_each_ras_error (err_node , err_data ) {
3504+ ref_id = & err_node -> err_info .mcm_info ;
3505+ if ((mcm_info -> socket_id >= 0 && mcm_info -> socket_id != ref_id -> socket_id ) ||
3506+ (mcm_info -> die_id >= 0 && mcm_info -> die_id != ref_id -> die_id ))
3507+ continue ;
3508+
3509+ return err_node ;
3510+ }
3511+
3512+ return NULL ;
3513+ }
3514+
3515+ static struct ras_err_node * amdgpu_ras_error_node_new (void )
3516+ {
3517+ struct ras_err_node * err_node ;
3518+
3519+ err_node = kvzalloc (sizeof (* err_node ), GFP_KERNEL );
3520+ if (!err_node )
3521+ return NULL ;
3522+
3523+ INIT_LIST_HEAD (& err_node -> node );
3524+
3525+ return err_node ;
3526+ }
3527+
3528+ static struct ras_err_info * amdgpu_ras_error_get_info (struct ras_err_data * err_data ,
3529+ struct amdgpu_smuio_mcm_config_info * mcm_info )
3530+ {
3531+ struct ras_err_node * err_node ;
3532+
3533+ err_node = amdgpu_ras_error_find_node_by_id (err_data , mcm_info );
3534+ if (err_node )
3535+ return & err_node -> err_info ;
3536+
3537+ err_node = amdgpu_ras_error_node_new ();
3538+ if (!err_node )
3539+ return NULL ;
3540+
3541+ memcpy (& err_node -> err_info .mcm_info , mcm_info , sizeof (* mcm_info ));
3542+
3543+ err_data -> err_list_count ++ ;
3544+ list_add_tail (& err_node -> node , & err_data -> err_node_list );
3545+
3546+ return & err_node -> err_info ;
3547+ }
3548+
3549+ int amdgpu_ras_error_statistic_ue_count (struct ras_err_data * err_data ,
3550+ struct amdgpu_smuio_mcm_config_info * mcm_info , u64 count )
3551+ {
3552+ struct ras_err_info * err_info ;
3553+
3554+ if (!err_data || !mcm_info )
3555+ return - EINVAL ;
3556+
3557+ if (!count )
3558+ return 0 ;
3559+
3560+ err_info = amdgpu_ras_error_get_info (err_data , mcm_info );
3561+ if (!err_info )
3562+ return - EINVAL ;
3563+
3564+ err_info -> ue_count += count ;
3565+ err_data -> ue_count += count ;
3566+
3567+ return 0 ;
3568+ }
3569+
3570+ int amdgpu_ras_error_statistic_ce_count (struct ras_err_data * err_data ,
3571+ struct amdgpu_smuio_mcm_config_info * mcm_info , u64 count )
3572+ {
3573+ struct ras_err_info * err_info ;
3574+
3575+ if (!err_data || !mcm_info )
3576+ return - EINVAL ;
3577+
3578+ if (!count )
3579+ return 0 ;
3580+
3581+ err_info = amdgpu_ras_error_get_info (err_data , mcm_info );
3582+ if (!err_info )
3583+ return - EINVAL ;
3584+
3585+ err_info -> ce_count += count ;
3586+ err_data -> ce_count += count ;
3587+
3588+ return 0 ;
3589+ }
0 commit comments