@@ -471,3 +471,190 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
471471 .init_registers = nbio_v7_9_init_registers ,
472472 .get_pcie_replay_count = nbio_v7_9_get_pcie_replay_count ,
473473};
474+
475+ static void nbio_v7_9_query_ras_error_count (struct amdgpu_device * adev ,
476+ void * ras_error_status )
477+ {
478+ return ;
479+ }
480+
481+ static void nbio_v7_9_handle_ras_controller_intr_no_bifring (struct amdgpu_device * adev )
482+ {
483+ uint32_t bif_doorbell_intr_cntl ;
484+ struct ras_manager * obj = amdgpu_ras_find_obj (adev , adev -> nbio .ras_if );
485+ struct ras_err_data err_data = {0 , 0 , 0 , NULL };
486+ struct amdgpu_ras * ras = amdgpu_ras_get_context (adev );
487+
488+ bif_doorbell_intr_cntl = RREG32_SOC15 (NBIO , 0 , regBIF_BX0_BIF_DOORBELL_INT_CNTL );
489+
490+ if (REG_GET_FIELD (bif_doorbell_intr_cntl ,
491+ BIF_BX0_BIF_DOORBELL_INT_CNTL , RAS_CNTLR_INTERRUPT_STATUS )) {
492+ /* driver has to clear the interrupt status when bif ring is disabled */
493+ bif_doorbell_intr_cntl = REG_SET_FIELD (bif_doorbell_intr_cntl ,
494+ BIF_BX0_BIF_DOORBELL_INT_CNTL ,
495+ RAS_CNTLR_INTERRUPT_CLEAR , 1 );
496+ WREG32_SOC15 (NBIO , 0 , regBIF_BX0_BIF_DOORBELL_INT_CNTL , bif_doorbell_intr_cntl );
497+
498+ if (!ras -> disable_ras_err_cnt_harvest ) {
499+ /*
500+ * clear error status after ras_controller_intr
501+ * according to hw team and count ue number
502+ * for query
503+ */
504+ nbio_v7_9_query_ras_error_count (adev , & err_data );
505+
506+ /* logging on error cnt and printing for awareness */
507+ obj -> err_data .ue_count += err_data .ue_count ;
508+ obj -> err_data .ce_count += err_data .ce_count ;
509+
510+ if (err_data .ce_count )
511+ dev_info (adev -> dev , "%ld correctable hardware "
512+ "errors detected in %s block, "
513+ "no user action is needed.\n" ,
514+ obj -> err_data .ce_count ,
515+ get_ras_block_str (adev -> nbio .ras_if ));
516+
517+ if (err_data .ue_count )
518+ dev_info (adev -> dev , "%ld uncorrectable hardware "
519+ "errors detected in %s block\n" ,
520+ obj -> err_data .ue_count ,
521+ get_ras_block_str (adev -> nbio .ras_if ));
522+ }
523+
524+ dev_info (adev -> dev , "RAS controller interrupt triggered "
525+ "by NBIF error\n" );
526+
527+ /* ras_controller_int is dedicated for nbif ras error,
528+ * not the global interrupt for sync flood
529+ */
530+ amdgpu_ras_reset_gpu (adev );
531+ }
532+ }
533+
534+ static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring (struct amdgpu_device * adev )
535+ {
536+ uint32_t bif_doorbell_intr_cntl ;
537+
538+ bif_doorbell_intr_cntl = RREG32_SOC15 (NBIO , 0 , regBIF_BX0_BIF_DOORBELL_INT_CNTL );
539+
540+ if (REG_GET_FIELD (bif_doorbell_intr_cntl ,
541+ BIF_BX0_BIF_DOORBELL_INT_CNTL , RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS )) {
542+ /* driver has to clear the interrupt status when bif ring is disabled */
543+ bif_doorbell_intr_cntl = REG_SET_FIELD (bif_doorbell_intr_cntl ,
544+ BIF_BX0_BIF_DOORBELL_INT_CNTL ,
545+ RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR , 1 );
546+
547+ WREG32_SOC15 (NBIO , 0 , regBIF_BX0_BIF_DOORBELL_INT_CNTL , bif_doorbell_intr_cntl );
548+
549+ amdgpu_ras_global_ras_isr (adev );
550+ }
551+ }
552+
553+ static int nbio_v7_9_set_ras_controller_irq_state (struct amdgpu_device * adev ,
554+ struct amdgpu_irq_src * src ,
555+ unsigned type ,
556+ enum amdgpu_interrupt_state state )
557+ {
558+ /* Dummy function, there is no initialization operation in driver */
559+
560+ return 0 ;
561+ }
562+
563+ static int nbio_v7_9_process_ras_controller_irq (struct amdgpu_device * adev ,
564+ struct amdgpu_irq_src * source ,
565+ struct amdgpu_iv_entry * entry )
566+ {
567+ /* By design, the ih cookie for ras_controller_irq should be written
568+ * to BIFring instead of general iv ring. However, due to known bif ring
569+ * hw bug, it has to be disabled. There is no chance the process function
570+ * will be involked. Just left it as a dummy one.
571+ */
572+ return 0 ;
573+ }
574+
575+ static int nbio_v7_9_set_ras_err_event_athub_irq_state (struct amdgpu_device * adev ,
576+ struct amdgpu_irq_src * src ,
577+ unsigned type ,
578+ enum amdgpu_interrupt_state state )
579+ {
580+ /* Dummy function, there is no initialization operation in driver */
581+
582+ return 0 ;
583+ }
584+
585+ static int nbio_v7_9_process_err_event_athub_irq (struct amdgpu_device * adev ,
586+ struct amdgpu_irq_src * source ,
587+ struct amdgpu_iv_entry * entry )
588+ {
589+ /* By design, the ih cookie for err_event_athub_irq should be written
590+ * to BIFring instead of general iv ring. However, due to known bif ring
591+ * hw bug, it has to be disabled. There is no chance the process function
592+ * will be involked. Just left it as a dummy one.
593+ */
594+ return 0 ;
595+ }
596+
597+ static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = {
598+ .set = nbio_v7_9_set_ras_controller_irq_state ,
599+ .process = nbio_v7_9_process_ras_controller_irq ,
600+ };
601+
602+ static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = {
603+ .set = nbio_v7_9_set_ras_err_event_athub_irq_state ,
604+ .process = nbio_v7_9_process_err_event_athub_irq ,
605+ };
606+
607+ static int nbio_v7_9_init_ras_controller_interrupt (struct amdgpu_device * adev )
608+ {
609+ int r ;
610+
611+ /* init the irq funcs */
612+ adev -> nbio .ras_controller_irq .funcs =
613+ & nbio_v7_9_ras_controller_irq_funcs ;
614+ adev -> nbio .ras_controller_irq .num_types = 1 ;
615+
616+ /* register ras controller interrupt */
617+ r = amdgpu_irq_add_id (adev , SOC15_IH_CLIENTID_BIF ,
618+ NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT ,
619+ & adev -> nbio .ras_controller_irq );
620+
621+ return r ;
622+ }
623+
624+ static int nbio_v7_9_init_ras_err_event_athub_interrupt (struct amdgpu_device * adev )
625+ {
626+
627+ int r ;
628+
629+ /* init the irq funcs */
630+ adev -> nbio .ras_err_event_athub_irq .funcs =
631+ & nbio_v7_9_ras_err_event_athub_irq_funcs ;
632+ adev -> nbio .ras_err_event_athub_irq .num_types = 1 ;
633+
634+ /* register ras err event athub interrupt */
635+ r = amdgpu_irq_add_id (adev , SOC15_IH_CLIENTID_BIF ,
636+ NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT ,
637+ & adev -> nbio .ras_err_event_athub_irq );
638+
639+ return r ;
640+ }
641+
642+ const struct amdgpu_ras_block_hw_ops nbio_v7_9_ras_hw_ops = {
643+ .query_ras_error_count = nbio_v7_9_query_ras_error_count ,
644+ };
645+
646+ struct amdgpu_nbio_ras nbio_v7_9_ras = {
647+ .ras_block = {
648+ .ras_comm = {
649+ .name = "pcie_bif" ,
650+ .block = AMDGPU_RAS_BLOCK__PCIE_BIF ,
651+ .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE ,
652+ },
653+ .hw_ops = & nbio_v7_9_ras_hw_ops ,
654+ .ras_late_init = amdgpu_nbio_ras_late_init ,
655+ },
656+ .handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring ,
657+ .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring ,
658+ .init_ras_controller_interrupt = nbio_v7_9_init_ras_controller_interrupt ,
659+ .init_ras_err_event_athub_interrupt = nbio_v7_9_init_ras_err_event_athub_interrupt ,
660+ };
0 commit comments