Skip to content

Commit fbc2a09

Browse files
Farah-kassabriogabbay
authored andcommitted
accel/habanalabs: update device boot error check
Use a predefined mask which set the device critical boot errors. Driver will fail and stop its loading, only upon detecting at least one of those errors defined in this mask. Signed-off-by: Farah Kassabri <fkassabri@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
1 parent f64fa33 commit fbc2a09

1 file changed

Lines changed: 32 additions & 83 deletions

File tree

drivers/accel/habanalabs/common/firmware_if.c

Lines changed: 32 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -646,141 +646,90 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
646646
return rc;
647647
}
648648

649-
static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
650-
u32 sts_val)
649+
static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, u32 sts_val)
651650
{
652651
bool err_exists = false;
653652

654653
if (!(err_val & CPU_BOOT_ERR0_ENABLED))
655654
return false;
656655

657-
if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) {
658-
dev_err(hdev->dev,
659-
"Device boot error - DRAM initialization failed\n");
660-
err_exists = true;
661-
}
656+
if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
657+
dev_err(hdev->dev, "Device boot error - DRAM initialization failed\n");
662658

663-
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) {
659+
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
664660
dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
665-
err_exists = true;
666-
}
667661

668-
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) {
669-
dev_err(hdev->dev,
670-
"Device boot error - Thermal Sensor initialization failed\n");
671-
err_exists = true;
672-
}
662+
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
663+
dev_err(hdev->dev, "Device boot error - Thermal Sensor initialization failed\n");
673664

674665
if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
675666
if (hdev->bmc_enable) {
676-
dev_err(hdev->dev,
677-
"Device boot error - Skipped waiting for BMC\n");
678-
err_exists = true;
667+
dev_err(hdev->dev, "Device boot error - Skipped waiting for BMC\n");
679668
} else {
680-
dev_info(hdev->dev,
681-
"Device boot message - Skipped waiting for BMC\n");
669+
dev_info(hdev->dev, "Device boot message - Skipped waiting for BMC\n");
682670
/* This is an info so we don't want it to disable the
683671
* device
684672
*/
685673
err_val &= ~CPU_BOOT_ERR0_BMC_WAIT_SKIPPED;
686674
}
687675
}
688676

689-
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) {
690-
dev_err(hdev->dev,
691-
"Device boot error - Serdes data from BMC not available\n");
692-
err_exists = true;
693-
}
677+
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
678+
dev_err(hdev->dev, "Device boot error - Serdes data from BMC not available\n");
694679

695-
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) {
696-
dev_err(hdev->dev,
697-
"Device boot error - NIC F/W initialization failed\n");
698-
err_exists = true;
699-
}
680+
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
681+
dev_err(hdev->dev, "Device boot error - NIC F/W initialization failed\n");
700682

701-
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) {
702-
dev_err(hdev->dev,
703-
"Device boot warning - security not ready\n");
704-
err_exists = true;
705-
}
683+
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
684+
dev_err(hdev->dev, "Device boot warning - security not ready\n");
706685

707-
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) {
686+
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
708687
dev_err(hdev->dev, "Device boot error - security failure\n");
709-
err_exists = true;
710-
}
711688

712-
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) {
689+
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
713690
dev_err(hdev->dev, "Device boot error - eFuse failure\n");
714-
err_exists = true;
715-
}
716691

717-
if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) {
692+
if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL)
718693
dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n");
719-
err_exists = true;
720-
}
721694

722-
if (err_val & CPU_BOOT_ERR0_PLL_FAIL) {
695+
if (err_val & CPU_BOOT_ERR0_PLL_FAIL)
723696
dev_err(hdev->dev, "Device boot error - PLL failure\n");
724-
err_exists = true;
725-
}
726697

727-
if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL) {
698+
if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL)
728699
dev_err(hdev->dev, "Device boot error - Failed to set threshold for temperature sensor\n");
729-
err_exists = true;
730-
}
731700

732701
if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) {
733702
/* Ignore this bit, don't prevent driver loading */
734703
dev_dbg(hdev->dev, "device unusable status is set\n");
735704
err_val &= ~CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL;
736705
}
737706

738-
if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) {
707+
if (err_val & CPU_BOOT_ERR0_BINNING_FAIL)
739708
dev_err(hdev->dev, "Device boot error - binning failure\n");
740-
err_exists = true;
741-
}
742709

743710
if (sts_val & CPU_BOOT_DEV_STS0_ENABLED)
744711
dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);
745712

713+
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
714+
dev_err(hdev->dev, "Device boot warning - Skipped DRAM initialization\n");
715+
716+
if (err_val & CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
717+
dev_err(hdev->dev, "Device boot error - ARC memory scrub failed\n");
718+
719+
/* All warnings should go here in order not to reach the unknown error validation */
746720
if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) {
747721
dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n");
748722
err_exists = true;
749723
}
750724

751-
/* All warnings should go here in order not to reach the unknown error validation */
752-
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
753-
dev_warn(hdev->dev,
754-
"Device boot warning - Skipped DRAM initialization\n");
755-
/* This is a warning so we don't want it to disable the
756-
* device
757-
*/
758-
err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
759-
}
725+
if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL)
726+
dev_warn(hdev->dev, "Device boot warning - Failed to load preboot primary image\n");
760727

761-
if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) {
762-
dev_warn(hdev->dev,
763-
"Device boot warning - Failed to load preboot primary image\n");
764-
/* This is a warning so we don't want it to disable the
765-
* device as we have a secondary preboot image
766-
*/
767-
err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL;
768-
}
728+
if (err_val & CPU_BOOT_ERR0_TPM_FAIL)
729+
dev_warn(hdev->dev, "Device boot warning - TPM failure\n");
769730

770-
if (err_val & CPU_BOOT_ERR0_TPM_FAIL) {
771-
dev_warn(hdev->dev,
772-
"Device boot warning - TPM failure\n");
773-
/* This is a warning so we don't want it to disable the
774-
* device
775-
*/
776-
err_val &= ~CPU_BOOT_ERR0_TPM_FAIL;
777-
}
778-
779-
if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
780-
dev_err(hdev->dev,
781-
"Device boot error - unknown ERR0 error 0x%08x\n", err_val);
731+
if (err_val & CPU_BOOT_ERR_FATAL_MASK)
782732
err_exists = true;
783-
}
784733

785734
/* return error only if it's in the predefined mask */
786735
if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) &

0 commit comments

Comments
 (0)