Skip to content

Commit d8b9cea

Browse files
ofirbittogabbay
authored andcommitted
accel/habanalabs: add pci health check during heartbeat
Currently upon a heartbeat failure, we don't know if the failure is due to firmware hang or due to a bad PCI link. Hence, we are reading a PCI config space register with a known value (vendor ID) so we will know which of the two possibilities caused the heartbeat failure. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
1 parent 3d21ec6 commit d8b9cea

3 files changed

Lines changed: 16 additions & 3 deletions

File tree

drivers/accel/habanalabs/common/device.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -981,6 +981,18 @@ static void device_early_fini(struct hl_device *hdev)
981981
hdev->asic_funcs->early_fini(hdev);
982982
}
983983

984+
static bool is_pci_link_healthy(struct hl_device *hdev)
985+
{
986+
u16 vendor_id;
987+
988+
if (!hdev->pdev)
989+
return false;
990+
991+
pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
992+
993+
return (vendor_id == PCI_VENDOR_ID_HABANALABS);
994+
}
995+
984996
static void hl_device_heartbeat(struct work_struct *work)
985997
{
986998
struct hl_device *hdev = container_of(work, struct hl_device,
@@ -995,7 +1007,8 @@ static void hl_device_heartbeat(struct work_struct *work)
9951007
goto reschedule;
9961008

9971009
if (hl_device_operational(hdev, NULL))
998-
dev_err(hdev->dev, "Device heartbeat failed!\n");
1010+
dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
1011+
is_pci_link_healthy(hdev) ? "healthy" : "broken");
9991012

10001013
info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
10011014
info.event_mask = &event_mask;

drivers/accel/habanalabs/common/habanalabs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
struct hl_device;
3737
struct hl_fpriv;
3838

39+
#define PCI_VENDOR_ID_HABANALABS 0x1da3
40+
3941
/* Use upper bits of mmap offset to store habana driver specific information.
4042
* bits[63:59] - Encode mmap type
4143
* bits[45:0] - mmap offset value

drivers/accel/habanalabs/common/habanalabs_drv.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,6 @@ module_param(boot_error_status_mask, ulong, 0444);
5454
MODULE_PARM_DESC(boot_error_status_mask,
5555
"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
5656

57-
#define PCI_VENDOR_ID_HABANALABS 0x1da3
58-
5957
#define PCI_IDS_GOYA 0x0001
6058
#define PCI_IDS_GAUDI 0x1000
6159
#define PCI_IDS_GAUDI_SEC 0x1010

0 commit comments

Comments
 (0)