Skip to content

Commit 67d19a2

Browse files
KobyElbazogabbay
authored andcommitted
accel/habanalabs: poll for device status update following WFE cmd
Currently, we rely on COMMS protocol's ack to verify that WFE command has been acknowledged by the FW. However, this does not guarantee that the device status has been updated. Although unlikely, this could trigger a race since the driver expects the device to be halted at that stage, but it might not be. Therefore, we increase WFE's robustness by polling on the status register that will be updated once the device is actually halted. Signed-off-by: Koby Elbaz <kelbaz@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
1 parent 3b9abb4 commit 67d19a2

1 file changed

Lines changed: 23 additions & 5 deletions

File tree

drivers/accel/habanalabs/common/firmware_if.c

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1368,21 +1368,39 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev)
13681368

13691369
void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
13701370
{
1371-
struct static_fw_load_mgr *static_loader =
1372-
&hdev->fw_loader.static_loader;
1371+
struct fw_load_mgr *fw_loader = &hdev->fw_loader;
1372+
u32 status, cpu_boot_status_reg, cpu_timeout;
1373+
struct static_fw_load_mgr *static_loader;
1374+
struct pre_fw_load_props *pre_fw_load;
13731375
int rc;
13741376

13751377
if (hdev->device_cpu_is_halted)
13761378
return;
13771379

13781380
/* Stop device CPU to make sure nothing bad happens */
13791381
if (hdev->asic_prop.dynamic_fw_load) {
1382+
pre_fw_load = &fw_loader->pre_fw_load;
1383+
cpu_timeout = fw_loader->cpu_timeout;
1384+
cpu_boot_status_reg = pre_fw_load->cpu_boot_status_reg;
1385+
13801386
rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
1381-
COMMS_GOTO_WFE, 0, false,
1382-
hdev->fw_loader.cpu_timeout);
1383-
if (rc)
1387+
COMMS_GOTO_WFE, 0, false, cpu_timeout);
1388+
if (rc) {
13841389
dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
1390+
} else {
1391+
rc = hl_poll_timeout(
1392+
hdev,
1393+
cpu_boot_status_reg,
1394+
status,
1395+
status == CPU_BOOT_STATUS_IN_WFE,
1396+
hdev->fw_poll_interval_usec,
1397+
cpu_timeout);
1398+
if (rc)
1399+
dev_err(hdev->dev, "Current status=%u. Timed-out updating to WFE\n",
1400+
status);
1401+
}
13851402
} else {
1403+
static_loader = &hdev->fw_loader.static_loader;
13861404
WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE);
13871405
msleep(static_loader->cpu_reset_wait_msec);
13881406

0 commit comments

Comments
 (0)