Skip to content

Commit 0e7db50

Browse files
kwachowsjlawryno
authored andcommitted
accel/ivpu: Implement heartbeat-based TDR mechanism
Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism. The enhancement aims to improve the reliability of device hang detection by monitoring heartbeat updates. Each progressing inference will update heartbeat counter allowing driver to monitor its progression. Limit maximum number of reschedules when heartbeat indicates progression to 30. This increases the maximum running time of single inference to about 60 seconds. The heartbeat mechanism provides a more robust method for detecting device hangs, potentially reducing false positive recoveries due to long running inferences. Signed-off-by: Karol Wachowski <karol.wachowski@intel.com> Signed-off-by: Maciej Falkowski <maciej.falkowski@linux.intel.com> Reviewed-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com> Link: https://lore.kernel.org/r/20250416102555.384526-1-maciej.falkowski@linux.intel.com
1 parent 3a2b738 commit 0e7db50

4 files changed

Lines changed: 26 additions & 0 deletions

File tree

drivers/accel/ivpu/ivpu_drv.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,9 @@ int ivpu_boot(struct ivpu_device *vdev)
374374
{
375375
int ret;
376376

377+
drm_WARN_ON(&vdev->drm, atomic_read(&vdev->job_timeout_counter));
378+
drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
379+
377380
/* Update boot params located at first 4KB of FW memory */
378381
ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem));
379382

@@ -573,6 +576,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
573576
vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID;
574577
vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
575578
atomic64_set(&vdev->unique_id_counter, 0);
579+
atomic_set(&vdev->job_timeout_counter, 0);
576580
xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
577581
xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
578582
xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);

drivers/accel/ivpu/ivpu_drv.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ struct ivpu_device {
154154
struct mutex submitted_jobs_lock; /* Protects submitted_jobs */
155155
struct xarray submitted_jobs_xa;
156156
struct ivpu_ipc_consumer job_done_consumer;
157+
atomic_t job_timeout_counter;
157158

158159
atomic64_t unique_id_counter;
159160

drivers/accel/ivpu/ivpu_fw.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ struct ivpu_fw_info {
3939
u64 read_only_addr;
4040
u32 read_only_size;
4141
u32 sched_mode;
42+
u64 last_heartbeat;
4243
};
4344

4445
int ivpu_fw_init(struct ivpu_device *vdev);

drivers/accel/ivpu/ivpu_pm.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
3434
MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
3535

3636
#define PM_RESCHEDULE_LIMIT 5
37+
#define PM_TDR_HEARTBEAT_LIMIT 30
3738

3839
static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
3940
{
@@ -44,6 +45,7 @@ static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
4445
ivpu_fw_log_reset(vdev);
4546
ivpu_fw_load(vdev);
4647
fw->entry_point = fw->cold_boot_entry_point;
48+
fw->last_heartbeat = 0;
4749
}
4850

4951
static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
@@ -189,7 +191,24 @@ static void ivpu_job_timeout_work(struct work_struct *work)
189191
{
190192
struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
191193
struct ivpu_device *vdev = pm->vdev;
194+
u64 heartbeat;
192195

196+
if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
197+
ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
198+
goto recovery;
199+
}
200+
201+
if (atomic_fetch_inc(&vdev->job_timeout_counter) > PM_TDR_HEARTBEAT_LIMIT) {
202+
ivpu_err(vdev, "Job timeout detected, heartbeat limit exceeded\n");
203+
goto recovery;
204+
}
205+
206+
vdev->fw->last_heartbeat = heartbeat;
207+
ivpu_start_job_timeout_detection(vdev);
208+
return;
209+
210+
recovery:
211+
atomic_set(&vdev->job_timeout_counter, 0);
193212
ivpu_pm_trigger_recovery(vdev, "TDR");
194213
}
195214

@@ -204,6 +223,7 @@ void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
204223
void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
205224
{
206225
cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
226+
atomic_set(&vdev->job_timeout_counter, 0);
207227
}
208228

209229
int ivpu_pm_suspend_cb(struct device *dev)

0 commit comments

Comments
 (0)