Skip to content

Commit 58cde80

Browse files
Stanislaw Gruszkajlawryno
authored andcommitted
accel/ivpu: Use dedicated work for job timeout detection
Change to use work for timeout detection. Needed for thread_irq conversion. Signed-off-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com> Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com> Link: https://patchwork.freedesktop.org/patch/msgid/20231113170252.758137-5-jacek.lawrynowicz@linux.intel.com
1 parent b3c10b7 commit 58cde80

3 files changed

Lines changed: 43 additions & 15 deletions

File tree

drivers/accel/ivpu/ivpu_job.c

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,6 @@
2424
#define JOB_ID_CONTEXT_MASK GENMASK(31, 8)
2525
#define JOB_MAX_BUFFER_COUNT 65535
2626

27-
static unsigned int ivpu_tdr_timeout_ms;
28-
module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, uint, 0644);
29-
MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
30-
3127
static void ivpu_cmdq_ring_db(struct ivpu_device *vdev, struct ivpu_cmdq *cmdq)
3228
{
3329
ivpu_hw_reg_db_set(vdev, cmdq->db_id);
@@ -342,6 +338,8 @@ static int ivpu_job_done(struct ivpu_device *vdev, u32 job_id, u32 job_status)
342338
ivpu_dbg(vdev, JOB, "Job complete: id %3u ctx %2d engine %d status 0x%x\n",
343339
job->job_id, job->file_priv->ctx.id, job->engine_idx, job_status);
344340

341+
ivpu_stop_job_timeout_detection(vdev);
342+
345343
job_put(job);
346344
return 0;
347345
}
@@ -357,6 +355,9 @@ static void ivpu_job_done_message(struct ivpu_device *vdev, void *msg)
357355
ret = ivpu_job_done(vdev, payload->job_id, payload->job_status);
358356
if (ret)
359357
ivpu_err(vdev, "Failed to finish job %d: %d\n", payload->job_id, ret);
358+
359+
if (!ret && !xa_empty(&vdev->submitted_jobs_xa))
360+
ivpu_start_job_timeout_detection(vdev);
360361
}
361362

362363
void ivpu_jobs_abort_all(struct ivpu_device *vdev)
@@ -400,6 +401,8 @@ static int ivpu_direct_job_submission(struct ivpu_job *job)
400401
if (ret)
401402
goto err_xa_erase;
402403

404+
ivpu_start_job_timeout_detection(vdev);
405+
403406
ivpu_dbg(vdev, JOB, "Job submitted: id %3u addr 0x%llx ctx %2d engine %d next %d\n",
404407
job->job_id, job->cmd_buf_vpu_addr, file_priv->ctx.id,
405408
job->engine_idx, cmdq->jobq->header.tail);
@@ -569,7 +572,6 @@ static int ivpu_job_done_thread(void *arg)
569572
struct ivpu_device *vdev = (struct ivpu_device *)arg;
570573
struct ivpu_ipc_consumer cons;
571574
struct vpu_jsm_msg jsm_msg;
572-
bool jobs_submitted;
573575
unsigned int timeout;
574576
int ret;
575577

@@ -578,18 +580,10 @@ static int ivpu_job_done_thread(void *arg)
578580
ivpu_ipc_consumer_add(vdev, &cons, VPU_IPC_CHAN_JOB_RET);
579581

580582
while (!kthread_should_stop()) {
581-
timeout = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
582-
jobs_submitted = !xa_empty(&vdev->submitted_jobs_xa);
583583
ret = ivpu_ipc_receive(vdev, &cons, NULL, &jsm_msg, timeout);
584-
if (!ret) {
584+
if (!ret)
585585
ivpu_job_done_message(vdev, &jsm_msg);
586-
} else if (ret == -ETIMEDOUT) {
587-
if (jobs_submitted && !xa_empty(&vdev->submitted_jobs_xa)) {
588-
ivpu_err(vdev, "TDR detected, timeout %d ms", timeout);
589-
ivpu_hw_diagnose_failure(vdev);
590-
ivpu_pm_schedule_recovery(vdev);
591-
}
592-
}
586+
593587
if (kthread_should_park()) {
594588
ivpu_dbg(vdev, JOB, "Parked %s\n", __func__);
595589
kthread_parkme();

drivers/accel/ivpu/ivpu_pm.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ static bool ivpu_disable_recovery;
2323
module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
2424
MODULE_PARM_DESC(disable_recovery, "Disables recovery when VPU hang is detected");
2525

26+
static unsigned long ivpu_tdr_timeout_ms;
27+
module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
28+
MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
29+
2630
#define PM_RESCHEDULE_LIMIT 5
2731

2832
static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
@@ -141,6 +145,31 @@ void ivpu_pm_schedule_recovery(struct ivpu_device *vdev)
141145
}
142146
}
143147

148+
static void ivpu_job_timeout_work(struct work_struct *work)
149+
{
150+
struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
151+
struct ivpu_device *vdev = pm->vdev;
152+
unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
153+
154+
ivpu_err(vdev, "TDR detected, timeout %lu ms", timeout_ms);
155+
ivpu_hw_diagnose_failure(vdev);
156+
157+
ivpu_pm_schedule_recovery(vdev);
158+
}
159+
160+
void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
161+
{
162+
unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
163+
164+
/* No-op if already queued */
165+
queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms));
166+
}
167+
168+
void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
169+
{
170+
cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
171+
}
172+
144173
int ivpu_pm_suspend_cb(struct device *dev)
145174
{
146175
struct drm_device *drm = dev_get_drvdata(dev);
@@ -317,6 +346,7 @@ void ivpu_pm_init(struct ivpu_device *vdev)
317346

318347
atomic_set(&pm->in_reset, 0);
319348
INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
349+
INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
320350

321351
if (ivpu_disable_recovery)
322352
delay = -1;
@@ -331,6 +361,7 @@ void ivpu_pm_init(struct ivpu_device *vdev)
331361

332362
void ivpu_pm_cancel_recovery(struct ivpu_device *vdev)
333363
{
364+
drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
334365
cancel_work_sync(&vdev->pm->recovery_work);
335366
}
336367

drivers/accel/ivpu/ivpu_pm.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ struct ivpu_device;
1212

1313
struct ivpu_pm_info {
1414
struct ivpu_device *vdev;
15+
struct delayed_work job_timeout_work;
1516
struct work_struct recovery_work;
1617
atomic_t in_reset;
1718
atomic_t reset_counter;
@@ -37,5 +38,7 @@ int __must_check ivpu_rpm_get_if_active(struct ivpu_device *vdev);
3738
void ivpu_rpm_put(struct ivpu_device *vdev);
3839

3940
void ivpu_pm_schedule_recovery(struct ivpu_device *vdev);
41+
void ivpu_start_job_timeout_detection(struct ivpu_device *vdev);
42+
void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev);
4043

4144
#endif /* __IVPU_PM_H__ */

0 commit comments

Comments
 (0)