Skip to content

Commit 50a0b12

Browse files
cwabbott0Rob Clark
authored andcommitted
drm/msm: Wait for MMU devcoredump when waiting for GMU
If there is a flood of faults then the MMU can become saturated while it waits for the kernel to process the first fault and resume it, so that the GMU becomes blocked. This is mainly a problem when the kernel reads the state of the GPU for a devcoredump, because this takes a while. If we timeout waiting for the GMU, check if this has happened and retry after we're finished. Signed-off-by: Connor Abbott <cwabbott0@gmail.com> Patchwork: https://patchwork.freedesktop.org/patch/664685/ Signed-off-by: Rob Clark <robin.clark@oss.qualcomm.com>
1 parent a3a2237 commit 50a0b12

4 files changed

Lines changed: 48 additions & 6 deletions

File tree

drivers/gpu/drm/msm/adreno/a6xx_gmu.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -382,9 +382,23 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
382382
/* Trigger the equested OOB operation */
383383
gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, 1 << request);
384384

385-
/* Wait for the acknowledge interrupt */
386-
ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
387-
val & (1 << ack), 100, 10000);
385+
do {
386+
/* Wait for the acknowledge interrupt */
387+
ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
388+
val & (1 << ack), 100, 10000);
389+
390+
if (!ret)
391+
break;
392+
393+
if (completion_done(&a6xx_gpu->base.fault_coredump_done))
394+
break;
395+
396+
/* We may timeout because the GMU is temporarily wedged from
397+
* pending faults from the GPU and we are taking a devcoredump.
398+
* Wait until the MMU is resumed and try again.
399+
*/
400+
wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
401+
} while (true);
388402

389403
if (ret)
390404
DRM_DEV_ERROR(gmu->dev,

drivers/gpu/drm/msm/adreno/a6xx_hfi.c

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,25 @@ static int a6xx_hfi_wait_for_msg_interrupt(struct a6xx_gmu *gmu, u32 id, u32 seq
105105
{
106106
int ret;
107107
u32 val;
108+
struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
109+
110+
do {
111+
/* Wait for a response */
112+
ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
113+
val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
114+
115+
if (!ret)
116+
break;
108117

109-
/* Wait for a response */
110-
ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
111-
val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
118+
if (completion_done(&a6xx_gpu->base.fault_coredump_done))
119+
break;
120+
121+
/* We may timeout because the GMU is temporarily wedged from
122+
* pending faults from the GPU and we are taking a devcoredump.
123+
* Wait until the MMU is resumed and try again.
124+
*/
125+
wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
126+
} while (true);
112127

113128
if (ret) {
114129
DRM_DEV_ERROR(gmu->dev,

drivers/gpu/drm/msm/adreno/adreno_gpu.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
284284
struct adreno_smmu_fault_info *info, const char *block,
285285
u32 scratch[4])
286286
{
287+
struct adreno_gpu *adreno_gpu = container_of(gpu, struct adreno_gpu, base);
287288
struct msm_drm_private *priv = gpu->dev->dev_private;
288289
struct msm_mmu *mmu = to_msm_vm(gpu->vm)->mmu;
289290
const char *type = "UNKNOWN";
@@ -336,13 +337,20 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
336337
/* Turn off the hangcheck timer to keep it from bothering us */
337338
timer_delete(&gpu->hangcheck_timer);
338339

340+
/* Let any concurrent GMU transactions know that the MMU may be
341+
* blocked for a while and they should wait on us.
342+
*/
343+
reinit_completion(&adreno_gpu->fault_coredump_done);
344+
339345
fault_info.ttbr0 = info->ttbr0;
340346
fault_info.iova = iova;
341347
fault_info.flags = flags;
342348
fault_info.type = type;
343349
fault_info.block = block;
344350

345351
msm_gpu_fault_crashstate_capture(gpu, &fault_info);
352+
353+
complete_all(&adreno_gpu->fault_coredump_done);
346354
}
347355

348356
return 0;
@@ -1223,6 +1231,9 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
12231231
if (ret)
12241232
return ret;
12251233

1234+
init_completion(&adreno_gpu->fault_coredump_done);
1235+
complete_all(&adreno_gpu->fault_coredump_done);
1236+
12261237
pm_runtime_set_autosuspend_delay(dev,
12271238
adreno_gpu->info->inactive_period);
12281239
pm_runtime_use_autosuspend(dev);

drivers/gpu/drm/msm/adreno/adreno_gpu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ struct adreno_gpu {
180180
uint16_t speedbin;
181181
const struct adreno_gpu_funcs *funcs;
182182

183+
struct completion fault_coredump_done;
184+
183185
/* interesting register offsets to dump: */
184186
const unsigned int *registers;
185187

0 commit comments

Comments
 (0)