Skip to content

Commit 9edc529

Browse files
robclarkRob Clark
authored andcommitted
drm/msm: Add VM logging for VM_BIND updates
When userspace opts in to VM_BIND, the submit no longer holds references keeping the VMA alive. This makes it difficult to distinguish between UMD/KMD/app bugs. So add a debug option for logging the most recent VM updates and capturing these in GPU devcoredumps. The submitqueue id is also captured, a value of zero means the operation did not go via a submitqueue (ie. comes from msm_gem_vm_close() tearing down the remaining mappings when the device file is closed. Signed-off-by: Rob Clark <robdclark@chromium.org> Signed-off-by: Rob Clark <robin.clark@oss.qualcomm.com> Tested-by: Antonino Maniscalco <antomani103@gmail.com> Reviewed-by: Antonino Maniscalco <antomani103@gmail.com> Patchwork: https://patchwork.freedesktop.org/patch/661518/
1 parent 2e6a8a1 commit 9edc529

5 files changed

Lines changed: 202 additions & 13 deletions

File tree

drivers/gpu/drm/msm/adreno/adreno_gpu.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,7 @@ void adreno_gpu_state_destroy(struct msm_gpu_state *state)
833833
for (i = 0; state->bos && i < state->nr_bos; i++)
834834
kvfree(state->bos[i].data);
835835

836+
kfree(state->vm_logs);
836837
kfree(state->bos);
837838
kfree(state->comm);
838839
kfree(state->cmd);
@@ -973,6 +974,16 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
973974
info->ptes[0], info->ptes[1], info->ptes[2], info->ptes[3]);
974975
}
975976

977+
if (state->vm_logs) {
978+
drm_puts(p, "vm-log:\n");
979+
for (i = 0; i < state->nr_vm_logs; i++) {
980+
struct msm_gem_vm_log_entry *e = &state->vm_logs[i];
981+
drm_printf(p, " - %s:%d: 0x%016llx-0x%016llx\n",
982+
e->op, e->queue_id, e->iova,
983+
e->iova + e->range);
984+
}
985+
}
986+
976987
drm_printf(p, "rbbm-status: 0x%08x\n", state->rbbm_status);
977988

978989
drm_puts(p, "ringbuffer:\n");

drivers/gpu/drm/msm/msm_gem.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,20 @@
2424
#define MSM_BO_STOLEN 0x10000000 /* try to use stolen/splash memory */
2525
#define MSM_BO_MAP_PRIV 0x20000000 /* use IOMMU_PRIV when mapping */
2626

27+
/**
28+
* struct msm_gem_vm_log_entry - An entry in the VM log
29+
*
30+
* For userspace managed VMs, a log of recent VM updates is tracked and
31+
* captured in GPU devcore dumps, to aid debugging issues caused by (for
32+
* example) incorrectly synchronized VM updates
33+
*/
34+
struct msm_gem_vm_log_entry {
35+
const char *op;
36+
uint64_t iova;
37+
uint64_t range;
38+
int queue_id;
39+
};
40+
2741
/**
2842
* struct msm_gem_vm - VM object
2943
*
@@ -85,6 +99,15 @@ struct msm_gem_vm {
8599
/** @last_fence: Fence for last pending work scheduled on the VM */
86100
struct dma_fence *last_fence;
87101

102+
/** @log: A log of recent VM updates */
103+
struct msm_gem_vm_log_entry *log;
104+
105+
/** @log_shift: length of @log is (1 << @log_shift) */
106+
uint32_t log_shift;
107+
108+
/** @log_idx: index of next @log entry to write */
109+
uint32_t log_idx;
110+
88111
/** @faults: the number of GPU hangs associated with this address space */
89112
int faults;
90113

@@ -115,6 +138,7 @@ msm_gem_vm_create(struct drm_device *drm, struct msm_mmu *mmu, const char *name,
115138
u64 va_start, u64 va_size, bool managed);
116139

117140
void msm_gem_vm_close(struct drm_gpuvm *gpuvm);
141+
void msm_gem_vm_unusable(struct drm_gpuvm *gpuvm);
118142

119143
struct msm_fence_context;
120144

drivers/gpu/drm/msm/msm_gem_vma.c

Lines changed: 116 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717

1818
#define vm_dbg(fmt, ...) pr_debug("%s:%d: "fmt"\n", __func__, __LINE__, ##__VA_ARGS__)
1919

20+
static uint vm_log_shift = 0;
21+
MODULE_PARM_DESC(vm_log_shift, "Length of VM op log");
22+
module_param_named(vm_log_shift, vm_log_shift, uint, 0600);
23+
2024
/**
2125
* struct msm_vm_map_op - create new pgtable mapping
2226
*/
@@ -31,6 +35,13 @@ struct msm_vm_map_op {
3135
struct sg_table *sgt;
3236
/** @prot: the mapping protection flags */
3337
int prot;
38+
39+
/**
40+
* @queue_id: The id of the submitqueue the operation is performed
41+
* on, or zero for (in particular) UNMAP ops triggered outside of
42+
* a submitqueue (ie. process cleanup)
43+
*/
44+
int queue_id;
3445
};
3546

3647
/**
@@ -41,6 +52,13 @@ struct msm_vm_unmap_op {
4152
uint64_t iova;
4253
/** @range: size of region to unmap */
4354
uint64_t range;
55+
56+
/**
57+
* @queue_id: The id of the submitqueue the operation is performed
58+
* on, or zero for (in particular) UNMAP ops triggered outside of
59+
* a submitqueue (ie. process cleanup)
60+
*/
61+
int queue_id;
4462
};
4563

4664
/**
@@ -144,27 +162,95 @@ msm_gem_vm_free(struct drm_gpuvm *gpuvm)
144162
vm->mmu->funcs->destroy(vm->mmu);
145163
dma_fence_put(vm->last_fence);
146164
put_pid(vm->pid);
165+
kfree(vm->log);
147166
kfree(vm);
148167
}
149168

169+
/**
170+
* msm_gem_vm_unusable() - Mark a VM as unusable
171+
* @vm: the VM to mark unusable
172+
*/
173+
void
174+
msm_gem_vm_unusable(struct drm_gpuvm *gpuvm)
175+
{
176+
struct msm_gem_vm *vm = to_msm_vm(gpuvm);
177+
uint32_t vm_log_len = (1 << vm->log_shift);
178+
uint32_t vm_log_mask = vm_log_len - 1;
179+
uint32_t nr_vm_logs;
180+
int first;
181+
182+
vm->unusable = true;
183+
184+
/* Bail if no log, or empty log: */
185+
if (!vm->log || !vm->log[0].op)
186+
return;
187+
188+
mutex_lock(&vm->mmu_lock);
189+
190+
/*
191+
* log_idx is the next entry to overwrite, meaning it is the oldest, or
192+
* first, entry (other than the special case handled below where the
193+
* log hasn't wrapped around yet)
194+
*/
195+
first = vm->log_idx;
196+
197+
if (!vm->log[first].op) {
198+
/*
199+
* If the next log entry has not been written yet, then only
200+
* entries 0 to idx-1 are valid (ie. we haven't wrapped around
201+
* yet)
202+
*/
203+
nr_vm_logs = MAX(0, first - 1);
204+
first = 0;
205+
} else {
206+
nr_vm_logs = vm_log_len;
207+
}
208+
209+
pr_err("vm-log:\n");
210+
for (int i = 0; i < nr_vm_logs; i++) {
211+
int idx = (i + first) & vm_log_mask;
212+
struct msm_gem_vm_log_entry *e = &vm->log[idx];
213+
pr_err(" - %s:%d: 0x%016llx-0x%016llx\n",
214+
e->op, e->queue_id, e->iova,
215+
e->iova + e->range);
216+
}
217+
218+
mutex_unlock(&vm->mmu_lock);
219+
}
220+
150221
static void
151-
vm_unmap_op(struct msm_gem_vm *vm, const struct msm_vm_unmap_op *op)
222+
vm_log(struct msm_gem_vm *vm, const char *op, uint64_t iova, uint64_t range, int queue_id)
152223
{
224+
int idx;
225+
153226
if (!vm->managed)
154227
lockdep_assert_held(&vm->mmu_lock);
155228

156-
vm_dbg("%p: %016llx %016llx", vm, op->iova, op->iova + op->range);
229+
vm_dbg("%s:%p:%d: %016llx %016llx", op, vm, queue_id, iova, iova + range);
230+
231+
if (!vm->log)
232+
return;
233+
234+
idx = vm->log_idx;
235+
vm->log[idx].op = op;
236+
vm->log[idx].iova = iova;
237+
vm->log[idx].range = range;
238+
vm->log[idx].queue_id = queue_id;
239+
vm->log_idx = (vm->log_idx + 1) & ((1 << vm->log_shift) - 1);
240+
}
241+
242+
static void
243+
vm_unmap_op(struct msm_gem_vm *vm, const struct msm_vm_unmap_op *op)
244+
{
245+
vm_log(vm, "unmap", op->iova, op->range, op->queue_id);
157246

158247
vm->mmu->funcs->unmap(vm->mmu, op->iova, op->range);
159248
}
160249

161250
static int
162251
vm_map_op(struct msm_gem_vm *vm, const struct msm_vm_map_op *op)
163252
{
164-
if (!vm->managed)
165-
lockdep_assert_held(&vm->mmu_lock);
166-
167-
vm_dbg("%p: %016llx %016llx", vm, op->iova, op->iova + op->range);
253+
vm_log(vm, "map", op->iova, op->range, op->queue_id);
168254

169255
return vm->mmu->funcs->map(vm->mmu, op->iova, op->sgt, op->offset,
170256
op->range, op->prot);
@@ -382,6 +468,7 @@ vma_from_op(struct op_arg *arg, struct drm_gpuva_op_map *op)
382468
static int
383469
msm_gem_vm_sm_step_map(struct drm_gpuva_op *op, void *arg)
384470
{
471+
struct msm_vm_bind_job *job = ((struct op_arg *)arg)->job;
385472
struct drm_gem_object *obj = op->map.gem.obj;
386473
struct drm_gpuva *vma;
387474
struct sg_table *sgt;
@@ -412,6 +499,7 @@ msm_gem_vm_sm_step_map(struct drm_gpuva_op *op, void *arg)
412499
.range = vma->va.range,
413500
.offset = vma->gem.offset,
414501
.prot = prot,
502+
.queue_id = job->queue->id,
415503
},
416504
.obj = vma->gem.obj,
417505
});
@@ -445,6 +533,7 @@ msm_gem_vm_sm_step_remap(struct drm_gpuva_op *op, void *arg)
445533
.unmap = {
446534
.iova = unmap_start,
447535
.range = unmap_range,
536+
.queue_id = job->queue->id,
448537
},
449538
.obj = orig_vma->gem.obj,
450539
});
@@ -506,6 +595,7 @@ msm_gem_vm_sm_step_remap(struct drm_gpuva_op *op, void *arg)
506595
static int
507596
msm_gem_vm_sm_step_unmap(struct drm_gpuva_op *op, void *arg)
508597
{
598+
struct msm_vm_bind_job *job = ((struct op_arg *)arg)->job;
509599
struct drm_gpuva *vma = op->unmap.va;
510600
struct msm_gem_vma *msm_vma = to_msm_vma(vma);
511601

@@ -520,6 +610,7 @@ msm_gem_vm_sm_step_unmap(struct drm_gpuva_op *op, void *arg)
520610
.unmap = {
521611
.iova = vma->va.addr,
522612
.range = vma->va.range,
613+
.queue_id = job->queue->id,
523614
},
524615
.obj = vma->gem.obj,
525616
});
@@ -584,7 +675,7 @@ msm_vma_job_run(struct drm_sched_job *_job)
584675
* now the VM is in an undefined state. Game over!
585676
*/
586677
if (ret)
587-
vm->unusable = true;
678+
msm_gem_vm_unusable(job->vm);
588679

589680
job_foreach_bo (obj, job) {
590681
msm_gem_lock(obj);
@@ -695,6 +786,23 @@ msm_gem_vm_create(struct drm_device *drm, struct msm_mmu *mmu, const char *name,
695786

696787
drm_mm_init(&vm->mm, va_start, va_size);
697788

789+
/*
790+
* We don't really need vm log for kernel managed VMs, as the kernel
791+
* is responsible for ensuring that GEM objs are mapped if they are
792+
* used by a submit. Furthermore we piggyback on mmu_lock to serialize
793+
* access to the log.
794+
*
795+
* Limit the max log_shift to 8 to prevent userspace from asking us
796+
* for an unreasonable log size.
797+
*/
798+
if (!managed)
799+
vm->log_shift = MIN(vm_log_shift, 8);
800+
801+
if (vm->log_shift) {
802+
vm->log = kmalloc_array(1 << vm->log_shift, sizeof(vm->log[0]),
803+
GFP_KERNEL | __GFP_ZERO);
804+
}
805+
698806
return &vm->base;
699807

700808
err_free_dummy:
@@ -1162,7 +1270,7 @@ vm_bind_job_prepare(struct msm_vm_bind_job *job)
11621270
* state the vm is in. So throw up our hands!
11631271
*/
11641272
if (i > 0)
1165-
vm->unusable = true;
1273+
msm_gem_vm_unusable(job->vm);
11661274
return ret;
11671275
}
11681276
}

drivers/gpu/drm/msm/msm_gpu.c

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -259,9 +259,6 @@ static void crashstate_get_bos(struct msm_gpu_state *state, struct msm_gem_submi
259259
{
260260
extern bool rd_full;
261261

262-
if (!submit)
263-
return;
264-
265262
if (msm_context_is_vmbind(submit->queue->ctx)) {
266263
struct drm_exec exec;
267264
struct drm_gpuva *vma;
@@ -318,6 +315,48 @@ static void crashstate_get_bos(struct msm_gpu_state *state, struct msm_gem_submi
318315
}
319316
}
320317

318+
static void crashstate_get_vm_logs(struct msm_gpu_state *state, struct msm_gem_vm *vm)
319+
{
320+
uint32_t vm_log_len = (1 << vm->log_shift);
321+
uint32_t vm_log_mask = vm_log_len - 1;
322+
int first;
323+
324+
/* Bail if no log, or empty log: */
325+
if (!vm->log || !vm->log[0].op)
326+
return;
327+
328+
mutex_lock(&vm->mmu_lock);
329+
330+
/*
331+
* log_idx is the next entry to overwrite, meaning it is the oldest, or
332+
* first, entry (other than the special case handled below where the
333+
* log hasn't wrapped around yet)
334+
*/
335+
first = vm->log_idx;
336+
337+
if (!vm->log[first].op) {
338+
/*
339+
* If the next log entry has not been written yet, then only
340+
* entries 0 to idx-1 are valid (ie. we haven't wrapped around
341+
* yet)
342+
*/
343+
state->nr_vm_logs = MAX(0, first - 1);
344+
first = 0;
345+
} else {
346+
state->nr_vm_logs = vm_log_len;
347+
}
348+
349+
state->vm_logs = kmalloc_array(
350+
state->nr_vm_logs, sizeof(vm->log[0]), GFP_KERNEL);
351+
for (int i = 0; i < state->nr_vm_logs; i++) {
352+
int idx = (i + first) & vm_log_mask;
353+
354+
state->vm_logs[i] = vm->log[idx];
355+
}
356+
357+
mutex_unlock(&vm->mmu_lock);
358+
}
359+
321360
static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
322361
struct msm_gem_submit *submit, struct msm_gpu_fault_info *fault_info,
323362
char *comm, char *cmd)
@@ -351,7 +390,10 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
351390
msm_iommu_pagetable_walk(mmu, info->iova, info->ptes);
352391
}
353392

354-
crashstate_get_bos(state, submit);
393+
if (submit) {
394+
crashstate_get_vm_logs(state, to_msm_vm(submit->vm));
395+
crashstate_get_bos(state, submit);
396+
}
355397

356398
/* Set the active crash state to be dumped on failure */
357399
gpu->crashstate = state;
@@ -452,7 +494,7 @@ static void recover_worker(struct kthread_work *work)
452494
* VM_BIND)
453495
*/
454496
if (!vm->managed)
455-
vm->unusable = true;
497+
msm_gem_vm_unusable(submit->vm);
456498
}
457499

458500
get_comm_cmdline(submit, &comm, &cmd);

drivers/gpu/drm/msm/msm_gpu.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "msm_gem.h"
2121

2222
struct msm_gem_submit;
23+
struct msm_gem_vm_log_entry;
2324
struct msm_gpu_perfcntr;
2425
struct msm_gpu_state;
2526
struct msm_context;
@@ -603,6 +604,9 @@ struct msm_gpu_state {
603604

604605
struct msm_gpu_fault_info fault_info;
605606

607+
int nr_vm_logs;
608+
struct msm_gem_vm_log_entry *vm_logs;
609+
606610
int nr_bos;
607611
struct msm_gpu_state_bo *bos;
608612
};

0 commit comments

Comments
 (0)