Skip to content

Commit dd29944

Browse files
mukjoshialexdeucher
authored andcommitted
drm/amdgpu: Rework retry fault removal
Rework retry fault removal from the software filter by storing an expired timestamp for a fault that is being removed. When a new fault comes, and it matches an entry in the sw filter, it will be added as a new fault only when its timestamp is greater than the timestamp expiry of the fault in the sw filter. This helps in avoiding stale faults being added back into the filter and preventing legitimate faults from being handled. Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> Reviewed-by: Philip Yang <Philip.Yang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 318e431 commit dd29944

2 files changed

Lines changed: 34 additions & 3 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -395,8 +395,21 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
395395
while (fault->timestamp >= stamp) {
396396
uint64_t tmp;
397397

398-
if (atomic64_read(&fault->key) == key)
399-
return true;
398+
if (atomic64_read(&fault->key) == key) {
399+
/*
400+
* if we get a fault which is already present in
401+
* the fault_ring and the timestamp of
402+
* the fault is after the expired timestamp,
403+
* then this is a new fault that needs to be added
404+
* into the fault ring.
405+
*/
406+
if (fault->timestamp_expiry != 0 &&
407+
amdgpu_ih_ts_after(fault->timestamp_expiry,
408+
timestamp))
409+
break;
410+
else
411+
return true;
412+
}
400413

401414
tmp = fault->timestamp;
402415
fault = &gmc->fault_ring[fault->next];
@@ -432,15 +445,32 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
432445
{
433446
struct amdgpu_gmc *gmc = &adev->gmc;
434447
uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
448+
struct amdgpu_ih_ring *ih;
435449
struct amdgpu_gmc_fault *fault;
450+
uint32_t last_wptr;
451+
uint64_t last_ts;
436452
uint32_t hash;
437453
uint64_t tmp;
438454

455+
ih = adev->irq.retry_cam_enabled ? &adev->irq.ih_soft : &adev->irq.ih1;
456+
/* Get the WPTR of the last entry in IH ring */
457+
last_wptr = amdgpu_ih_get_wptr(adev, ih);
458+
/* Order wptr with ring data. */
459+
rmb();
460+
/* Get the timetamp of the last entry in IH ring */
461+
last_ts = amdgpu_ih_decode_iv_ts(adev, ih, last_wptr, -1);
462+
439463
hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
440464
fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
441465
do {
442-
if (atomic64_cmpxchg(&fault->key, key, 0) == key)
466+
if (atomic64_read(&fault->key) == key) {
467+
/*
468+
* Update the timestamp when this fault
469+
* expired.
470+
*/
471+
fault->timestamp_expiry = last_ts;
443472
break;
473+
}
444474

445475
tmp = fault->timestamp;
446476
fault = &gmc->fault_ring[fault->next];

drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ struct amdgpu_gmc_fault {
7070
uint64_t timestamp:48;
7171
uint64_t next:AMDGPU_GMC_FAULT_RING_ORDER;
7272
atomic64_t key;
73+
uint64_t timestamp_expiry:48;
7374
};
7475

7576
/*

0 commit comments

Comments
 (0)