Skip to content

Commit b7851f8

Browse files
jcornwallAMDalexdeucher
authored andcommitted
drm/amdkfd: Trap handler support for expert scheduling mode
The trap may be entered with dependency checking disabled. Wait for dependency counters and save/restore scheduling mode. v2: Use ttmp1 instead of ttmp11. ttmp11 is not zero-initialized. While the trap handler does zero this field before use, a user-mode second-level trap handler could not rely on this being zero when using an older kernel mode driver. v3: Use ttmp11 primarily but copy to ttmp1 before jumping to the second level trap handler. ttmp1 is inspectable by a debugger. Unexpected bits in the unused space may regress existing software. Signed-off-by: Jay Cornwall <jay.cornwall@amd.com> Reviewed-by: Lancelot Six <lancelot.six@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> (cherry picked from commit 4238888) Cc: stable@vger.kernel.org
1 parent bf2084a commit b7851f8

2 files changed

Lines changed: 73 additions & 26 deletions

File tree

drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3644,14 +3644,18 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = {
36443644
};
36453645

36463646
static const uint32_t cwsr_trap_gfx12_hex[] = {
3647-
0xbfa00001, 0xbfa002a2,
3648-
0xb0804009, 0xb8f8f804,
3647+
0xbfa00001, 0xbfa002b2,
3648+
0xb0804009, 0xb8eef81a,
3649+
0xbf880000, 0xb980081a,
3650+
0x00000000, 0xb8f8f804,
3651+
0x9177ff77, 0x0c000000,
3652+
0x846e9a6e, 0x8c776e77,
36493653
0x9178ff78, 0x00008c00,
36503654
0xb8fbf811, 0x8b6eff78,
36513655
0x00004000, 0xbfa10008,
36523656
0x8b6eff7b, 0x00000080,
36533657
0xbfa20018, 0x8b6ea07b,
3654-
0xbfa20042, 0xbf830010,
3658+
0xbfa2004a, 0xbf830010,
36553659
0xb8fbf811, 0xbfa0fffb,
36563660
0x8b6eff7b, 0x00000bd0,
36573661
0xbfa20010, 0xb8eef812,
@@ -3662,28 +3666,32 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
36623666
0xf0000000, 0xbfa20005,
36633667
0x8b6fff6f, 0x00000200,
36643668
0xbfa20002, 0x8b6ea07b,
3665-
0xbfa2002c, 0xbefa4d82,
3669+
0xbfa20034, 0xbefa4d82,
36663670
0xbf8a0000, 0x84fa887a,
36673671
0xbf0d8f7b, 0xbfa10002,
36683672
0x8c7bff7b, 0xffff0000,
3669-
0xf4601bbd, 0xf8000010,
3670-
0xbf8a0000, 0x846e976e,
3671-
0x9177ff77, 0x00800000,
3672-
0x8c776e77, 0xf4603bbd,
3673-
0xf8000000, 0xbf8a0000,
3674-
0xf4603ebd, 0xf8000008,
3675-
0xbf8a0000, 0x8bee6e6e,
3676-
0xbfa10001, 0xbe80486e,
3677-
0x8b6eff6d, 0xf0000000,
3678-
0xbfa20009, 0xb8eef811,
3679-
0x8b6eff6e, 0x00000080,
3680-
0xbfa20007, 0x8c78ff78,
3681-
0x00004000, 0x80ec886c,
3682-
0x82ed806d, 0xbfa00002,
3683-
0x806c846c, 0x826d806d,
3684-
0x8b6dff6d, 0x0000ffff,
3685-
0x8bfe7e7e, 0x8bea6a6a,
3686-
0x85788978, 0xb9783244,
3673+
0x8b6eff77, 0x0c000000,
3674+
0x916dff6d, 0x0c000000,
3675+
0x8c6d6e6d, 0xf4601bbd,
3676+
0xf8000010, 0xbf8a0000,
3677+
0x846e976e, 0x9177ff77,
3678+
0x00800000, 0x8c776e77,
3679+
0xf4603bbd, 0xf8000000,
3680+
0xbf8a0000, 0xf4603ebd,
3681+
0xf8000008, 0xbf8a0000,
3682+
0x8bee6e6e, 0xbfa10001,
3683+
0xbe80486e, 0x8b6eff6d,
3684+
0xf0000000, 0xbfa20009,
3685+
0xb8eef811, 0x8b6eff6e,
3686+
0x00000080, 0xbfa20007,
3687+
0x8c78ff78, 0x00004000,
3688+
0x80ec886c, 0x82ed806d,
3689+
0xbfa00002, 0x806c846c,
3690+
0x826d806d, 0x8b6dff6d,
3691+
0x0000ffff, 0x8bfe7e7e,
3692+
0x8bea6a6a, 0x85788978,
3693+
0x936eff77, 0x0002001a,
3694+
0xb96ef81a, 0xb9783244,
36873695
0xbe804a6c, 0xb8faf802,
36883696
0xbf0d987a, 0xbfa10001,
36893697
0xbfb00000, 0x8b6dff6d,
@@ -3981,7 +3989,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
39813989
0x008ce800, 0x00000000,
39823990
0x807d817d, 0x8070ff70,
39833991
0x00000080, 0xbf0a7b7d,
3984-
0xbfa2fff7, 0xbfa0016e,
3992+
0xbfa2fff7, 0xbfa00171,
39853993
0xbef4007e, 0x8b75ff7f,
39863994
0x0000ffff, 0x8c75ff75,
39873995
0x00040000, 0xbef60080,
@@ -4163,12 +4171,14 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
41634171
0xf8000074, 0xbf8a0000,
41644172
0x8b6dff6d, 0x0000ffff,
41654173
0x8bfe7e7e, 0x8bea6a6a,
4166-
0xb97af804, 0xbe804ec2,
4167-
0xbf94fffe, 0xbe804a6c,
4174+
0x936eff77, 0x0002001a,
4175+
0xb96ef81a, 0xb97af804,
41684176
0xbe804ec2, 0xbf94fffe,
4169-
0xbfb10000, 0xbf9f0000,
4177+
0xbe804a6c, 0xbe804ec2,
4178+
0xbf94fffe, 0xbfb10000,
41704179
0xbf9f0000, 0xbf9f0000,
41714180
0xbf9f0000, 0xbf9f0000,
4181+
0xbf9f0000, 0x00000000,
41724182
};
41734183

41744184
static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {

drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,16 @@ var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL
7878
var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
7979
var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
8080
var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE = 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT
81+
82+
var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT = 0
83+
var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE = 2
84+
8185
var BARRIER_STATE_SIGNAL_OFFSET = 16
8286
var BARRIER_STATE_VALID_OFFSET = 0
8387

88+
var TTMP11_SCHED_MODE_SHIFT = 26
89+
var TTMP11_SCHED_MODE_SIZE = 2
90+
var TTMP11_SCHED_MODE_MASK = 0xC000000
8491
var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23
8592
var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000
8693

@@ -160,8 +167,19 @@ L_JUMP_TO_RESTORE:
160167
s_branch L_RESTORE
161168

162169
L_SKIP_RESTORE:
170+
// Assume most relaxed scheduling mode is set. Save and revert to normal mode.
171+
s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_SCHED_MODE)
172+
s_wait_alu 0
173+
s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, \
174+
SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT, SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE), 0
175+
163176
s_getreg_b32 s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV) //save STATUS since we will change SCC
164177

178+
// Save SCHED_MODE[1:0] into ttmp11[27:26].
179+
s_andn2_b32 ttmp11, ttmp11, TTMP11_SCHED_MODE_MASK
180+
s_lshl_b32 ttmp2, ttmp2, TTMP11_SCHED_MODE_SHIFT
181+
s_or_b32 ttmp11, ttmp11, ttmp2
182+
165183
// Clear SPI_PRIO: do not save with elevated priority.
166184
// Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
167185
s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK
@@ -238,6 +256,13 @@ L_FETCH_2ND_TRAP:
238256
s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA
239257
s_or_b32 ttmp15, ttmp15, 0xFFFF0000
240258
L_NO_SIGN_EXTEND_TMA:
259+
#if ASIC_FAMILY == CHIP_GFX12
260+
// Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI).
261+
// The second-level trap will restore from ttmp1 for backwards compatibility.
262+
s_and_b32 ttmp2, ttmp11, TTMP11_SCHED_MODE_MASK
263+
s_andn2_b32 ttmp1, ttmp1, TTMP11_SCHED_MODE_MASK
264+
s_or_b32 ttmp1, ttmp1, ttmp2
265+
#endif
241266

242267
s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS // debug trap enabled flag
243268
s_wait_idle
@@ -287,6 +312,10 @@ L_EXIT_TRAP:
287312
// STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
288313
// Only restore fields which the trap handler changes.
289314
s_lshr_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
315+
316+
// Assume relaxed scheduling mode after this point.
317+
restore_sched_mode(ttmp2)
318+
290319
s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
291320
SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv
292321

@@ -1043,6 +1072,9 @@ L_SKIP_BARRIER_RESTORE:
10431072
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
10441073
s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
10451074

1075+
// Assume relaxed scheduling mode after this point.
1076+
restore_sched_mode(s_restore_tmp)
1077+
10461078
s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv // SCC is included, which is changed by previous salu
10471079

10481080
// Make barrier and LDS state visible to all waves in the group.
@@ -1134,3 +1166,8 @@ function valu_sgpr_hazard
11341166
end
11351167
#endif
11361168
end
1169+
1170+
function restore_sched_mode(s_tmp)
1171+
s_bfe_u32 s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10))
1172+
s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp
1173+
end

0 commit comments

Comments
 (0)