Skip to content

Commit 7eba6a8

Browse files
tomaszlimwiniars
authored andcommitted
drm/xe/vf: Make multi-GT migration less error prone
There is a remote chance that after migration, some GTs will not send the MIGRATED interrupt, or due to current VF KMD state the interrupt will not lead to marking the GT for recovery. Requiring IRQs from all GTs before starting migration introduces the possibility that the process will get stalled due to one GuC. One could argue it is also waste of time to wait for all IRQs, but we should get them all IRQs as soon as VGPU starts, so that's not really an impactful argument. Still, not waiting for all GTs makes it easier to handle situations: * where one GuC IRQ is missing * where state before probe is unclean - getting MIGRATED IRQ as soon as interrupts are enabled * where multiple migrations happen close to each other To help with these cases, this patch alters the post-migration recovery so that recovery task is started as soon as one GuC IRQ is handled, and other GTs are included in recovery later as the subsequent IRQs are serviced. The post-migration recovery can now be called for any selection of GTs, and it will perform recovery on all GTs for which IRQs have arrived, even multiple times if necessary. v2: Typos and style fixes v3: Transferring gt_flags by value rather than reference to last function where it is used Signed-off-by: Tomasz Lis <tomasz.lis@intel.com> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com> Cc: Michal Winiarski <michal.winiarski@intel.com> Cc: Satyanarayana K V P <satyanarayana.k.v.p@intel.com> Acked-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com> Reviewed-by: Michal Winiarski <michal.winiarski@intel.com> Link: https://lore.kernel.org/r/20250630152155.195648-1-tomasz.lis@intel.com Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
1 parent 491b978 commit 7eba6a8

1 file changed

Lines changed: 77 additions & 90 deletions

File tree

drivers/gpu/drm/xe/xe_sriov_vf.c

Lines changed: 77 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -147,127 +147,113 @@ void xe_sriov_vf_init_early(struct xe_device *xe)
147147
xe_sriov_info(xe, "migration not supported by this module version\n");
148148
}
149149

150-
/**
151-
* vf_post_migration_requery_guc - Re-query GuC for current VF provisioning.
150+
static bool gt_vf_post_migration_needed(struct xe_gt *gt)
151+
{
152+
return test_bit(gt->info.id, &gt_to_xe(gt)->sriov.vf.migration.gt_flags);
153+
}
154+
155+
/*
156+
* Notify GuCs marked in flags about resource fixups apply finished.
152157
* @xe: the &xe_device struct instance
153-
*
154-
* After migration, we need to re-query all VF configuration to make sure
155-
* they match previous provisioning. Note that most of VF provisioning
156-
* shall be the same, except GGTT range, since GGTT is not virtualized per-VF.
157-
*
158-
* Returns: 0 if the operation completed successfully, or a negative error
159-
* code otherwise.
158+
* @gt_flags: flags marking to which GTs the notification shall be sent
160159
*/
161-
static int vf_post_migration_requery_guc(struct xe_device *xe)
160+
static int vf_post_migration_notify_resfix_done(struct xe_device *xe, unsigned long gt_flags)
162161
{
163162
struct xe_gt *gt;
164163
unsigned int id;
165-
int err, ret = 0;
164+
int err = 0;
166165

167166
for_each_gt(gt, xe, id) {
168-
err = xe_gt_sriov_vf_query_config(gt);
169-
ret = ret ?: err;
167+
if (!test_bit(id, &gt_flags))
168+
continue;
169+
/* skip asking GuC for RESFIX exit if new recovery request arrived */
170+
if (gt_vf_post_migration_needed(gt))
171+
continue;
172+
err = xe_gt_sriov_vf_notify_resfix_done(gt);
173+
if (err)
174+
break;
175+
clear_bit(id, &gt_flags);
170176
}
171177

172-
return ret;
178+
if (gt_flags && !err)
179+
drm_dbg(&xe->drm, "another recovery imminent, skipped some notifications\n");
180+
return err;
173181
}
174182

175-
static void vf_post_migration_fixup_ctb(struct xe_device *xe)
183+
static int vf_get_next_migrated_gt_id(struct xe_device *xe)
176184
{
177185
struct xe_gt *gt;
178186
unsigned int id;
179187

180-
xe_assert(xe, IS_SRIOV_VF(xe));
181-
182188
for_each_gt(gt, xe, id) {
183-
s32 shift = xe_gt_sriov_vf_ggtt_shift(gt);
184-
185-
xe_guc_ct_fixup_messages_with_ggtt(&gt->uc.guc.ct, shift);
189+
if (test_and_clear_bit(id, &xe->sriov.vf.migration.gt_flags))
190+
return id;
186191
}
192+
return -1;
187193
}
188194

189-
/*
190-
* vf_post_migration_imminent - Check if post-restore recovery is coming.
191-
* @xe: the &xe_device struct instance
195+
/**
196+
* Perform post-migration fixups on a single GT.
192197
*
193-
* Return: True if migration recovery worker will soon be running. Any worker currently
194-
* executing does not affect the result.
198+
* After migration, GuC needs to be re-queried for VF configuration to check
199+
* if it matches previous provisioning. Most of VF provisioning shall be the
200+
* same, except GGTT range, since GGTT is not virtualized per-VF. If GGTT
201+
* range has changed, we have to perform fixups - shift all GGTT references
202+
* used anywhere within the driver. After the fixups in this function succeed,
203+
* it is allowed to ask the GuC bound to this GT to continue normal operation.
204+
*
205+
* Returns: 0 if the operation completed successfully, or a negative error
206+
* code otherwise.
195207
*/
196-
static bool vf_post_migration_imminent(struct xe_device *xe)
208+
static int gt_vf_post_migration_fixups(struct xe_gt *gt)
197209
{
198-
return xe->sriov.vf.migration.gt_flags != 0 ||
199-
work_pending(&xe->sriov.vf.migration.worker);
200-
}
201-
202-
static bool vf_post_migration_fixup_ggtt_nodes(struct xe_device *xe)
203-
{
204-
bool need_fixups = false;
205-
struct xe_tile *tile;
206-
unsigned int id;
207-
208-
for_each_tile(tile, xe, id) {
209-
struct xe_gt *gt = tile->primary_gt;
210-
s64 shift;
211-
212-
shift = xe_gt_sriov_vf_ggtt_shift(gt);
213-
if (shift) {
214-
need_fixups = true;
215-
xe_tile_sriov_vf_fixup_ggtt_nodes(tile, shift);
216-
}
217-
}
218-
return need_fixups;
219-
}
210+
s64 shift;
211+
int err;
220212

221-
/*
222-
* Notify all GuCs about resource fixups apply finished.
223-
*/
224-
static void vf_post_migration_notify_resfix_done(struct xe_device *xe)
225-
{
226-
struct xe_gt *gt;
227-
unsigned int id;
213+
err = xe_gt_sriov_vf_query_config(gt);
214+
if (err)
215+
return err;
228216

229-
for_each_gt(gt, xe, id) {
230-
if (vf_post_migration_imminent(xe))
231-
goto skip;
232-
xe_gt_sriov_vf_notify_resfix_done(gt);
217+
shift = xe_gt_sriov_vf_ggtt_shift(gt);
218+
if (shift) {
219+
xe_tile_sriov_vf_fixup_ggtt_nodes(gt_to_tile(gt), shift);
220+
/* FIXME: add the recovery steps */
221+
xe_guc_ct_fixup_messages_with_ggtt(&gt->uc.guc.ct, shift);
233222
}
234-
return;
235-
236-
skip:
237-
drm_dbg(&xe->drm, "another recovery imminent, skipping notifications\n");
223+
return 0;
238224
}
239225

240226
static void vf_post_migration_recovery(struct xe_device *xe)
241227
{
242-
bool need_fixups;
243-
int err;
228+
unsigned long fixed_gts = 0;
229+
int id, err;
244230

245231
drm_dbg(&xe->drm, "migration recovery in progress\n");
246232
xe_pm_runtime_get(xe);
247-
err = vf_post_migration_requery_guc(xe);
248-
if (vf_post_migration_imminent(xe))
249-
goto defer;
250-
if (unlikely(err))
251-
goto fail;
233+
252234
if (!vf_migration_supported(xe)) {
253235
xe_sriov_err(xe, "migration not supported by this module version\n");
254236
err = -ENOTRECOVERABLE;
255237
goto fail;
256238
}
257239

258-
need_fixups = vf_post_migration_fixup_ggtt_nodes(xe);
259-
/* FIXME: add the recovery steps */
260-
if (need_fixups)
261-
vf_post_migration_fixup_ctb(xe);
240+
while (id = vf_get_next_migrated_gt_id(xe), id >= 0) {
241+
struct xe_gt *gt = xe_device_get_gt(xe, id);
242+
243+
err = gt_vf_post_migration_fixups(gt);
244+
if (err)
245+
goto fail;
246+
247+
set_bit(id, &fixed_gts);
248+
}
249+
250+
err = vf_post_migration_notify_resfix_done(xe, fixed_gts);
251+
if (err)
252+
goto fail;
262253

263-
vf_post_migration_notify_resfix_done(xe);
264254
xe_pm_runtime_put(xe);
265255
drm_notice(&xe->drm, "migration recovery ended\n");
266256
return;
267-
defer:
268-
xe_pm_runtime_put(xe);
269-
drm_dbg(&xe->drm, "migration recovery deferred\n");
270-
return;
271257
fail:
272258
xe_pm_runtime_put(xe);
273259
drm_err(&xe->drm, "migration recovery failed (%pe)\n", ERR_PTR(err));
@@ -282,18 +268,23 @@ static void migration_worker_func(struct work_struct *w)
282268
vf_post_migration_recovery(xe);
283269
}
284270

285-
static bool vf_ready_to_recovery_on_all_gts(struct xe_device *xe)
271+
/*
272+
* Check if post-restore recovery is coming on any of GTs.
273+
* @xe: the &xe_device struct instance
274+
*
275+
* Return: True if migration recovery worker will soon be running. Any worker currently
276+
* executing does not affect the result.
277+
*/
278+
static bool vf_ready_to_recovery_on_any_gts(struct xe_device *xe)
286279
{
287280
struct xe_gt *gt;
288281
unsigned int id;
289282

290283
for_each_gt(gt, xe, id) {
291-
if (!test_bit(id, &xe->sriov.vf.migration.gt_flags)) {
292-
xe_gt_sriov_dbg_verbose(gt, "still not ready to recover\n");
293-
return false;
294-
}
284+
if (test_bit(id, &xe->sriov.vf.migration.gt_flags))
285+
return true;
295286
}
296-
return true;
287+
return false;
297288
}
298289

299290
/**
@@ -308,13 +299,9 @@ void xe_sriov_vf_start_migration_recovery(struct xe_device *xe)
308299

309300
xe_assert(xe, IS_SRIOV_VF(xe));
310301

311-
if (!vf_ready_to_recovery_on_all_gts(xe))
302+
if (!vf_ready_to_recovery_on_any_gts(xe))
312303
return;
313304

314-
WRITE_ONCE(xe->sriov.vf.migration.gt_flags, 0);
315-
/* Ensure other threads see that no flags are set now. */
316-
smp_mb();
317-
318305
started = queue_work(xe->sriov.wq, &xe->sriov.vf.migration.worker);
319306
drm_info(&xe->drm, "VF migration recovery %s\n", started ?
320307
"scheduled" : "already in progress");

0 commit comments

Comments
 (0)