Skip to content

Commit fc889e9

Browse files
committed
fix(uvm): skip RM calls during cleanup when GPU is surprise removed
1 parent 6cb727c commit fc889e9

4 files changed

Lines changed: 50 additions & 10 deletions

File tree

kernel-open/nvidia-uvm/uvm_channel.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
#include "uvm_common.h"
2828
#include "uvm_global.h"
2929
#include "uvm_hal.h"
30+
#include "uvm_gpu.h"
31+
#include "uvm_gpu_isr.h"
3032
#include "uvm_procfs.h"
3133
#include "uvm_push.h"
3234
#include "uvm_gpu_semaphore.h"
@@ -2310,10 +2312,14 @@ static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
23102312
free_conf_computing_buffers(channel);
23112313
}
23122314

2313-
if (uvm_channel_is_proxy(channel))
2314-
uvm_rm_locked_call_void(nvUvmInterfacePagingChannelDestroy(channel->proxy.handle));
2315-
else
2316-
uvm_rm_locked_call_void(nvUvmInterfaceChannelDestroy(channel->handle));
2315+
// Skip RM calls if GPU has been surprise removed. Calling RM with stale
2316+
// handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
2317+
if (uvm_parent_gpu_is_accessible(pool->manager->gpu->parent)) {
2318+
if (uvm_channel_is_proxy(channel))
2319+
uvm_rm_locked_call_void(nvUvmInterfacePagingChannelDestroy(channel->proxy.handle));
2320+
else
2321+
uvm_rm_locked_call_void(nvUvmInterfaceChannelDestroy(channel->handle));
2322+
}
23172323

23182324
uvm_gpu_tracking_semaphore_free(&channel->tracking_sem);
23192325

@@ -2657,7 +2663,11 @@ static void tsg_destroy(uvm_channel_pool_t *pool, uvmGpuTsgHandle tsg_handle)
26572663
{
26582664
UVM_ASSERT(pool->num_tsgs > 0);
26592665

2660-
uvm_rm_locked_call_void(nvUvmInterfaceTsgDestroy(tsg_handle));
2666+
// Skip RM call if GPU has been surprise removed. Calling RM with stale
2667+
// handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
2668+
if (uvm_parent_gpu_is_accessible(pool->manager->gpu->parent))
2669+
uvm_rm_locked_call_void(nvUvmInterfaceTsgDestroy(tsg_handle));
2670+
26612671
pool->num_tsgs--;
26622672
}
26632673

kernel-open/nvidia-uvm/uvm_rm_mem.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include "uvm_rm_mem.h"
2525
#include "uvm_gpu.h"
26+
#include "uvm_gpu_isr.h"
2627
#include "uvm_global.h"
2728
#include "uvm_kvmalloc.h"
2829
#include "uvm_linux.h"
@@ -298,8 +299,11 @@ void uvm_rm_mem_unmap_cpu(uvm_rm_mem_t *rm_mem)
298299
if (!uvm_rm_mem_mapped_on_cpu(rm_mem))
299300
return;
300301

301-
uvm_rm_locked_call_void(nvUvmInterfaceMemoryCpuUnMap(rm_mem->gpu_owner->rm_address_space,
302-
uvm_rm_mem_get_cpu_va(rm_mem)));
302+
// Skip RM call if GPU has been surprise removed. Calling RM with stale
303+
// handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
304+
if (uvm_parent_gpu_is_accessible(rm_mem->gpu_owner->parent))
305+
uvm_rm_locked_call_void(nvUvmInterfaceMemoryCpuUnMap(rm_mem->gpu_owner->rm_address_space,
306+
uvm_rm_mem_get_cpu_va(rm_mem)));
303307

304308
rm_mem_clear_cpu_va(rm_mem);
305309
}
@@ -355,7 +359,12 @@ static void rm_mem_unmap_gpu(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu)
355359
rm_mem_unmap_gpu_proxy(rm_mem, gpu);
356360

357361
va = uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu);
358-
uvm_rm_locked_call_void(nvUvmInterfaceMemoryFree(gpu->rm_address_space, va));
362+
363+
// Skip RM call if GPU has been surprise removed. Calling RM with stale
364+
// handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
365+
if (uvm_parent_gpu_is_accessible(gpu->parent))
366+
uvm_rm_locked_call_void(nvUvmInterfaceMemoryFree(gpu->rm_address_space, va));
367+
359368
rm_mem_clear_gpu_va(rm_mem, gpu);
360369
}
361370

kernel-open/nvidia-uvm/uvm_user_channel.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "uvm_kvmalloc.h"
3333
#include "uvm_api.h"
3434
#include "uvm_gpu.h"
35+
#include "uvm_gpu_isr.h"
3536
#include "uvm_tracker.h"
3637
#include "uvm_map_external.h"
3738
#include "nv_uvm_interface.h"
@@ -782,6 +783,14 @@ void uvm_user_channel_stop(uvm_user_channel_t *user_channel)
782783
// write mode.
783784
uvm_assert_rwsem_locked_read(&va_space->lock);
784785

786+
// Skip RM call if GPU has been surprise removed. Calling RM with stale
787+
// client handles will result in repeated NV_ERR_INVALID_OBJECT_HANDLE
788+
// errors during teardown.
789+
if (!uvm_parent_gpu_is_accessible(user_channel->gpu->parent)) {
790+
atomic_set(&user_channel->is_bound, 0);
791+
return;
792+
}
793+
785794
// TODO: Bug 1737765. This doesn't stop the user from putting the
786795
// channel back on the runlist, which could put stale instance
787796
// pointers back in the fault buffer.
@@ -854,7 +863,9 @@ void uvm_user_channel_destroy_detached(uvm_user_channel_t *user_channel)
854863
uvm_kvfree(user_channel->resources);
855864
}
856865

857-
if (user_channel->rm_retained_channel)
866+
// Skip RM call if GPU has been surprise removed. Calling RM with stale
867+
// handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
868+
if (user_channel->rm_retained_channel && uvm_parent_gpu_is_accessible(user_channel->gpu->parent))
858869
uvm_rm_locked_call_void(nvUvmInterfaceReleaseChannel(user_channel->rm_retained_channel));
859870

860871
uvm_user_channel_release(user_channel);

kernel-open/nvidia-uvm/uvm_va_space.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "uvm_tools.h"
3333
#include "uvm_thread_context.h"
3434
#include "uvm_hal.h"
35+
#include "uvm_gpu_isr.h"
3536
#include "uvm_map_external.h"
3637
#include "uvm_ats.h"
3738
#include "uvm_gpu_access_counters.h"
@@ -1436,6 +1437,13 @@ void uvm_gpu_va_space_unset_page_dir(uvm_gpu_va_space_t *gpu_va_space)
14361437
if (gpu_va_space->did_set_page_directory) {
14371438
NV_STATUS status;
14381439

1440+
// Skip RM call if GPU has been surprise removed. Calling RM with stale
1441+
// handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
1442+
if (!uvm_parent_gpu_is_accessible(gpu_va_space->gpu->parent)) {
1443+
gpu_va_space->did_set_page_directory = false;
1444+
return;
1445+
}
1446+
14391447
status = uvm_rm_locked_call(nvUvmInterfaceUnsetPageDirectory(gpu_va_space->duped_gpu_va_space));
14401448
UVM_ASSERT_MSG(status == NV_OK,
14411449
"nvUvmInterfaceUnsetPageDirectory() failed: %s, GPU %s\n",
@@ -1487,7 +1495,9 @@ static void destroy_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
14871495
if (gpu_va_space->page_tables.root)
14881496
uvm_page_tree_deinit(&gpu_va_space->page_tables);
14891497

1490-
if (gpu_va_space->duped_gpu_va_space)
1498+
// Skip RM call if GPU has been surprise removed. Calling RM with stale
1499+
// handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
1500+
if (gpu_va_space->duped_gpu_va_space && uvm_parent_gpu_is_accessible(gpu_va_space->gpu->parent))
14911501
uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu_va_space->duped_gpu_va_space));
14921502

14931503
// If the state is DEAD, then this GPU VA space is tracked in

0 commit comments

Comments
 (0)