@@ -540,26 +540,29 @@ enum kfd_smi_event {
540540 KFD_SMI_EVENT_ALL_PROCESS = 64
541541};
542542
543+ /* The reason of the page migration event */
543544enum KFD_MIGRATE_TRIGGERS {
544- KFD_MIGRATE_TRIGGER_PREFETCH ,
545- KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU ,
546- KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU ,
547- KFD_MIGRATE_TRIGGER_TTM_EVICTION
545+ KFD_MIGRATE_TRIGGER_PREFETCH , /* Prefetch to GPU VRAM or system memory */
546+ KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU , /* GPU page fault recover */
547+ KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU , /* CPU page fault recover */
548+ KFD_MIGRATE_TRIGGER_TTM_EVICTION /* TTM eviction */
548549};
549550
551+ /* The reason of user queue evition event */
550552enum KFD_QUEUE_EVICTION_TRIGGERS {
551- KFD_QUEUE_EVICTION_TRIGGER_SVM ,
552- KFD_QUEUE_EVICTION_TRIGGER_USERPTR ,
553- KFD_QUEUE_EVICTION_TRIGGER_TTM ,
554- KFD_QUEUE_EVICTION_TRIGGER_SUSPEND ,
555- KFD_QUEUE_EVICTION_CRIU_CHECKPOINT ,
556- KFD_QUEUE_EVICTION_CRIU_RESTORE
553+ KFD_QUEUE_EVICTION_TRIGGER_SVM , /* SVM buffer migration */
554+ KFD_QUEUE_EVICTION_TRIGGER_USERPTR , /* userptr movement */
555+ KFD_QUEUE_EVICTION_TRIGGER_TTM , /* TTM move buffer */
556+ KFD_QUEUE_EVICTION_TRIGGER_SUSPEND , /* GPU suspend */
557+ KFD_QUEUE_EVICTION_CRIU_CHECKPOINT , /* CRIU checkpoint */
558+ KFD_QUEUE_EVICTION_CRIU_RESTORE /* CRIU restore */
557559};
558560
561+ /* The reason of unmap buffer from GPU event */
559562enum KFD_SVM_UNMAP_TRIGGERS {
560- KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY ,
561- KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE ,
562- KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
563+ KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY , /* MMU notifier CPU buffer movement */
564+ KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE ,/* MMU notifier page migration */
565+ KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU /* Unmap to free the buffer */
563566};
564567
565568#define KFD_SMI_EVENT_MASK_FROM_INDEX (i ) (1ULL << ((i) - 1))
@@ -570,6 +573,77 @@ struct kfd_ioctl_smi_events_args {
570573 __u32 anon_fd ; /* from KFD */
571574};
572575
576+ /*
577+ * SVM event tracing via SMI system management interface
578+ *
579+ * Open event file descriptor
580+ * use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return a anonymous file
581+ * descriptor to receive SMI events.
582+ * If calling with sudo permission, then file descriptor can be used to receive
583+ * SVM events from all processes, otherwise, to only receive SVM events of same
584+ * process.
585+ *
586+ * To enable the SVM event
587+ * Write event file descriptor with KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap
588+ * mask to start record the event to the kfifo, use bitmap mask combination
589+ * for multiple events. New event mask will overwrite the previous event mask.
590+ * KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS) bit requires sudo
591+ * permisson to receive SVM events from all process.
592+ *
593+ * To receive the event
594+ * Application can poll file descriptor to wait for the events, then read event
595+ * from the file into a buffer. Each event is one line string message, starting
596+ * with the event id, then the event specific information.
597+ *
598+ * To decode event information
599+ * The following event format string macro can be used with sscanf to decode
600+ * the specific event information.
601+ * event triggers: the reason to generate the event, defined as enum for unmap,
602+ * eviction and migrate events.
603+ * node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0 for system memory.
604+ * addr: user mode address, in pages
605+ * size: in pages
606+ * pid: the process ID to generate the event
607+ * ns: timestamp in nanosecond-resolution, starts at system boot time but
608+ * stops during suspend
609+ * migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update
610+ * rw: 'W' for write page fault, 'R' for read page fault
611+ * rescheduled: 'R' if the queue restore failed and rescheduled to try again
612+ */
613+ #define KFD_EVENT_FMT_UPDATE_GPU_RESET (reset_seq_num , reset_cause )\
614+ "%x %s\n", (reset_seq_num), (reset_cause)
615+
616+ #define KFD_EVENT_FMT_THERMAL_THROTTLING (bitmask , counter )\
617+ "%llx:%llx\n", (bitmask), (counter)
618+
619+ #define KFD_EVENT_FMT_VMFAULT (pid , task_name )\
620+ "%x:%s\n", (pid), (task_name)
621+
622+ #define KFD_EVENT_FMT_PAGEFAULT_START (ns , pid , addr , node , rw )\
623+ "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (rw)
624+
625+ #define KFD_EVENT_FMT_PAGEFAULT_END (ns , pid , addr , node , migrate_update )\
626+ "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (migrate_update)
627+
628+ #define KFD_EVENT_FMT_MIGRATE_START (ns , pid , start , size , from , to , prefetch_loc ,\
629+ preferred_loc , migrate_trigger )\
630+ "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\
631+ (from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger)
632+
633+ #define KFD_EVENT_FMT_MIGRATE_END (ns , pid , start , size , from , to , migrate_trigger )\
634+ "%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\
635+ (from), (to), (migrate_trigger)
636+
637+ #define KFD_EVENT_FMT_QUEUE_EVICTION (ns , pid , node , evict_trigger )\
638+ "%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger)
639+
640+ #define KFD_EVENT_FMT_QUEUE_RESTORE (ns , pid , node , rescheduled )\
641+ "%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled)
642+
643+ #define KFD_EVENT_FMT_UNMAP_FROM_GPU (ns , pid , addr , size , node , unmap_trigger )\
644+ "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
645+ (node), (unmap_trigger)
646+
573647/**************************************************************************************************
574648 * CRIU IOCTLs (Checkpoint Restore In Userspace)
575649 *
0 commit comments