Skip to content

Commit b7a5b53

Browse files
gormanmingomolnar
authored andcommitted
sched/numa: Complete scanning of partial VMAs regardless of PID activity
NUMA Balancing skips VMAs when the current task has not trapped a NUMA fault within the VMA. If the VMA is skipped then mm->numa_scan_offset advances and a task that is trapping faults within the VMA may never fully update PTEs within the VMA. Force tasks to update PTEs for partially scanned PTEs. The VMA will be tagged for NUMA hints by some task but this removes some of the benefit of tracking PID activity within a VMA. A follow-on patch will mitigate this problem. The test cases and machines evaluated did not trigger the corner case so the performance results are neutral with only small changes within the noise from normal test-to-test variance. However, the next patch makes the corner case easier to trigger. Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Ingo Molnar <mingo@kernel.org> Tested-by: Raghavendra K T <raghavendra.kt@amd.com> Link: https://lore.kernel.org/r/20231010083143.19593-6-mgorman@techsingularity.net
1 parent 2e2675d commit b7a5b53

3 files changed

Lines changed: 18 additions & 4 deletions

File tree

include/linux/sched/numa_balancing.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ enum numa_vmaskip_reason {
2121
NUMAB_SKIP_INACCESSIBLE,
2222
NUMAB_SKIP_SCAN_DELAY,
2323
NUMAB_SKIP_PID_INACTIVE,
24+
NUMAB_SKIP_IGNORE_PID,
2425
};
2526

2627
#ifdef CONFIG_NUMA_BALANCING

include/trace/events/sched.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -670,7 +670,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
670670
EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \
671671
EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \
672672
EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \
673-
EMe(NUMAB_SKIP_PID_INACTIVE, "pid_inactive" )
673+
EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \
674+
EMe(NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" )
674675

675676
/* Redefine for export. */
676677
#undef EM

kernel/sched/fair.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3113,7 +3113,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
31133113
p->mm->numa_scan_offset = 0;
31143114
}
31153115

3116-
static bool vma_is_accessed(struct vm_area_struct *vma)
3116+
static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
31173117
{
31183118
unsigned long pids;
31193119
/*
@@ -3126,7 +3126,19 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
31263126
return true;
31273127

31283128
pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
3129-
return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
3129+
if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
3130+
return true;
3131+
3132+
/*
3133+
* Complete a scan that has already started regardless of PID access, or
3134+
* some VMAs may never be scanned in multi-threaded applications:
3135+
*/
3136+
if (mm->numa_scan_offset > vma->vm_start) {
3137+
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
3138+
return true;
3139+
}
3140+
3141+
return false;
31303142
}
31313143

31323144
#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
@@ -3270,7 +3282,7 @@ static void task_numa_work(struct callback_head *work)
32703282
}
32713283

32723284
/* Do not scan the VMA if task has not accessed */
3273-
if (!vma_is_accessed(vma)) {
3285+
if (!vma_is_accessed(mm, vma)) {
32743286
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
32753287
continue;
32763288
}

0 commit comments

Comments
 (0)