Skip to content

Commit e7bf7a4

Browse files
Ricardo Kolleroupton
authored andcommitted
KVM: arm64: Split huge pages when dirty logging is enabled
Split huge pages eagerly when enabling dirty logging. The goal is to avoid doing it while faulting on write-protected pages, which negatively impacts guest performance. A memslot marked for dirty logging is split in 1GB pieces at a time. This is in order to release the mmu_lock and give other kernel threads the opportunity to run, and also in order to allocate enough pages to split a 1GB range worth of huge pages (or a single 1GB huge page). Note that these page allocations can fail, so eager page splitting is best-effort. This is not a correctness issue though, as huge pages can still be split on write-faults. Eager page splitting only takes effect when the huge page mapping has been existing in the stage-2 page table. Otherwise, the huge page will be mapped to multiple non-huge pages on page fault. The benefits of eager page splitting are the same as in x86, added with commit a3fe5db ("KVM: x86/mmu: Split huge pages mapped by the TDP MMU when dirty logging is enabled"). For example, when running dirty_log_perf_test with 64 virtual CPUs (Ampere Altra), 1GB per vCPU, 50% reads, and 2MB HugeTLB memory, the time it takes vCPUs to access all of their memory after dirty logging is enabled decreased by 44% from 2.58s to 1.42s. Signed-off-by: Ricardo Koller <ricarkol@google.com> Reviewed-by: Shaoqin Huang <shahuang@redhat.com> Reviewed-by: Gavin Shan <gshan@redhat.com> Link: https://lore.kernel.org/r/20230426172330.1439644-10-ricarkol@google.com Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
1 parent ce2b602 commit e7bf7a4

1 file changed

Lines changed: 123 additions & 4 deletions

File tree

arch/arm64/kvm/mmu.c

Lines changed: 123 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,21 @@ static phys_addr_t __ro_after_init hyp_idmap_vector;
3131

3232
static unsigned long __ro_after_init io_map_base;
3333

34-
static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
34+
static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
35+
phys_addr_t size)
3536
{
36-
phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
3737
phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
3838

3939
return (boundary - 1 < end - 1) ? boundary : end;
4040
}
4141

42+
static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
43+
{
44+
phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
45+
46+
return __stage2_range_addr_end(addr, end, size);
47+
}
48+
4249
/*
4350
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
4451
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
@@ -75,6 +82,79 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
7582
#define stage2_apply_range_resched(mmu, addr, end, fn) \
7683
stage2_apply_range(mmu, addr, end, fn, true)
7784

85+
/*
86+
* Get the maximum number of page-tables pages needed to split a range
87+
* of blocks into PAGE_SIZE PTEs. It assumes the range is already
88+
* mapped at level 2, or at level 1 if allowed.
89+
*/
90+
static int kvm_mmu_split_nr_page_tables(u64 range)
91+
{
92+
int n = 0;
93+
94+
if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
95+
n += DIV_ROUND_UP_ULL(range, PUD_SIZE);
96+
n += DIV_ROUND_UP_ULL(range, PMD_SIZE);
97+
return n;
98+
}
99+
100+
static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
101+
{
102+
struct kvm_mmu_memory_cache *cache;
103+
u64 chunk_size, min;
104+
105+
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
106+
return true;
107+
108+
chunk_size = kvm->arch.mmu.split_page_chunk_size;
109+
min = kvm_mmu_split_nr_page_tables(chunk_size);
110+
cache = &kvm->arch.mmu.split_page_cache;
111+
return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
112+
}
113+
114+
static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
115+
phys_addr_t end)
116+
{
117+
struct kvm_mmu_memory_cache *cache;
118+
struct kvm_pgtable *pgt;
119+
int ret, cache_capacity;
120+
u64 next, chunk_size;
121+
122+
lockdep_assert_held_write(&kvm->mmu_lock);
123+
124+
chunk_size = kvm->arch.mmu.split_page_chunk_size;
125+
cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
126+
127+
if (chunk_size == 0)
128+
return 0;
129+
130+
cache = &kvm->arch.mmu.split_page_cache;
131+
132+
do {
133+
if (need_split_memcache_topup_or_resched(kvm)) {
134+
write_unlock(&kvm->mmu_lock);
135+
cond_resched();
136+
/* Eager page splitting is best-effort. */
137+
ret = __kvm_mmu_topup_memory_cache(cache,
138+
cache_capacity,
139+
cache_capacity);
140+
write_lock(&kvm->mmu_lock);
141+
if (ret)
142+
break;
143+
}
144+
145+
pgt = kvm->arch.mmu.pgt;
146+
if (!pgt)
147+
return -EINVAL;
148+
149+
next = __stage2_range_addr_end(addr, end, chunk_size);
150+
ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
151+
if (ret)
152+
break;
153+
} while (addr = next, addr != end);
154+
155+
return ret;
156+
}
157+
78158
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
79159
{
80160
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -793,6 +873,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
793873
void kvm_uninit_stage2_mmu(struct kvm *kvm)
794874
{
795875
kvm_free_stage2_pgd(&kvm->arch.mmu);
876+
kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
796877
}
797878

798879
static void stage2_unmap_memslot(struct kvm *kvm,
@@ -1019,6 +1100,34 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
10191100
stage2_wp_range(&kvm->arch.mmu, start, end);
10201101
}
10211102

1103+
/**
1104+
* kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
1105+
* pages for memory slot
1106+
* @kvm: The KVM pointer
1107+
* @slot: The memory slot to split
1108+
*
1109+
* Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
1110+
* serializing operations for VM memory regions.
1111+
*/
1112+
static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
1113+
{
1114+
struct kvm_memslots *slots;
1115+
struct kvm_memory_slot *memslot;
1116+
phys_addr_t start, end;
1117+
1118+
lockdep_assert_held(&kvm->slots_lock);
1119+
1120+
slots = kvm_memslots(kvm);
1121+
memslot = id_to_memslot(slots, slot);
1122+
1123+
start = memslot->base_gfn << PAGE_SHIFT;
1124+
end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1125+
1126+
write_lock(&kvm->mmu_lock);
1127+
kvm_mmu_split_huge_pages(kvm, start, end);
1128+
write_unlock(&kvm->mmu_lock);
1129+
}
1130+
10221131
/*
10231132
* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
10241133
* dirty pages.
@@ -1812,8 +1921,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
18121921
return;
18131922

18141923
/*
1815-
* Pages are write-protected on either of these two
1816-
* cases:
1924+
* Huge and normal pages are write-protected and split
1925+
* on either of these two cases:
18171926
*
18181927
* 1. with initial-all-set: gradually with CLEAR ioctls,
18191928
*/
@@ -1825,6 +1934,16 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
18251934
* enabling dirty logging.
18261935
*/
18271936
kvm_mmu_wp_memory_region(kvm, new->id);
1937+
kvm_mmu_split_memory_region(kvm, new->id);
1938+
} else {
1939+
/*
1940+
* Free any leftovers from the eager page splitting cache. Do
1941+
* this when deleting, moving, disabling dirty logging, or
1942+
* creating the memslot (a nop). Doing it for deletes makes
1943+
* sure we don't leak memory, and there's no need to keep the
1944+
* cache around for any of the other cases.
1945+
*/
1946+
kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
18281947
}
18291948
}
18301949

0 commit comments

Comments
 (0)