Skip to content

Commit 141705b

Browse files
Lai Jiangshansean-jc
authored andcommitted
KVM: x86/mmu: Track tail count in pte_list_desc to optimize guest fork()
Rework "struct pte_list_desc" and pte_list_{add|remove} to track the tail count, i.e. number of PTEs in non-head descriptors, and to always keep all tail descriptors full so that adding a new entry and counting the number of entries is done in constant time instead of linear time. No visible performace is changed in tests. But pte_list_add() is no longer shown in the perf result for the COWed pages even the guest forks millions of tasks. Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com> Link: https://lore.kernel.org/r/20230113122910.672417-1-jiangshanlai@gmail.com [sean: reword shortlog, tweak changelog, add lots of comments, add BUG_ON()] Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent 19ace7d commit 141705b

1 file changed

Lines changed: 65 additions & 44 deletions

File tree

arch/x86/kvm/mmu/mmu.c

Lines changed: 65 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -125,17 +125,31 @@ module_param(dbg, bool, 0644);
125125
#define PTE_LIST_EXT 14
126126

127127
/*
128-
* Slight optimization of cacheline layout, by putting `more' and `spte_count'
129-
* at the start; then accessing it will only use one single cacheline for
130-
* either full (entries==PTE_LIST_EXT) case or entries<=6.
128+
* struct pte_list_desc is the core data structure used to implement a custom
129+
* list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
130+
* given GFN when used in the context of rmaps. Using a custom list allows KVM
131+
* to optimize for the common case where many GFNs will have at most a handful
132+
* of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
133+
* memory footprint, which in turn improves runtime performance by exploiting
134+
* cache locality.
135+
*
136+
* A list is comprised of one or more pte_list_desc objects (descriptors).
137+
* Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
138+
* is full and a new SPTEs needs to be added, a new descriptor is allocated and
139+
* becomes the head of the list. This means that by definitions, all tail
140+
* descriptors are full.
141+
*
142+
* Note, the meta data fields are deliberately placed at the start of the
143+
* structure to optimize the cacheline layout; accessing the descriptor will
144+
* touch only a single cacheline so long as @spte_count<=6 (or if only the
145+
* descriptors metadata is accessed).
131146
*/
132147
struct pte_list_desc {
133148
struct pte_list_desc *more;
134-
/*
135-
* Stores number of entries stored in the pte_list_desc. No need to be
136-
* u64 but just for easier alignment. When PTE_LIST_EXT, means full.
137-
*/
138-
u64 spte_count;
149+
/* The number of PTEs stored in _this_ descriptor. */
150+
u32 spte_count;
151+
/* The number of PTEs stored in all tails of this descriptor. */
152+
u32 tail_count;
139153
u64 *sptes[PTE_LIST_EXT];
140154
};
141155

@@ -929,53 +943,70 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
929943
desc->sptes[0] = (u64 *)rmap_head->val;
930944
desc->sptes[1] = spte;
931945
desc->spte_count = 2;
946+
desc->tail_count = 0;
932947
rmap_head->val = (unsigned long)desc | 1;
933948
++count;
934949
} else {
935950
rmap_printk("%p %llx many->many\n", spte, *spte);
936951
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
937-
while (desc->spte_count == PTE_LIST_EXT) {
938-
count += PTE_LIST_EXT;
939-
if (!desc->more) {
940-
desc->more = kvm_mmu_memory_cache_alloc(cache);
941-
desc = desc->more;
942-
desc->spte_count = 0;
943-
break;
944-
}
945-
desc = desc->more;
952+
count = desc->tail_count + desc->spte_count;
953+
954+
/*
955+
* If the previous head is full, allocate a new head descriptor
956+
* as tail descriptors are always kept full.
957+
*/
958+
if (desc->spte_count == PTE_LIST_EXT) {
959+
desc = kvm_mmu_memory_cache_alloc(cache);
960+
desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
961+
desc->spte_count = 0;
962+
desc->tail_count = count;
963+
rmap_head->val = (unsigned long)desc | 1;
946964
}
947-
count += desc->spte_count;
948965
desc->sptes[desc->spte_count++] = spte;
949966
}
950967
return count;
951968
}
952969

953970
static void
954971
pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
955-
struct pte_list_desc *desc, int i,
956-
struct pte_list_desc *prev_desc)
972+
struct pte_list_desc *desc, int i)
957973
{
958-
int j = desc->spte_count - 1;
974+
struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
975+
int j = head_desc->spte_count - 1;
959976

960-
desc->sptes[i] = desc->sptes[j];
961-
desc->sptes[j] = NULL;
962-
desc->spte_count--;
963-
if (desc->spte_count)
977+
/*
978+
* The head descriptor should never be empty. A new head is added only
979+
* when adding an entry and the previous head is full, and heads are
980+
* removed (this flow) when they become empty.
981+
*/
982+
BUG_ON(j < 0);
983+
984+
/*
985+
* Replace the to-be-freed SPTE with the last valid entry from the head
986+
* descriptor to ensure that tail descriptors are full at all times.
987+
* Note, this also means that tail_count is stable for each descriptor.
988+
*/
989+
desc->sptes[i] = head_desc->sptes[j];
990+
head_desc->sptes[j] = NULL;
991+
head_desc->spte_count--;
992+
if (head_desc->spte_count)
964993
return;
965-
if (!prev_desc && !desc->more)
994+
995+
/*
996+
* The head descriptor is empty. If there are no tail descriptors,
997+
* nullify the rmap head to mark the list as emtpy, else point the rmap
998+
* head at the next descriptor, i.e. the new head.
999+
*/
1000+
if (!head_desc->more)
9661001
rmap_head->val = 0;
9671002
else
968-
if (prev_desc)
969-
prev_desc->more = desc->more;
970-
else
971-
rmap_head->val = (unsigned long)desc->more | 1;
972-
mmu_free_pte_list_desc(desc);
1003+
rmap_head->val = (unsigned long)head_desc->more | 1;
1004+
mmu_free_pte_list_desc(head_desc);
9731005
}
9741006

9751007
static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
9761008
{
9771009
struct pte_list_desc *desc;
978-
struct pte_list_desc *prev_desc;
9791010
int i;
9801011

9811012
if (!rmap_head->val) {
@@ -991,16 +1022,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
9911022
} else {
9921023
rmap_printk("%p many->many\n", spte);
9931024
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
994-
prev_desc = NULL;
9951025
while (desc) {
9961026
for (i = 0; i < desc->spte_count; ++i) {
9971027
if (desc->sptes[i] == spte) {
998-
pte_list_desc_remove_entry(rmap_head,
999-
desc, i, prev_desc);
1028+
pte_list_desc_remove_entry(rmap_head, desc, i);
10001029
return;
10011030
}
10021031
}
1003-
prev_desc = desc;
10041032
desc = desc->more;
10051033
}
10061034
pr_err("%s: %p many->many\n", __func__, spte);
@@ -1047,21 +1075,14 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
10471075
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
10481076
{
10491077
struct pte_list_desc *desc;
1050-
unsigned int count = 0;
10511078

10521079
if (!rmap_head->val)
10531080
return 0;
10541081
else if (!(rmap_head->val & 1))
10551082
return 1;
10561083

10571084
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1058-
1059-
while (desc) {
1060-
count += desc->spte_count;
1061-
desc = desc->more;
1062-
}
1063-
1064-
return count;
1085+
return desc->tail_count + desc->spte_count;
10651086
}
10661087

10671088
static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,

0 commit comments

Comments
 (0)