@@ -125,17 +125,31 @@ module_param(dbg, bool, 0644);
125125#define PTE_LIST_EXT 14
126126
127127/*
128- * Slight optimization of cacheline layout, by putting `more' and `spte_count'
129- * at the start; then accessing it will only use one single cacheline for
130- * either full (entries==PTE_LIST_EXT) case or entries<=6.
128+ * struct pte_list_desc is the core data structure used to implement a custom
129+ * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
130+ * given GFN when used in the context of rmaps. Using a custom list allows KVM
131+ * to optimize for the common case where many GFNs will have at most a handful
132+ * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
133+ * memory footprint, which in turn improves runtime performance by exploiting
134+ * cache locality.
135+ *
136+ * A list is comprised of one or more pte_list_desc objects (descriptors).
137+ * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
138+ * is full and a new SPTEs needs to be added, a new descriptor is allocated and
139+ * becomes the head of the list. This means that by definitions, all tail
140+ * descriptors are full.
141+ *
142+ * Note, the meta data fields are deliberately placed at the start of the
143+ * structure to optimize the cacheline layout; accessing the descriptor will
144+ * touch only a single cacheline so long as @spte_count<=6 (or if only the
145+ * descriptors metadata is accessed).
131146 */
132147struct pte_list_desc {
133148 struct pte_list_desc * more ;
134- /*
135- * Stores number of entries stored in the pte_list_desc. No need to be
136- * u64 but just for easier alignment. When PTE_LIST_EXT, means full.
137- */
138- u64 spte_count ;
149+ /* The number of PTEs stored in _this_ descriptor. */
150+ u32 spte_count ;
151+ /* The number of PTEs stored in all tails of this descriptor. */
152+ u32 tail_count ;
139153 u64 * sptes [PTE_LIST_EXT ];
140154};
141155
@@ -929,53 +943,70 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
929943 desc -> sptes [0 ] = (u64 * )rmap_head -> val ;
930944 desc -> sptes [1 ] = spte ;
931945 desc -> spte_count = 2 ;
946+ desc -> tail_count = 0 ;
932947 rmap_head -> val = (unsigned long )desc | 1 ;
933948 ++ count ;
934949 } else {
935950 rmap_printk ("%p %llx many->many\n" , spte , * spte );
936951 desc = (struct pte_list_desc * )(rmap_head -> val & ~1ul );
937- while (desc -> spte_count == PTE_LIST_EXT ) {
938- count += PTE_LIST_EXT ;
939- if (!desc -> more ) {
940- desc -> more = kvm_mmu_memory_cache_alloc (cache );
941- desc = desc -> more ;
942- desc -> spte_count = 0 ;
943- break ;
944- }
945- desc = desc -> more ;
952+ count = desc -> tail_count + desc -> spte_count ;
953+
954+ /*
955+ * If the previous head is full, allocate a new head descriptor
956+ * as tail descriptors are always kept full.
957+ */
958+ if (desc -> spte_count == PTE_LIST_EXT ) {
959+ desc = kvm_mmu_memory_cache_alloc (cache );
960+ desc -> more = (struct pte_list_desc * )(rmap_head -> val & ~1ul );
961+ desc -> spte_count = 0 ;
962+ desc -> tail_count = count ;
963+ rmap_head -> val = (unsigned long )desc | 1 ;
946964 }
947- count += desc -> spte_count ;
948965 desc -> sptes [desc -> spte_count ++ ] = spte ;
949966 }
950967 return count ;
951968}
952969
953970static void
954971pte_list_desc_remove_entry (struct kvm_rmap_head * rmap_head ,
955- struct pte_list_desc * desc , int i ,
956- struct pte_list_desc * prev_desc )
972+ struct pte_list_desc * desc , int i )
957973{
958- int j = desc -> spte_count - 1 ;
974+ struct pte_list_desc * head_desc = (struct pte_list_desc * )(rmap_head -> val & ~1ul );
975+ int j = head_desc -> spte_count - 1 ;
959976
960- desc -> sptes [i ] = desc -> sptes [j ];
961- desc -> sptes [j ] = NULL ;
962- desc -> spte_count -- ;
963- if (desc -> spte_count )
977+ /*
978+ * The head descriptor should never be empty. A new head is added only
979+ * when adding an entry and the previous head is full, and heads are
980+ * removed (this flow) when they become empty.
981+ */
982+ BUG_ON (j < 0 );
983+
984+ /*
985+ * Replace the to-be-freed SPTE with the last valid entry from the head
986+ * descriptor to ensure that tail descriptors are full at all times.
987+ * Note, this also means that tail_count is stable for each descriptor.
988+ */
989+ desc -> sptes [i ] = head_desc -> sptes [j ];
990+ head_desc -> sptes [j ] = NULL ;
991+ head_desc -> spte_count -- ;
992+ if (head_desc -> spte_count )
964993 return ;
965- if (!prev_desc && !desc -> more )
994+
995+ /*
996+ * The head descriptor is empty. If there are no tail descriptors,
997+ * nullify the rmap head to mark the list as emtpy, else point the rmap
998+ * head at the next descriptor, i.e. the new head.
999+ */
1000+ if (!head_desc -> more )
9661001 rmap_head -> val = 0 ;
9671002 else
968- if (prev_desc )
969- prev_desc -> more = desc -> more ;
970- else
971- rmap_head -> val = (unsigned long )desc -> more | 1 ;
972- mmu_free_pte_list_desc (desc );
1003+ rmap_head -> val = (unsigned long )head_desc -> more | 1 ;
1004+ mmu_free_pte_list_desc (head_desc );
9731005}
9741006
9751007static void pte_list_remove (u64 * spte , struct kvm_rmap_head * rmap_head )
9761008{
9771009 struct pte_list_desc * desc ;
978- struct pte_list_desc * prev_desc ;
9791010 int i ;
9801011
9811012 if (!rmap_head -> val ) {
@@ -991,16 +1022,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
9911022 } else {
9921023 rmap_printk ("%p many->many\n" , spte );
9931024 desc = (struct pte_list_desc * )(rmap_head -> val & ~1ul );
994- prev_desc = NULL ;
9951025 while (desc ) {
9961026 for (i = 0 ; i < desc -> spte_count ; ++ i ) {
9971027 if (desc -> sptes [i ] == spte ) {
998- pte_list_desc_remove_entry (rmap_head ,
999- desc , i , prev_desc );
1028+ pte_list_desc_remove_entry (rmap_head , desc , i );
10001029 return ;
10011030 }
10021031 }
1003- prev_desc = desc ;
10041032 desc = desc -> more ;
10051033 }
10061034 pr_err ("%s: %p many->many\n" , __func__ , spte );
@@ -1047,21 +1075,14 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
10471075unsigned int pte_list_count (struct kvm_rmap_head * rmap_head )
10481076{
10491077 struct pte_list_desc * desc ;
1050- unsigned int count = 0 ;
10511078
10521079 if (!rmap_head -> val )
10531080 return 0 ;
10541081 else if (!(rmap_head -> val & 1 ))
10551082 return 1 ;
10561083
10571084 desc = (struct pte_list_desc * )(rmap_head -> val & ~1ul );
1058-
1059- while (desc ) {
1060- count += desc -> spte_count ;
1061- desc = desc -> more ;
1062- }
1063-
1064- return count ;
1085+ return desc -> tail_count + desc -> spte_count ;
10651086}
10661087
10671088static struct kvm_rmap_head * gfn_to_rmap (gfn_t gfn , int level ,
0 commit comments