@@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page)
176176#endif /* CONFIG_PGSTE */
177177
178178/*
179- * page table entry allocation/free routines.
179+ * A 2KB-pgtable is either upper or lower half of a normal page.
180+ * The second half of the page may be unused or used as another
181+ * 2KB-pgtable.
182+ *
183+ * Whenever possible the parent page for a new 2KB-pgtable is picked
184+ * from the list of partially allocated pages mm_context_t::pgtable_list.
185+ * In case the list is empty a new parent page is allocated and added to
186+ * the list.
187+ *
188+ * When a parent page gets fully allocated it contains 2KB-pgtables in both
189+ * upper and lower halves and is removed from mm_context_t::pgtable_list.
190+ *
191+ * When 2KB-pgtable is freed from to fully allocated parent page that
192+ * page turns partially allocated and added to mm_context_t::pgtable_list.
193+ *
194+ * If 2KB-pgtable is freed from the partially allocated parent page that
195+ * page turns unused and gets removed from mm_context_t::pgtable_list.
196+ * Furthermore, the unused parent page is released.
197+ *
198+ * As follows from the above, no unallocated or fully allocated parent
199+ * pages are contained in mm_context_t::pgtable_list.
200+ *
201+ * The upper byte (bits 24-31) of the parent page _refcount is used
202+ * for tracking contained 2KB-pgtables and has the following format:
203+ *
204+ * PP AA
205+ * 01234567 upper byte (bits 24-31) of struct page::_refcount
206+ * || ||
207+ * || |+--- upper 2KB-pgtable is allocated
208+ * || +---- lower 2KB-pgtable is allocated
209+ * |+------- upper 2KB-pgtable is pending for removal
210+ * +-------- lower 2KB-pgtable is pending for removal
211+ *
212+ * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
213+ * using _refcount is possible).
214+ *
215+ * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
216+ * The parent page is either:
217+ * - added to mm_context_t::pgtable_list in case the second half of the
218+ * parent page is still unallocated;
219+ * - removed from mm_context_t::pgtable_list in case both hales of the
220+ * parent page are allocated;
221+ * These operations are protected with mm_context_t::lock.
222+ *
223+ * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
224+ * and the corresponding PP bit is set to 1 in a single atomic operation.
225+ * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
226+ * exclusive and may never be both set to 1!
227+ * The parent page is either:
228+ * - added to mm_context_t::pgtable_list in case the second half of the
229+ * parent page is still allocated;
230+ * - removed from mm_context_t::pgtable_list in case the second half of
231+ * the parent page is unallocated;
232+ * These operations are protected with mm_context_t::lock.
233+ *
234+ * It is important to understand that mm_context_t::lock only protects
235+ * mm_context_t::pgtable_list and AA bits, but not the parent page itself
236+ * and PP bits.
237+ *
238+ * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
239+ * while both AA bits and the second PP bit are already unset. Then the
240+ * parent page does not contain any 2KB-pgtable fragment anymore, and it has
241+ * also been removed from mm_context_t::pgtable_list. It is safe to release
242+ * the page therefore.
243+ *
244+ * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
245+ * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
246+ * while the PP bits are never used, nor such a page is added to or removed
247+ * from mm_context_t::pgtable_list.
180248 */
181249unsigned long * page_table_alloc (struct mm_struct * mm )
182250{
@@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
192260 page = list_first_entry (& mm -> context .pgtable_list ,
193261 struct page , lru );
194262 mask = atomic_read (& page -> _refcount ) >> 24 ;
195- mask = (mask | (mask >> 4 )) & 3 ;
196- if (mask != 3 ) {
263+ /*
264+ * The pending removal bits must also be checked.
265+ * Failure to do so might lead to an impossible
266+ * value of (i.e 0x13 or 0x23) written to _refcount.
267+ * Such values violate the assumption that pending and
268+ * allocation bits are mutually exclusive, and the rest
269+ * of the code unrails as result. That could lead to
270+ * a whole bunch of races and corruptions.
271+ */
272+ mask = (mask | (mask >> 4 )) & 0x03U ;
273+ if (mask != 0x03U ) {
197274 table = (unsigned long * ) page_to_virt (page );
198275 bit = mask & 1 ; /* =1 -> second 2K */
199276 if (bit )
200277 table += PTRS_PER_PTE ;
201278 atomic_xor_bits (& page -> _refcount ,
202- 1U << (bit + 24 ));
279+ 0x01U << (bit + 24 ));
203280 list_del (& page -> lru );
204281 }
205282 }
@@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
220297 table = (unsigned long * ) page_to_virt (page );
221298 if (mm_alloc_pgste (mm )) {
222299 /* Return 4K page table with PGSTEs */
223- atomic_xor_bits (& page -> _refcount , 3 << 24 );
300+ atomic_xor_bits (& page -> _refcount , 0x03U << 24 );
224301 memset64 ((u64 * )table , _PAGE_INVALID , PTRS_PER_PTE );
225302 memset64 ((u64 * )table + PTRS_PER_PTE , 0 , PTRS_PER_PTE );
226303 } else {
227304 /* Return the first 2K fragment of the page */
228- atomic_xor_bits (& page -> _refcount , 1 << 24 );
305+ atomic_xor_bits (& page -> _refcount , 0x01U << 24 );
229306 memset64 ((u64 * )table , _PAGE_INVALID , 2 * PTRS_PER_PTE );
230307 spin_lock_bh (& mm -> context .lock );
231308 list_add (& page -> lru , & mm -> context .pgtable_list );
@@ -244,19 +321,24 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
244321 /* Free 2K page table fragment of a 4K page */
245322 bit = ((unsigned long ) table & ~PAGE_MASK )/(PTRS_PER_PTE * sizeof (pte_t ));
246323 spin_lock_bh (& mm -> context .lock );
324+ /*
325+ * Mark the page for delayed release. The actual release
326+ * will happen outside of the critical section from this
327+ * function or from __tlb_remove_table()
328+ */
247329 mask = atomic_xor_bits (& page -> _refcount , 0x11U << (bit + 24 ));
248330 mask >>= 24 ;
249- if (mask & 3 )
331+ if (mask & 0x03U )
250332 list_add (& page -> lru , & mm -> context .pgtable_list );
251333 else
252334 list_del (& page -> lru );
253335 spin_unlock_bh (& mm -> context .lock );
254336 mask = atomic_xor_bits (& page -> _refcount , 0x10U << (bit + 24 ));
255337 mask >>= 24 ;
256- if (mask != 0 )
338+ if (mask != 0x00U )
257339 return ;
258340 } else {
259- atomic_xor_bits (& page -> _refcount , 3U << 24 );
341+ atomic_xor_bits (& page -> _refcount , 0x03U << 24 );
260342 }
261343
262344 pgtable_pte_page_dtor (page );
@@ -274,43 +356,48 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
274356 page = virt_to_page (table );
275357 if (mm_alloc_pgste (mm )) {
276358 gmap_unlink (mm , table , vmaddr );
277- table = (unsigned long * ) ((unsigned long )table | 3 );
359+ table = (unsigned long * ) ((unsigned long )table | 0x03U );
278360 tlb_remove_table (tlb , table );
279361 return ;
280362 }
281363 bit = ((unsigned long ) table & ~PAGE_MASK ) / (PTRS_PER_PTE * sizeof (pte_t ));
282364 spin_lock_bh (& mm -> context .lock );
365+ /*
366+ * Mark the page for delayed release. The actual release will happen
367+ * outside of the critical section from __tlb_remove_table() or from
368+ * page_table_free()
369+ */
283370 mask = atomic_xor_bits (& page -> _refcount , 0x11U << (bit + 24 ));
284371 mask >>= 24 ;
285- if (mask & 3 )
372+ if (mask & 0x03U )
286373 list_add_tail (& page -> lru , & mm -> context .pgtable_list );
287374 else
288375 list_del (& page -> lru );
289376 spin_unlock_bh (& mm -> context .lock );
290- table = (unsigned long * ) ((unsigned long ) table | (1U << bit ));
377+ table = (unsigned long * ) ((unsigned long ) table | (0x01U << bit ));
291378 tlb_remove_table (tlb , table );
292379}
293380
294381void __tlb_remove_table (void * _table )
295382{
296- unsigned int mask = (unsigned long ) _table & 3 ;
383+ unsigned int mask = (unsigned long ) _table & 0x03U ;
297384 void * table = (void * )((unsigned long ) _table ^ mask );
298385 struct page * page = virt_to_page (table );
299386
300387 switch (mask ) {
301- case 0 : /* pmd, pud, or p4d */
388+ case 0x00U : /* pmd, pud, or p4d */
302389 free_pages ((unsigned long ) table , 2 );
303390 break ;
304- case 1 : /* lower 2K of a 4K page table */
305- case 2 : /* higher 2K of a 4K page table */
391+ case 0x01U : /* lower 2K of a 4K page table */
392+ case 0x02U : /* higher 2K of a 4K page table */
306393 mask = atomic_xor_bits (& page -> _refcount , mask << (4 + 24 ));
307394 mask >>= 24 ;
308- if (mask != 0 )
395+ if (mask != 0x00U )
309396 break ;
310397 fallthrough ;
311- case 3 : /* 4K page table with pgstes */
312- if (mask & 3 )
313- atomic_xor_bits (& page -> _refcount , 3 << 24 );
398+ case 0x03U : /* 4K page table with pgstes */
399+ if (mask & 0x03U )
400+ atomic_xor_bits (& page -> _refcount , 0x03U << 24 );
314401 pgtable_pte_page_dtor (page );
315402 __free_page (page );
316403 break ;
0 commit comments