Skip to content

Commit 1194372

Browse files
Alexander Gordeevhcahca
authored andcommitted
s390/mm: better annotate 2KB pagetable fragments handling
Explicitly encode immediate value of pending remove nibble (bits 31-28) and tracking nibble (bits 27-24) of the page refcount whenever these nibbles are tested or changed, for better readability. Also, add some comments describing how the fragments are handled. Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
1 parent c2c2249 commit 1194372

1 file changed

Lines changed: 107 additions & 20 deletions

File tree

arch/s390/mm/pgalloc.c

Lines changed: 107 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page)
176176
#endif /* CONFIG_PGSTE */
177177

178178
/*
179-
* page table entry allocation/free routines.
179+
* A 2KB-pgtable is either upper or lower half of a normal page.
180+
* The second half of the page may be unused or used as another
181+
* 2KB-pgtable.
182+
*
183+
* Whenever possible the parent page for a new 2KB-pgtable is picked
184+
* from the list of partially allocated pages mm_context_t::pgtable_list.
185+
* In case the list is empty a new parent page is allocated and added to
186+
* the list.
187+
*
188+
* When a parent page gets fully allocated it contains 2KB-pgtables in both
189+
* upper and lower halves and is removed from mm_context_t::pgtable_list.
190+
*
191+
* When 2KB-pgtable is freed from to fully allocated parent page that
192+
* page turns partially allocated and added to mm_context_t::pgtable_list.
193+
*
194+
* If 2KB-pgtable is freed from the partially allocated parent page that
195+
* page turns unused and gets removed from mm_context_t::pgtable_list.
196+
* Furthermore, the unused parent page is released.
197+
*
198+
* As follows from the above, no unallocated or fully allocated parent
199+
* pages are contained in mm_context_t::pgtable_list.
200+
*
201+
* The upper byte (bits 24-31) of the parent page _refcount is used
202+
* for tracking contained 2KB-pgtables and has the following format:
203+
*
204+
* PP AA
205+
* 01234567 upper byte (bits 24-31) of struct page::_refcount
206+
* || ||
207+
* || |+--- upper 2KB-pgtable is allocated
208+
* || +---- lower 2KB-pgtable is allocated
209+
* |+------- upper 2KB-pgtable is pending for removal
210+
* +-------- lower 2KB-pgtable is pending for removal
211+
*
212+
* (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
213+
* using _refcount is possible).
214+
*
215+
* When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
216+
* The parent page is either:
217+
* - added to mm_context_t::pgtable_list in case the second half of the
218+
* parent page is still unallocated;
219+
* - removed from mm_context_t::pgtable_list in case both hales of the
220+
* parent page are allocated;
221+
* These operations are protected with mm_context_t::lock.
222+
*
223+
* When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
224+
* and the corresponding PP bit is set to 1 in a single atomic operation.
225+
* Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
226+
* exclusive and may never be both set to 1!
227+
* The parent page is either:
228+
* - added to mm_context_t::pgtable_list in case the second half of the
229+
* parent page is still allocated;
230+
* - removed from mm_context_t::pgtable_list in case the second half of
231+
* the parent page is unallocated;
232+
* These operations are protected with mm_context_t::lock.
233+
*
234+
* It is important to understand that mm_context_t::lock only protects
235+
* mm_context_t::pgtable_list and AA bits, but not the parent page itself
236+
* and PP bits.
237+
*
238+
* Releasing the parent page happens whenever the PP bit turns from 1 to 0,
239+
* while both AA bits and the second PP bit are already unset. Then the
240+
* parent page does not contain any 2KB-pgtable fragment anymore, and it has
241+
* also been removed from mm_context_t::pgtable_list. It is safe to release
242+
* the page therefore.
243+
*
244+
* PGSTE memory spaces use full 4KB-pgtables and do not need most of the
245+
* logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
246+
* while the PP bits are never used, nor such a page is added to or removed
247+
* from mm_context_t::pgtable_list.
180248
*/
181249
unsigned long *page_table_alloc(struct mm_struct *mm)
182250
{
@@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
192260
page = list_first_entry(&mm->context.pgtable_list,
193261
struct page, lru);
194262
mask = atomic_read(&page->_refcount) >> 24;
195-
mask = (mask | (mask >> 4)) & 3;
196-
if (mask != 3) {
263+
/*
264+
* The pending removal bits must also be checked.
265+
* Failure to do so might lead to an impossible
266+
* value of (i.e 0x13 or 0x23) written to _refcount.
267+
* Such values violate the assumption that pending and
268+
* allocation bits are mutually exclusive, and the rest
269+
* of the code unrails as result. That could lead to
270+
* a whole bunch of races and corruptions.
271+
*/
272+
mask = (mask | (mask >> 4)) & 0x03U;
273+
if (mask != 0x03U) {
197274
table = (unsigned long *) page_to_virt(page);
198275
bit = mask & 1; /* =1 -> second 2K */
199276
if (bit)
200277
table += PTRS_PER_PTE;
201278
atomic_xor_bits(&page->_refcount,
202-
1U << (bit + 24));
279+
0x01U << (bit + 24));
203280
list_del(&page->lru);
204281
}
205282
}
@@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
220297
table = (unsigned long *) page_to_virt(page);
221298
if (mm_alloc_pgste(mm)) {
222299
/* Return 4K page table with PGSTEs */
223-
atomic_xor_bits(&page->_refcount, 3 << 24);
300+
atomic_xor_bits(&page->_refcount, 0x03U << 24);
224301
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
225302
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
226303
} else {
227304
/* Return the first 2K fragment of the page */
228-
atomic_xor_bits(&page->_refcount, 1 << 24);
305+
atomic_xor_bits(&page->_refcount, 0x01U << 24);
229306
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
230307
spin_lock_bh(&mm->context.lock);
231308
list_add(&page->lru, &mm->context.pgtable_list);
@@ -244,19 +321,24 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
244321
/* Free 2K page table fragment of a 4K page */
245322
bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
246323
spin_lock_bh(&mm->context.lock);
324+
/*
325+
* Mark the page for delayed release. The actual release
326+
* will happen outside of the critical section from this
327+
* function or from __tlb_remove_table()
328+
*/
247329
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
248330
mask >>= 24;
249-
if (mask & 3)
331+
if (mask & 0x03U)
250332
list_add(&page->lru, &mm->context.pgtable_list);
251333
else
252334
list_del(&page->lru);
253335
spin_unlock_bh(&mm->context.lock);
254336
mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
255337
mask >>= 24;
256-
if (mask != 0)
338+
if (mask != 0x00U)
257339
return;
258340
} else {
259-
atomic_xor_bits(&page->_refcount, 3U << 24);
341+
atomic_xor_bits(&page->_refcount, 0x03U << 24);
260342
}
261343

262344
pgtable_pte_page_dtor(page);
@@ -274,43 +356,48 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
274356
page = virt_to_page(table);
275357
if (mm_alloc_pgste(mm)) {
276358
gmap_unlink(mm, table, vmaddr);
277-
table = (unsigned long *) ((unsigned long)table | 3);
359+
table = (unsigned long *) ((unsigned long)table | 0x03U);
278360
tlb_remove_table(tlb, table);
279361
return;
280362
}
281363
bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
282364
spin_lock_bh(&mm->context.lock);
365+
/*
366+
* Mark the page for delayed release. The actual release will happen
367+
* outside of the critical section from __tlb_remove_table() or from
368+
* page_table_free()
369+
*/
283370
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
284371
mask >>= 24;
285-
if (mask & 3)
372+
if (mask & 0x03U)
286373
list_add_tail(&page->lru, &mm->context.pgtable_list);
287374
else
288375
list_del(&page->lru);
289376
spin_unlock_bh(&mm->context.lock);
290-
table = (unsigned long *) ((unsigned long) table | (1U << bit));
377+
table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
291378
tlb_remove_table(tlb, table);
292379
}
293380

294381
void __tlb_remove_table(void *_table)
295382
{
296-
unsigned int mask = (unsigned long) _table & 3;
383+
unsigned int mask = (unsigned long) _table & 0x03U;
297384
void *table = (void *)((unsigned long) _table ^ mask);
298385
struct page *page = virt_to_page(table);
299386

300387
switch (mask) {
301-
case 0: /* pmd, pud, or p4d */
388+
case 0x00U: /* pmd, pud, or p4d */
302389
free_pages((unsigned long) table, 2);
303390
break;
304-
case 1: /* lower 2K of a 4K page table */
305-
case 2: /* higher 2K of a 4K page table */
391+
case 0x01U: /* lower 2K of a 4K page table */
392+
case 0x02U: /* higher 2K of a 4K page table */
306393
mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
307394
mask >>= 24;
308-
if (mask != 0)
395+
if (mask != 0x00U)
309396
break;
310397
fallthrough;
311-
case 3: /* 4K page table with pgstes */
312-
if (mask & 3)
313-
atomic_xor_bits(&page->_refcount, 3 << 24);
398+
case 0x03U: /* 4K page table with pgstes */
399+
if (mask & 0x03U)
400+
atomic_xor_bits(&page->_refcount, 0x03U << 24);
314401
pgtable_pte_page_dtor(page);
315402
__free_page(page);
316403
break;

0 commit comments

Comments
 (0)