@@ -10269,7 +10269,8 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1026910269 * Serialization rules:
1027010270 *
1027110271 * mm::mm_cid::mutex: Serializes fork() and exit() and therefore
10272- * protects mm::mm_cid::users.
10272+ * protects mm::mm_cid::users and mode switch
10273+ * transitions
1027310274 *
1027410275 * mm::mm_cid::lock: Serializes mm_update_max_cids() and
1027510276 * mm_update_cpus_allowed(). Nests in mm_cid::mutex
@@ -10285,14 +10286,70 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1028510286 *
1028610287 * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
1028710288 * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
10288- * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
10289- * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
10290- * task needs to drop the CID into the pool when scheduling out. Both bits
10291- * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
10292- * actually handed over to user space in the RSEQ memory.
10289+ * MM_CID_ONCPU bit set.
10290+ *
10291+ * During the transition of ownership mode, the MM_CID_TRANSIT bit is set
10292+ * on the CIDs. When this bit is set the tasks drop the CID back into the
10293+ * pool when scheduling out.
10294+ *
10295+ * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the
10296+ * CID is actually handed over to user space in the RSEQ memory.
1029310297 *
1029410298 * Mode switching:
1029510299 *
10300+ * The ownership mode is per process and stored in mm:mm_cid::mode with the
10301+ * following possible states:
10302+ *
10303+ * 0: Per task ownership
10304+ * 0 | MM_CID_TRANSIT: Transition from per CPU to per task
10305+ * MM_CID_ONCPU: Per CPU ownership
10306+ * MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU
10307+ *
10308+ * All transitions of ownership mode happen in two phases:
10309+ *
10310+ * 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the
10311+ * CIDs and denotes that the CID is only temporarily owned by a
10312+ * task. When the task schedules out it drops the CID back into the
10313+ * pool if this bit is set.
10314+ *
10315+ * 2) The initiating context walks the per CPU space or the tasks to fixup
10316+ * or drop the CIDs and after completion it clears MM_CID_TRANSIT in
10317+ * mm:mm_cid::mode. After that point the CIDs are strictly task or CPU
10318+ * owned again.
10319+ *
10320+ * This two phase transition is required to prevent CID space exhaustion
10321+ * during the transition as a direct transfer of ownership would fail:
10322+ *
10323+ * - On task to CPU mode switch if a task is scheduled in on one CPU and
10324+ * then migrated to another CPU before the fixup freed enough per task
10325+ * CIDs.
10326+ *
10327+ * - On CPU to task mode switch if two tasks are scheduled in on the same
10328+ * CPU before the fixup freed per CPU CIDs.
10329+ *
10330+ * Both scenarios can result in a live lock because sched_in() is invoked
10331+ * with runqueue lock held and loops in search of a CID and the fixup
10332+ * thread can't make progress freeing them up because it is stuck on the
10333+ * same runqueue lock.
10334+ *
10335+ * While MM_CID_TRANSIT is active during the transition phase the MM_CID
10336+ * bitmap can be contended, but that's a temporary contention bound to the
10337+ * transition period. After that everything goes back into steady state and
10338+ * nothing except fork() and exit() will touch the bitmap. This is an
10339+ * acceptable tradeoff as it completely avoids complex serialization,
10340+ * memory barriers and atomic operations for the common case.
10341+ *
10342+ * Aside of that this mechanism also ensures RT compability:
10343+ *
10344+ * - The task which runs the fixup is fully preemptible except for the
10345+ * short runqueue lock held sections.
10346+ *
10347+ * - The transient impact of the bitmap contention is only problematic
10348+ * when there is a thundering herd scenario of tasks scheduling in and
10349+ * out concurrently. There is not much which can be done about that
10350+ * except for avoiding mode switching by a proper overall system
10351+ * configuration.
10352+ *
1029610353 * Switching to per CPU mode happens when the user count becomes greater
1029710354 * than the maximum number of CIDs, which is calculated by:
1029810355 *
@@ -10306,12 +10363,13 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1030610363 *
1030710364 * At the point of switching to per CPU mode the new user is not yet
1030810365 * visible in the system, so the task which initiated the fork() runs the
10309- * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
10310- * either transfers each tasks owned CID to the CPU the task runs on or
10311- * drops it into the CID pool if a task is not on a CPU at that point in
10312- * time. Tasks which schedule in before the task walk reaches them do the
10313- * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
10314- * it's guaranteed that no task related to that MM owns a CID anymore.
10366+ * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and
10367+ * either marks each task owned CID with MM_CID_TRANSIT if the task is
10368+ * running on a CPU or drops it into the CID pool if a task is not on a
10369+ * CPU. Tasks which schedule in before the task walk reaches them do the
10370+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus()
10371+ * completes it is guaranteed that no task related to that MM owns a CID
10372+ * anymore.
1031510373 *
1031610374 * Switching back to task mode happens when the user count goes below the
1031710375 * threshold which was recorded on the per CPU mode switch:
@@ -10327,28 +10385,11 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1032710385 * run either in the deferred update function in context of a workqueue or
1032810386 * by a task which forks a new one or by a task which exits. Whatever
1032910387 * happens first. mm_cid_fixup_cpus_to_task() walks through the possible
10330- * CPUs and either transfers the CPU owned CIDs to a related task which
10331- * runs on the CPU or drops it into the pool. Tasks which schedule in on a
10332- * CPU which the walk did not cover yet do the handover themself.
10333- *
10334- * This transition from CPU to per task ownership happens in two phases:
10335- *
10336- * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
10337- * CID and denotes that the CID is only temporarily owned by the
10338- * task. When it schedules out the task drops the CID back into the
10339- * pool if this bit is set.
10340- *
10341- * 2) The initiating context walks the per CPU space and after completion
10342- * clears mm:mm_cid.transit. So after that point the CIDs are strictly
10343- * task owned again.
10344- *
10345- * This two phase transition is required to prevent CID space exhaustion
10346- * during the transition as a direct transfer of ownership would fail if
10347- * two tasks are scheduled in on the same CPU before the fixup freed per
10348- * CPU CIDs.
10349- *
10350- * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
10351- * related to that MM is owned by a CPU anymore.
10388+ * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a
10389+ * related task is running on the CPU or drops it into the pool. Tasks
10390+ * which are scheduled in before the fixup covered them do the handover
10391+ * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed
10392+ * that no CID related to that MM is owned by a CPU anymore.
1035210393 */
1035310394
1035410395/*
@@ -10379,6 +10420,7 @@ static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
1037910420static bool mm_update_max_cids (struct mm_struct * mm )
1038010421{
1038110422 struct mm_mm_cid * mc = & mm -> mm_cid ;
10423+ bool percpu = cid_on_cpu (mc -> mode );
1038210424
1038310425 lockdep_assert_held (& mm -> mm_cid .lock );
1038410426
@@ -10387,7 +10429,7 @@ static bool mm_update_max_cids(struct mm_struct *mm)
1038710429 __mm_update_max_cids (mc );
1038810430
1038910431 /* Check whether owner mode must be changed */
10390- if (!mc -> percpu ) {
10432+ if (!percpu ) {
1039110433 /* Enable per CPU mode when the number of users is above max_cids */
1039210434 if (mc -> users > mc -> max_cids )
1039310435 mc -> pcpu_thrs = mm_cid_calc_pcpu_thrs (mc );
@@ -10398,12 +10440,17 @@ static bool mm_update_max_cids(struct mm_struct *mm)
1039810440 }
1039910441
1040010442 /* Mode change required? */
10401- if (!! mc -> percpu == !!mc -> pcpu_thrs )
10443+ if (percpu == !!mc -> pcpu_thrs )
1040210444 return false;
10403- /* When switching back to per TASK mode, set the transition flag */
10404- if (!mc -> pcpu_thrs )
10405- WRITE_ONCE (mc -> transit , MM_CID_TRANSIT );
10406- WRITE_ONCE (mc -> percpu , !!mc -> pcpu_thrs );
10445+
10446+ /* Flip the mode and set the transition flag to bridge the transfer */
10447+ WRITE_ONCE (mc -> mode , mc -> mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU ));
10448+ /*
10449+ * Order the store against the subsequent fixups so that
10450+ * acquire(rq::lock) cannot be reordered by the CPU before the
10451+ * store.
10452+ */
10453+ smp_mb ();
1040710454 return true;
1040810455}
1040910456
@@ -10428,7 +10475,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
1042810475
1042910476 WRITE_ONCE (mc -> nr_cpus_allowed , weight );
1043010477 __mm_update_max_cids (mc );
10431- if (!mc -> percpu )
10478+ if (!cid_on_cpu ( mc -> mode ) )
1043210479 return ;
1043310480
1043410481 /* Adjust the threshold to the wider set */
@@ -10446,6 +10493,16 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
1044610493 irq_work_queue (& mc -> irq_work );
1044710494}
1044810495
10496+ static inline void mm_cid_complete_transit (struct mm_struct * mm , unsigned int mode )
10497+ {
10498+ /*
10499+ * Ensure that the store removing the TRANSIT bit cannot be
10500+ * reordered by the CPU before the fixups have been completed.
10501+ */
10502+ smp_mb ();
10503+ WRITE_ONCE (mm -> mm_cid .mode , mode );
10504+ }
10505+
1044910506static inline void mm_cid_transit_to_task (struct task_struct * t , struct mm_cid_pcpu * pcp )
1045010507{
1045110508 if (cid_on_cpu (t -> mm_cid .cid )) {
@@ -10489,14 +10546,13 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
1048910546 }
1049010547 }
1049110548 }
10492- /* Clear the transition bit */
10493- WRITE_ONCE (mm -> mm_cid .transit , 0 );
10549+ mm_cid_complete_transit (mm , 0 );
1049410550}
1049510551
10496- static inline void mm_cid_transfer_to_cpu (struct task_struct * t , struct mm_cid_pcpu * pcp )
10552+ static inline void mm_cid_transit_to_cpu (struct task_struct * t , struct mm_cid_pcpu * pcp )
1049710553{
1049810554 if (cid_on_task (t -> mm_cid .cid )) {
10499- t -> mm_cid .cid = cid_to_cpu_cid (t -> mm_cid .cid );
10555+ t -> mm_cid .cid = cid_to_transit_cid (t -> mm_cid .cid );
1050010556 pcp -> cid = t -> mm_cid .cid ;
1050110557 }
1050210558}
@@ -10509,18 +10565,17 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
1050910565 if (!t -> mm_cid .active )
1051010566 return false;
1051110567 if (cid_on_task (t -> mm_cid .cid )) {
10512- /* If running on the CPU, transfer the CID, otherwise drop it */
10568+ /* If running on the CPU, put the CID in transit mode , otherwise drop it */
1051310569 if (task_rq (t )-> curr == t )
10514- mm_cid_transfer_to_cpu (t , per_cpu_ptr (mm -> mm_cid .pcpu , task_cpu (t )));
10570+ mm_cid_transit_to_cpu (t , per_cpu_ptr (mm -> mm_cid .pcpu , task_cpu (t )));
1051510571 else
1051610572 mm_unset_cid_on_task (t );
1051710573 }
1051810574 return true;
1051910575}
1052010576
10521- static void mm_cid_fixup_tasks_to_cpus ( void )
10577+ static void mm_cid_do_fixup_tasks_to_cpus ( struct mm_struct * mm )
1052210578{
10523- struct mm_struct * mm = current -> mm ;
1052410579 struct task_struct * p , * t ;
1052510580 unsigned int users ;
1052610581
@@ -10558,6 +10613,14 @@ static void mm_cid_fixup_tasks_to_cpus(void)
1055810613 }
1055910614}
1056010615
10616+ static void mm_cid_fixup_tasks_to_cpus (void )
10617+ {
10618+ struct mm_struct * mm = current -> mm ;
10619+
10620+ mm_cid_do_fixup_tasks_to_cpus (mm );
10621+ mm_cid_complete_transit (mm , MM_CID_ONCPU );
10622+ }
10623+
1056110624static bool sched_mm_cid_add_user (struct task_struct * t , struct mm_struct * mm )
1056210625{
1056310626 t -> mm_cid .active = 1 ;
@@ -10586,17 +10649,17 @@ void sched_mm_cid_fork(struct task_struct *t)
1058610649 }
1058710650
1058810651 if (!sched_mm_cid_add_user (t , mm )) {
10589- if (!mm -> mm_cid .percpu )
10652+ if (!cid_on_cpu ( mm -> mm_cid .mode ) )
1059010653 t -> mm_cid .cid = mm_get_cid (mm );
1059110654 return ;
1059210655 }
1059310656
1059410657 /* Handle the mode change and transfer current's CID */
10595- percpu = !! mm -> mm_cid .percpu ;
10658+ percpu = cid_on_cpu ( mm -> mm_cid .mode ) ;
1059610659 if (!percpu )
1059710660 mm_cid_transit_to_task (current , pcp );
1059810661 else
10599- mm_cid_transfer_to_cpu (current , pcp );
10662+ mm_cid_transit_to_cpu (current , pcp );
1060010663 }
1060110664
1060210665 if (percpu ) {
@@ -10631,7 +10694,7 @@ static bool __sched_mm_cid_exit(struct task_struct *t)
1063110694 * affinity change increased the number of allowed CPUs and the
1063210695 * deferred fixup did not run yet.
1063310696 */
10634- if (WARN_ON_ONCE (mm -> mm_cid .percpu ))
10697+ if (WARN_ON_ONCE (cid_on_cpu ( mm -> mm_cid .mode ) ))
1063510698 return false;
1063610699 /*
1063710700 * A failed fork(2) cleanup never gets here, so @current must have
@@ -10664,8 +10727,14 @@ void sched_mm_cid_exit(struct task_struct *t)
1066410727 scoped_guard (raw_spinlock_irq , & mm -> mm_cid .lock ) {
1066510728 if (!__sched_mm_cid_exit (t ))
1066610729 return ;
10667- /* Mode change required. Transfer currents CID */
10668- mm_cid_transit_to_task (current , this_cpu_ptr (mm -> mm_cid .pcpu ));
10730+ /*
10731+ * Mode change. The task has the CID unset
10732+ * already. The CPU CID is still valid and
10733+ * does not have MM_CID_TRANSIT set as the
10734+ * mode change has just taken effect under
10735+ * mm::mm_cid::lock. Drop it.
10736+ */
10737+ mm_drop_cid_on_cpu (mm , this_cpu_ptr (mm -> mm_cid .pcpu ));
1066910738 }
1067010739 mm_cid_fixup_cpus_to_tasks (mm );
1067110740 return ;
@@ -10722,7 +10791,7 @@ static void mm_cid_work_fn(struct work_struct *work)
1072210791 if (!mm_update_max_cids (mm ))
1072310792 return ;
1072410793 /* Affinity changes can only switch back to task mode */
10725- if (WARN_ON_ONCE (mm -> mm_cid .percpu ))
10794+ if (WARN_ON_ONCE (cid_on_cpu ( mm -> mm_cid .mode ) ))
1072610795 return ;
1072710796 }
1072810797 mm_cid_fixup_cpus_to_tasks (mm );
@@ -10743,8 +10812,7 @@ static void mm_cid_irq_work(struct irq_work *work)
1074310812void mm_init_cid (struct mm_struct * mm , struct task_struct * p )
1074410813{
1074510814 mm -> mm_cid .max_cids = 0 ;
10746- mm -> mm_cid .percpu = 0 ;
10747- mm -> mm_cid .transit = 0 ;
10815+ mm -> mm_cid .mode = 0 ;
1074810816 mm -> mm_cid .nr_cpus_allowed = p -> nr_cpus_allowed ;
1074910817 mm -> mm_cid .users = 0 ;
1075010818 mm -> mm_cid .pcpu_thrs = 0 ;
0 commit comments