@@ -2369,6 +2369,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
23692369}
23702370EXPORT_SYMBOL_GPL (task_cgroup_path );
23712371
2372+ /**
2373+ * cgroup_attach_lock - Lock for ->attach()
2374+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
2375+ *
2376+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
2377+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
2378+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
2379+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
2380+ * lead to deadlocks.
2381+ *
2382+ * Bringing up a CPU may involve creating and destroying tasks which requires
2383+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
2384+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
2385+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
2386+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
2387+ * the threadgroup_rwsem to be released to create new tasks. For more details:
2388+ *
2389+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
2390+ *
2391+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
2392+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
2393+ * CPU hotplug is disabled on entry.
2394+ */
2395+ static void cgroup_attach_lock (bool lock_threadgroup )
2396+ {
2397+ cpus_read_lock ();
2398+ if (lock_threadgroup )
2399+ percpu_down_write (& cgroup_threadgroup_rwsem );
2400+ }
2401+
2402+ /**
2403+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
2404+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
2405+ */
2406+ static void cgroup_attach_unlock (bool lock_threadgroup )
2407+ {
2408+ if (lock_threadgroup )
2409+ percpu_up_write (& cgroup_threadgroup_rwsem );
2410+ cpus_read_unlock ();
2411+ }
2412+
23722413/**
23732414 * cgroup_migrate_add_task - add a migration target task to a migration context
23742415 * @task: target task
@@ -2841,8 +2882,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
28412882}
28422883
28432884struct task_struct * cgroup_procs_write_start (char * buf , bool threadgroup ,
2844- bool * locked )
2845- __acquires (& cgroup_threadgroup_rwsem )
2885+ bool * threadgroup_locked )
28462886{
28472887 struct task_struct * tsk ;
28482888 pid_t pid ;
@@ -2859,12 +2899,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
28592899 * Therefore, we can skip the global lock.
28602900 */
28612901 lockdep_assert_held (& cgroup_mutex );
2862- if (pid || threadgroup ) {
2863- percpu_down_write (& cgroup_threadgroup_rwsem );
2864- * locked = true;
2865- } else {
2866- * locked = false;
2867- }
2902+ * threadgroup_locked = pid || threadgroup ;
2903+ cgroup_attach_lock (* threadgroup_locked );
28682904
28692905 rcu_read_lock ();
28702906 if (pid ) {
@@ -2895,26 +2931,23 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
28952931 goto out_unlock_rcu ;
28962932
28972933out_unlock_threadgroup :
2898- if (* locked ) {
2899- percpu_up_write (& cgroup_threadgroup_rwsem );
2900- * locked = false;
2901- }
2934+ cgroup_attach_unlock (* threadgroup_locked );
2935+ * threadgroup_locked = false;
29022936out_unlock_rcu :
29032937 rcu_read_unlock ();
29042938 return tsk ;
29052939}
29062940
2907- void cgroup_procs_write_finish (struct task_struct * task , bool locked )
2908- __releases (& cgroup_threadgroup_rwsem )
2941+ void cgroup_procs_write_finish (struct task_struct * task , bool threadgroup_locked )
29092942{
29102943 struct cgroup_subsys * ss ;
29112944 int ssid ;
29122945
29132946 /* release reference from cgroup_procs_write_start() */
29142947 put_task_struct (task );
29152948
2916- if ( locked )
2917- percpu_up_write ( & cgroup_threadgroup_rwsem );
2949+ cgroup_attach_unlock ( threadgroup_locked );
2950+
29182951 for_each_subsys (ss , ssid )
29192952 if (ss -> post_attach )
29202953 ss -> post_attach ();
@@ -3000,8 +3033,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
30003033 * write-locking can be skipped safely.
30013034 */
30023035 has_tasks = !list_empty (& mgctx .preloaded_src_csets );
3003- if (has_tasks )
3004- percpu_down_write (& cgroup_threadgroup_rwsem );
3036+ cgroup_attach_lock (has_tasks );
30053037
30063038 /* NULL dst indicates self on default hierarchy */
30073039 ret = cgroup_migrate_prepare_dst (& mgctx );
@@ -3022,8 +3054,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
30223054 ret = cgroup_migrate_execute (& mgctx );
30233055out_finish :
30243056 cgroup_migrate_finish (& mgctx );
3025- if (has_tasks )
3026- percpu_up_write (& cgroup_threadgroup_rwsem );
3057+ cgroup_attach_unlock (has_tasks );
30273058 return ret ;
30283059}
30293060
@@ -4971,13 +5002,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
49715002 struct task_struct * task ;
49725003 const struct cred * saved_cred ;
49735004 ssize_t ret ;
4974- bool locked ;
5005+ bool threadgroup_locked ;
49755006
49765007 dst_cgrp = cgroup_kn_lock_live (of -> kn , false);
49775008 if (!dst_cgrp )
49785009 return - ENODEV ;
49795010
4980- task = cgroup_procs_write_start (buf , threadgroup , & locked );
5011+ task = cgroup_procs_write_start (buf , threadgroup , & threadgroup_locked );
49815012 ret = PTR_ERR_OR_ZERO (task );
49825013 if (ret )
49835014 goto out_unlock ;
@@ -5003,7 +5034,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
50035034 ret = cgroup_attach_task (dst_cgrp , task , threadgroup );
50045035
50055036out_finish :
5006- cgroup_procs_write_finish (task , locked );
5037+ cgroup_procs_write_finish (task , threadgroup_locked );
50075038out_unlock :
50085039 cgroup_kn_unlock (of -> kn );
50095040
0 commit comments