1 /*
2 * kernel/cpuset.c
3 *
4 * Processor and Memory placement constraints for sets of tasks.
5 *
6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004-2007 Silicon Graphics, Inc.
8 * Copyright (C) 2006 Google, Inc
9 *
10 * Portions derived from Patrick Mochel's sysfs code.
11 * sysfs is Copyright (c) 2001-3 Patrick Mochel
12 *
13 * 2003-10-10 Written by Simon Derr.
14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
19 *
20 * This file is subject to the terms and conditions of the GNU General Public
21 * License. See the file COPYING in the main directory of the Linux
22 * distribution for more details.
23 */
24 #include "cgroup-internal.h"
25 #include "cpuset-internal.h"
26
27 #include <linux/init.h>
28 #include <linux/interrupt.h>
29 #include <linux/kernel.h>
30 #include <linux/mempolicy.h>
31 #include <linux/mm.h>
32 #include <linux/memory.h>
33 #include <linux/export.h>
34 #include <linux/rcupdate.h>
35 #include <linux/sched.h>
36 #include <linux/sched/deadline.h>
37 #include <linux/sched/mm.h>
38 #include <linux/sched/task.h>
39 #include <linux/security.h>
40 #include <linux/oom.h>
41 #include <linux/sched/isolation.h>
42 #include <linux/wait.h>
43 #include <linux/workqueue.h>
44
45 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
46 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
47
48 /*
49 * There could be abnormal cpuset configurations for cpu or memory
50 * node binding, add this key to provide a quick low-cost judgment
51 * of the situation.
52 */
53 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
54
55 static const char * const perr_strings[] = {
56 [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",
57 [PERR_INVPARENT] = "Parent is an invalid partition root",
58 [PERR_NOTPART] = "Parent is not a partition root",
59 [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
60 [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
61 [PERR_HOTPLUG] = "No cpu available due to hotplug",
62 [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
63 [PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
64 [PERR_ACCESS] = "Enable partition not permitted",
65 };
66
67 /*
68 * Exclusive CPUs distributed out to sub-partitions of top_cpuset
69 */
70 static cpumask_var_t subpartitions_cpus;
71
72 /*
73 * Exclusive CPUs in isolated partitions
74 */
75 static cpumask_var_t isolated_cpus;
76
77 /*
78 * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
79 */
80 static cpumask_var_t boot_hk_cpus;
81 static bool have_boot_isolcpus;
82
83 /* List of remote partition root children */
84 static struct list_head remote_children;
85
86 /*
87 * A flag to force sched domain rebuild at the end of an operation.
88 * It can be set in
89 * - update_partition_sd_lb()
90 * - remote_partition_check()
91 * - update_cpumasks_hier()
92 * - cpuset_update_flag()
93 * - cpuset_hotplug_update_tasks()
94 * - cpuset_handle_hotplug()
95 *
96 * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
97 *
98 * Note that update_relax_domain_level() in cpuset-v1.c can still call
99 * rebuild_sched_domains_locked() directly without using this flag.
100 */
101 static bool force_sd_rebuild;
102
103 /*
104 * Partition root states:
105 *
106 * 0 - member (not a partition root)
107 * 1 - partition root
108 * 2 - partition root without load balancing (isolated)
109 * -1 - invalid partition root
110 * -2 - invalid isolated partition root
111 *
112 * There are 2 types of partitions - local or remote. Local partitions are
113 * those whose parents are partition root themselves. Setting of
114 * cpuset.cpus.exclusive are optional in setting up local partitions.
115 * Remote partitions are those whose parents are not partition roots. Passing
116 * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
117 * nodes are mandatory in creating a remote partition.
118 *
119 * For simplicity, a local partition can be created under a local or remote
120 * partition but a remote partition cannot have any partition root in its
121 * ancestor chain except the cgroup root.
122 */
123 #define PRS_MEMBER 0
124 #define PRS_ROOT 1
125 #define PRS_ISOLATED 2
126 #define PRS_INVALID_ROOT -1
127 #define PRS_INVALID_ISOLATED -2
128
is_prs_invalid(int prs_state)129 static inline bool is_prs_invalid(int prs_state)
130 {
131 return prs_state < 0;
132 }
133
134 /*
135 * Temporary cpumasks for working with partitions that are passed among
136 * functions to avoid memory allocation in inner functions.
137 */
138 struct tmpmasks {
139 cpumask_var_t addmask, delmask; /* For partition root */
140 cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
141 };
142
inc_dl_tasks_cs(struct task_struct * p)143 void inc_dl_tasks_cs(struct task_struct *p)
144 {
145 struct cpuset *cs = task_cs(p);
146
147 cs->nr_deadline_tasks++;
148 }
149
dec_dl_tasks_cs(struct task_struct * p)150 void dec_dl_tasks_cs(struct task_struct *p)
151 {
152 struct cpuset *cs = task_cs(p);
153
154 cs->nr_deadline_tasks--;
155 }
156
is_partition_valid(const struct cpuset * cs)157 static inline int is_partition_valid(const struct cpuset *cs)
158 {
159 return cs->partition_root_state > 0;
160 }
161
is_partition_invalid(const struct cpuset * cs)162 static inline int is_partition_invalid(const struct cpuset *cs)
163 {
164 return cs->partition_root_state < 0;
165 }
166
167 /*
168 * Callers should hold callback_lock to modify partition_root_state.
169 */
make_partition_invalid(struct cpuset * cs)170 static inline void make_partition_invalid(struct cpuset *cs)
171 {
172 if (cs->partition_root_state > 0)
173 cs->partition_root_state = -cs->partition_root_state;
174 }
175
176 /*
177 * Send notification event of whenever partition_root_state changes.
178 */
notify_partition_change(struct cpuset * cs,int old_prs)179 static inline void notify_partition_change(struct cpuset *cs, int old_prs)
180 {
181 if (old_prs == cs->partition_root_state)
182 return;
183 cgroup_file_notify(&cs->partition_file);
184
185 /* Reset prs_err if not invalid */
186 if (is_partition_valid(cs))
187 WRITE_ONCE(cs->prs_err, PERR_NONE);
188 }
189
190 static struct cpuset top_cpuset = {
191 .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
192 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
193 .partition_root_state = PRS_ROOT,
194 .relax_domain_level = -1,
195 .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
196 };
197
198 /*
199 * There are two global locks guarding cpuset structures - cpuset_mutex and
200 * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
201 * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
202 * structures. Note that cpuset_mutex needs to be a mutex as it is used in
203 * paths that rely on priority inheritance (e.g. scheduler - on RT) for
204 * correctness.
205 *
206 * A task must hold both locks to modify cpusets. If a task holds
207 * cpuset_mutex, it blocks others, ensuring that it is the only task able to
208 * also acquire callback_lock and be able to modify cpusets. It can perform
209 * various checks on the cpuset structure first, knowing nothing will change.
210 * It can also allocate memory while just holding cpuset_mutex. While it is
211 * performing these checks, various callback routines can briefly acquire
212 * callback_lock to query cpusets. Once it is ready to make the changes, it
213 * takes callback_lock, blocking everyone else.
214 *
215 * Calls to the kernel memory allocator can not be made while holding
216 * callback_lock, as that would risk double tripping on callback_lock
217 * from one of the callbacks into the cpuset code from within
218 * __alloc_pages().
219 *
220 * If a task is only holding callback_lock, then it has read-only
221 * access to cpusets.
222 *
223 * Now, the task_struct fields mems_allowed and mempolicy may be changed
224 * by other task, we use alloc_lock in the task_struct fields to protect
225 * them.
226 *
227 * The cpuset_common_seq_show() handlers only hold callback_lock across
228 * small pieces of code, such as when reading out possibly multi-word
229 * cpumasks and nodemasks.
230 */
231
232 static DEFINE_MUTEX(cpuset_mutex);
233
cpuset_lock(void)234 void cpuset_lock(void)
235 {
236 mutex_lock(&cpuset_mutex);
237 }
238
cpuset_unlock(void)239 void cpuset_unlock(void)
240 {
241 mutex_unlock(&cpuset_mutex);
242 }
243
244 static DEFINE_SPINLOCK(callback_lock);
245
cpuset_callback_lock_irq(void)246 void cpuset_callback_lock_irq(void)
247 {
248 spin_lock_irq(&callback_lock);
249 }
250
cpuset_callback_unlock_irq(void)251 void cpuset_callback_unlock_irq(void)
252 {
253 spin_unlock_irq(&callback_lock);
254 }
255
256 static struct workqueue_struct *cpuset_migrate_mm_wq;
257
258 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
259
check_insane_mems_config(nodemask_t * nodes)260 static inline void check_insane_mems_config(nodemask_t *nodes)
261 {
262 if (!cpusets_insane_config() &&
263 movable_only_nodes(nodes)) {
264 static_branch_enable(&cpusets_insane_config_key);
265 pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
266 "Cpuset allocations might fail even with a lot of memory available.\n",
267 nodemask_pr_args(nodes));
268 }
269 }
270
271 /*
272 * decrease cs->attach_in_progress.
273 * wake_up cpuset_attach_wq if cs->attach_in_progress==0.
274 */
dec_attach_in_progress_locked(struct cpuset * cs)275 static inline void dec_attach_in_progress_locked(struct cpuset *cs)
276 {
277 lockdep_assert_held(&cpuset_mutex);
278
279 cs->attach_in_progress--;
280 if (!cs->attach_in_progress)
281 wake_up(&cpuset_attach_wq);
282 }
283
dec_attach_in_progress(struct cpuset * cs)284 static inline void dec_attach_in_progress(struct cpuset *cs)
285 {
286 mutex_lock(&cpuset_mutex);
287 dec_attach_in_progress_locked(cs);
288 mutex_unlock(&cpuset_mutex);
289 }
290
cpuset_v2(void)291 static inline bool cpuset_v2(void)
292 {
293 return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
294 cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
295 }
296
297 /*
298 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
299 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
300 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
301 * With v2 behavior, "cpus" and "mems" are always what the users have
302 * requested and won't be changed by hotplug events. Only the effective
303 * cpus or mems will be affected.
304 */
is_in_v2_mode(void)305 static inline bool is_in_v2_mode(void)
306 {
307 return cpuset_v2() ||
308 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
309 }
310
311 /**
312 * partition_is_populated - check if partition has tasks
313 * @cs: partition root to be checked
314 * @excluded_child: a child cpuset to be excluded in task checking
315 * Return: true if there are tasks, false otherwise
316 *
317 * It is assumed that @cs is a valid partition root. @excluded_child should
318 * be non-NULL when this cpuset is going to become a partition itself.
319 */
partition_is_populated(struct cpuset * cs,struct cpuset * excluded_child)320 static inline bool partition_is_populated(struct cpuset *cs,
321 struct cpuset *excluded_child)
322 {
323 struct cgroup_subsys_state *css;
324 struct cpuset *child;
325
326 if (cs->css.cgroup->nr_populated_csets)
327 return true;
328 if (!excluded_child && !cs->nr_subparts)
329 return cgroup_is_populated(cs->css.cgroup);
330
331 rcu_read_lock();
332 cpuset_for_each_child(child, css, cs) {
333 if (child == excluded_child)
334 continue;
335 if (is_partition_valid(child))
336 continue;
337 if (cgroup_is_populated(child->css.cgroup)) {
338 rcu_read_unlock();
339 return true;
340 }
341 }
342 rcu_read_unlock();
343 return false;
344 }
345
346 /*
347 * Return in pmask the portion of a task's cpusets's cpus_allowed that
348 * are online and are capable of running the task. If none are found,
349 * walk up the cpuset hierarchy until we find one that does have some
350 * appropriate cpus.
351 *
352 * One way or another, we guarantee to return some non-empty subset
353 * of cpu_online_mask.
354 *
355 * Call with callback_lock or cpuset_mutex held.
356 */
guarantee_online_cpus(struct task_struct * tsk,struct cpumask * pmask)357 static void guarantee_online_cpus(struct task_struct *tsk,
358 struct cpumask *pmask)
359 {
360 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
361 struct cpuset *cs;
362
363 if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
364 cpumask_copy(pmask, cpu_online_mask);
365
366 rcu_read_lock();
367 cs = task_cs(tsk);
368
369 while (!cpumask_intersects(cs->effective_cpus, pmask))
370 cs = parent_cs(cs);
371
372 cpumask_and(pmask, pmask, cs->effective_cpus);
373 rcu_read_unlock();
374 }
375
376 /*
377 * Return in *pmask the portion of a cpusets's mems_allowed that
378 * are online, with memory. If none are online with memory, walk
379 * up the cpuset hierarchy until we find one that does have some
380 * online mems. The top cpuset always has some mems online.
381 *
382 * One way or another, we guarantee to return some non-empty subset
383 * of node_states[N_MEMORY].
384 *
385 * Call with callback_lock or cpuset_mutex held.
386 */
guarantee_online_mems(struct cpuset * cs,nodemask_t * pmask)387 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
388 {
389 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
390 cs = parent_cs(cs);
391 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
392 }
393
394 /**
395 * alloc_cpumasks - allocate three cpumasks for cpuset
396 * @cs: the cpuset that have cpumasks to be allocated.
397 * @tmp: the tmpmasks structure pointer
398 * Return: 0 if successful, -ENOMEM otherwise.
399 *
400 * Only one of the two input arguments should be non-NULL.
401 */
alloc_cpumasks(struct cpuset * cs,struct tmpmasks * tmp)402 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
403 {
404 cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
405
406 if (cs) {
407 pmask1 = &cs->cpus_allowed;
408 pmask2 = &cs->effective_cpus;
409 pmask3 = &cs->effective_xcpus;
410 pmask4 = &cs->exclusive_cpus;
411 } else {
412 pmask1 = &tmp->new_cpus;
413 pmask2 = &tmp->addmask;
414 pmask3 = &tmp->delmask;
415 pmask4 = NULL;
416 }
417
418 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
419 return -ENOMEM;
420
421 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
422 goto free_one;
423
424 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
425 goto free_two;
426
427 if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
428 goto free_three;
429
430
431 return 0;
432
433 free_three:
434 free_cpumask_var(*pmask3);
435 free_two:
436 free_cpumask_var(*pmask2);
437 free_one:
438 free_cpumask_var(*pmask1);
439 return -ENOMEM;
440 }
441
442 /**
443 * free_cpumasks - free cpumasks in a tmpmasks structure
444 * @cs: the cpuset that have cpumasks to be free.
445 * @tmp: the tmpmasks structure pointer
446 */
free_cpumasks(struct cpuset * cs,struct tmpmasks * tmp)447 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
448 {
449 if (cs) {
450 free_cpumask_var(cs->cpus_allowed);
451 free_cpumask_var(cs->effective_cpus);
452 free_cpumask_var(cs->effective_xcpus);
453 free_cpumask_var(cs->exclusive_cpus);
454 }
455 if (tmp) {
456 free_cpumask_var(tmp->new_cpus);
457 free_cpumask_var(tmp->addmask);
458 free_cpumask_var(tmp->delmask);
459 }
460 }
461
462 /**
463 * alloc_trial_cpuset - allocate a trial cpuset
464 * @cs: the cpuset that the trial cpuset duplicates
465 */
alloc_trial_cpuset(struct cpuset * cs)466 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
467 {
468 struct cpuset *trial;
469
470 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
471 if (!trial)
472 return NULL;
473
474 if (alloc_cpumasks(trial, NULL)) {
475 kfree(trial);
476 return NULL;
477 }
478
479 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
480 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
481 cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
482 cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
483 return trial;
484 }
485
486 /**
487 * free_cpuset - free the cpuset
488 * @cs: the cpuset to be freed
489 */
free_cpuset(struct cpuset * cs)490 static inline void free_cpuset(struct cpuset *cs)
491 {
492 free_cpumasks(cs, NULL);
493 kfree(cs);
494 }
495
496 /* Return user specified exclusive CPUs */
user_xcpus(struct cpuset * cs)497 static inline struct cpumask *user_xcpus(struct cpuset *cs)
498 {
499 return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
500 : cs->exclusive_cpus;
501 }
502
xcpus_empty(struct cpuset * cs)503 static inline bool xcpus_empty(struct cpuset *cs)
504 {
505 return cpumask_empty(cs->cpus_allowed) &&
506 cpumask_empty(cs->exclusive_cpus);
507 }
508
509 /*
510 * cpusets_are_exclusive() - check if two cpusets are exclusive
511 *
512 * Return true if exclusive, false if not
513 */
cpusets_are_exclusive(struct cpuset * cs1,struct cpuset * cs2)514 static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
515 {
516 struct cpumask *xcpus1 = user_xcpus(cs1);
517 struct cpumask *xcpus2 = user_xcpus(cs2);
518
519 if (cpumask_intersects(xcpus1, xcpus2))
520 return false;
521 return true;
522 }
523
524 /*
525 * validate_change() - Used to validate that any proposed cpuset change
526 * follows the structural rules for cpusets.
527 *
528 * If we replaced the flag and mask values of the current cpuset
529 * (cur) with those values in the trial cpuset (trial), would
530 * our various subset and exclusive rules still be valid? Presumes
531 * cpuset_mutex held.
532 *
533 * 'cur' is the address of an actual, in-use cpuset. Operations
534 * such as list traversal that depend on the actual address of the
535 * cpuset in the list must use cur below, not trial.
536 *
537 * 'trial' is the address of bulk structure copy of cur, with
538 * perhaps one or more of the fields cpus_allowed, mems_allowed,
539 * or flags changed to new, trial values.
540 *
541 * Return 0 if valid, -errno if not.
542 */
543
validate_change(struct cpuset * cur,struct cpuset * trial)544 static int validate_change(struct cpuset *cur, struct cpuset *trial)
545 {
546 struct cgroup_subsys_state *css;
547 struct cpuset *c, *par;
548 int ret = 0;
549
550 rcu_read_lock();
551
552 if (!is_in_v2_mode())
553 ret = cpuset1_validate_change(cur, trial);
554 if (ret)
555 goto out;
556
557 /* Remaining checks don't apply to root cpuset */
558 if (cur == &top_cpuset)
559 goto out;
560
561 par = parent_cs(cur);
562
563 /*
564 * Cpusets with tasks - existing or newly being attached - can't
565 * be changed to have empty cpus_allowed or mems_allowed.
566 */
567 ret = -ENOSPC;
568 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
569 if (!cpumask_empty(cur->cpus_allowed) &&
570 cpumask_empty(trial->cpus_allowed))
571 goto out;
572 if (!nodes_empty(cur->mems_allowed) &&
573 nodes_empty(trial->mems_allowed))
574 goto out;
575 }
576
577 /*
578 * We can't shrink if we won't have enough room for SCHED_DEADLINE
579 * tasks. This check is not done when scheduling is disabled as the
580 * users should know what they are doing.
581 *
582 * For v1, effective_cpus == cpus_allowed & user_xcpus() returns
583 * cpus_allowed.
584 *
585 * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only
586 * for non-isolated partition root. At this point, the target
587 * effective_cpus isn't computed yet. user_xcpus() is the best
588 * approximation.
589 *
590 * TBD: May need to precompute the real effective_cpus here in case
591 * incorrect scheduling of SCHED_DEADLINE tasks in a partition
592 * becomes an issue.
593 */
594 ret = -EBUSY;
595 if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
596 !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
597 goto out;
598
599 /*
600 * If either I or some sibling (!= me) is exclusive, we can't
601 * overlap. exclusive_cpus cannot overlap with each other if set.
602 */
603 ret = -EINVAL;
604 cpuset_for_each_child(c, css, par) {
605 bool txset, cxset; /* Are exclusive_cpus set? */
606
607 if (c == cur)
608 continue;
609
610 txset = !cpumask_empty(trial->exclusive_cpus);
611 cxset = !cpumask_empty(c->exclusive_cpus);
612 if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
613 (txset && cxset)) {
614 if (!cpusets_are_exclusive(trial, c))
615 goto out;
616 } else if (txset || cxset) {
617 struct cpumask *xcpus, *acpus;
618
619 /*
620 * When just one of the exclusive_cpus's is set,
621 * cpus_allowed of the other cpuset, if set, cannot be
622 * a subset of it or none of those CPUs will be
623 * available if these exclusive CPUs are activated.
624 */
625 if (txset) {
626 xcpus = trial->exclusive_cpus;
627 acpus = c->cpus_allowed;
628 } else {
629 xcpus = c->exclusive_cpus;
630 acpus = trial->cpus_allowed;
631 }
632 if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
633 goto out;
634 }
635 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
636 nodes_intersects(trial->mems_allowed, c->mems_allowed))
637 goto out;
638 }
639
640 ret = 0;
641 out:
642 rcu_read_unlock();
643 return ret;
644 }
645
646 #ifdef CONFIG_SMP
647 /*
648 * Helper routine for generate_sched_domains().
649 * Do cpusets a, b have overlapping effective cpus_allowed masks?
650 */
cpusets_overlap(struct cpuset * a,struct cpuset * b)651 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
652 {
653 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
654 }
655
656 static void
update_domain_attr(struct sched_domain_attr * dattr,struct cpuset * c)657 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
658 {
659 if (dattr->relax_domain_level < c->relax_domain_level)
660 dattr->relax_domain_level = c->relax_domain_level;
661 return;
662 }
663
update_domain_attr_tree(struct sched_domain_attr * dattr,struct cpuset * root_cs)664 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
665 struct cpuset *root_cs)
666 {
667 struct cpuset *cp;
668 struct cgroup_subsys_state *pos_css;
669
670 rcu_read_lock();
671 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
672 /* skip the whole subtree if @cp doesn't have any CPU */
673 if (cpumask_empty(cp->cpus_allowed)) {
674 pos_css = css_rightmost_descendant(pos_css);
675 continue;
676 }
677
678 if (is_sched_load_balance(cp))
679 update_domain_attr(dattr, cp);
680 }
681 rcu_read_unlock();
682 }
683
684 /* Must be called with cpuset_mutex held. */
nr_cpusets(void)685 static inline int nr_cpusets(void)
686 {
687 /* jump label reference count + the top-level cpuset */
688 return static_key_count(&cpusets_enabled_key.key) + 1;
689 }
690
691 /*
692 * generate_sched_domains()
693 *
694 * This function builds a partial partition of the systems CPUs
695 * A 'partial partition' is a set of non-overlapping subsets whose
696 * union is a subset of that set.
697 * The output of this function needs to be passed to kernel/sched/core.c
698 * partition_sched_domains() routine, which will rebuild the scheduler's
699 * load balancing domains (sched domains) as specified by that partial
700 * partition.
701 *
702 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
703 * for a background explanation of this.
704 *
705 * Does not return errors, on the theory that the callers of this
706 * routine would rather not worry about failures to rebuild sched
707 * domains when operating in the severe memory shortage situations
708 * that could cause allocation failures below.
709 *
710 * Must be called with cpuset_mutex held.
711 *
712 * The three key local variables below are:
713 * cp - cpuset pointer, used (together with pos_css) to perform a
714 * top-down scan of all cpusets. For our purposes, rebuilding
715 * the schedulers sched domains, we can ignore !is_sched_load_
716 * balance cpusets.
717 * csa - (for CpuSet Array) Array of pointers to all the cpusets
718 * that need to be load balanced, for convenient iterative
719 * access by the subsequent code that finds the best partition,
720 * i.e the set of domains (subsets) of CPUs such that the
721 * cpus_allowed of every cpuset marked is_sched_load_balance
722 * is a subset of one of these domains, while there are as
723 * many such domains as possible, each as small as possible.
724 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
725 * the kernel/sched/core.c routine partition_sched_domains() in a
726 * convenient format, that can be easily compared to the prior
727 * value to determine what partition elements (sched domains)
728 * were changed (added or removed.)
729 *
730 * Finding the best partition (set of domains):
731 * The double nested loops below over i, j scan over the load
732 * balanced cpusets (using the array of cpuset pointers in csa[])
733 * looking for pairs of cpusets that have overlapping cpus_allowed
734 * and merging them using a union-find algorithm.
735 *
736 * The union of the cpus_allowed masks from the set of all cpusets
737 * having the same root then form the one element of the partition
738 * (one sched domain) to be passed to partition_sched_domains().
739 *
740 */
generate_sched_domains(cpumask_var_t ** domains,struct sched_domain_attr ** attributes)741 static int generate_sched_domains(cpumask_var_t **domains,
742 struct sched_domain_attr **attributes)
743 {
744 struct cpuset *cp; /* top-down scan of cpusets */
745 struct cpuset **csa; /* array of all cpuset ptrs */
746 int csn; /* how many cpuset ptrs in csa so far */
747 int i, j; /* indices for partition finding loops */
748 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
749 struct sched_domain_attr *dattr; /* attributes for custom domains */
750 int ndoms = 0; /* number of sched domains in result */
751 int nslot; /* next empty doms[] struct cpumask slot */
752 struct cgroup_subsys_state *pos_css;
753 bool root_load_balance = is_sched_load_balance(&top_cpuset);
754 bool cgrpv2 = cpuset_v2();
755 int nslot_update;
756
757 doms = NULL;
758 dattr = NULL;
759 csa = NULL;
760
761 /* Special case for the 99% of systems with one, full, sched domain */
762 if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
763 single_root_domain:
764 ndoms = 1;
765 doms = alloc_sched_domains(ndoms);
766 if (!doms)
767 goto done;
768
769 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
770 if (dattr) {
771 *dattr = SD_ATTR_INIT;
772 update_domain_attr_tree(dattr, &top_cpuset);
773 }
774 cpumask_and(doms[0], top_cpuset.effective_cpus,
775 housekeeping_cpumask(HK_TYPE_DOMAIN));
776
777 goto done;
778 }
779
780 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
781 if (!csa)
782 goto done;
783 csn = 0;
784
785 rcu_read_lock();
786 if (root_load_balance)
787 csa[csn++] = &top_cpuset;
788 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
789 if (cp == &top_cpuset)
790 continue;
791
792 if (cgrpv2)
793 goto v2;
794
795 /*
796 * v1:
797 * Continue traversing beyond @cp iff @cp has some CPUs and
798 * isn't load balancing. The former is obvious. The
799 * latter: All child cpusets contain a subset of the
800 * parent's cpus, so just skip them, and then we call
801 * update_domain_attr_tree() to calc relax_domain_level of
802 * the corresponding sched domain.
803 */
804 if (!cpumask_empty(cp->cpus_allowed) &&
805 !(is_sched_load_balance(cp) &&
806 cpumask_intersects(cp->cpus_allowed,
807 housekeeping_cpumask(HK_TYPE_DOMAIN))))
808 continue;
809
810 if (is_sched_load_balance(cp) &&
811 !cpumask_empty(cp->effective_cpus))
812 csa[csn++] = cp;
813
814 /* skip @cp's subtree */
815 pos_css = css_rightmost_descendant(pos_css);
816 continue;
817
818 v2:
819 /*
820 * Only valid partition roots that are not isolated and with
821 * non-empty effective_cpus will be saved into csn[].
822 */
823 if ((cp->partition_root_state == PRS_ROOT) &&
824 !cpumask_empty(cp->effective_cpus))
825 csa[csn++] = cp;
826
827 /*
828 * Skip @cp's subtree if not a partition root and has no
829 * exclusive CPUs to be granted to child cpusets.
830 */
831 if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
832 pos_css = css_rightmost_descendant(pos_css);
833 }
834 rcu_read_unlock();
835
836 /*
837 * If there are only isolated partitions underneath the cgroup root,
838 * we can optimize out unneeded sched domains scanning.
839 */
840 if (root_load_balance && (csn == 1))
841 goto single_root_domain;
842
843 for (i = 0; i < csn; i++)
844 uf_node_init(&csa[i]->node);
845
846 /* Merge overlapping cpusets */
847 for (i = 0; i < csn; i++) {
848 for (j = i + 1; j < csn; j++) {
849 if (cpusets_overlap(csa[i], csa[j])) {
850 /*
851 * Cgroup v2 shouldn't pass down overlapping
852 * partition root cpusets.
853 */
854 WARN_ON_ONCE(cgrpv2);
855 uf_union(&csa[i]->node, &csa[j]->node);
856 }
857 }
858 }
859
860 /* Count the total number of domains */
861 for (i = 0; i < csn; i++) {
862 if (uf_find(&csa[i]->node) == &csa[i]->node)
863 ndoms++;
864 }
865
866 /*
867 * Now we know how many domains to create.
868 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
869 */
870 doms = alloc_sched_domains(ndoms);
871 if (!doms)
872 goto done;
873
874 /*
875 * The rest of the code, including the scheduler, can deal with
876 * dattr==NULL case. No need to abort if alloc fails.
877 */
878 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
879 GFP_KERNEL);
880
881 /*
882 * Cgroup v2 doesn't support domain attributes, just set all of them
883 * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
884 * subset of HK_TYPE_DOMAIN housekeeping CPUs.
885 */
886 if (cgrpv2) {
887 for (i = 0; i < ndoms; i++) {
888 /*
889 * The top cpuset may contain some boot time isolated
890 * CPUs that need to be excluded from the sched domain.
891 */
892 if (csa[i] == &top_cpuset)
893 cpumask_and(doms[i], csa[i]->effective_cpus,
894 housekeeping_cpumask(HK_TYPE_DOMAIN));
895 else
896 cpumask_copy(doms[i], csa[i]->effective_cpus);
897 if (dattr)
898 dattr[i] = SD_ATTR_INIT;
899 }
900 goto done;
901 }
902
903 for (nslot = 0, i = 0; i < csn; i++) {
904 nslot_update = 0;
905 for (j = i; j < csn; j++) {
906 if (uf_find(&csa[j]->node) == &csa[i]->node) {
907 struct cpumask *dp = doms[nslot];
908
909 if (i == j) {
910 nslot_update = 1;
911 cpumask_clear(dp);
912 if (dattr)
913 *(dattr + nslot) = SD_ATTR_INIT;
914 }
915 cpumask_or(dp, dp, csa[j]->effective_cpus);
916 cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
917 if (dattr)
918 update_domain_attr_tree(dattr + nslot, csa[j]);
919 }
920 }
921 if (nslot_update)
922 nslot++;
923 }
924 BUG_ON(nslot != ndoms);
925
926 done:
927 kfree(csa);
928
929 /*
930 * Fallback to the default domain if kmalloc() failed.
931 * See comments in partition_sched_domains().
932 */
933 if (doms == NULL)
934 ndoms = 1;
935
936 *domains = doms;
937 *attributes = dattr;
938 return ndoms;
939 }
940
dl_update_tasks_root_domain(struct cpuset * cs)941 static void dl_update_tasks_root_domain(struct cpuset *cs)
942 {
943 struct css_task_iter it;
944 struct task_struct *task;
945
946 if (cs->nr_deadline_tasks == 0)
947 return;
948
949 css_task_iter_start(&cs->css, 0, &it);
950
951 while ((task = css_task_iter_next(&it)))
952 dl_add_task_root_domain(task);
953
954 css_task_iter_end(&it);
955 }
956
dl_rebuild_rd_accounting(void)957 void dl_rebuild_rd_accounting(void)
958 {
959 struct cpuset *cs = NULL;
960 struct cgroup_subsys_state *pos_css;
961 int cpu;
962 u64 cookie = ++dl_cookie;
963
964 lockdep_assert_held(&cpuset_mutex);
965 lockdep_assert_cpus_held();
966 lockdep_assert_held(&sched_domains_mutex);
967
968 rcu_read_lock();
969
970 for_each_possible_cpu(cpu) {
971 if (dl_bw_visited(cpu, cookie))
972 continue;
973
974 dl_clear_root_domain_cpu(cpu);
975 }
976
977 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
978
979 if (cpumask_empty(cs->effective_cpus)) {
980 pos_css = css_rightmost_descendant(pos_css);
981 continue;
982 }
983
984 css_get(&cs->css);
985
986 rcu_read_unlock();
987
988 dl_update_tasks_root_domain(cs);
989
990 rcu_read_lock();
991 css_put(&cs->css);
992 }
993 rcu_read_unlock();
994 }
995
996 static void
partition_and_rebuild_sched_domains(int ndoms_new,cpumask_var_t doms_new[],struct sched_domain_attr * dattr_new)997 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
998 struct sched_domain_attr *dattr_new)
999 {
1000 sched_domains_mutex_lock();
1001 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
1002 sched_domains_mutex_unlock();
1003 }
1004
1005 /*
1006 * Rebuild scheduler domains.
1007 *
1008 * If the flag 'sched_load_balance' of any cpuset with non-empty
1009 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
1010 * which has that flag enabled, or if any cpuset with a non-empty
1011 * 'cpus' is removed, then call this routine to rebuild the
1012 * scheduler's dynamic sched domains.
1013 *
1014 * Call with cpuset_mutex held. Takes cpus_read_lock().
1015 */
rebuild_sched_domains_locked(void)1016 void rebuild_sched_domains_locked(void)
1017 {
1018 struct cgroup_subsys_state *pos_css;
1019 struct sched_domain_attr *attr;
1020 cpumask_var_t *doms;
1021 struct cpuset *cs;
1022 int ndoms;
1023
1024 lockdep_assert_cpus_held();
1025 lockdep_assert_held(&cpuset_mutex);
1026 force_sd_rebuild = false;
1027
1028 /*
1029 * If we have raced with CPU hotplug, return early to avoid
1030 * passing doms with offlined cpu to partition_sched_domains().
1031 * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
1032 *
1033 * With no CPUs in any subpartitions, top_cpuset's effective CPUs
1034 * should be the same as the active CPUs, so checking only top_cpuset
1035 * is enough to detect racing CPU offlines.
1036 */
1037 if (cpumask_empty(subpartitions_cpus) &&
1038 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1039 return;
1040
1041 /*
1042 * With subpartition CPUs, however, the effective CPUs of a partition
1043 * root should be only a subset of the active CPUs. Since a CPU in any
1044 * partition root could be offlined, all must be checked.
1045 */
1046 if (!cpumask_empty(subpartitions_cpus)) {
1047 rcu_read_lock();
1048 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1049 if (!is_partition_valid(cs)) {
1050 pos_css = css_rightmost_descendant(pos_css);
1051 continue;
1052 }
1053 if (!cpumask_subset(cs->effective_cpus,
1054 cpu_active_mask)) {
1055 rcu_read_unlock();
1056 return;
1057 }
1058 }
1059 rcu_read_unlock();
1060 }
1061
1062 /* Generate domain masks and attrs */
1063 ndoms = generate_sched_domains(&doms, &attr);
1064
1065 /* Have scheduler rebuild the domains */
1066 partition_and_rebuild_sched_domains(ndoms, doms, attr);
1067 }
1068 #else /* !CONFIG_SMP */
rebuild_sched_domains_locked(void)1069 void rebuild_sched_domains_locked(void)
1070 {
1071 }
1072 #endif /* CONFIG_SMP */
1073
rebuild_sched_domains_cpuslocked(void)1074 static void rebuild_sched_domains_cpuslocked(void)
1075 {
1076 mutex_lock(&cpuset_mutex);
1077 rebuild_sched_domains_locked();
1078 mutex_unlock(&cpuset_mutex);
1079 }
1080
rebuild_sched_domains(void)1081 void rebuild_sched_domains(void)
1082 {
1083 cpus_read_lock();
1084 rebuild_sched_domains_cpuslocked();
1085 cpus_read_unlock();
1086 }
1087
cpuset_reset_sched_domains(void)1088 void cpuset_reset_sched_domains(void)
1089 {
1090 mutex_lock(&cpuset_mutex);
1091 partition_sched_domains(1, NULL, NULL);
1092 mutex_unlock(&cpuset_mutex);
1093 }
1094
1095 /**
1096 * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
1097 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
1098 * @new_cpus: the temp variable for the new effective_cpus mask
1099 *
1100 * Iterate through each task of @cs updating its cpus_allowed to the
1101 * effective cpuset's. As this function is called with cpuset_mutex held,
1102 * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
1103 * is used instead of effective_cpus to make sure all offline CPUs are also
1104 * included as hotplug code won't update cpumasks for tasks in top_cpuset.
1105 */
cpuset_update_tasks_cpumask(struct cpuset * cs,struct cpumask * new_cpus)1106 void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
1107 {
1108 struct css_task_iter it;
1109 struct task_struct *task;
1110 bool top_cs = cs == &top_cpuset;
1111
1112 css_task_iter_start(&cs->css, 0, &it);
1113 while ((task = css_task_iter_next(&it))) {
1114 const struct cpumask *possible_mask = task_cpu_possible_mask(task);
1115
1116 if (top_cs) {
1117 /*
1118 * Percpu kthreads in top_cpuset are ignored
1119 */
1120 if (kthread_is_per_cpu(task))
1121 continue;
1122 cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
1123 } else {
1124 cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
1125 }
1126 set_cpus_allowed_ptr(task, new_cpus);
1127 }
1128 css_task_iter_end(&it);
1129 }
1130
1131 /**
1132 * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1133 * @new_cpus: the temp variable for the new effective_cpus mask
1134 * @cs: the cpuset the need to recompute the new effective_cpus mask
1135 * @parent: the parent cpuset
1136 *
1137 * The result is valid only if the given cpuset isn't a partition root.
1138 */
compute_effective_cpumask(struct cpumask * new_cpus,struct cpuset * cs,struct cpuset * parent)1139 static void compute_effective_cpumask(struct cpumask *new_cpus,
1140 struct cpuset *cs, struct cpuset *parent)
1141 {
1142 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1143 }
1144
1145 /*
1146 * Commands for update_parent_effective_cpumask
1147 */
1148 enum partition_cmd {
1149 partcmd_enable, /* Enable partition root */
1150 partcmd_enablei, /* Enable isolated partition root */
1151 partcmd_disable, /* Disable partition root */
1152 partcmd_update, /* Update parent's effective_cpus */
1153 partcmd_invalidate, /* Make partition invalid */
1154 };
1155
1156 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1157 struct tmpmasks *tmp);
1158
1159 /*
1160 * Update partition exclusive flag
1161 *
1162 * Return: 0 if successful, an error code otherwise
1163 */
update_partition_exclusive(struct cpuset * cs,int new_prs)1164 static int update_partition_exclusive(struct cpuset *cs, int new_prs)
1165 {
1166 bool exclusive = (new_prs > PRS_MEMBER);
1167
1168 if (exclusive && !is_cpu_exclusive(cs)) {
1169 if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1))
1170 return PERR_NOTEXCL;
1171 } else if (!exclusive && is_cpu_exclusive(cs)) {
1172 /* Turning off CS_CPU_EXCLUSIVE will not return error */
1173 cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1174 }
1175 return 0;
1176 }
1177
1178 /*
1179 * Update partition load balance flag and/or rebuild sched domain
1180 *
1181 * Changing load balance flag will automatically call
1182 * rebuild_sched_domains_locked().
1183 * This function is for cgroup v2 only.
1184 */
update_partition_sd_lb(struct cpuset * cs,int old_prs)1185 static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
1186 {
1187 int new_prs = cs->partition_root_state;
1188 bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
1189 bool new_lb;
1190
1191 /*
1192 * If cs is not a valid partition root, the load balance state
1193 * will follow its parent.
1194 */
1195 if (new_prs > 0) {
1196 new_lb = (new_prs != PRS_ISOLATED);
1197 } else {
1198 new_lb = is_sched_load_balance(parent_cs(cs));
1199 }
1200 if (new_lb != !!is_sched_load_balance(cs)) {
1201 rebuild_domains = true;
1202 if (new_lb)
1203 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1204 else
1205 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1206 }
1207
1208 if (rebuild_domains)
1209 cpuset_force_rebuild();
1210 }
1211
1212 /*
1213 * tasks_nocpu_error - Return true if tasks will have no effective_cpus
1214 */
tasks_nocpu_error(struct cpuset * parent,struct cpuset * cs,struct cpumask * xcpus)1215 static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
1216 struct cpumask *xcpus)
1217 {
1218 /*
1219 * A populated partition (cs or parent) can't have empty effective_cpus
1220 */
1221 return (cpumask_subset(parent->effective_cpus, xcpus) &&
1222 partition_is_populated(parent, cs)) ||
1223 (!cpumask_intersects(xcpus, cpu_active_mask) &&
1224 partition_is_populated(cs, NULL));
1225 }
1226
reset_partition_data(struct cpuset * cs)1227 static void reset_partition_data(struct cpuset *cs)
1228 {
1229 struct cpuset *parent = parent_cs(cs);
1230
1231 if (!cpuset_v2())
1232 return;
1233
1234 lockdep_assert_held(&callback_lock);
1235
1236 cs->nr_subparts = 0;
1237 if (cpumask_empty(cs->exclusive_cpus)) {
1238 cpumask_clear(cs->effective_xcpus);
1239 if (is_cpu_exclusive(cs))
1240 clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1241 }
1242 if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed))
1243 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1244 }
1245
1246 /*
1247 * partition_xcpus_newstate - Exclusive CPUs state change
1248 * @old_prs: old partition_root_state
1249 * @new_prs: new partition_root_state
1250 * @xcpus: exclusive CPUs with state change
1251 */
partition_xcpus_newstate(int old_prs,int new_prs,struct cpumask * xcpus)1252 static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
1253 {
1254 WARN_ON_ONCE(old_prs == new_prs);
1255 if (new_prs == PRS_ISOLATED)
1256 cpumask_or(isolated_cpus, isolated_cpus, xcpus);
1257 else
1258 cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
1259 }
1260
1261 /*
1262 * partition_xcpus_add - Add new exclusive CPUs to partition
1263 * @new_prs: new partition_root_state
1264 * @parent: parent cpuset
1265 * @xcpus: exclusive CPUs to be added
1266 * Return: true if isolated_cpus modified, false otherwise
1267 *
1268 * Remote partition if parent == NULL
1269 */
partition_xcpus_add(int new_prs,struct cpuset * parent,struct cpumask * xcpus)1270 static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
1271 struct cpumask *xcpus)
1272 {
1273 bool isolcpus_updated;
1274
1275 WARN_ON_ONCE(new_prs < 0);
1276 lockdep_assert_held(&callback_lock);
1277 if (!parent)
1278 parent = &top_cpuset;
1279
1280
1281 if (parent == &top_cpuset)
1282 cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
1283
1284 isolcpus_updated = (new_prs != parent->partition_root_state);
1285 if (isolcpus_updated)
1286 partition_xcpus_newstate(parent->partition_root_state, new_prs,
1287 xcpus);
1288
1289 cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
1290 return isolcpus_updated;
1291 }
1292
1293 /*
1294 * partition_xcpus_del - Remove exclusive CPUs from partition
1295 * @old_prs: old partition_root_state
1296 * @parent: parent cpuset
1297 * @xcpus: exclusive CPUs to be removed
1298 * Return: true if isolated_cpus modified, false otherwise
1299 *
1300 * Remote partition if parent == NULL
1301 */
partition_xcpus_del(int old_prs,struct cpuset * parent,struct cpumask * xcpus)1302 static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
1303 struct cpumask *xcpus)
1304 {
1305 bool isolcpus_updated;
1306
1307 WARN_ON_ONCE(old_prs < 0);
1308 lockdep_assert_held(&callback_lock);
1309 if (!parent)
1310 parent = &top_cpuset;
1311
1312 if (parent == &top_cpuset)
1313 cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
1314
1315 isolcpus_updated = (old_prs != parent->partition_root_state);
1316 if (isolcpus_updated)
1317 partition_xcpus_newstate(old_prs, parent->partition_root_state,
1318 xcpus);
1319
1320 cpumask_and(xcpus, xcpus, cpu_active_mask);
1321 cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
1322 return isolcpus_updated;
1323 }
1324
update_unbound_workqueue_cpumask(bool isolcpus_updated)1325 static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
1326 {
1327 int ret;
1328
1329 lockdep_assert_cpus_held();
1330
1331 if (!isolcpus_updated)
1332 return;
1333
1334 ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
1335 WARN_ON_ONCE(ret < 0);
1336 }
1337
1338 /**
1339 * cpuset_cpu_is_isolated - Check if the given CPU is isolated
1340 * @cpu: the CPU number to be checked
1341 * Return: true if CPU is used in an isolated partition, false otherwise
1342 */
cpuset_cpu_is_isolated(int cpu)1343 bool cpuset_cpu_is_isolated(int cpu)
1344 {
1345 return cpumask_test_cpu(cpu, isolated_cpus);
1346 }
1347 EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
1348
1349 /*
1350 * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
1351 * @cs: cpuset
1352 * @xcpus: effective exclusive CPUs value to be set
1353 * Return: true if xcpus is not empty, false otherwise.
1354 *
1355 * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
1356 * it must be a subset of parent's effective_xcpus.
1357 */
compute_effective_exclusive_cpumask(struct cpuset * cs,struct cpumask * xcpus)1358 static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
1359 struct cpumask *xcpus)
1360 {
1361 struct cpuset *parent = parent_cs(cs);
1362
1363 if (!xcpus)
1364 xcpus = cs->effective_xcpus;
1365
1366 return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
1367 }
1368
is_remote_partition(struct cpuset * cs)1369 static inline bool is_remote_partition(struct cpuset *cs)
1370 {
1371 return !list_empty(&cs->remote_sibling);
1372 }
1373
is_local_partition(struct cpuset * cs)1374 static inline bool is_local_partition(struct cpuset *cs)
1375 {
1376 return is_partition_valid(cs) && !is_remote_partition(cs);
1377 }
1378
1379 /*
1380 * remote_partition_enable - Enable current cpuset as a remote partition root
1381 * @cs: the cpuset to update
1382 * @new_prs: new partition_root_state
1383 * @tmp: temporary masks
1384 * Return: 0 if successful, errcode if error
1385 *
1386 * Enable the current cpuset to become a remote partition root taking CPUs
1387 * directly from the top cpuset. cpuset_mutex must be held by the caller.
1388 */
remote_partition_enable(struct cpuset * cs,int new_prs,struct tmpmasks * tmp)1389 static int remote_partition_enable(struct cpuset *cs, int new_prs,
1390 struct tmpmasks *tmp)
1391 {
1392 bool isolcpus_updated;
1393
1394 /*
1395 * The user must have sysadmin privilege.
1396 */
1397 if (!capable(CAP_SYS_ADMIN))
1398 return PERR_ACCESS;
1399
1400 /*
1401 * The requested exclusive_cpus must not be allocated to other
1402 * partitions and it can't use up all the root's effective_cpus.
1403 *
1404 * Note that if there is any local partition root above it or
1405 * remote partition root underneath it, its exclusive_cpus must
1406 * have overlapped with subpartitions_cpus.
1407 */
1408 compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
1409 if (cpumask_empty(tmp->new_cpus) ||
1410 cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
1411 cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
1412 return PERR_INVCPUS;
1413
1414 spin_lock_irq(&callback_lock);
1415 isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
1416 list_add(&cs->remote_sibling, &remote_children);
1417 spin_unlock_irq(&callback_lock);
1418 update_unbound_workqueue_cpumask(isolcpus_updated);
1419 cs->prs_err = 0;
1420
1421 /*
1422 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
1423 */
1424 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1425 update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1426 return 0;
1427 }
1428
1429 /*
1430 * remote_partition_disable - Remove current cpuset from remote partition list
1431 * @cs: the cpuset to update
1432 * @tmp: temporary masks
1433 *
1434 * The effective_cpus is also updated.
1435 *
1436 * cpuset_mutex must be held by the caller.
1437 */
remote_partition_disable(struct cpuset * cs,struct tmpmasks * tmp)1438 static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
1439 {
1440 bool isolcpus_updated;
1441
1442 compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
1443 WARN_ON_ONCE(!is_remote_partition(cs));
1444 WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
1445
1446 spin_lock_irq(&callback_lock);
1447 list_del_init(&cs->remote_sibling);
1448 isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
1449 NULL, tmp->new_cpus);
1450 if (cs->prs_err)
1451 cs->partition_root_state = -cs->partition_root_state;
1452 else
1453 cs->partition_root_state = PRS_MEMBER;
1454
1455 reset_partition_data(cs);
1456 spin_unlock_irq(&callback_lock);
1457 update_unbound_workqueue_cpumask(isolcpus_updated);
1458
1459 /*
1460 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
1461 */
1462 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1463 update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1464 }
1465
1466 /*
1467 * remote_cpus_update - cpus_exclusive change of remote partition
1468 * @cs: the cpuset to be updated
1469 * @newmask: the new effective_xcpus mask
1470 * @tmp: temporary masks
1471 *
1472 * top_cpuset and subpartitions_cpus will be updated or partition can be
1473 * invalidated.
1474 */
remote_cpus_update(struct cpuset * cs,struct cpumask * newmask,struct tmpmasks * tmp)1475 static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
1476 struct tmpmasks *tmp)
1477 {
1478 bool adding, deleting;
1479 int prs = cs->partition_root_state;
1480 int isolcpus_updated = 0;
1481
1482 if (WARN_ON_ONCE(!is_remote_partition(cs)))
1483 return;
1484
1485 WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
1486
1487 if (cpumask_empty(newmask)) {
1488 cs->prs_err = PERR_CPUSEMPTY;
1489 goto invalidate;
1490 }
1491
1492 adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
1493 deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
1494
1495 /*
1496 * Additions of remote CPUs is only allowed if those CPUs are
1497 * not allocated to other partitions and there are effective_cpus
1498 * left in the top cpuset.
1499 */
1500 if (adding) {
1501 if (!capable(CAP_SYS_ADMIN))
1502 cs->prs_err = PERR_ACCESS;
1503 else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
1504 cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
1505 cs->prs_err = PERR_NOCPUS;
1506 if (cs->prs_err)
1507 goto invalidate;
1508 }
1509
1510 spin_lock_irq(&callback_lock);
1511 if (adding)
1512 isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
1513 if (deleting)
1514 isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
1515 spin_unlock_irq(&callback_lock);
1516 update_unbound_workqueue_cpumask(isolcpus_updated);
1517
1518 /*
1519 * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
1520 */
1521 cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1522 update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1523 return;
1524
1525 invalidate:
1526 remote_partition_disable(cs, tmp);
1527 }
1528
1529 /*
1530 * remote_partition_check - check if a child remote partition needs update
1531 * @cs: the cpuset to be updated
1532 * @newmask: the new effective_xcpus mask
1533 * @delmask: temporary mask for deletion (not in tmp)
1534 * @tmp: temporary masks
1535 *
1536 * This should be called before the given cs has updated its cpus_allowed
1537 * and/or effective_xcpus.
1538 */
remote_partition_check(struct cpuset * cs,struct cpumask * newmask,struct cpumask * delmask,struct tmpmasks * tmp)1539 static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
1540 struct cpumask *delmask, struct tmpmasks *tmp)
1541 {
1542 struct cpuset *child, *next;
1543 int disable_cnt = 0;
1544
1545 /*
1546 * Compute the effective exclusive CPUs that will be deleted.
1547 */
1548 if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
1549 !cpumask_intersects(delmask, subpartitions_cpus))
1550 return; /* No deletion of exclusive CPUs in partitions */
1551
1552 /*
1553 * Searching the remote children list to look for those that will
1554 * be impacted by the deletion of exclusive CPUs.
1555 *
1556 * Since a cpuset must be removed from the remote children list
1557 * before it can go offline and holding cpuset_mutex will prevent
1558 * any change in cpuset status. RCU read lock isn't needed.
1559 */
1560 lockdep_assert_held(&cpuset_mutex);
1561 list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
1562 if (cpumask_intersects(child->effective_cpus, delmask)) {
1563 remote_partition_disable(child, tmp);
1564 disable_cnt++;
1565 }
1566 if (disable_cnt)
1567 cpuset_force_rebuild();
1568 }
1569
1570 /*
1571 * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
1572 * @prstate: partition root state to be checked
1573 * @new_cpus: cpu mask
1574 * Return: true if there is conflict, false otherwise
1575 *
1576 * CPUs outside of boot_hk_cpus, if defined, can only be used in an
1577 * isolated partition.
1578 */
prstate_housekeeping_conflict(int prstate,struct cpumask * new_cpus)1579 static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
1580 {
1581 if (!have_boot_isolcpus)
1582 return false;
1583
1584 if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
1585 return true;
1586
1587 return false;
1588 }
1589
1590 /**
1591 * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
1592 * @cs: The cpuset that requests change in partition root state
1593 * @cmd: Partition root state change command
1594 * @newmask: Optional new cpumask for partcmd_update
1595 * @tmp: Temporary addmask and delmask
1596 * Return: 0 or a partition root state error code
1597 *
1598 * For partcmd_enable*, the cpuset is being transformed from a non-partition
1599 * root to a partition root. The effective_xcpus (cpus_allowed if
1600 * effective_xcpus not set) mask of the given cpuset will be taken away from
1601 * parent's effective_cpus. The function will return 0 if all the CPUs listed
1602 * in effective_xcpus can be granted or an error code will be returned.
1603 *
1604 * For partcmd_disable, the cpuset is being transformed from a partition
1605 * root back to a non-partition root. Any CPUs in effective_xcpus will be
1606 * given back to parent's effective_cpus. 0 will always be returned.
1607 *
1608 * For partcmd_update, if the optional newmask is specified, the cpu list is
1609 * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
1610 * assumed to remain the same. The cpuset should either be a valid or invalid
1611 * partition root. The partition root state may change from valid to invalid
1612 * or vice versa. An error code will be returned if transitioning from
1613 * invalid to valid violates the exclusivity rule.
1614 *
1615 * For partcmd_invalidate, the current partition will be made invalid.
1616 *
1617 * The partcmd_enable* and partcmd_disable commands are used by
1618 * update_prstate(). An error code may be returned and the caller will check
1619 * for error.
1620 *
1621 * The partcmd_update command is used by update_cpumasks_hier() with newmask
1622 * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
1623 * by update_cpumask() with NULL newmask. In both cases, the callers won't
1624 * check for error and so partition_root_state and prs_err will be updated
1625 * directly.
1626 */
update_parent_effective_cpumask(struct cpuset * cs,int cmd,struct cpumask * newmask,struct tmpmasks * tmp)1627 static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
1628 struct cpumask *newmask,
1629 struct tmpmasks *tmp)
1630 {
1631 struct cpuset *parent = parent_cs(cs);
1632 int adding; /* Adding cpus to parent's effective_cpus */
1633 int deleting; /* Deleting cpus from parent's effective_cpus */
1634 int old_prs, new_prs;
1635 int part_error = PERR_NONE; /* Partition error? */
1636 int subparts_delta = 0;
1637 struct cpumask *xcpus; /* cs effective_xcpus */
1638 int isolcpus_updated = 0;
1639 bool nocpu;
1640
1641 lockdep_assert_held(&cpuset_mutex);
1642
1643 /*
1644 * new_prs will only be changed for the partcmd_update and
1645 * partcmd_invalidate commands.
1646 */
1647 adding = deleting = false;
1648 old_prs = new_prs = cs->partition_root_state;
1649 xcpus = user_xcpus(cs);
1650
1651 if (cmd == partcmd_invalidate) {
1652 if (is_prs_invalid(old_prs))
1653 return 0;
1654
1655 /*
1656 * Make the current partition invalid.
1657 */
1658 if (is_partition_valid(parent))
1659 adding = cpumask_and(tmp->addmask,
1660 xcpus, parent->effective_xcpus);
1661 if (old_prs > 0) {
1662 new_prs = -old_prs;
1663 subparts_delta--;
1664 }
1665 goto write_error;
1666 }
1667
1668 /*
1669 * The parent must be a partition root.
1670 * The new cpumask, if present, or the current cpus_allowed must
1671 * not be empty.
1672 */
1673 if (!is_partition_valid(parent)) {
1674 return is_partition_invalid(parent)
1675 ? PERR_INVPARENT : PERR_NOTPART;
1676 }
1677 if (!newmask && xcpus_empty(cs))
1678 return PERR_CPUSEMPTY;
1679
1680 nocpu = tasks_nocpu_error(parent, cs, xcpus);
1681
1682 if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
1683 /*
1684 * Enabling partition root is not allowed if its
1685 * effective_xcpus is empty or doesn't overlap with
1686 * parent's effective_xcpus.
1687 */
1688 if (cpumask_empty(xcpus) ||
1689 !cpumask_intersects(xcpus, parent->effective_xcpus))
1690 return PERR_INVCPUS;
1691
1692 if (prstate_housekeeping_conflict(new_prs, xcpus))
1693 return PERR_HKEEPING;
1694
1695 /*
1696 * A parent can be left with no CPU as long as there is no
1697 * task directly associated with the parent partition.
1698 */
1699 if (nocpu)
1700 return PERR_NOCPUS;
1701
1702 deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus);
1703 if (deleting)
1704 subparts_delta++;
1705 new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
1706 } else if (cmd == partcmd_disable) {
1707 /*
1708 * May need to add cpus to parent's effective_cpus for
1709 * valid partition root.
1710 */
1711 adding = !is_prs_invalid(old_prs) &&
1712 cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
1713 if (adding)
1714 subparts_delta--;
1715 new_prs = PRS_MEMBER;
1716 } else if (newmask) {
1717 /*
1718 * Empty cpumask is not allowed
1719 */
1720 if (cpumask_empty(newmask)) {
1721 part_error = PERR_CPUSEMPTY;
1722 goto write_error;
1723 }
1724 /* Check newmask again, whether cpus are available for parent/cs */
1725 nocpu |= tasks_nocpu_error(parent, cs, newmask);
1726
1727 /*
1728 * partcmd_update with newmask:
1729 *
1730 * Compute add/delete mask to/from effective_cpus
1731 *
1732 * For valid partition:
1733 * addmask = exclusive_cpus & ~newmask
1734 * & parent->effective_xcpus
1735 * delmask = newmask & ~exclusive_cpus
1736 * & parent->effective_xcpus
1737 *
1738 * For invalid partition:
1739 * delmask = newmask & parent->effective_xcpus
1740 */
1741 if (is_prs_invalid(old_prs)) {
1742 adding = false;
1743 deleting = cpumask_and(tmp->delmask,
1744 newmask, parent->effective_xcpus);
1745 } else {
1746 cpumask_andnot(tmp->addmask, xcpus, newmask);
1747 adding = cpumask_and(tmp->addmask, tmp->addmask,
1748 parent->effective_xcpus);
1749
1750 cpumask_andnot(tmp->delmask, newmask, xcpus);
1751 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1752 parent->effective_xcpus);
1753 }
1754 /*
1755 * Make partition invalid if parent's effective_cpus could
1756 * become empty and there are tasks in the parent.
1757 */
1758 if (nocpu && (!adding ||
1759 !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
1760 part_error = PERR_NOCPUS;
1761 deleting = false;
1762 adding = cpumask_and(tmp->addmask,
1763 xcpus, parent->effective_xcpus);
1764 }
1765 } else {
1766 /*
1767 * partcmd_update w/o newmask
1768 *
1769 * delmask = effective_xcpus & parent->effective_cpus
1770 *
1771 * This can be called from:
1772 * 1) update_cpumasks_hier()
1773 * 2) cpuset_hotplug_update_tasks()
1774 *
1775 * Check to see if it can be transitioned from valid to
1776 * invalid partition or vice versa.
1777 *
1778 * A partition error happens when parent has tasks and all
1779 * its effective CPUs will have to be distributed out.
1780 */
1781 WARN_ON_ONCE(!is_partition_valid(parent));
1782 if (nocpu) {
1783 part_error = PERR_NOCPUS;
1784 if (is_partition_valid(cs))
1785 adding = cpumask_and(tmp->addmask,
1786 xcpus, parent->effective_xcpus);
1787 } else if (is_partition_invalid(cs) &&
1788 cpumask_subset(xcpus, parent->effective_xcpus)) {
1789 struct cgroup_subsys_state *css;
1790 struct cpuset *child;
1791 bool exclusive = true;
1792
1793 /*
1794 * Convert invalid partition to valid has to
1795 * pass the cpu exclusivity test.
1796 */
1797 rcu_read_lock();
1798 cpuset_for_each_child(child, css, parent) {
1799 if (child == cs)
1800 continue;
1801 if (!cpusets_are_exclusive(cs, child)) {
1802 exclusive = false;
1803 break;
1804 }
1805 }
1806 rcu_read_unlock();
1807 if (exclusive)
1808 deleting = cpumask_and(tmp->delmask,
1809 xcpus, parent->effective_cpus);
1810 else
1811 part_error = PERR_NOTEXCL;
1812 }
1813 }
1814
1815 write_error:
1816 if (part_error)
1817 WRITE_ONCE(cs->prs_err, part_error);
1818
1819 if (cmd == partcmd_update) {
1820 /*
1821 * Check for possible transition between valid and invalid
1822 * partition root.
1823 */
1824 switch (cs->partition_root_state) {
1825 case PRS_ROOT:
1826 case PRS_ISOLATED:
1827 if (part_error) {
1828 new_prs = -old_prs;
1829 subparts_delta--;
1830 }
1831 break;
1832 case PRS_INVALID_ROOT:
1833 case PRS_INVALID_ISOLATED:
1834 if (!part_error) {
1835 new_prs = -old_prs;
1836 subparts_delta++;
1837 }
1838 break;
1839 }
1840 }
1841
1842 if (!adding && !deleting && (new_prs == old_prs))
1843 return 0;
1844
1845 /*
1846 * Transitioning between invalid to valid or vice versa may require
1847 * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
1848 * validate_change() has already been successfully called and
1849 * CPU lists in cs haven't been updated yet. So defer it to later.
1850 */
1851 if ((old_prs != new_prs) && (cmd != partcmd_update)) {
1852 int err = update_partition_exclusive(cs, new_prs);
1853
1854 if (err)
1855 return err;
1856 }
1857
1858 /*
1859 * Change the parent's effective_cpus & effective_xcpus (top cpuset
1860 * only).
1861 *
1862 * Newly added CPUs will be removed from effective_cpus and
1863 * newly deleted ones will be added back to effective_cpus.
1864 */
1865 spin_lock_irq(&callback_lock);
1866 if (old_prs != new_prs) {
1867 cs->partition_root_state = new_prs;
1868 if (new_prs <= 0)
1869 cs->nr_subparts = 0;
1870 }
1871 /*
1872 * Adding to parent's effective_cpus means deletion CPUs from cs
1873 * and vice versa.
1874 */
1875 if (adding)
1876 isolcpus_updated += partition_xcpus_del(old_prs, parent,
1877 tmp->addmask);
1878 if (deleting)
1879 isolcpus_updated += partition_xcpus_add(new_prs, parent,
1880 tmp->delmask);
1881
1882 if (is_partition_valid(parent)) {
1883 parent->nr_subparts += subparts_delta;
1884 WARN_ON_ONCE(parent->nr_subparts < 0);
1885 }
1886 spin_unlock_irq(&callback_lock);
1887 update_unbound_workqueue_cpumask(isolcpus_updated);
1888
1889 if ((old_prs != new_prs) && (cmd == partcmd_update))
1890 update_partition_exclusive(cs, new_prs);
1891
1892 if (adding || deleting) {
1893 cpuset_update_tasks_cpumask(parent, tmp->addmask);
1894 update_sibling_cpumasks(parent, cs, tmp);
1895 }
1896
1897 /*
1898 * For partcmd_update without newmask, it is being called from
1899 * cpuset_handle_hotplug(). Update the load balance flag and
1900 * scheduling domain accordingly.
1901 */
1902 if ((cmd == partcmd_update) && !newmask)
1903 update_partition_sd_lb(cs, old_prs);
1904
1905 notify_partition_change(cs, old_prs);
1906 return 0;
1907 }
1908
1909 /**
1910 * compute_partition_effective_cpumask - compute effective_cpus for partition
1911 * @cs: partition root cpuset
1912 * @new_ecpus: previously computed effective_cpus to be updated
1913 *
1914 * Compute the effective_cpus of a partition root by scanning effective_xcpus
1915 * of child partition roots and excluding their effective_xcpus.
1916 *
1917 * This has the side effect of invalidating valid child partition roots,
1918 * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
1919 * or update_cpumasks_hier() where parent and children are modified
1920 * successively, we don't need to call update_parent_effective_cpumask()
1921 * and the child's effective_cpus will be updated in later iterations.
1922 *
1923 * Note that rcu_read_lock() is assumed to be held.
1924 */
compute_partition_effective_cpumask(struct cpuset * cs,struct cpumask * new_ecpus)1925 static void compute_partition_effective_cpumask(struct cpuset *cs,
1926 struct cpumask *new_ecpus)
1927 {
1928 struct cgroup_subsys_state *css;
1929 struct cpuset *child;
1930 bool populated = partition_is_populated(cs, NULL);
1931
1932 /*
1933 * Check child partition roots to see if they should be
1934 * invalidated when
1935 * 1) child effective_xcpus not a subset of new
1936 * excluisve_cpus
1937 * 2) All the effective_cpus will be used up and cp
1938 * has tasks
1939 */
1940 compute_effective_exclusive_cpumask(cs, new_ecpus);
1941 cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
1942
1943 rcu_read_lock();
1944 cpuset_for_each_child(child, css, cs) {
1945 if (!is_partition_valid(child))
1946 continue;
1947
1948 child->prs_err = 0;
1949 if (!cpumask_subset(child->effective_xcpus,
1950 cs->effective_xcpus))
1951 child->prs_err = PERR_INVCPUS;
1952 else if (populated &&
1953 cpumask_subset(new_ecpus, child->effective_xcpus))
1954 child->prs_err = PERR_NOCPUS;
1955
1956 if (child->prs_err) {
1957 int old_prs = child->partition_root_state;
1958
1959 /*
1960 * Invalidate child partition
1961 */
1962 spin_lock_irq(&callback_lock);
1963 make_partition_invalid(child);
1964 cs->nr_subparts--;
1965 child->nr_subparts = 0;
1966 spin_unlock_irq(&callback_lock);
1967 notify_partition_change(child, old_prs);
1968 continue;
1969 }
1970 cpumask_andnot(new_ecpus, new_ecpus,
1971 child->effective_xcpus);
1972 }
1973 rcu_read_unlock();
1974 }
1975
1976 /*
1977 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
1978 * @cs: the cpuset to consider
1979 * @tmp: temp variables for calculating effective_cpus & partition setup
1980 * @force: don't skip any descendant cpusets if set
1981 *
1982 * When configured cpumask is changed, the effective cpumasks of this cpuset
1983 * and all its descendants need to be updated.
1984 *
1985 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
1986 *
1987 * Called with cpuset_mutex held
1988 */
update_cpumasks_hier(struct cpuset * cs,struct tmpmasks * tmp,bool force)1989 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
1990 bool force)
1991 {
1992 struct cpuset *cp;
1993 struct cgroup_subsys_state *pos_css;
1994 bool need_rebuild_sched_domains = false;
1995 int old_prs, new_prs;
1996
1997 rcu_read_lock();
1998 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1999 struct cpuset *parent = parent_cs(cp);
2000 bool remote = is_remote_partition(cp);
2001 bool update_parent = false;
2002
2003 /*
2004 * Skip descendent remote partition that acquires CPUs
2005 * directly from top cpuset unless it is cs.
2006 */
2007 if (remote && (cp != cs)) {
2008 pos_css = css_rightmost_descendant(pos_css);
2009 continue;
2010 }
2011
2012 /*
2013 * Update effective_xcpus if exclusive_cpus set.
2014 * The case when exclusive_cpus isn't set is handled later.
2015 */
2016 if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
2017 spin_lock_irq(&callback_lock);
2018 compute_effective_exclusive_cpumask(cp, NULL);
2019 spin_unlock_irq(&callback_lock);
2020 }
2021
2022 old_prs = new_prs = cp->partition_root_state;
2023 if (remote || (is_partition_valid(parent) &&
2024 is_partition_valid(cp)))
2025 compute_partition_effective_cpumask(cp, tmp->new_cpus);
2026 else
2027 compute_effective_cpumask(tmp->new_cpus, cp, parent);
2028
2029 /*
2030 * A partition with no effective_cpus is allowed as long as
2031 * there is no task associated with it. Call
2032 * update_parent_effective_cpumask() to check it.
2033 */
2034 if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
2035 update_parent = true;
2036 goto update_parent_effective;
2037 }
2038
2039 /*
2040 * If it becomes empty, inherit the effective mask of the
2041 * parent, which is guaranteed to have some CPUs unless
2042 * it is a partition root that has explicitly distributed
2043 * out all its CPUs.
2044 */
2045 if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
2046 cpumask_copy(tmp->new_cpus, parent->effective_cpus);
2047
2048 if (remote)
2049 goto get_css;
2050
2051 /*
2052 * Skip the whole subtree if
2053 * 1) the cpumask remains the same,
2054 * 2) has no partition root state,
2055 * 3) force flag not set, and
2056 * 4) for v2 load balance state same as its parent.
2057 */
2058 if (!cp->partition_root_state && !force &&
2059 cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
2060 (!cpuset_v2() ||
2061 (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
2062 pos_css = css_rightmost_descendant(pos_css);
2063 continue;
2064 }
2065
2066 update_parent_effective:
2067 /*
2068 * update_parent_effective_cpumask() should have been called
2069 * for cs already in update_cpumask(). We should also call
2070 * cpuset_update_tasks_cpumask() again for tasks in the parent
2071 * cpuset if the parent's effective_cpus changes.
2072 */
2073 if ((cp != cs) && old_prs) {
2074 switch (parent->partition_root_state) {
2075 case PRS_ROOT:
2076 case PRS_ISOLATED:
2077 update_parent = true;
2078 break;
2079
2080 default:
2081 /*
2082 * When parent is not a partition root or is
2083 * invalid, child partition roots become
2084 * invalid too.
2085 */
2086 if (is_partition_valid(cp))
2087 new_prs = -cp->partition_root_state;
2088 WRITE_ONCE(cp->prs_err,
2089 is_partition_invalid(parent)
2090 ? PERR_INVPARENT : PERR_NOTPART);
2091 break;
2092 }
2093 }
2094 get_css:
2095 if (!css_tryget_online(&cp->css))
2096 continue;
2097 rcu_read_unlock();
2098
2099 if (update_parent) {
2100 update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
2101 /*
2102 * The cpuset partition_root_state may become
2103 * invalid. Capture it.
2104 */
2105 new_prs = cp->partition_root_state;
2106 }
2107
2108 spin_lock_irq(&callback_lock);
2109 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
2110 cp->partition_root_state = new_prs;
2111 /*
2112 * Make sure effective_xcpus is properly set for a valid
2113 * partition root.
2114 */
2115 if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
2116 cpumask_and(cp->effective_xcpus,
2117 cp->cpus_allowed, parent->effective_xcpus);
2118 else if (new_prs < 0)
2119 reset_partition_data(cp);
2120 spin_unlock_irq(&callback_lock);
2121
2122 notify_partition_change(cp, old_prs);
2123
2124 WARN_ON(!is_in_v2_mode() &&
2125 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
2126
2127 cpuset_update_tasks_cpumask(cp, cp->effective_cpus);
2128
2129 /*
2130 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
2131 * from parent if current cpuset isn't a valid partition root
2132 * and their load balance states differ.
2133 */
2134 if (cpuset_v2() && !is_partition_valid(cp) &&
2135 (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
2136 if (is_sched_load_balance(parent))
2137 set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2138 else
2139 clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2140 }
2141
2142 /*
2143 * On legacy hierarchy, if the effective cpumask of any non-
2144 * empty cpuset is changed, we need to rebuild sched domains.
2145 * On default hierarchy, the cpuset needs to be a partition
2146 * root as well.
2147 */
2148 if (!cpumask_empty(cp->cpus_allowed) &&
2149 is_sched_load_balance(cp) &&
2150 (!cpuset_v2() || is_partition_valid(cp)))
2151 need_rebuild_sched_domains = true;
2152
2153 rcu_read_lock();
2154 css_put(&cp->css);
2155 }
2156 rcu_read_unlock();
2157
2158 if (need_rebuild_sched_domains)
2159 cpuset_force_rebuild();
2160 }
2161
2162 /**
2163 * update_sibling_cpumasks - Update siblings cpumasks
2164 * @parent: Parent cpuset
2165 * @cs: Current cpuset
2166 * @tmp: Temp variables
2167 */
update_sibling_cpumasks(struct cpuset * parent,struct cpuset * cs,struct tmpmasks * tmp)2168 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
2169 struct tmpmasks *tmp)
2170 {
2171 struct cpuset *sibling;
2172 struct cgroup_subsys_state *pos_css;
2173
2174 lockdep_assert_held(&cpuset_mutex);
2175
2176 /*
2177 * Check all its siblings and call update_cpumasks_hier()
2178 * if their effective_cpus will need to be changed.
2179 *
2180 * It is possible a change in parent's effective_cpus
2181 * due to a change in a child partition's effective_xcpus will impact
2182 * its siblings even if they do not inherit parent's effective_cpus
2183 * directly.
2184 *
2185 * The update_cpumasks_hier() function may sleep. So we have to
2186 * release the RCU read lock before calling it.
2187 */
2188 rcu_read_lock();
2189 cpuset_for_each_child(sibling, pos_css, parent) {
2190 if (sibling == cs)
2191 continue;
2192 if (!is_partition_valid(sibling)) {
2193 compute_effective_cpumask(tmp->new_cpus, sibling,
2194 parent);
2195 if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
2196 continue;
2197 }
2198 if (!css_tryget_online(&sibling->css))
2199 continue;
2200
2201 rcu_read_unlock();
2202 update_cpumasks_hier(sibling, tmp, false);
2203 rcu_read_lock();
2204 css_put(&sibling->css);
2205 }
2206 rcu_read_unlock();
2207 }
2208
2209 /**
2210 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
2211 * @cs: the cpuset to consider
2212 * @trialcs: trial cpuset
2213 * @buf: buffer of cpu numbers written to this cpuset
2214 */
update_cpumask(struct cpuset * cs,struct cpuset * trialcs,const char * buf)2215 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2216 const char *buf)
2217 {
2218 int retval;
2219 struct tmpmasks tmp;
2220 struct cpuset *parent = parent_cs(cs);
2221 bool invalidate = false;
2222 bool force = false;
2223 int old_prs = cs->partition_root_state;
2224
2225 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
2226 if (cs == &top_cpuset)
2227 return -EACCES;
2228
2229 /*
2230 * An empty cpus_allowed is ok only if the cpuset has no tasks.
2231 * Since cpulist_parse() fails on an empty mask, we special case
2232 * that parsing. The validate_change() call ensures that cpusets
2233 * with tasks have cpus.
2234 */
2235 if (!*buf) {
2236 cpumask_clear(trialcs->cpus_allowed);
2237 if (cpumask_empty(trialcs->exclusive_cpus))
2238 cpumask_clear(trialcs->effective_xcpus);
2239 } else {
2240 retval = cpulist_parse(buf, trialcs->cpus_allowed);
2241 if (retval < 0)
2242 return retval;
2243
2244 if (!cpumask_subset(trialcs->cpus_allowed,
2245 top_cpuset.cpus_allowed))
2246 return -EINVAL;
2247
2248 /*
2249 * When exclusive_cpus isn't explicitly set, it is constrained
2250 * by cpus_allowed and parent's effective_xcpus. Otherwise,
2251 * trialcs->effective_xcpus is used as a temporary cpumask
2252 * for checking validity of the partition root.
2253 */
2254 if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
2255 compute_effective_exclusive_cpumask(trialcs, NULL);
2256 }
2257
2258 /* Nothing to do if the cpus didn't change */
2259 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
2260 return 0;
2261
2262 if (alloc_cpumasks(NULL, &tmp))
2263 return -ENOMEM;
2264
2265 if (old_prs) {
2266 if (is_partition_valid(cs) &&
2267 cpumask_empty(trialcs->effective_xcpus)) {
2268 invalidate = true;
2269 cs->prs_err = PERR_INVCPUS;
2270 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
2271 invalidate = true;
2272 cs->prs_err = PERR_HKEEPING;
2273 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
2274 invalidate = true;
2275 cs->prs_err = PERR_NOCPUS;
2276 }
2277 }
2278
2279 /*
2280 * Check all the descendants in update_cpumasks_hier() if
2281 * effective_xcpus is to be changed.
2282 */
2283 force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
2284
2285 retval = validate_change(cs, trialcs);
2286
2287 if ((retval == -EINVAL) && cpuset_v2()) {
2288 struct cgroup_subsys_state *css;
2289 struct cpuset *cp;
2290
2291 /*
2292 * The -EINVAL error code indicates that partition sibling
2293 * CPU exclusivity rule has been violated. We still allow
2294 * the cpumask change to proceed while invalidating the
2295 * partition. However, any conflicting sibling partitions
2296 * have to be marked as invalid too.
2297 */
2298 invalidate = true;
2299 rcu_read_lock();
2300 cpuset_for_each_child(cp, css, parent) {
2301 struct cpumask *xcpus = user_xcpus(trialcs);
2302
2303 if (is_partition_valid(cp) &&
2304 cpumask_intersects(xcpus, cp->effective_xcpus)) {
2305 rcu_read_unlock();
2306 update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
2307 rcu_read_lock();
2308 }
2309 }
2310 rcu_read_unlock();
2311 retval = 0;
2312 }
2313
2314 if (retval < 0)
2315 goto out_free;
2316
2317 if (is_partition_valid(cs) ||
2318 (is_partition_invalid(cs) && !invalidate)) {
2319 struct cpumask *xcpus = trialcs->effective_xcpus;
2320
2321 if (cpumask_empty(xcpus) && is_partition_invalid(cs))
2322 xcpus = trialcs->cpus_allowed;
2323
2324 /*
2325 * Call remote_cpus_update() to handle valid remote partition
2326 */
2327 if (is_remote_partition(cs))
2328 remote_cpus_update(cs, xcpus, &tmp);
2329 else if (invalidate)
2330 update_parent_effective_cpumask(cs, partcmd_invalidate,
2331 NULL, &tmp);
2332 else
2333 update_parent_effective_cpumask(cs, partcmd_update,
2334 xcpus, &tmp);
2335 } else if (!cpumask_empty(cs->exclusive_cpus)) {
2336 /*
2337 * Use trialcs->effective_cpus as a temp cpumask
2338 */
2339 remote_partition_check(cs, trialcs->effective_xcpus,
2340 trialcs->effective_cpus, &tmp);
2341 }
2342
2343 spin_lock_irq(&callback_lock);
2344 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
2345 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2346 if ((old_prs > 0) && !is_partition_valid(cs))
2347 reset_partition_data(cs);
2348 spin_unlock_irq(&callback_lock);
2349
2350 /* effective_cpus/effective_xcpus will be updated here */
2351 update_cpumasks_hier(cs, &tmp, force);
2352
2353 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2354 if (cs->partition_root_state)
2355 update_partition_sd_lb(cs, old_prs);
2356 out_free:
2357 free_cpumasks(NULL, &tmp);
2358 return retval;
2359 }
2360
2361 /**
2362 * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
2363 * @cs: the cpuset to consider
2364 * @trialcs: trial cpuset
2365 * @buf: buffer of cpu numbers written to this cpuset
2366 *
2367 * The tasks' cpumask will be updated if cs is a valid partition root.
2368 */
update_exclusive_cpumask(struct cpuset * cs,struct cpuset * trialcs,const char * buf)2369 static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2370 const char *buf)
2371 {
2372 int retval;
2373 struct tmpmasks tmp;
2374 struct cpuset *parent = parent_cs(cs);
2375 bool invalidate = false;
2376 bool force = false;
2377 int old_prs = cs->partition_root_state;
2378
2379 if (!*buf) {
2380 cpumask_clear(trialcs->exclusive_cpus);
2381 cpumask_clear(trialcs->effective_xcpus);
2382 } else {
2383 retval = cpulist_parse(buf, trialcs->exclusive_cpus);
2384 if (retval < 0)
2385 return retval;
2386 }
2387
2388 /* Nothing to do if the CPUs didn't change */
2389 if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
2390 return 0;
2391
2392 if (*buf)
2393 compute_effective_exclusive_cpumask(trialcs, NULL);
2394
2395 /*
2396 * Check all the descendants in update_cpumasks_hier() if
2397 * effective_xcpus is to be changed.
2398 */
2399 force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
2400
2401 retval = validate_change(cs, trialcs);
2402 if (retval)
2403 return retval;
2404
2405 if (alloc_cpumasks(NULL, &tmp))
2406 return -ENOMEM;
2407
2408 if (old_prs) {
2409 if (cpumask_empty(trialcs->effective_xcpus)) {
2410 invalidate = true;
2411 cs->prs_err = PERR_INVCPUS;
2412 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
2413 invalidate = true;
2414 cs->prs_err = PERR_HKEEPING;
2415 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
2416 invalidate = true;
2417 cs->prs_err = PERR_NOCPUS;
2418 }
2419
2420 if (is_remote_partition(cs)) {
2421 if (invalidate)
2422 remote_partition_disable(cs, &tmp);
2423 else
2424 remote_cpus_update(cs, trialcs->effective_xcpus,
2425 &tmp);
2426 } else if (invalidate) {
2427 update_parent_effective_cpumask(cs, partcmd_invalidate,
2428 NULL, &tmp);
2429 } else {
2430 update_parent_effective_cpumask(cs, partcmd_update,
2431 trialcs->effective_xcpus, &tmp);
2432 }
2433 } else if (!cpumask_empty(trialcs->exclusive_cpus)) {
2434 /*
2435 * Use trialcs->effective_cpus as a temp cpumask
2436 */
2437 remote_partition_check(cs, trialcs->effective_xcpus,
2438 trialcs->effective_cpus, &tmp);
2439 }
2440 spin_lock_irq(&callback_lock);
2441 cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
2442 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2443 if ((old_prs > 0) && !is_partition_valid(cs))
2444 reset_partition_data(cs);
2445 spin_unlock_irq(&callback_lock);
2446
2447 /*
2448 * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
2449 * of the subtree when it is a valid partition root or effective_xcpus
2450 * is updated.
2451 */
2452 if (is_partition_valid(cs) || force)
2453 update_cpumasks_hier(cs, &tmp, force);
2454
2455 /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2456 if (cs->partition_root_state)
2457 update_partition_sd_lb(cs, old_prs);
2458
2459 free_cpumasks(NULL, &tmp);
2460 return 0;
2461 }
2462
2463 /*
2464 * Migrate memory region from one set of nodes to another. This is
2465 * performed asynchronously as it can be called from process migration path
2466 * holding locks involved in process management. All mm migrations are
2467 * performed in the queued order and can be waited for by flushing
2468 * cpuset_migrate_mm_wq.
2469 */
2470
2471 struct cpuset_migrate_mm_work {
2472 struct work_struct work;
2473 struct mm_struct *mm;
2474 nodemask_t from;
2475 nodemask_t to;
2476 };
2477
cpuset_migrate_mm_workfn(struct work_struct * work)2478 static void cpuset_migrate_mm_workfn(struct work_struct *work)
2479 {
2480 struct cpuset_migrate_mm_work *mwork =
2481 container_of(work, struct cpuset_migrate_mm_work, work);
2482
2483 /* on a wq worker, no need to worry about %current's mems_allowed */
2484 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
2485 mmput(mwork->mm);
2486 kfree(mwork);
2487 }
2488
cpuset_migrate_mm(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to)2489 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
2490 const nodemask_t *to)
2491 {
2492 struct cpuset_migrate_mm_work *mwork;
2493
2494 if (nodes_equal(*from, *to)) {
2495 mmput(mm);
2496 return;
2497 }
2498
2499 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
2500 if (mwork) {
2501 mwork->mm = mm;
2502 mwork->from = *from;
2503 mwork->to = *to;
2504 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
2505 queue_work(cpuset_migrate_mm_wq, &mwork->work);
2506 } else {
2507 mmput(mm);
2508 }
2509 }
2510
cpuset_post_attach(void)2511 static void cpuset_post_attach(void)
2512 {
2513 flush_workqueue(cpuset_migrate_mm_wq);
2514 }
2515
2516 /*
2517 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
2518 * @tsk: the task to change
2519 * @newmems: new nodes that the task will be set
2520 *
2521 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
2522 * and rebind an eventual tasks' mempolicy. If the task is allocating in
2523 * parallel, it might temporarily see an empty intersection, which results in
2524 * a seqlock check and retry before OOM or allocation failure.
2525 */
cpuset_change_task_nodemask(struct task_struct * tsk,nodemask_t * newmems)2526 static void cpuset_change_task_nodemask(struct task_struct *tsk,
2527 nodemask_t *newmems)
2528 {
2529 task_lock(tsk);
2530
2531 local_irq_disable();
2532 write_seqcount_begin(&tsk->mems_allowed_seq);
2533
2534 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
2535 mpol_rebind_task(tsk, newmems);
2536 tsk->mems_allowed = *newmems;
2537
2538 write_seqcount_end(&tsk->mems_allowed_seq);
2539 local_irq_enable();
2540
2541 task_unlock(tsk);
2542 }
2543
2544 static void *cpuset_being_rebound;
2545
2546 /**
2547 * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
2548 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
2549 *
2550 * Iterate through each task of @cs updating its mems_allowed to the
2551 * effective cpuset's. As this function is called with cpuset_mutex held,
2552 * cpuset membership stays stable.
2553 */
cpuset_update_tasks_nodemask(struct cpuset * cs)2554 void cpuset_update_tasks_nodemask(struct cpuset *cs)
2555 {
2556 static nodemask_t newmems; /* protected by cpuset_mutex */
2557 struct css_task_iter it;
2558 struct task_struct *task;
2559
2560 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
2561
2562 guarantee_online_mems(cs, &newmems);
2563
2564 /*
2565 * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
2566 * take while holding tasklist_lock. Forks can happen - the
2567 * mpol_dup() cpuset_being_rebound check will catch such forks,
2568 * and rebind their vma mempolicies too. Because we still hold
2569 * the global cpuset_mutex, we know that no other rebind effort
2570 * will be contending for the global variable cpuset_being_rebound.
2571 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
2572 * is idempotent. Also migrate pages in each mm to new nodes.
2573 */
2574 css_task_iter_start(&cs->css, 0, &it);
2575 while ((task = css_task_iter_next(&it))) {
2576 struct mm_struct *mm;
2577 bool migrate;
2578
2579 cpuset_change_task_nodemask(task, &newmems);
2580
2581 mm = get_task_mm(task);
2582 if (!mm)
2583 continue;
2584
2585 migrate = is_memory_migrate(cs);
2586
2587 mpol_rebind_mm(mm, &cs->mems_allowed);
2588 if (migrate)
2589 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
2590 else
2591 mmput(mm);
2592 }
2593 css_task_iter_end(&it);
2594
2595 /*
2596 * All the tasks' nodemasks have been updated, update
2597 * cs->old_mems_allowed.
2598 */
2599 cs->old_mems_allowed = newmems;
2600
2601 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
2602 cpuset_being_rebound = NULL;
2603 }
2604
2605 /*
2606 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
2607 * @cs: the cpuset to consider
2608 * @new_mems: a temp variable for calculating new effective_mems
2609 *
2610 * When configured nodemask is changed, the effective nodemasks of this cpuset
2611 * and all its descendants need to be updated.
2612 *
2613 * On legacy hierarchy, effective_mems will be the same with mems_allowed.
2614 *
2615 * Called with cpuset_mutex held
2616 */
update_nodemasks_hier(struct cpuset * cs,nodemask_t * new_mems)2617 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
2618 {
2619 struct cpuset *cp;
2620 struct cgroup_subsys_state *pos_css;
2621
2622 rcu_read_lock();
2623 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2624 struct cpuset *parent = parent_cs(cp);
2625
2626 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
2627
2628 /*
2629 * If it becomes empty, inherit the effective mask of the
2630 * parent, which is guaranteed to have some MEMs.
2631 */
2632 if (is_in_v2_mode() && nodes_empty(*new_mems))
2633 *new_mems = parent->effective_mems;
2634
2635 /* Skip the whole subtree if the nodemask remains the same. */
2636 if (nodes_equal(*new_mems, cp->effective_mems)) {
2637 pos_css = css_rightmost_descendant(pos_css);
2638 continue;
2639 }
2640
2641 if (!css_tryget_online(&cp->css))
2642 continue;
2643 rcu_read_unlock();
2644
2645 spin_lock_irq(&callback_lock);
2646 cp->effective_mems = *new_mems;
2647 spin_unlock_irq(&callback_lock);
2648
2649 WARN_ON(!is_in_v2_mode() &&
2650 !nodes_equal(cp->mems_allowed, cp->effective_mems));
2651
2652 cpuset_update_tasks_nodemask(cp);
2653
2654 rcu_read_lock();
2655 css_put(&cp->css);
2656 }
2657 rcu_read_unlock();
2658 }
2659
2660 /*
2661 * Handle user request to change the 'mems' memory placement
2662 * of a cpuset. Needs to validate the request, update the
2663 * cpusets mems_allowed, and for each task in the cpuset,
2664 * update mems_allowed and rebind task's mempolicy and any vma
2665 * mempolicies and if the cpuset is marked 'memory_migrate',
2666 * migrate the tasks pages to the new memory.
2667 *
2668 * Call with cpuset_mutex held. May take callback_lock during call.
2669 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
2670 * lock each such tasks mm->mmap_lock, scan its vma's and rebind
2671 * their mempolicies to the cpusets new mems_allowed.
2672 */
update_nodemask(struct cpuset * cs,struct cpuset * trialcs,const char * buf)2673 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
2674 const char *buf)
2675 {
2676 int retval;
2677
2678 /*
2679 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
2680 * it's read-only
2681 */
2682 if (cs == &top_cpuset) {
2683 retval = -EACCES;
2684 goto done;
2685 }
2686
2687 /*
2688 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
2689 * Since nodelist_parse() fails on an empty mask, we special case
2690 * that parsing. The validate_change() call ensures that cpusets
2691 * with tasks have memory.
2692 */
2693 if (!*buf) {
2694 nodes_clear(trialcs->mems_allowed);
2695 } else {
2696 retval = nodelist_parse(buf, trialcs->mems_allowed);
2697 if (retval < 0)
2698 goto done;
2699
2700 if (!nodes_subset(trialcs->mems_allowed,
2701 top_cpuset.mems_allowed)) {
2702 retval = -EINVAL;
2703 goto done;
2704 }
2705 }
2706
2707 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
2708 retval = 0; /* Too easy - nothing to do */
2709 goto done;
2710 }
2711 retval = validate_change(cs, trialcs);
2712 if (retval < 0)
2713 goto done;
2714
2715 check_insane_mems_config(&trialcs->mems_allowed);
2716
2717 spin_lock_irq(&callback_lock);
2718 cs->mems_allowed = trialcs->mems_allowed;
2719 spin_unlock_irq(&callback_lock);
2720
2721 /* use trialcs->mems_allowed as a temp variable */
2722 update_nodemasks_hier(cs, &trialcs->mems_allowed);
2723 done:
2724 return retval;
2725 }
2726
current_cpuset_is_being_rebound(void)2727 bool current_cpuset_is_being_rebound(void)
2728 {
2729 bool ret;
2730
2731 rcu_read_lock();
2732 ret = task_cs(current) == cpuset_being_rebound;
2733 rcu_read_unlock();
2734
2735 return ret;
2736 }
2737
2738 /*
2739 * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag
2740 * bit: the bit to update (see cpuset_flagbits_t)
2741 * cs: the cpuset to update
2742 * turning_on: whether the flag is being set or cleared
2743 *
2744 * Call with cpuset_mutex held.
2745 */
2746
cpuset_update_flag(cpuset_flagbits_t bit,struct cpuset * cs,int turning_on)2747 int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
2748 int turning_on)
2749 {
2750 struct cpuset *trialcs;
2751 int balance_flag_changed;
2752 int spread_flag_changed;
2753 int err;
2754
2755 trialcs = alloc_trial_cpuset(cs);
2756 if (!trialcs)
2757 return -ENOMEM;
2758
2759 if (turning_on)
2760 set_bit(bit, &trialcs->flags);
2761 else
2762 clear_bit(bit, &trialcs->flags);
2763
2764 err = validate_change(cs, trialcs);
2765 if (err < 0)
2766 goto out;
2767
2768 balance_flag_changed = (is_sched_load_balance(cs) !=
2769 is_sched_load_balance(trialcs));
2770
2771 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
2772 || (is_spread_page(cs) != is_spread_page(trialcs)));
2773
2774 spin_lock_irq(&callback_lock);
2775 cs->flags = trialcs->flags;
2776 spin_unlock_irq(&callback_lock);
2777
2778 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
2779 if (cpuset_v2())
2780 cpuset_force_rebuild();
2781 else
2782 rebuild_sched_domains_locked();
2783 }
2784
2785 if (spread_flag_changed)
2786 cpuset1_update_tasks_flags(cs);
2787 out:
2788 free_cpuset(trialcs);
2789 return err;
2790 }
2791
2792 /**
2793 * update_prstate - update partition_root_state
2794 * @cs: the cpuset to update
2795 * @new_prs: new partition root state
2796 * Return: 0 if successful, != 0 if error
2797 *
2798 * Call with cpuset_mutex held.
2799 */
update_prstate(struct cpuset * cs,int new_prs)2800 static int update_prstate(struct cpuset *cs, int new_prs)
2801 {
2802 int err = PERR_NONE, old_prs = cs->partition_root_state;
2803 struct cpuset *parent = parent_cs(cs);
2804 struct tmpmasks tmpmask;
2805 bool new_xcpus_state = false;
2806
2807 if (old_prs == new_prs)
2808 return 0;
2809
2810 /*
2811 * Treat a previously invalid partition root as if it is a "member".
2812 */
2813 if (new_prs && is_prs_invalid(old_prs))
2814 old_prs = PRS_MEMBER;
2815
2816 if (alloc_cpumasks(NULL, &tmpmask))
2817 return -ENOMEM;
2818
2819 /*
2820 * Setup effective_xcpus if not properly set yet, it will be cleared
2821 * later if partition becomes invalid.
2822 */
2823 if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
2824 spin_lock_irq(&callback_lock);
2825 cpumask_and(cs->effective_xcpus,
2826 cs->cpus_allowed, parent->effective_xcpus);
2827 spin_unlock_irq(&callback_lock);
2828 }
2829
2830 err = update_partition_exclusive(cs, new_prs);
2831 if (err)
2832 goto out;
2833
2834 if (!old_prs) {
2835 /*
2836 * cpus_allowed and exclusive_cpus cannot be both empty.
2837 */
2838 if (xcpus_empty(cs)) {
2839 err = PERR_CPUSEMPTY;
2840 goto out;
2841 }
2842
2843 /*
2844 * If parent is valid partition, enable local partiion.
2845 * Otherwise, enable a remote partition.
2846 */
2847 if (is_partition_valid(parent)) {
2848 enum partition_cmd cmd = (new_prs == PRS_ROOT)
2849 ? partcmd_enable : partcmd_enablei;
2850
2851 err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
2852 } else {
2853 err = remote_partition_enable(cs, new_prs, &tmpmask);
2854 }
2855 } else if (old_prs && new_prs) {
2856 /*
2857 * A change in load balance state only, no change in cpumasks.
2858 */
2859 new_xcpus_state = true;
2860 } else {
2861 /*
2862 * Switching back to member is always allowed even if it
2863 * disables child partitions.
2864 */
2865 if (is_remote_partition(cs))
2866 remote_partition_disable(cs, &tmpmask);
2867 else
2868 update_parent_effective_cpumask(cs, partcmd_disable,
2869 NULL, &tmpmask);
2870
2871 /*
2872 * Invalidation of child partitions will be done in
2873 * update_cpumasks_hier().
2874 */
2875 }
2876 out:
2877 /*
2878 * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
2879 * happens.
2880 */
2881 if (err) {
2882 new_prs = -new_prs;
2883 update_partition_exclusive(cs, new_prs);
2884 }
2885
2886 spin_lock_irq(&callback_lock);
2887 cs->partition_root_state = new_prs;
2888 WRITE_ONCE(cs->prs_err, err);
2889 if (!is_partition_valid(cs))
2890 reset_partition_data(cs);
2891 else if (new_xcpus_state)
2892 partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
2893 spin_unlock_irq(&callback_lock);
2894 update_unbound_workqueue_cpumask(new_xcpus_state);
2895
2896 /* Force update if switching back to member */
2897 update_cpumasks_hier(cs, &tmpmask, !new_prs);
2898
2899 /* Update sched domains and load balance flag */
2900 update_partition_sd_lb(cs, old_prs);
2901
2902 notify_partition_change(cs, old_prs);
2903 if (force_sd_rebuild)
2904 rebuild_sched_domains_locked();
2905 free_cpumasks(NULL, &tmpmask);
2906 return 0;
2907 }
2908
2909 static struct cpuset *cpuset_attach_old_cs;
2910
2911 /*
2912 * Check to see if a cpuset can accept a new task
2913 * For v1, cpus_allowed and mems_allowed can't be empty.
2914 * For v2, effective_cpus can't be empty.
2915 * Note that in v1, effective_cpus = cpus_allowed.
2916 */
cpuset_can_attach_check(struct cpuset * cs)2917 static int cpuset_can_attach_check(struct cpuset *cs)
2918 {
2919 if (cpumask_empty(cs->effective_cpus) ||
2920 (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
2921 return -ENOSPC;
2922 return 0;
2923 }
2924
reset_migrate_dl_data(struct cpuset * cs)2925 static void reset_migrate_dl_data(struct cpuset *cs)
2926 {
2927 cs->nr_migrate_dl_tasks = 0;
2928 cs->sum_migrate_dl_bw = 0;
2929 }
2930
2931 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
cpuset_can_attach(struct cgroup_taskset * tset)2932 static int cpuset_can_attach(struct cgroup_taskset *tset)
2933 {
2934 struct cgroup_subsys_state *css;
2935 struct cpuset *cs, *oldcs;
2936 struct task_struct *task;
2937 bool cpus_updated, mems_updated;
2938 int ret;
2939
2940 /* used later by cpuset_attach() */
2941 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2942 oldcs = cpuset_attach_old_cs;
2943 cs = css_cs(css);
2944
2945 mutex_lock(&cpuset_mutex);
2946
2947 /* Check to see if task is allowed in the cpuset */
2948 ret = cpuset_can_attach_check(cs);
2949 if (ret)
2950 goto out_unlock;
2951
2952 cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
2953 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
2954
2955 cgroup_taskset_for_each(task, css, tset) {
2956 ret = task_can_attach(task);
2957 if (ret)
2958 goto out_unlock;
2959
2960 /*
2961 * Skip rights over task check in v2 when nothing changes,
2962 * migration permission derives from hierarchy ownership in
2963 * cgroup_procs_write_permission()).
2964 */
2965 if (!cpuset_v2() || (cpus_updated || mems_updated)) {
2966 ret = security_task_setscheduler(task);
2967 if (ret)
2968 goto out_unlock;
2969 }
2970
2971 if (dl_task(task)) {
2972 cs->nr_migrate_dl_tasks++;
2973 cs->sum_migrate_dl_bw += task->dl.dl_bw;
2974 }
2975 }
2976
2977 if (!cs->nr_migrate_dl_tasks)
2978 goto out_success;
2979
2980 if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
2981 int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
2982
2983 if (unlikely(cpu >= nr_cpu_ids)) {
2984 reset_migrate_dl_data(cs);
2985 ret = -EINVAL;
2986 goto out_unlock;
2987 }
2988
2989 ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
2990 if (ret) {
2991 reset_migrate_dl_data(cs);
2992 goto out_unlock;
2993 }
2994 }
2995
2996 out_success:
2997 /*
2998 * Mark attach is in progress. This makes validate_change() fail
2999 * changes which zero cpus/mems_allowed.
3000 */
3001 cs->attach_in_progress++;
3002 out_unlock:
3003 mutex_unlock(&cpuset_mutex);
3004 return ret;
3005 }
3006
cpuset_cancel_attach(struct cgroup_taskset * tset)3007 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
3008 {
3009 struct cgroup_subsys_state *css;
3010 struct cpuset *cs;
3011
3012 cgroup_taskset_first(tset, &css);
3013 cs = css_cs(css);
3014
3015 mutex_lock(&cpuset_mutex);
3016 dec_attach_in_progress_locked(cs);
3017
3018 if (cs->nr_migrate_dl_tasks) {
3019 int cpu = cpumask_any(cs->effective_cpus);
3020
3021 dl_bw_free(cpu, cs->sum_migrate_dl_bw);
3022 reset_migrate_dl_data(cs);
3023 }
3024
3025 mutex_unlock(&cpuset_mutex);
3026 }
3027
3028 /*
3029 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
3030 * but we can't allocate it dynamically there. Define it global and
3031 * allocate from cpuset_init().
3032 */
3033 static cpumask_var_t cpus_attach;
3034 static nodemask_t cpuset_attach_nodemask_to;
3035
cpuset_attach_task(struct cpuset * cs,struct task_struct * task)3036 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
3037 {
3038 lockdep_assert_held(&cpuset_mutex);
3039
3040 if (cs != &top_cpuset)
3041 guarantee_online_cpus(task, cpus_attach);
3042 else
3043 cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
3044 subpartitions_cpus);
3045 /*
3046 * can_attach beforehand should guarantee that this doesn't
3047 * fail. TODO: have a better way to handle failure here
3048 */
3049 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
3050
3051 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
3052 cpuset1_update_task_spread_flags(cs, task);
3053 }
3054
cpuset_attach(struct cgroup_taskset * tset)3055 static void cpuset_attach(struct cgroup_taskset *tset)
3056 {
3057 struct task_struct *task;
3058 struct task_struct *leader;
3059 struct cgroup_subsys_state *css;
3060 struct cpuset *cs;
3061 struct cpuset *oldcs = cpuset_attach_old_cs;
3062 bool cpus_updated, mems_updated;
3063
3064 cgroup_taskset_first(tset, &css);
3065 cs = css_cs(css);
3066
3067 lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
3068 mutex_lock(&cpuset_mutex);
3069 cpus_updated = !cpumask_equal(cs->effective_cpus,
3070 oldcs->effective_cpus);
3071 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3072
3073 /*
3074 * In the default hierarchy, enabling cpuset in the child cgroups
3075 * will trigger a number of cpuset_attach() calls with no change
3076 * in effective cpus and mems. In that case, we can optimize out
3077 * by skipping the task iteration and update.
3078 */
3079 if (cpuset_v2() && !cpus_updated && !mems_updated) {
3080 cpuset_attach_nodemask_to = cs->effective_mems;
3081 goto out;
3082 }
3083
3084 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
3085
3086 cgroup_taskset_for_each(task, css, tset)
3087 cpuset_attach_task(cs, task);
3088
3089 /*
3090 * Change mm for all threadgroup leaders. This is expensive and may
3091 * sleep and should be moved outside migration path proper. Skip it
3092 * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
3093 * not set.
3094 */
3095 cpuset_attach_nodemask_to = cs->effective_mems;
3096 if (!is_memory_migrate(cs) && !mems_updated)
3097 goto out;
3098
3099 cgroup_taskset_for_each_leader(leader, css, tset) {
3100 struct mm_struct *mm = get_task_mm(leader);
3101
3102 if (mm) {
3103 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
3104
3105 /*
3106 * old_mems_allowed is the same with mems_allowed
3107 * here, except if this task is being moved
3108 * automatically due to hotplug. In that case
3109 * @mems_allowed has been updated and is empty, so
3110 * @old_mems_allowed is the right nodesets that we
3111 * migrate mm from.
3112 */
3113 if (is_memory_migrate(cs))
3114 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
3115 &cpuset_attach_nodemask_to);
3116 else
3117 mmput(mm);
3118 }
3119 }
3120
3121 out:
3122 cs->old_mems_allowed = cpuset_attach_nodemask_to;
3123
3124 if (cs->nr_migrate_dl_tasks) {
3125 cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
3126 oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
3127 reset_migrate_dl_data(cs);
3128 }
3129
3130 dec_attach_in_progress_locked(cs);
3131
3132 mutex_unlock(&cpuset_mutex);
3133 }
3134
3135 /*
3136 * Common handling for a write to a "cpus" or "mems" file.
3137 */
cpuset_write_resmask(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3138 ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
3139 char *buf, size_t nbytes, loff_t off)
3140 {
3141 struct cpuset *cs = css_cs(of_css(of));
3142 struct cpuset *trialcs;
3143 int retval = -ENODEV;
3144
3145 buf = strstrip(buf);
3146 cpus_read_lock();
3147 mutex_lock(&cpuset_mutex);
3148 if (!is_cpuset_online(cs))
3149 goto out_unlock;
3150
3151 trialcs = alloc_trial_cpuset(cs);
3152 if (!trialcs) {
3153 retval = -ENOMEM;
3154 goto out_unlock;
3155 }
3156
3157 switch (of_cft(of)->private) {
3158 case FILE_CPULIST:
3159 retval = update_cpumask(cs, trialcs, buf);
3160 break;
3161 case FILE_EXCLUSIVE_CPULIST:
3162 retval = update_exclusive_cpumask(cs, trialcs, buf);
3163 break;
3164 case FILE_MEMLIST:
3165 retval = update_nodemask(cs, trialcs, buf);
3166 break;
3167 default:
3168 retval = -EINVAL;
3169 break;
3170 }
3171
3172 free_cpuset(trialcs);
3173 if (force_sd_rebuild)
3174 rebuild_sched_domains_locked();
3175 out_unlock:
3176 mutex_unlock(&cpuset_mutex);
3177 cpus_read_unlock();
3178 flush_workqueue(cpuset_migrate_mm_wq);
3179 return retval ?: nbytes;
3180 }
3181
3182 /*
3183 * These ascii lists should be read in a single call, by using a user
3184 * buffer large enough to hold the entire map. If read in smaller
3185 * chunks, there is no guarantee of atomicity. Since the display format
3186 * used, list of ranges of sequential numbers, is variable length,
3187 * and since these maps can change value dynamically, one could read
3188 * gibberish by doing partial reads while a list was changing.
3189 */
cpuset_common_seq_show(struct seq_file * sf,void * v)3190 int cpuset_common_seq_show(struct seq_file *sf, void *v)
3191 {
3192 struct cpuset *cs = css_cs(seq_css(sf));
3193 cpuset_filetype_t type = seq_cft(sf)->private;
3194 int ret = 0;
3195
3196 spin_lock_irq(&callback_lock);
3197
3198 switch (type) {
3199 case FILE_CPULIST:
3200 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
3201 break;
3202 case FILE_MEMLIST:
3203 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
3204 break;
3205 case FILE_EFFECTIVE_CPULIST:
3206 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
3207 break;
3208 case FILE_EFFECTIVE_MEMLIST:
3209 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
3210 break;
3211 case FILE_EXCLUSIVE_CPULIST:
3212 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
3213 break;
3214 case FILE_EFFECTIVE_XCPULIST:
3215 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
3216 break;
3217 case FILE_SUBPARTS_CPULIST:
3218 seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
3219 break;
3220 case FILE_ISOLATED_CPULIST:
3221 seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
3222 break;
3223 default:
3224 ret = -EINVAL;
3225 }
3226
3227 spin_unlock_irq(&callback_lock);
3228 return ret;
3229 }
3230
sched_partition_show(struct seq_file * seq,void * v)3231 static int sched_partition_show(struct seq_file *seq, void *v)
3232 {
3233 struct cpuset *cs = css_cs(seq_css(seq));
3234 const char *err, *type = NULL;
3235
3236 switch (cs->partition_root_state) {
3237 case PRS_ROOT:
3238 seq_puts(seq, "root\n");
3239 break;
3240 case PRS_ISOLATED:
3241 seq_puts(seq, "isolated\n");
3242 break;
3243 case PRS_MEMBER:
3244 seq_puts(seq, "member\n");
3245 break;
3246 case PRS_INVALID_ROOT:
3247 type = "root";
3248 fallthrough;
3249 case PRS_INVALID_ISOLATED:
3250 if (!type)
3251 type = "isolated";
3252 err = perr_strings[READ_ONCE(cs->prs_err)];
3253 if (err)
3254 seq_printf(seq, "%s invalid (%s)\n", type, err);
3255 else
3256 seq_printf(seq, "%s invalid\n", type);
3257 break;
3258 }
3259 return 0;
3260 }
3261
sched_partition_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3262 static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
3263 size_t nbytes, loff_t off)
3264 {
3265 struct cpuset *cs = css_cs(of_css(of));
3266 int val;
3267 int retval = -ENODEV;
3268
3269 buf = strstrip(buf);
3270
3271 if (!strcmp(buf, "root"))
3272 val = PRS_ROOT;
3273 else if (!strcmp(buf, "member"))
3274 val = PRS_MEMBER;
3275 else if (!strcmp(buf, "isolated"))
3276 val = PRS_ISOLATED;
3277 else
3278 return -EINVAL;
3279
3280 css_get(&cs->css);
3281 cpus_read_lock();
3282 mutex_lock(&cpuset_mutex);
3283 if (!is_cpuset_online(cs))
3284 goto out_unlock;
3285
3286 retval = update_prstate(cs, val);
3287 out_unlock:
3288 mutex_unlock(&cpuset_mutex);
3289 cpus_read_unlock();
3290 css_put(&cs->css);
3291 return retval ?: nbytes;
3292 }
3293
3294 /*
3295 * This is currently a minimal set for the default hierarchy. It can be
3296 * expanded later on by migrating more features and control files from v1.
3297 */
3298 static struct cftype dfl_files[] = {
3299 {
3300 .name = "cpus",
3301 .seq_show = cpuset_common_seq_show,
3302 .write = cpuset_write_resmask,
3303 .max_write_len = (100U + 6 * NR_CPUS),
3304 .private = FILE_CPULIST,
3305 .flags = CFTYPE_NOT_ON_ROOT,
3306 },
3307
3308 {
3309 .name = "mems",
3310 .seq_show = cpuset_common_seq_show,
3311 .write = cpuset_write_resmask,
3312 .max_write_len = (100U + 6 * MAX_NUMNODES),
3313 .private = FILE_MEMLIST,
3314 .flags = CFTYPE_NOT_ON_ROOT,
3315 },
3316
3317 {
3318 .name = "cpus.effective",
3319 .seq_show = cpuset_common_seq_show,
3320 .private = FILE_EFFECTIVE_CPULIST,
3321 },
3322
3323 {
3324 .name = "mems.effective",
3325 .seq_show = cpuset_common_seq_show,
3326 .private = FILE_EFFECTIVE_MEMLIST,
3327 },
3328
3329 {
3330 .name = "cpus.partition",
3331 .seq_show = sched_partition_show,
3332 .write = sched_partition_write,
3333 .private = FILE_PARTITION_ROOT,
3334 .flags = CFTYPE_NOT_ON_ROOT,
3335 .file_offset = offsetof(struct cpuset, partition_file),
3336 },
3337
3338 {
3339 .name = "cpus.exclusive",
3340 .seq_show = cpuset_common_seq_show,
3341 .write = cpuset_write_resmask,
3342 .max_write_len = (100U + 6 * NR_CPUS),
3343 .private = FILE_EXCLUSIVE_CPULIST,
3344 .flags = CFTYPE_NOT_ON_ROOT,
3345 },
3346
3347 {
3348 .name = "cpus.exclusive.effective",
3349 .seq_show = cpuset_common_seq_show,
3350 .private = FILE_EFFECTIVE_XCPULIST,
3351 .flags = CFTYPE_NOT_ON_ROOT,
3352 },
3353
3354 {
3355 .name = "cpus.subpartitions",
3356 .seq_show = cpuset_common_seq_show,
3357 .private = FILE_SUBPARTS_CPULIST,
3358 .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
3359 },
3360
3361 {
3362 .name = "cpus.isolated",
3363 .seq_show = cpuset_common_seq_show,
3364 .private = FILE_ISOLATED_CPULIST,
3365 .flags = CFTYPE_ONLY_ON_ROOT,
3366 },
3367
3368 { } /* terminate */
3369 };
3370
3371
3372 /**
3373 * cpuset_css_alloc - Allocate a cpuset css
3374 * @parent_css: Parent css of the control group that the new cpuset will be
3375 * part of
3376 * Return: cpuset css on success, -ENOMEM on failure.
3377 *
3378 * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
3379 * top cpuset css otherwise.
3380 */
3381 static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state * parent_css)3382 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
3383 {
3384 struct cpuset *cs;
3385
3386 if (!parent_css)
3387 return &top_cpuset.css;
3388
3389 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
3390 if (!cs)
3391 return ERR_PTR(-ENOMEM);
3392
3393 if (alloc_cpumasks(cs, NULL)) {
3394 kfree(cs);
3395 return ERR_PTR(-ENOMEM);
3396 }
3397
3398 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
3399 fmeter_init(&cs->fmeter);
3400 cs->relax_domain_level = -1;
3401 INIT_LIST_HEAD(&cs->remote_sibling);
3402
3403 /* Set CS_MEMORY_MIGRATE for default hierarchy */
3404 if (cpuset_v2())
3405 __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
3406
3407 return &cs->css;
3408 }
3409
cpuset_css_online(struct cgroup_subsys_state * css)3410 static int cpuset_css_online(struct cgroup_subsys_state *css)
3411 {
3412 struct cpuset *cs = css_cs(css);
3413 struct cpuset *parent = parent_cs(cs);
3414 struct cpuset *tmp_cs;
3415 struct cgroup_subsys_state *pos_css;
3416
3417 if (!parent)
3418 return 0;
3419
3420 cpus_read_lock();
3421 mutex_lock(&cpuset_mutex);
3422
3423 set_bit(CS_ONLINE, &cs->flags);
3424 if (is_spread_page(parent))
3425 set_bit(CS_SPREAD_PAGE, &cs->flags);
3426 if (is_spread_slab(parent))
3427 set_bit(CS_SPREAD_SLAB, &cs->flags);
3428 /*
3429 * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
3430 */
3431 if (cpuset_v2() && !is_sched_load_balance(parent))
3432 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
3433
3434 cpuset_inc();
3435
3436 spin_lock_irq(&callback_lock);
3437 if (is_in_v2_mode()) {
3438 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
3439 cs->effective_mems = parent->effective_mems;
3440 }
3441 spin_unlock_irq(&callback_lock);
3442
3443 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
3444 goto out_unlock;
3445
3446 /*
3447 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
3448 * set. This flag handling is implemented in cgroup core for
3449 * historical reasons - the flag may be specified during mount.
3450 *
3451 * Currently, if any sibling cpusets have exclusive cpus or mem, we
3452 * refuse to clone the configuration - thereby refusing the task to
3453 * be entered, and as a result refusing the sys_unshare() or
3454 * clone() which initiated it. If this becomes a problem for some
3455 * users who wish to allow that scenario, then this could be
3456 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
3457 * (and likewise for mems) to the new cgroup.
3458 */
3459 rcu_read_lock();
3460 cpuset_for_each_child(tmp_cs, pos_css, parent) {
3461 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
3462 rcu_read_unlock();
3463 goto out_unlock;
3464 }
3465 }
3466 rcu_read_unlock();
3467
3468 spin_lock_irq(&callback_lock);
3469 cs->mems_allowed = parent->mems_allowed;
3470 cs->effective_mems = parent->mems_allowed;
3471 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
3472 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
3473 spin_unlock_irq(&callback_lock);
3474 out_unlock:
3475 mutex_unlock(&cpuset_mutex);
3476 cpus_read_unlock();
3477 return 0;
3478 }
3479
3480 /*
3481 * If the cpuset being removed has its flag 'sched_load_balance'
3482 * enabled, then simulate turning sched_load_balance off, which
3483 * will call rebuild_sched_domains_locked(). That is not needed
3484 * in the default hierarchy where only changes in partition
3485 * will cause repartitioning.
3486 *
3487 * If the cpuset has the 'sched.partition' flag enabled, simulate
3488 * turning 'sched.partition" off.
3489 */
3490
cpuset_css_offline(struct cgroup_subsys_state * css)3491 static void cpuset_css_offline(struct cgroup_subsys_state *css)
3492 {
3493 struct cpuset *cs = css_cs(css);
3494
3495 cpus_read_lock();
3496 mutex_lock(&cpuset_mutex);
3497
3498 if (!cpuset_v2() && is_sched_load_balance(cs))
3499 cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
3500
3501 cpuset_dec();
3502 clear_bit(CS_ONLINE, &cs->flags);
3503
3504 mutex_unlock(&cpuset_mutex);
3505 cpus_read_unlock();
3506 }
3507
cpuset_css_killed(struct cgroup_subsys_state * css)3508 static void cpuset_css_killed(struct cgroup_subsys_state *css)
3509 {
3510 struct cpuset *cs = css_cs(css);
3511
3512 cpus_read_lock();
3513 mutex_lock(&cpuset_mutex);
3514
3515 /* Reset valid partition back to member */
3516 if (is_partition_valid(cs))
3517 update_prstate(cs, PRS_MEMBER);
3518
3519 mutex_unlock(&cpuset_mutex);
3520 cpus_read_unlock();
3521
3522 }
3523
cpuset_css_free(struct cgroup_subsys_state * css)3524 static void cpuset_css_free(struct cgroup_subsys_state *css)
3525 {
3526 struct cpuset *cs = css_cs(css);
3527
3528 free_cpuset(cs);
3529 }
3530
cpuset_bind(struct cgroup_subsys_state * root_css)3531 static void cpuset_bind(struct cgroup_subsys_state *root_css)
3532 {
3533 mutex_lock(&cpuset_mutex);
3534 spin_lock_irq(&callback_lock);
3535
3536 if (is_in_v2_mode()) {
3537 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
3538 cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
3539 top_cpuset.mems_allowed = node_possible_map;
3540 } else {
3541 cpumask_copy(top_cpuset.cpus_allowed,
3542 top_cpuset.effective_cpus);
3543 top_cpuset.mems_allowed = top_cpuset.effective_mems;
3544 }
3545
3546 spin_unlock_irq(&callback_lock);
3547 mutex_unlock(&cpuset_mutex);
3548 }
3549
3550 /*
3551 * In case the child is cloned into a cpuset different from its parent,
3552 * additional checks are done to see if the move is allowed.
3553 */
cpuset_can_fork(struct task_struct * task,struct css_set * cset)3554 static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
3555 {
3556 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
3557 bool same_cs;
3558 int ret;
3559
3560 rcu_read_lock();
3561 same_cs = (cs == task_cs(current));
3562 rcu_read_unlock();
3563
3564 if (same_cs)
3565 return 0;
3566
3567 lockdep_assert_held(&cgroup_mutex);
3568 mutex_lock(&cpuset_mutex);
3569
3570 /* Check to see if task is allowed in the cpuset */
3571 ret = cpuset_can_attach_check(cs);
3572 if (ret)
3573 goto out_unlock;
3574
3575 ret = task_can_attach(task);
3576 if (ret)
3577 goto out_unlock;
3578
3579 ret = security_task_setscheduler(task);
3580 if (ret)
3581 goto out_unlock;
3582
3583 /*
3584 * Mark attach is in progress. This makes validate_change() fail
3585 * changes which zero cpus/mems_allowed.
3586 */
3587 cs->attach_in_progress++;
3588 out_unlock:
3589 mutex_unlock(&cpuset_mutex);
3590 return ret;
3591 }
3592
cpuset_cancel_fork(struct task_struct * task,struct css_set * cset)3593 static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
3594 {
3595 struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
3596 bool same_cs;
3597
3598 rcu_read_lock();
3599 same_cs = (cs == task_cs(current));
3600 rcu_read_unlock();
3601
3602 if (same_cs)
3603 return;
3604
3605 dec_attach_in_progress(cs);
3606 }
3607
3608 /*
3609 * Make sure the new task conform to the current state of its parent,
3610 * which could have been changed by cpuset just after it inherits the
3611 * state from the parent and before it sits on the cgroup's task list.
3612 */
cpuset_fork(struct task_struct * task)3613 static void cpuset_fork(struct task_struct *task)
3614 {
3615 struct cpuset *cs;
3616 bool same_cs;
3617
3618 rcu_read_lock();
3619 cs = task_cs(task);
3620 same_cs = (cs == task_cs(current));
3621 rcu_read_unlock();
3622
3623 if (same_cs) {
3624 if (cs == &top_cpuset)
3625 return;
3626
3627 set_cpus_allowed_ptr(task, current->cpus_ptr);
3628 task->mems_allowed = current->mems_allowed;
3629 return;
3630 }
3631
3632 /* CLONE_INTO_CGROUP */
3633 mutex_lock(&cpuset_mutex);
3634 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
3635 cpuset_attach_task(cs, task);
3636
3637 dec_attach_in_progress_locked(cs);
3638 mutex_unlock(&cpuset_mutex);
3639 }
3640
3641 struct cgroup_subsys cpuset_cgrp_subsys = {
3642 .css_alloc = cpuset_css_alloc,
3643 .css_online = cpuset_css_online,
3644 .css_offline = cpuset_css_offline,
3645 .css_killed = cpuset_css_killed,
3646 .css_free = cpuset_css_free,
3647 .can_attach = cpuset_can_attach,
3648 .cancel_attach = cpuset_cancel_attach,
3649 .attach = cpuset_attach,
3650 .post_attach = cpuset_post_attach,
3651 .bind = cpuset_bind,
3652 .can_fork = cpuset_can_fork,
3653 .cancel_fork = cpuset_cancel_fork,
3654 .fork = cpuset_fork,
3655 #ifdef CONFIG_CPUSETS_V1
3656 .legacy_cftypes = cpuset1_files,
3657 #endif
3658 .dfl_cftypes = dfl_files,
3659 .early_init = true,
3660 .threaded = true,
3661 };
3662
3663 /**
3664 * cpuset_init - initialize cpusets at system boot
3665 *
3666 * Description: Initialize top_cpuset
3667 **/
3668
cpuset_init(void)3669 int __init cpuset_init(void)
3670 {
3671 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
3672 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
3673 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
3674 BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
3675 BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
3676 BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
3677
3678 cpumask_setall(top_cpuset.cpus_allowed);
3679 nodes_setall(top_cpuset.mems_allowed);
3680 cpumask_setall(top_cpuset.effective_cpus);
3681 cpumask_setall(top_cpuset.effective_xcpus);
3682 cpumask_setall(top_cpuset.exclusive_cpus);
3683 nodes_setall(top_cpuset.effective_mems);
3684
3685 fmeter_init(&top_cpuset.fmeter);
3686 INIT_LIST_HEAD(&remote_children);
3687
3688 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
3689
3690 have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);
3691 if (have_boot_isolcpus) {
3692 BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));
3693 cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
3694 cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);
3695 }
3696
3697 return 0;
3698 }
3699
3700 static void
hotplug_update_tasks(struct cpuset * cs,struct cpumask * new_cpus,nodemask_t * new_mems,bool cpus_updated,bool mems_updated)3701 hotplug_update_tasks(struct cpuset *cs,
3702 struct cpumask *new_cpus, nodemask_t *new_mems,
3703 bool cpus_updated, bool mems_updated)
3704 {
3705 /* A partition root is allowed to have empty effective cpus */
3706 if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
3707 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
3708 if (nodes_empty(*new_mems))
3709 *new_mems = parent_cs(cs)->effective_mems;
3710
3711 spin_lock_irq(&callback_lock);
3712 cpumask_copy(cs->effective_cpus, new_cpus);
3713 cs->effective_mems = *new_mems;
3714 spin_unlock_irq(&callback_lock);
3715
3716 if (cpus_updated)
3717 cpuset_update_tasks_cpumask(cs, new_cpus);
3718 if (mems_updated)
3719 cpuset_update_tasks_nodemask(cs);
3720 }
3721
cpuset_force_rebuild(void)3722 void cpuset_force_rebuild(void)
3723 {
3724 force_sd_rebuild = true;
3725 }
3726
3727 /**
3728 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
3729 * @cs: cpuset in interest
3730 * @tmp: the tmpmasks structure pointer
3731 *
3732 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
3733 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
3734 * all its tasks are moved to the nearest ancestor with both resources.
3735 */
cpuset_hotplug_update_tasks(struct cpuset * cs,struct tmpmasks * tmp)3736 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3737 {
3738 static cpumask_t new_cpus;
3739 static nodemask_t new_mems;
3740 bool cpus_updated;
3741 bool mems_updated;
3742 bool remote;
3743 int partcmd = -1;
3744 struct cpuset *parent;
3745 retry:
3746 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3747
3748 mutex_lock(&cpuset_mutex);
3749
3750 /*
3751 * We have raced with task attaching. We wait until attaching
3752 * is finished, so we won't attach a task to an empty cpuset.
3753 */
3754 if (cs->attach_in_progress) {
3755 mutex_unlock(&cpuset_mutex);
3756 goto retry;
3757 }
3758
3759 parent = parent_cs(cs);
3760 compute_effective_cpumask(&new_cpus, cs, parent);
3761 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3762
3763 if (!tmp || !cs->partition_root_state)
3764 goto update_tasks;
3765
3766 /*
3767 * Compute effective_cpus for valid partition root, may invalidate
3768 * child partition roots if necessary.
3769 */
3770 remote = is_remote_partition(cs);
3771 if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
3772 compute_partition_effective_cpumask(cs, &new_cpus);
3773
3774 if (remote && cpumask_empty(&new_cpus) &&
3775 partition_is_populated(cs, NULL)) {
3776 cs->prs_err = PERR_HOTPLUG;
3777 remote_partition_disable(cs, tmp);
3778 compute_effective_cpumask(&new_cpus, cs, parent);
3779 remote = false;
3780 cpuset_force_rebuild();
3781 }
3782
3783 /*
3784 * Force the partition to become invalid if either one of
3785 * the following conditions hold:
3786 * 1) empty effective cpus but not valid empty partition.
3787 * 2) parent is invalid or doesn't grant any cpus to child
3788 * partitions.
3789 */
3790 if (is_local_partition(cs) && (!is_partition_valid(parent) ||
3791 tasks_nocpu_error(parent, cs, &new_cpus)))
3792 partcmd = partcmd_invalidate;
3793 /*
3794 * On the other hand, an invalid partition root may be transitioned
3795 * back to a regular one.
3796 */
3797 else if (is_partition_valid(parent) && is_partition_invalid(cs))
3798 partcmd = partcmd_update;
3799
3800 if (partcmd >= 0) {
3801 update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
3802 if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
3803 compute_partition_effective_cpumask(cs, &new_cpus);
3804 cpuset_force_rebuild();
3805 }
3806 }
3807
3808 update_tasks:
3809 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3810 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3811 if (!cpus_updated && !mems_updated)
3812 goto unlock; /* Hotplug doesn't affect this cpuset */
3813
3814 if (mems_updated)
3815 check_insane_mems_config(&new_mems);
3816
3817 if (is_in_v2_mode())
3818 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3819 cpus_updated, mems_updated);
3820 else
3821 cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems,
3822 cpus_updated, mems_updated);
3823
3824 unlock:
3825 mutex_unlock(&cpuset_mutex);
3826 }
3827
3828 /**
3829 * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
3830 *
3831 * This function is called after either CPU or memory configuration has
3832 * changed and updates cpuset accordingly. The top_cpuset is always
3833 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
3834 * order to make cpusets transparent (of no affect) on systems that are
3835 * actively using CPU hotplug but making no active use of cpusets.
3836 *
3837 * Non-root cpusets are only affected by offlining. If any CPUs or memory
3838 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
3839 * all descendants.
3840 *
3841 * Note that CPU offlining during suspend is ignored. We don't modify
3842 * cpusets across suspend/resume cycles at all.
3843 *
3844 * CPU / memory hotplug is handled synchronously.
3845 */
cpuset_handle_hotplug(void)3846 static void cpuset_handle_hotplug(void)
3847 {
3848 static cpumask_t new_cpus;
3849 static nodemask_t new_mems;
3850 bool cpus_updated, mems_updated;
3851 bool on_dfl = is_in_v2_mode();
3852 struct tmpmasks tmp, *ptmp = NULL;
3853
3854 if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3855 ptmp = &tmp;
3856
3857 lockdep_assert_cpus_held();
3858 mutex_lock(&cpuset_mutex);
3859
3860 /* fetch the available cpus/mems and find out which changed how */
3861 cpumask_copy(&new_cpus, cpu_active_mask);
3862 new_mems = node_states[N_MEMORY];
3863
3864 /*
3865 * If subpartitions_cpus is populated, it is likely that the check
3866 * below will produce a false positive on cpus_updated when the cpu
3867 * list isn't changed. It is extra work, but it is better to be safe.
3868 */
3869 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
3870 !cpumask_empty(subpartitions_cpus);
3871 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3872
3873 /* For v1, synchronize cpus_allowed to cpu_active_mask */
3874 if (cpus_updated) {
3875 cpuset_force_rebuild();
3876 spin_lock_irq(&callback_lock);
3877 if (!on_dfl)
3878 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3879 /*
3880 * Make sure that CPUs allocated to child partitions
3881 * do not show up in effective_cpus. If no CPU is left,
3882 * we clear the subpartitions_cpus & let the child partitions
3883 * fight for the CPUs again.
3884 */
3885 if (!cpumask_empty(subpartitions_cpus)) {
3886 if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
3887 top_cpuset.nr_subparts = 0;
3888 cpumask_clear(subpartitions_cpus);
3889 } else {
3890 cpumask_andnot(&new_cpus, &new_cpus,
3891 subpartitions_cpus);
3892 }
3893 }
3894 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3895 spin_unlock_irq(&callback_lock);
3896 /* we don't mess with cpumasks of tasks in top_cpuset */
3897 }
3898
3899 /* synchronize mems_allowed to N_MEMORY */
3900 if (mems_updated) {
3901 spin_lock_irq(&callback_lock);
3902 if (!on_dfl)
3903 top_cpuset.mems_allowed = new_mems;
3904 top_cpuset.effective_mems = new_mems;
3905 spin_unlock_irq(&callback_lock);
3906 cpuset_update_tasks_nodemask(&top_cpuset);
3907 }
3908
3909 mutex_unlock(&cpuset_mutex);
3910
3911 /* if cpus or mems changed, we need to propagate to descendants */
3912 if (cpus_updated || mems_updated) {
3913 struct cpuset *cs;
3914 struct cgroup_subsys_state *pos_css;
3915
3916 rcu_read_lock();
3917 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3918 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3919 continue;
3920 rcu_read_unlock();
3921
3922 cpuset_hotplug_update_tasks(cs, ptmp);
3923
3924 rcu_read_lock();
3925 css_put(&cs->css);
3926 }
3927 rcu_read_unlock();
3928 }
3929
3930 /* rebuild sched domains if necessary */
3931 if (force_sd_rebuild)
3932 rebuild_sched_domains_cpuslocked();
3933
3934 free_cpumasks(NULL, ptmp);
3935 }
3936
cpuset_update_active_cpus(void)3937 void cpuset_update_active_cpus(void)
3938 {
3939 /*
3940 * We're inside cpu hotplug critical region which usually nests
3941 * inside cgroup synchronization. Bounce actual hotplug processing
3942 * to a work item to avoid reverse locking order.
3943 */
3944 cpuset_handle_hotplug();
3945 }
3946
3947 /*
3948 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
3949 * Call this routine anytime after node_states[N_MEMORY] changes.
3950 * See cpuset_update_active_cpus() for CPU hotplug handling.
3951 */
cpuset_track_online_nodes(struct notifier_block * self,unsigned long action,void * arg)3952 static int cpuset_track_online_nodes(struct notifier_block *self,
3953 unsigned long action, void *arg)
3954 {
3955 cpuset_handle_hotplug();
3956 return NOTIFY_OK;
3957 }
3958
3959 /**
3960 * cpuset_init_smp - initialize cpus_allowed
3961 *
3962 * Description: Finish top cpuset after cpu, node maps are initialized
3963 */
cpuset_init_smp(void)3964 void __init cpuset_init_smp(void)
3965 {
3966 /*
3967 * cpus_allowd/mems_allowed set to v2 values in the initial
3968 * cpuset_bind() call will be reset to v1 values in another
3969 * cpuset_bind() call when v1 cpuset is mounted.
3970 */
3971 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3972
3973 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3974 top_cpuset.effective_mems = node_states[N_MEMORY];
3975
3976 hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
3977
3978 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3979 BUG_ON(!cpuset_migrate_mm_wq);
3980 }
3981
3982 /**
3983 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
3984 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
3985 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
3986 *
3987 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
3988 * attached to the specified @tsk. Guaranteed to return some non-empty
3989 * subset of cpu_online_mask, even if this means going outside the
3990 * tasks cpuset, except when the task is in the top cpuset.
3991 **/
3992
cpuset_cpus_allowed(struct task_struct * tsk,struct cpumask * pmask)3993 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3994 {
3995 unsigned long flags;
3996 struct cpuset *cs;
3997
3998 spin_lock_irqsave(&callback_lock, flags);
3999 rcu_read_lock();
4000
4001 cs = task_cs(tsk);
4002 if (cs != &top_cpuset)
4003 guarantee_online_cpus(tsk, pmask);
4004 /*
4005 * Tasks in the top cpuset won't get update to their cpumasks
4006 * when a hotplug online/offline event happens. So we include all
4007 * offline cpus in the allowed cpu list.
4008 */
4009 if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
4010 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
4011
4012 /*
4013 * We first exclude cpus allocated to partitions. If there is no
4014 * allowable online cpu left, we fall back to all possible cpus.
4015 */
4016 cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
4017 if (!cpumask_intersects(pmask, cpu_online_mask))
4018 cpumask_copy(pmask, possible_mask);
4019 }
4020
4021 rcu_read_unlock();
4022 spin_unlock_irqrestore(&callback_lock, flags);
4023 }
4024
4025 /**
4026 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
4027 * @tsk: pointer to task_struct with which the scheduler is struggling
4028 *
4029 * Description: In the case that the scheduler cannot find an allowed cpu in
4030 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
4031 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
4032 * which will not contain a sane cpumask during cases such as cpu hotplugging.
4033 * This is the absolute last resort for the scheduler and it is only used if
4034 * _every_ other avenue has been traveled.
4035 *
4036 * Returns true if the affinity of @tsk was changed, false otherwise.
4037 **/
4038
cpuset_cpus_allowed_fallback(struct task_struct * tsk)4039 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
4040 {
4041 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
4042 const struct cpumask *cs_mask;
4043 bool changed = false;
4044
4045 rcu_read_lock();
4046 cs_mask = task_cs(tsk)->cpus_allowed;
4047 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
4048 do_set_cpus_allowed(tsk, cs_mask);
4049 changed = true;
4050 }
4051 rcu_read_unlock();
4052
4053 /*
4054 * We own tsk->cpus_allowed, nobody can change it under us.
4055 *
4056 * But we used cs && cs->cpus_allowed lockless and thus can
4057 * race with cgroup_attach_task() or update_cpumask() and get
4058 * the wrong tsk->cpus_allowed. However, both cases imply the
4059 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
4060 * which takes task_rq_lock().
4061 *
4062 * If we are called after it dropped the lock we must see all
4063 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
4064 * set any mask even if it is not right from task_cs() pov,
4065 * the pending set_cpus_allowed_ptr() will fix things.
4066 *
4067 * select_fallback_rq() will fix things ups and set cpu_possible_mask
4068 * if required.
4069 */
4070 return changed;
4071 }
4072
cpuset_init_current_mems_allowed(void)4073 void __init cpuset_init_current_mems_allowed(void)
4074 {
4075 nodes_setall(current->mems_allowed);
4076 }
4077
4078 /**
4079 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
4080 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
4081 *
4082 * Description: Returns the nodemask_t mems_allowed of the cpuset
4083 * attached to the specified @tsk. Guaranteed to return some non-empty
4084 * subset of node_states[N_MEMORY], even if this means going outside the
4085 * tasks cpuset.
4086 **/
4087
cpuset_mems_allowed(struct task_struct * tsk)4088 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
4089 {
4090 nodemask_t mask;
4091 unsigned long flags;
4092
4093 spin_lock_irqsave(&callback_lock, flags);
4094 rcu_read_lock();
4095 guarantee_online_mems(task_cs(tsk), &mask);
4096 rcu_read_unlock();
4097 spin_unlock_irqrestore(&callback_lock, flags);
4098
4099 return mask;
4100 }
4101
4102 /**
4103 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
4104 * @nodemask: the nodemask to be checked
4105 *
4106 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
4107 */
cpuset_nodemask_valid_mems_allowed(nodemask_t * nodemask)4108 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
4109 {
4110 return nodes_intersects(*nodemask, current->mems_allowed);
4111 }
4112
4113 /*
4114 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
4115 * mem_hardwall ancestor to the specified cpuset. Call holding
4116 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
4117 * (an unusual configuration), then returns the root cpuset.
4118 */
nearest_hardwall_ancestor(struct cpuset * cs)4119 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
4120 {
4121 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
4122 cs = parent_cs(cs);
4123 return cs;
4124 }
4125
4126 /*
4127 * cpuset_node_allowed - Can we allocate on a memory node?
4128 * @node: is this an allowed node?
4129 * @gfp_mask: memory allocation flags
4130 *
4131 * If we're in interrupt, yes, we can always allocate. If @node is set in
4132 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
4133 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
4134 * yes. If current has access to memory reserves as an oom victim, yes.
4135 * Otherwise, no.
4136 *
4137 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
4138 * and do not allow allocations outside the current tasks cpuset
4139 * unless the task has been OOM killed.
4140 * GFP_KERNEL allocations are not so marked, so can escape to the
4141 * nearest enclosing hardwalled ancestor cpuset.
4142 *
4143 * Scanning up parent cpusets requires callback_lock. The
4144 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
4145 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
4146 * current tasks mems_allowed came up empty on the first pass over
4147 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
4148 * cpuset are short of memory, might require taking the callback_lock.
4149 *
4150 * The first call here from mm/page_alloc:get_page_from_freelist()
4151 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
4152 * so no allocation on a node outside the cpuset is allowed (unless
4153 * in interrupt, of course).
4154 *
4155 * The second pass through get_page_from_freelist() doesn't even call
4156 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
4157 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
4158 * in alloc_flags. That logic and the checks below have the combined
4159 * affect that:
4160 * in_interrupt - any node ok (current task context irrelevant)
4161 * GFP_ATOMIC - any node ok
4162 * tsk_is_oom_victim - any node ok
4163 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
4164 * GFP_USER - only nodes in current tasks mems allowed ok.
4165 */
cpuset_node_allowed(int node,gfp_t gfp_mask)4166 bool cpuset_node_allowed(int node, gfp_t gfp_mask)
4167 {
4168 struct cpuset *cs; /* current cpuset ancestors */
4169 bool allowed; /* is allocation in zone z allowed? */
4170 unsigned long flags;
4171
4172 if (in_interrupt())
4173 return true;
4174 if (node_isset(node, current->mems_allowed))
4175 return true;
4176 /*
4177 * Allow tasks that have access to memory reserves because they have
4178 * been OOM killed to get memory anywhere.
4179 */
4180 if (unlikely(tsk_is_oom_victim(current)))
4181 return true;
4182 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
4183 return false;
4184
4185 if (current->flags & PF_EXITING) /* Let dying task have memory */
4186 return true;
4187
4188 /* Not hardwall and node outside mems_allowed: scan up cpusets */
4189 spin_lock_irqsave(&callback_lock, flags);
4190
4191 rcu_read_lock();
4192 cs = nearest_hardwall_ancestor(task_cs(current));
4193 allowed = node_isset(node, cs->mems_allowed);
4194 rcu_read_unlock();
4195
4196 spin_unlock_irqrestore(&callback_lock, flags);
4197 return allowed;
4198 }
4199
4200 /**
4201 * cpuset_spread_node() - On which node to begin search for a page
4202 * @rotor: round robin rotor
4203 *
4204 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
4205 * tasks in a cpuset with is_spread_page or is_spread_slab set),
4206 * and if the memory allocation used cpuset_mem_spread_node()
4207 * to determine on which node to start looking, as it will for
4208 * certain page cache or slab cache pages such as used for file
4209 * system buffers and inode caches, then instead of starting on the
4210 * local node to look for a free page, rather spread the starting
4211 * node around the tasks mems_allowed nodes.
4212 *
4213 * We don't have to worry about the returned node being offline
4214 * because "it can't happen", and even if it did, it would be ok.
4215 *
4216 * The routines calling guarantee_online_mems() are careful to
4217 * only set nodes in task->mems_allowed that are online. So it
4218 * should not be possible for the following code to return an
4219 * offline node. But if it did, that would be ok, as this routine
4220 * is not returning the node where the allocation must be, only
4221 * the node where the search should start. The zonelist passed to
4222 * __alloc_pages() will include all nodes. If the slab allocator
4223 * is passed an offline node, it will fall back to the local node.
4224 * See kmem_cache_alloc_node().
4225 */
cpuset_spread_node(int * rotor)4226 static int cpuset_spread_node(int *rotor)
4227 {
4228 return *rotor = next_node_in(*rotor, current->mems_allowed);
4229 }
4230
4231 /**
4232 * cpuset_mem_spread_node() - On which node to begin search for a file page
4233 */
cpuset_mem_spread_node(void)4234 int cpuset_mem_spread_node(void)
4235 {
4236 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
4237 current->cpuset_mem_spread_rotor =
4238 node_random(¤t->mems_allowed);
4239
4240 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
4241 }
4242
4243 /**
4244 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
4245 * @tsk1: pointer to task_struct of some task.
4246 * @tsk2: pointer to task_struct of some other task.
4247 *
4248 * Description: Return true if @tsk1's mems_allowed intersects the
4249 * mems_allowed of @tsk2. Used by the OOM killer to determine if
4250 * one of the task's memory usage might impact the memory available
4251 * to the other.
4252 **/
4253
cpuset_mems_allowed_intersects(const struct task_struct * tsk1,const struct task_struct * tsk2)4254 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
4255 const struct task_struct *tsk2)
4256 {
4257 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
4258 }
4259
4260 /**
4261 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
4262 *
4263 * Description: Prints current's name, cpuset name, and cached copy of its
4264 * mems_allowed to the kernel log.
4265 */
cpuset_print_current_mems_allowed(void)4266 void cpuset_print_current_mems_allowed(void)
4267 {
4268 struct cgroup *cgrp;
4269
4270 rcu_read_lock();
4271
4272 cgrp = task_cs(current)->css.cgroup;
4273 pr_cont(",cpuset=");
4274 pr_cont_cgroup_name(cgrp);
4275 pr_cont(",mems_allowed=%*pbl",
4276 nodemask_pr_args(¤t->mems_allowed));
4277
4278 rcu_read_unlock();
4279 }
4280
4281 #ifdef CONFIG_PROC_PID_CPUSET
4282 /*
4283 * proc_cpuset_show()
4284 * - Print tasks cpuset path into seq_file.
4285 * - Used for /proc/<pid>/cpuset.
4286 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
4287 * doesn't really matter if tsk->cpuset changes after we read it,
4288 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
4289 * anyway.
4290 */
proc_cpuset_show(struct seq_file * m,struct pid_namespace * ns,struct pid * pid,struct task_struct * tsk)4291 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
4292 struct pid *pid, struct task_struct *tsk)
4293 {
4294 char *buf;
4295 struct cgroup_subsys_state *css;
4296 int retval;
4297
4298 retval = -ENOMEM;
4299 buf = kmalloc(PATH_MAX, GFP_KERNEL);
4300 if (!buf)
4301 goto out;
4302
4303 rcu_read_lock();
4304 spin_lock_irq(&css_set_lock);
4305 css = task_css(tsk, cpuset_cgrp_id);
4306 retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
4307 current->nsproxy->cgroup_ns);
4308 spin_unlock_irq(&css_set_lock);
4309 rcu_read_unlock();
4310
4311 if (retval == -E2BIG)
4312 retval = -ENAMETOOLONG;
4313 if (retval < 0)
4314 goto out_free;
4315 seq_puts(m, buf);
4316 seq_putc(m, '\n');
4317 retval = 0;
4318 out_free:
4319 kfree(buf);
4320 out:
4321 return retval;
4322 }
4323 #endif /* CONFIG_PROC_PID_CPUSET */
4324
4325 /* Display task mems_allowed in /proc/<pid>/status file. */
cpuset_task_status_allowed(struct seq_file * m,struct task_struct * task)4326 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
4327 {
4328 seq_printf(m, "Mems_allowed:\t%*pb\n",
4329 nodemask_pr_args(&task->mems_allowed));
4330 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
4331 nodemask_pr_args(&task->mems_allowed));
4332 }
4333