cgroup-v1.c - OpenGrok cross reference for /linux-6.14.4/kernel/cgroup/cgroup-v1.c

Lines Matching +full:early +full:- +full:to +full:- +full:mid
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include "cgroup-internal.h"
36  * pidlist destructions need to be flushed on cgroup destruction.  Use a
41 /* protects cgroup_subsys->release_agent_path */
51 	/* Check also dfl_cftypes for file-less controllers, i.e. perf_event */  in cgroup1_subsys_absent()
52 	return ss->legacy_cftypes == NULL && ss->dfl_cftypes;  in cgroup1_subsys_absent()
56  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
57  * @from: attach to all cgroups of a given task
58  * @tsk: the task to be attached
88  * cgroup_transfer_tasks - move tasks from one cgroup to another
89  * @to: cgroup to which the tasks will be moved
94  * is guaranteed to be either visible in the source cgroup after the
100 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)  in cgroup_transfer_tasks()  argument
108 	if (cgroup_on_dfl(to))  in cgroup_transfer_tasks()
109 		return -EINVAL;  in cgroup_transfer_tasks()
111 	ret = cgroup_migrate_vet_dst(to);  in cgroup_transfer_tasks()
121 	list_for_each_entry(link, &from->cset_links, cset_link)  in cgroup_transfer_tasks()
122 		cgroup_migrate_add_src(link->cset, to, &mgctx);  in cgroup_transfer_tasks()
130 	 * Migrate tasks one-by-one until @from is empty.  This fails iff  in cgroup_transfer_tasks()
131 	 * ->can_attach() fails.  in cgroup_transfer_tasks()
134 		css_task_iter_start(&from->self, 0, &it);  in cgroup_transfer_tasks()
138 		} while (task && (task->flags & PF_EXITING));  in cgroup_transfer_tasks()
147 				TRACE_CGROUP_PATH(transfer_tasks, to, task, false);  in cgroup_transfer_tasks()
162  * *lots* of attached tasks. So it may need several calls to read(),
178  * to the cgroup.
182 	 * used to find which pidlist is wanted. doesn't change as long as
192 	/* pointer to the cgroup we belong to, for list removal purposes */
199  * Used to destroy all pidlists lingering waiting for destroy timer.  None
206 	mutex_lock(&cgrp->pidlist_mutex);  in cgroup1_pidlist_destroy_all()
207 	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)  in cgroup1_pidlist_destroy_all()
208 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);  in cgroup1_pidlist_destroy_all()
209 	mutex_unlock(&cgrp->pidlist_mutex);  in cgroup1_pidlist_destroy_all()
212 	BUG_ON(!list_empty(&cgrp->pidlists));  in cgroup1_pidlist_destroy_all()
222 	mutex_lock(&l->owner->pidlist_mutex);  in cgroup_pidlist_destroy_work_fn()
229 		list_del(&l->links);  in cgroup_pidlist_destroy_work_fn()
230 		kvfree(l->list);  in cgroup_pidlist_destroy_work_fn()
231 		put_pid_ns(l->key.ns);  in cgroup_pidlist_destroy_work_fn()
235 	mutex_unlock(&l->owner->pidlist_mutex);  in cgroup_pidlist_destroy_work_fn()
240  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
249 	 * edge cases first; no work needs to be done for either  in pidlist_uniq()
256 		while (list[src] == list[src-1]) {  in pidlist_uniq()
261 		/* dest always points to where the next unique element goes */  in pidlist_uniq()
270  * The two pid files - task and cgroup.procs - guaranteed that the result
273  * making it impossible to use, for example, single rbtree of member tasks
275  * per open file is dangerous, so cgroup had to implement shared pool of
280 	return *(pid_t *)a - *(pid_t *)b;  in cmppid()
290 	lockdep_assert_held(&cgrp->pidlist_mutex);  in cgroup_pidlist_find()
292 	list_for_each_entry(l, &cgrp->pidlists, links)  in cgroup_pidlist_find()
293 		if (l->key.type == type && l->key.ns == ns)  in cgroup_pidlist_find()
309 	lockdep_assert_held(&cgrp->pidlist_mutex);  in cgroup_pidlist_find_create()
320 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);  in cgroup_pidlist_find_create()
321 	l->key.type = type;  in cgroup_pidlist_find_create()
323 	l->key.ns = get_pid_ns(task_active_pid_ns(current));  in cgroup_pidlist_find_create()
324 	l->owner = cgrp;  in cgroup_pidlist_find_create()
325 	list_add(&l->links, &cgrp->pidlists);  in cgroup_pidlist_find_create()
342 	lockdep_assert_held(&cgrp->pidlist_mutex);  in pidlist_array_load()
346 	 * enough space - tough.  This race is indistinguishable to the  in pidlist_array_load()
353 		return -ENOMEM;  in pidlist_array_load()
355 	css_task_iter_start(&cgrp->self, 0, &it);  in pidlist_array_load()
364 		if (pid > 0) /* make sure to only use valid results */  in pidlist_array_load()
376 		return -ENOMEM;  in pidlist_array_load()
380 	kvfree(l->list);  in pidlist_array_load()
381 	l->list = array;  in pidlist_array_load()
382 	l->length = length;  in pidlist_array_load()
389  * next pid to display; the seq_file iterator is a pointer to the pid
390  * in the cgroup->l->list array.
396 	 * Initially we receive a position value that corresponds to  in cgroup_pidlist_start()
398 	 * after a seek to the start). Use a binary-search to find the  in cgroup_pidlist_start()
399 	 * next pid to display, if any  in cgroup_pidlist_start()
401 	struct kernfs_open_file *of = s->private;  in cgroup_pidlist_start()
402 	struct cgroup_file_ctx *ctx = of->priv;  in cgroup_pidlist_start()
403 	struct cgroup *cgrp = seq_css(s)->cgroup;  in cgroup_pidlist_start()
405 	enum cgroup_filetype type = seq_cft(s)->private;  in cgroup_pidlist_start()
409 	mutex_lock(&cgrp->pidlist_mutex);  in cgroup_pidlist_start()
412 	 * !NULL @ctx->procs1.pidlist indicates that this isn't the first  in cgroup_pidlist_start()
414 	 * that. Look for it. Note that @ctx->procs1.pidlist can't be used  in cgroup_pidlist_start()
417 	if (ctx->procs1.pidlist)  in cgroup_pidlist_start()
418 		ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);  in cgroup_pidlist_start()
424 	if (!ctx->procs1.pidlist) {  in cgroup_pidlist_start()
425 		ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);  in cgroup_pidlist_start()
429 	l = ctx->procs1.pidlist;  in cgroup_pidlist_start()
432 		int end = l->length;  in cgroup_pidlist_start()
435 			int mid = (index + end) / 2;  in cgroup_pidlist_start()  local
436 			if (l->list[mid] == pid) {  in cgroup_pidlist_start()
437 				index = mid;  in cgroup_pidlist_start()
439 			} else if (l->list[mid] < pid)  in cgroup_pidlist_start()
440 				index = mid + 1;  in cgroup_pidlist_start()
442 				end = mid;  in cgroup_pidlist_start()
446 	if (index >= l->length)  in cgroup_pidlist_start()
448 	/* Update the abstract position to be the actual pid that we found */  in cgroup_pidlist_start()
449 	iter = l->list + index;  in cgroup_pidlist_start()
456 	struct kernfs_open_file *of = s->private;  in cgroup_pidlist_stop()
457 	struct cgroup_file_ctx *ctx = of->priv;  in cgroup_pidlist_stop()
458 	struct cgroup_pidlist *l = ctx->procs1.pidlist;  in cgroup_pidlist_stop()
461 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,  in cgroup_pidlist_stop()
463 	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);  in cgroup_pidlist_stop()
468 	struct kernfs_open_file *of = s->private;  in cgroup_pidlist_next()
469 	struct cgroup_file_ctx *ctx = of->priv;  in cgroup_pidlist_next()
470 	struct cgroup_pidlist *l = ctx->procs1.pidlist;  in cgroup_pidlist_next()
472 	pid_t *end = l->list + l->length;  in cgroup_pidlist_next()
474 	 * Advance to the next pid in the array. If this goes off the  in cgroup_pidlist_next()
504 	cgrp = cgroup_kn_lock_live(of->kn, false);  in __cgroup1_procs_write()
506 		return -ENODEV;  in __cgroup1_procs_write()
515 	 * to check permissions on one of them. Check permissions using the  in __cgroup1_procs_write()
516 	 * credentials from file open to protect against inherited fd attacks.  in __cgroup1_procs_write()
518 	cred = of->file->f_cred;  in __cgroup1_procs_write()
520 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&  in __cgroup1_procs_write()
521 	    !uid_eq(cred->euid, tcred->uid) &&  in __cgroup1_procs_write()
522 	    !uid_eq(cred->euid, tcred->suid))  in __cgroup1_procs_write()
523 		ret = -EACCES;  in __cgroup1_procs_write()
533 	cgroup_kn_unlock(of->kn);  in __cgroup1_procs_write()
556 	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);  in cgroup_release_agent_write()
560 	 * require capabilities to set release agent.  in cgroup_release_agent_write()
562 	ctx = of->priv;  in cgroup_release_agent_write()
563 	if ((ctx->ns->user_ns != &init_user_ns) ||  in cgroup_release_agent_write()
564 	    !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))  in cgroup_release_agent_write()
565 		return -EPERM;  in cgroup_release_agent_write()
567 	cgrp = cgroup_kn_lock_live(of->kn, false);  in cgroup_release_agent_write()
569 		return -ENODEV;  in cgroup_release_agent_write()
571 	strscpy(cgrp->root->release_agent_path, strstrip(buf),  in cgroup_release_agent_write()
572 		sizeof(cgrp->root->release_agent_path));  in cgroup_release_agent_write()
574 	cgroup_kn_unlock(of->kn);  in cgroup_release_agent_write()
580 	struct cgroup *cgrp = seq_css(seq)->cgroup;  in cgroup_release_agent_show()
583 	seq_puts(seq, cgrp->root->release_agent_path);  in cgroup_release_agent_show()
598 	return notify_on_release(css->cgroup);  in cgroup_read_notify_on_release()
605 		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);  in cgroup_write_notify_on_release()
607 		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);  in cgroup_write_notify_on_release()
614 	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);  in cgroup_clone_children_read()
621 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);  in cgroup_clone_children_write()
623 		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);  in cgroup_clone_children_write()
667 		.max_write_len = PATH_MAX - 1,
680 	 * Grab the subsystems state racily. No need to add avenue to  in proc_cgroupstats_show()
688 			   ss->legacy_name, ss->root->hierarchy_id,  in proc_cgroupstats_show()
689 			   atomic_read(&ss->root->nr_cgrps),  in proc_cgroupstats_show()
697  * cgroupstats_build - build and fill cgroupstats
698  * @stats: cgroupstats to fill information into
699  * @dentry: A dentry entry belonging to the cgroup for which stats have
702  * Build and fill cgroupstats so that taskstats can export it to user
714 	/* it should be kernfs_node belonging to cgroupfs and is a directory */  in cgroupstats_build()
715 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||  in cgroupstats_build()
717 		return -EINVAL;  in cgroupstats_build()
721 	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),  in cgroupstats_build()
722 	 * @kn->priv is RCU safe.  Let's do the RCU dancing.  in cgroupstats_build()
725 	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);  in cgroupstats_build()
728 		return -ENOENT;  in cgroupstats_build()
732 	css_task_iter_start(&cgrp->self, 0, &it);  in cgroupstats_build()
734 		switch (READ_ONCE(tsk->__state)) {  in cgroupstats_build()
736 			stats->nr_running++;  in cgroupstats_build()
739 			stats->nr_sleeping++;  in cgroupstats_build()
742 			stats->nr_uninterruptible++;  in cgroupstats_build()
745 			stats->nr_stopped++;  in cgroupstats_build()
748 			if (tsk->in_iowait)  in cgroupstats_build()
749 				stats->nr_io_wait++;  in cgroupstats_build()
762 	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))  in cgroup1_check_for_release()
763 		schedule_work(&cgrp->release_agent_work);  in cgroup1_check_for_release()
769  * relative to the root of cgroup file system) as the argument.
771  * Most likely, this user command will try to rmdir this cgroup.
774  * attached to this cgroup before it is removed, or that some other
778  * to continue to serve a useful existence.  Next time it's released,
781  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
785  * release agent task.  We don't bother to wait because the caller of
797 	/* snoop agent path and exit early if empty */  in cgroup1_release_agent()
798 	if (!cgrp->root->release_agent_path[0])  in cgroup1_release_agent()
808 	strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);  in cgroup1_release_agent()
833  * cgroup_rename - Only allow simple rename of directories in place.
838 	struct cgroup *cgrp = kn->priv;  in cgroup1_rename()
841 	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */  in cgroup1_rename()
843 		return -EINVAL;  in cgroup1_rename()
846 		return -ENOTDIR;  in cgroup1_rename()
847 	if (kn->parent != new_parent)  in cgroup1_rename()
848 		return -EIO;  in cgroup1_rename()
878 		if (root->subsys_mask & (1 << ssid))  in cgroup1_show_options()
879 			seq_show_option(seq, ss->legacy_name, NULL);  in cgroup1_show_options()
880 	if (root->flags & CGRP_ROOT_NOPREFIX)  in cgroup1_show_options()
882 	if (root->flags & CGRP_ROOT_XATTR)  in cgroup1_show_options()
884 	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)  in cgroup1_show_options()
886 	if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)  in cgroup1_show_options()
890 	if (strlen(root->release_agent_path))  in cgroup1_show_options()
892 				root->release_agent_path);  in cgroup1_show_options()
895 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))  in cgroup1_show_options()
897 	if (strlen(root->name))  in cgroup1_show_options()
898 		seq_show_option(seq, "name", root->name);  in cgroup1_show_options()
937 	if (opt == -ENOPARAM) {  in cgroup1_parse_param()
941 		if (ret != -ENOPARAM)  in cgroup1_parse_param()
944 			if (strcmp(param->key, ss->legacy_name) ||  in cgroup1_parse_param()
949 					       param->key);  in cgroup1_parse_param()
950 			ctx->subsys_mask |= (1 << i);  in cgroup1_parse_param()
953 		return invalfc(fc, "Unknown subsys name '%s'", param->key);  in cgroup1_parse_param()
961 		ctx->none = true;  in cgroup1_parse_param()
964 		ctx->all_ss = true;  in cgroup1_parse_param()
967 		ctx->flags |= CGRP_ROOT_NOPREFIX;  in cgroup1_parse_param()
970 		ctx->cpuset_clone_children = true;  in cgroup1_parse_param()
973 		ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;  in cgroup1_parse_param()
976 		ctx->flags |= CGRP_ROOT_XATTR;  in cgroup1_parse_param()
979 		ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;  in cgroup1_parse_param()
982 		ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;  in cgroup1_parse_param()
986 		if (ctx->release_agent)  in cgroup1_parse_param()
990 		 * require capabilities to set release agent.  in cgroup1_parse_param()
992 		if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))  in cgroup1_parse_param()
994 		ctx->release_agent = param->string;  in cgroup1_parse_param()
995 		param->string = NULL;  in cgroup1_parse_param()
1000 			return -ENOENT;  in cgroup1_parse_param()
1002 		if (!param->size)  in cgroup1_parse_param()
1004 		if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)  in cgroup1_parse_param()
1006 		/* Must match [\w.-]+ */  in cgroup1_parse_param()
1007 		for (i = 0; i < param->size; i++) {  in cgroup1_parse_param()
1008 			char c = param->string[i];  in cgroup1_parse_param()
1011 			if ((c == '.') || (c == '-') || (c == '_'))  in cgroup1_parse_param()
1016 		if (ctx->name)  in cgroup1_parse_param()
1018 		ctx->name = param->string;  in cgroup1_parse_param()
1019 		param->string = NULL;  in cgroup1_parse_param()
1041 	ctx->subsys_mask &= enabled;  in check_cgroupfs_options()
1045 	 * let's default to 'all'.  in check_cgroupfs_options()
1047 	if (!ctx->subsys_mask && !ctx->none && !ctx->name)  in check_cgroupfs_options()
1048 		ctx->all_ss = true;  in check_cgroupfs_options()
1050 	if (ctx->all_ss) {  in check_cgroupfs_options()
1052 		if (ctx->subsys_mask)  in check_cgroupfs_options()
1055 		ctx->subsys_mask = enabled;  in check_cgroupfs_options()
1059 	 * We either have to specify by name or by subsystems. (So all  in check_cgroupfs_options()
1062 	if (!ctx->subsys_mask && !ctx->name)  in check_cgroupfs_options()
1070 	if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))  in check_cgroupfs_options()
1074 	if (ctx->subsys_mask && ctx->none)  in check_cgroupfs_options()
1083 	struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);  in cgroup1_reconfigure()
1095 	if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)  in cgroup1_reconfigure()
1097 			task_tgid_nr(current), current->comm);  in cgroup1_reconfigure()
1099 	added_mask = ctx->subsys_mask & ~root->subsys_mask;  in cgroup1_reconfigure()
1100 	removed_mask = root->subsys_mask & ~ctx->subsys_mask;  in cgroup1_reconfigure()
1102 	/* Don't allow flags or name to change at remount */  in cgroup1_reconfigure()
1103 	if ((ctx->flags ^ root->flags) ||  in cgroup1_reconfigure()
1104 	    (ctx->name && strcmp(ctx->name, root->name))) {  in cgroup1_reconfigure()
1106 		       ctx->flags, ctx->name ?: "", root->flags, root->name);  in cgroup1_reconfigure()
1107 		ret = -EINVAL;  in cgroup1_reconfigure()
1112 	if (!list_empty(&root->cgrp.self.children)) {  in cgroup1_reconfigure()
1113 		ret = -EBUSY;  in cgroup1_reconfigure()
1123 	if (ctx->release_agent) {  in cgroup1_reconfigure()
1125 		strcpy(root->release_agent_path, ctx->release_agent);  in cgroup1_reconfigure()
1145  * The guts of cgroup1 mount - find or create cgroup_root to use.
1146  * Called with cgroup_mutex held; returns 0 on success, -E... on
1147  * error and positive - in case when the candidate is busy dying.
1148  * On success it stashes a reference to cgroup_root into given
1167 	 * dying subsystems.  We just need to ensure that the ones  in cgroup1_root_to_use()
1172 		if (!(ctx->subsys_mask & (1 << i)) ||  in cgroup1_root_to_use()
1173 		    ss->root == &cgrp_dfl_root)  in cgroup1_root_to_use()
1176 		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))  in cgroup1_root_to_use()
1178 		cgroup_put(&ss->root->cgrp);  in cgroup1_root_to_use()
1192 		if (ctx->name) {  in cgroup1_root_to_use()
1193 			if (strcmp(ctx->name, root->name))  in cgroup1_root_to_use()
1202 		if ((ctx->subsys_mask || ctx->none) &&  in cgroup1_root_to_use()
1203 		    (ctx->subsys_mask != root->subsys_mask)) {  in cgroup1_root_to_use()
1206 			return -EBUSY;  in cgroup1_root_to_use()
1209 		if (root->flags ^ ctx->flags)  in cgroup1_root_to_use()
1212 		ctx->root = root;  in cgroup1_root_to_use()
1221 	if (!ctx->subsys_mask && !ctx->none)  in cgroup1_root_to_use()
1225 	if (ctx->ns != &init_cgroup_ns)  in cgroup1_root_to_use()
1226 		return -EPERM;  in cgroup1_root_to_use()
1230 		return -ENOMEM;  in cgroup1_root_to_use()
1232 	ctx->root = root;  in cgroup1_root_to_use()
1235 	ret = cgroup_setup_root(root, ctx->subsys_mask);  in cgroup1_root_to_use()
1237 		cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);  in cgroup1_root_to_use()
1249 	/* Check if the caller has permission to mount. */  in cgroup1_get_tree()
1250 	if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))  in cgroup1_get_tree()
1251 		return -EPERM;  in cgroup1_get_tree()
1256 	if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))  in cgroup1_get_tree()
1264 	if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {  in cgroup1_get_tree()
1277  * task_get_cgroup1 - Acquires the associated cgroup of a task within a
1284  * We limit it to cgroup1 only.
1288 	struct cgroup *cgrp = ERR_PTR(-ENOENT);  in task_get_cgroup1()
1297 		if (root->hierarchy_id != hierarchy_id)  in task_get_cgroup1()
1302 			cgrp = ERR_PTR(-ENOENT);  in task_get_cgroup1()
1313 	 * Used to destroy pidlists and separate to serve as flush domain.  in cgroup1_wq_init()
1314 	 * Cap @max_active to 1 too.  in cgroup1_wq_init()
1344 			if (strcmp(token, ss->name) &&  in cgroup_no_v1()
1345 			    strcmp(token, ss->legacy_name))  in cgroup_no_v1()