cgroup-v1.c - OpenGrok cross reference for /linux/kernel/cgroup/cgroup-v1.c

Lines Matching +full:early +full:- +full:to +full:- +full:mid
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include "cgroup-internal.h"
40  * pidlist destructions need to be flushed on cgroup destruction.  Use a
45 /* protects cgroup_subsys->release_agent_path */
55 	/* Check also dfl_cftypes for file-less controllers, i.e. perf_event */  in cgroup1_subsys_absent()
56 	return ss->legacy_cftypes == NULL && ss->dfl_cftypes;  in cgroup1_subsys_absent()
60  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
61  * @from: attach to all cgroups of a given task
62  * @tsk: the task to be attached
92  * cgroup_transfer_tasks - move tasks from one cgroup to another
93  * @to: cgroup to which the tasks will be moved
98  * is guaranteed to be either visible in the source cgroup after the
104 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)  in cgroup_transfer_tasks()  argument
112 	if (cgroup_on_dfl(to))  in cgroup_transfer_tasks()
113 		return -EINVAL;  in cgroup_transfer_tasks()
115 	ret = cgroup_migrate_vet_dst(to);  in cgroup_transfer_tasks()
125 	list_for_each_entry(link, &from->cset_links, cset_link)  in cgroup_transfer_tasks()
126 		cgroup_migrate_add_src(link->cset, to, &mgctx);  in cgroup_transfer_tasks()
134 	 * Migrate tasks one-by-one until @from is empty.  This fails iff  in cgroup_transfer_tasks()
135 	 * ->can_attach() fails.  in cgroup_transfer_tasks()
138 		css_task_iter_start(&from->self, 0, &it);  in cgroup_transfer_tasks()
142 		} while (task && (task->flags & PF_EXITING));  in cgroup_transfer_tasks()
151 				TRACE_CGROUP_PATH(transfer_tasks, to, task, false);  in cgroup_transfer_tasks()
166  * *lots* of attached tasks. So it may need several calls to read(),
182  * to the cgroup.
186 	 * used to find which pidlist is wanted. doesn't change as long as
196 	/* pointer to the cgroup we belong to, for list removal purposes */
203  * Used to destroy all pidlists lingering waiting for destroy timer.  None
210 	mutex_lock(&cgrp->pidlist_mutex);  in cgroup1_pidlist_destroy_all()
211 	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)  in cgroup1_pidlist_destroy_all()
212 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);  in cgroup1_pidlist_destroy_all()
213 	mutex_unlock(&cgrp->pidlist_mutex);  in cgroup1_pidlist_destroy_all()
216 	BUG_ON(!list_empty(&cgrp->pidlists));  in cgroup1_pidlist_destroy_all()
226 	mutex_lock(&l->owner->pidlist_mutex);  in cgroup_pidlist_destroy_work_fn()
233 		list_del(&l->links);  in cgroup_pidlist_destroy_work_fn()
234 		kvfree(l->list);  in cgroup_pidlist_destroy_work_fn()
235 		put_pid_ns(l->key.ns);  in cgroup_pidlist_destroy_work_fn()
239 	mutex_unlock(&l->owner->pidlist_mutex);  in cgroup_pidlist_destroy_work_fn()
244  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
253 	 * edge cases first; no work needs to be done for either  in pidlist_uniq()
260 		while (list[src] == list[src-1]) {  in pidlist_uniq()
265 		/* dest always points to where the next unique element goes */  in pidlist_uniq()
274  * The two pid files - task and cgroup.procs - guaranteed that the result
277  * making it impossible to use, for example, single rbtree of member tasks
279  * per open file is dangerous, so cgroup had to implement shared pool of
284 	return *(pid_t *)a - *(pid_t *)b;  in cmppid()
294 	lockdep_assert_held(&cgrp->pidlist_mutex);  in cgroup_pidlist_find()
296 	list_for_each_entry(l, &cgrp->pidlists, links)  in cgroup_pidlist_find()
297 		if (l->key.type == type && l->key.ns == ns)  in cgroup_pidlist_find()
313 	lockdep_assert_held(&cgrp->pidlist_mutex);  in cgroup_pidlist_find_create()
324 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);  in cgroup_pidlist_find_create()
325 	l->key.type = type;  in cgroup_pidlist_find_create()
327 	l->key.ns = get_pid_ns(task_active_pid_ns(current));  in cgroup_pidlist_find_create()
328 	l->owner = cgrp;  in cgroup_pidlist_find_create()
329 	list_add(&l->links, &cgrp->pidlists);  in cgroup_pidlist_find_create()
346 	lockdep_assert_held(&cgrp->pidlist_mutex);  in pidlist_array_load()
350 	 * enough space - tough.  This race is indistinguishable to the  in pidlist_array_load()
357 		return -ENOMEM;  in pidlist_array_load()
359 	css_task_iter_start(&cgrp->self, 0, &it);  in pidlist_array_load()
368 		if (pid > 0) /* make sure to only use valid results */  in pidlist_array_load()
380 		return -ENOMEM;  in pidlist_array_load()
384 	kvfree(l->list);  in pidlist_array_load()
385 	l->list = array;  in pidlist_array_load()
386 	l->length = length;  in pidlist_array_load()
393  * next pid to display; the seq_file iterator is a pointer to the pid
394  * in the cgroup->l->list array.
400 	 * Initially we receive a position value that corresponds to  in cgroup_pidlist_start()
402 	 * after a seek to the start). Use a binary-search to find the  in cgroup_pidlist_start()
403 	 * next pid to display, if any  in cgroup_pidlist_start()
405 	struct kernfs_open_file *of = s->private;  in cgroup_pidlist_start()
406 	struct cgroup_file_ctx *ctx = of->priv;  in cgroup_pidlist_start()
407 	struct cgroup *cgrp = seq_css(s)->cgroup;  in cgroup_pidlist_start()
409 	enum cgroup_filetype type = seq_cft(s)->private;  in cgroup_pidlist_start()
413 	mutex_lock(&cgrp->pidlist_mutex);  in cgroup_pidlist_start()
416 	 * !NULL @ctx->procs1.pidlist indicates that this isn't the first  in cgroup_pidlist_start()
418 	 * that. Look for it. Note that @ctx->procs1.pidlist can't be used  in cgroup_pidlist_start()
421 	if (ctx->procs1.pidlist)  in cgroup_pidlist_start()
422 		ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);  in cgroup_pidlist_start()
428 	if (!ctx->procs1.pidlist) {  in cgroup_pidlist_start()
429 		ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);  in cgroup_pidlist_start()
433 	l = ctx->procs1.pidlist;  in cgroup_pidlist_start()
436 		int end = l->length;  in cgroup_pidlist_start()
439 			int mid = (index + end) / 2;  in cgroup_pidlist_start()  local
440 			if (l->list[mid] == pid) {  in cgroup_pidlist_start()
441 				index = mid;  in cgroup_pidlist_start()
443 			} else if (l->list[mid] < pid)  in cgroup_pidlist_start()
444 				index = mid + 1;  in cgroup_pidlist_start()
446 				end = mid;  in cgroup_pidlist_start()
450 	if (index >= l->length)  in cgroup_pidlist_start()
452 	/* Update the abstract position to be the actual pid that we found */  in cgroup_pidlist_start()
453 	iter = l->list + index;  in cgroup_pidlist_start()
460 	struct kernfs_open_file *of = s->private;  in cgroup_pidlist_stop()
461 	struct cgroup_file_ctx *ctx = of->priv;  in cgroup_pidlist_stop()
462 	struct cgroup_pidlist *l = ctx->procs1.pidlist;  in cgroup_pidlist_stop()
465 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,  in cgroup_pidlist_stop()
467 	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);  in cgroup_pidlist_stop()
472 	struct kernfs_open_file *of = s->private;  in cgroup_pidlist_next()
473 	struct cgroup_file_ctx *ctx = of->priv;  in cgroup_pidlist_next()
474 	struct cgroup_pidlist *l = ctx->procs1.pidlist;  in cgroup_pidlist_next()
476 	pid_t *end = l->list + l->length;  in cgroup_pidlist_next()
478 	 * Advance to the next pid in the array. If this goes off the  in cgroup_pidlist_next()
508 	cgrp = cgroup_kn_lock_live(of->kn, false);  in __cgroup1_procs_write()
510 		return -ENODEV;  in __cgroup1_procs_write()
519 	 * to check permissions on one of them. Check permissions using the  in __cgroup1_procs_write()
520 	 * credentials from file open to protect against inherited fd attacks.  in __cgroup1_procs_write()
522 	cred = of->file->f_cred;  in __cgroup1_procs_write()
524 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&  in __cgroup1_procs_write()
525 	    !uid_eq(cred->euid, tcred->uid) &&  in __cgroup1_procs_write()
526 	    !uid_eq(cred->euid, tcred->suid))  in __cgroup1_procs_write()
527 		ret = -EACCES;  in __cgroup1_procs_write()
537 	cgroup_kn_unlock(of->kn);  in __cgroup1_procs_write()
560 	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);  in cgroup_release_agent_write()
564 	 * require capabilities to set release agent.  in cgroup_release_agent_write()
566 	ctx = of->priv;  in cgroup_release_agent_write()
567 	if ((ctx->ns->user_ns != &init_user_ns) ||  in cgroup_release_agent_write()
568 	    !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))  in cgroup_release_agent_write()
569 		return -EPERM;  in cgroup_release_agent_write()
571 	cgrp = cgroup_kn_lock_live(of->kn, false);  in cgroup_release_agent_write()
573 		return -ENODEV;  in cgroup_release_agent_write()
575 	strscpy(cgrp->root->release_agent_path, strstrip(buf),  in cgroup_release_agent_write()
576 		sizeof(cgrp->root->release_agent_path));  in cgroup_release_agent_write()
578 	cgroup_kn_unlock(of->kn);  in cgroup_release_agent_write()
584 	struct cgroup *cgrp = seq_css(seq)->cgroup;  in cgroup_release_agent_show()
587 	seq_puts(seq, cgrp->root->release_agent_path);  in cgroup_release_agent_show()
602 	return notify_on_release(css->cgroup);  in cgroup_read_notify_on_release()
609 		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);  in cgroup_write_notify_on_release()
611 		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);  in cgroup_write_notify_on_release()
618 	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);  in cgroup_clone_children_read()
625 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);  in cgroup_clone_children_write()
627 		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);  in cgroup_clone_children_write()
671 		.max_write_len = PATH_MAX - 1,
685 	 * Grab the subsystems state racily. No need to add avenue to  in proc_cgroupstats_show()
690 		cgrp_v1_visible |= ss->root != &cgrp_dfl_root;  in proc_cgroupstats_show()
696 			   ss->legacy_name, ss->root->hierarchy_id,  in proc_cgroupstats_show()
697 			   atomic_read(&ss->root->nr_cgrps),  in proc_cgroupstats_show()
709  * cgroupstats_build - build and fill cgroupstats
710  * @stats: cgroupstats to fill information into
711  * @dentry: A dentry entry belonging to the cgroup for which stats have
714  * Build and fill cgroupstats so that taskstats can export it to user
726 	/* it should be kernfs_node belonging to cgroupfs and is a directory */  in cgroupstats_build()
727 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||  in cgroupstats_build()
729 		return -EINVAL;  in cgroupstats_build()
733 	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),  in cgroupstats_build()
734 	 * @kn->priv is RCU safe.  Let's do the RCU dancing.  in cgroupstats_build()
737 	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);  in cgroupstats_build()
740 		return -ENOENT;  in cgroupstats_build()
744 	css_task_iter_start(&cgrp->self, 0, &it);  in cgroupstats_build()
746 		switch (READ_ONCE(tsk->__state)) {  in cgroupstats_build()
748 			stats->nr_running++;  in cgroupstats_build()
751 			stats->nr_sleeping++;  in cgroupstats_build()
754 			stats->nr_uninterruptible++;  in cgroupstats_build()
757 			stats->nr_stopped++;  in cgroupstats_build()
760 			if (tsk->in_iowait)  in cgroupstats_build()
761 				stats->nr_io_wait++;  in cgroupstats_build()
774 	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))  in cgroup1_check_for_release()
775 		schedule_work(&cgrp->release_agent_work);  in cgroup1_check_for_release()
781  * relative to the root of cgroup file system) as the argument.
783  * Most likely, this user command will try to rmdir this cgroup.
786  * attached to this cgroup before it is removed, or that some other
790  * to continue to serve a useful existence.  Next time it's released,
793  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
797  * release agent task.  We don't bother to wait because the caller of
809 	/* snoop agent path and exit early if empty */  in cgroup1_release_agent()
810 	if (!cgrp->root->release_agent_path[0])  in cgroup1_release_agent()
820 	strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);  in cgroup1_release_agent()
845  * cgroup_rename - Only allow simple rename of directories in place.
850 	struct cgroup *cgrp = kn->priv;  in cgroup1_rename()
853 	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */  in cgroup1_rename()
855 		return -EINVAL;  in cgroup1_rename()
858 		return -ENOTDIR;  in cgroup1_rename()
859 	if (rcu_access_pointer(kn->__parent) != new_parent)  in cgroup1_rename()
860 		return -EIO;  in cgroup1_rename()
890 		if (root->subsys_mask & (1 << ssid))  in cgroup1_show_options()
891 			seq_show_option(seq, ss->legacy_name, NULL);  in cgroup1_show_options()
892 	if (root->flags & CGRP_ROOT_NOPREFIX)  in cgroup1_show_options()
894 	if (root->flags & CGRP_ROOT_XATTR)  in cgroup1_show_options()
896 	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)  in cgroup1_show_options()
898 	if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)  in cgroup1_show_options()
902 	if (strlen(root->release_agent_path))  in cgroup1_show_options()
904 				root->release_agent_path);  in cgroup1_show_options()
907 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))  in cgroup1_show_options()
909 	if (strlen(root->name))  in cgroup1_show_options()
910 		seq_show_option(seq, "name", root->name);  in cgroup1_show_options()
949 	if (opt == -ENOPARAM) {  in cgroup1_parse_param()
953 		if (ret != -ENOPARAM)  in cgroup1_parse_param()
956 			if (strcmp(param->key, ss->legacy_name) ||  in cgroup1_parse_param()
961 					       param->key);  in cgroup1_parse_param()
962 			ctx->subsys_mask |= (1 << i);  in cgroup1_parse_param()
965 		return invalfc(fc, "Unknown subsys name '%s'", param->key);  in cgroup1_parse_param()
973 		ctx->none = true;  in cgroup1_parse_param()
976 		ctx->all_ss = true;  in cgroup1_parse_param()
979 		ctx->flags |= CGRP_ROOT_NOPREFIX;  in cgroup1_parse_param()
982 		ctx->cpuset_clone_children = true;  in cgroup1_parse_param()
985 		ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;  in cgroup1_parse_param()
988 		ctx->flags |= CGRP_ROOT_XATTR;  in cgroup1_parse_param()
991 		ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;  in cgroup1_parse_param()
994 		ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;  in cgroup1_parse_param()
998 		if (ctx->release_agent)  in cgroup1_parse_param()
1002 		 * require capabilities to set release agent.  in cgroup1_parse_param()
1004 		if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))  in cgroup1_parse_param()
1006 		ctx->release_agent = param->string;  in cgroup1_parse_param()
1007 		param->string = NULL;  in cgroup1_parse_param()
1012 			return -ENOENT;  in cgroup1_parse_param()
1014 		if (!param->size)  in cgroup1_parse_param()
1016 		if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)  in cgroup1_parse_param()
1018 		/* Must match [\w.-]+ */  in cgroup1_parse_param()
1019 		for (i = 0; i < param->size; i++) {  in cgroup1_parse_param()
1020 			char c = param->string[i];  in cgroup1_parse_param()
1023 			if ((c == '.') || (c == '-') || (c == '_'))  in cgroup1_parse_param()
1028 		if (ctx->name)  in cgroup1_parse_param()
1030 		ctx->name = param->string;  in cgroup1_parse_param()
1031 		param->string = NULL;  in cgroup1_parse_param()
1053 	ctx->subsys_mask &= enabled;  in check_cgroupfs_options()
1057 	 * let's default to 'all'.  in check_cgroupfs_options()
1059 	if (!ctx->subsys_mask && !ctx->none && !ctx->name)  in check_cgroupfs_options()
1060 		ctx->all_ss = true;  in check_cgroupfs_options()
1062 	if (ctx->all_ss) {  in check_cgroupfs_options()
1064 		if (ctx->subsys_mask)  in check_cgroupfs_options()
1067 		ctx->subsys_mask = enabled;  in check_cgroupfs_options()
1071 	 * We either have to specify by name or by subsystems. (So all  in check_cgroupfs_options()
1074 	if (!ctx->subsys_mask && !ctx->name)  in check_cgroupfs_options()
1082 	if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))  in check_cgroupfs_options()
1086 	if (ctx->subsys_mask && ctx->none)  in check_cgroupfs_options()
1095 	struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);  in cgroup1_reconfigure()
1107 	if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)  in cgroup1_reconfigure()
1109 			task_tgid_nr(current), current->comm);  in cgroup1_reconfigure()
1111 	added_mask = ctx->subsys_mask & ~root->subsys_mask;  in cgroup1_reconfigure()
1112 	removed_mask = root->subsys_mask & ~ctx->subsys_mask;  in cgroup1_reconfigure()
1114 	/* Don't allow flags or name to change at remount */  in cgroup1_reconfigure()
1115 	if ((ctx->flags ^ root->flags) ||  in cgroup1_reconfigure()
1116 	    (ctx->name && strcmp(ctx->name, root->name))) {  in cgroup1_reconfigure()
1118 		       ctx->flags, ctx->name ?: "", root->flags, root->name);  in cgroup1_reconfigure()
1119 		ret = -EINVAL;  in cgroup1_reconfigure()
1124 	if (!list_empty(&root->cgrp.self.children)) {  in cgroup1_reconfigure()
1125 		ret = -EBUSY;  in cgroup1_reconfigure()
1135 	if (ctx->release_agent) {  in cgroup1_reconfigure()
1137 		strscpy(root->release_agent_path, ctx->release_agent);  in cgroup1_reconfigure()
1157  * The guts of cgroup1 mount - find or create cgroup_root to use.
1158  * Called with cgroup_mutex held; returns 0 on success, -E... on
1159  * error and positive - in case when the candidate is busy dying.
1160  * On success it stashes a reference to cgroup_root into given
1179 	 * dying subsystems.  We just need to ensure that the ones  in cgroup1_root_to_use()
1184 		if (!(ctx->subsys_mask & (1 << i)) ||  in cgroup1_root_to_use()
1185 		    ss->root == &cgrp_dfl_root)  in cgroup1_root_to_use()
1188 		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))  in cgroup1_root_to_use()
1190 		cgroup_put(&ss->root->cgrp);  in cgroup1_root_to_use()
1204 		if (ctx->name) {  in cgroup1_root_to_use()
1205 			if (strcmp(ctx->name, root->name))  in cgroup1_root_to_use()
1214 		if ((ctx->subsys_mask || ctx->none) &&  in cgroup1_root_to_use()
1215 		    (ctx->subsys_mask != root->subsys_mask)) {  in cgroup1_root_to_use()
1218 			return -EBUSY;  in cgroup1_root_to_use()
1221 		if (root->flags ^ ctx->flags)  in cgroup1_root_to_use()
1224 		ctx->root = root;  in cgroup1_root_to_use()
1233 	if (!ctx->subsys_mask && !ctx->none)  in cgroup1_root_to_use()
1237 	if (ctx->ns != &init_cgroup_ns)  in cgroup1_root_to_use()
1238 		return -EPERM;  in cgroup1_root_to_use()
1242 		return -ENOMEM;  in cgroup1_root_to_use()
1244 	ctx->root = root;  in cgroup1_root_to_use()
1247 	ret = cgroup_setup_root(root, ctx->subsys_mask);  in cgroup1_root_to_use()
1249 		cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);  in cgroup1_root_to_use()
1261 	/* Check if the caller has permission to mount. */  in cgroup1_get_tree()
1262 	if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))  in cgroup1_get_tree()
1263 		return -EPERM;  in cgroup1_get_tree()
1268 	if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))  in cgroup1_get_tree()
1276 	if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {  in cgroup1_get_tree()
1289  * task_get_cgroup1 - Acquires the associated cgroup of a task within a
1296  * We limit it to cgroup1 only.
1300 	struct cgroup *cgrp = ERR_PTR(-ENOENT);  in task_get_cgroup1()
1309 		if (root->hierarchy_id != hierarchy_id)  in task_get_cgroup1()
1314 			cgrp = ERR_PTR(-ENOENT);  in task_get_cgroup1()
1325 	 * Used to destroy pidlists and separate to serve as flush domain.  in cgroup1_wq_init()
1326 	 * Cap @max_active to 1 too.  in cgroup1_wq_init()
1356 			if (strcmp(token, ss->name) &&  in cgroup_no_v1()
1357 			    strcmp(token, ss->legacy_name))  in cgroup_no_v1()