xref: /linux/security/device_cgroup.c (revision 08ec212c0f92cbf30e3ecc7349f18151714041d6)
1 /*
2  * device_cgroup.c - device cgroup subsystem
3  *
4  * Copyright 2007 IBM Corp
5  */
6 
7 #include <linux/device_cgroup.h>
8 #include <linux/cgroup.h>
9 #include <linux/ctype.h>
10 #include <linux/list.h>
11 #include <linux/uaccess.h>
12 #include <linux/seq_file.h>
13 #include <linux/slab.h>
14 #include <linux/rcupdate.h>
15 #include <linux/mutex.h>
16 
17 #define ACC_MKNOD 1
18 #define ACC_READ  2
19 #define ACC_WRITE 4
20 #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)
21 
22 #define DEV_BLOCK 1
23 #define DEV_CHAR  2
24 #define DEV_ALL   4  /* this represents all devices */
25 
26 static DEFINE_MUTEX(devcgroup_mutex);
27 
28 /*
29  * exception list locking rules:
30  * hold devcgroup_mutex for update/read.
31  * hold rcu_read_lock() for read.
32  */
33 
34 struct dev_exception_item {
35 	u32 major, minor;
36 	short type;
37 	short access;
38 	struct list_head list;
39 	struct rcu_head rcu;
40 };
41 
42 struct dev_cgroup {
43 	struct cgroup_subsys_state css;
44 	struct list_head exceptions;
45 	bool deny_all;
46 };
47 
48 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
49 {
50 	return container_of(s, struct dev_cgroup, css);
51 }
52 
53 static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup)
54 {
55 	return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id));
56 }
57 
58 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
59 {
60 	return css_to_devcgroup(task_subsys_state(task, devices_subsys_id));
61 }
62 
63 struct cgroup_subsys devices_subsys;
64 
65 static int devcgroup_can_attach(struct cgroup *new_cgrp,
66 				struct cgroup_taskset *set)
67 {
68 	struct task_struct *task = cgroup_taskset_first(set);
69 
70 	if (current != task && !capable(CAP_SYS_ADMIN))
71 		return -EPERM;
72 	return 0;
73 }
74 
75 /*
76  * called under devcgroup_mutex
77  */
78 static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
79 {
80 	struct dev_exception_item *ex, *tmp, *new;
81 
82 	list_for_each_entry(ex, orig, list) {
83 		new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
84 		if (!new)
85 			goto free_and_exit;
86 		list_add_tail(&new->list, dest);
87 	}
88 
89 	return 0;
90 
91 free_and_exit:
92 	list_for_each_entry_safe(ex, tmp, dest, list) {
93 		list_del(&ex->list);
94 		kfree(ex);
95 	}
96 	return -ENOMEM;
97 }
98 
99 /*
100  * called under devcgroup_mutex
101  */
102 static int dev_exception_add(struct dev_cgroup *dev_cgroup,
103 			     struct dev_exception_item *ex)
104 {
105 	struct dev_exception_item *excopy, *walk;
106 
107 	excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
108 	if (!excopy)
109 		return -ENOMEM;
110 
111 	list_for_each_entry(walk, &dev_cgroup->exceptions, list) {
112 		if (walk->type != ex->type)
113 			continue;
114 		if (walk->major != ex->major)
115 			continue;
116 		if (walk->minor != ex->minor)
117 			continue;
118 
119 		walk->access |= ex->access;
120 		kfree(excopy);
121 		excopy = NULL;
122 	}
123 
124 	if (excopy != NULL)
125 		list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions);
126 	return 0;
127 }
128 
129 /*
130  * called under devcgroup_mutex
131  */
132 static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
133 			     struct dev_exception_item *ex)
134 {
135 	struct dev_exception_item *walk, *tmp;
136 
137 	list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
138 		if (walk->type != ex->type)
139 			continue;
140 		if (walk->major != ex->major)
141 			continue;
142 		if (walk->minor != ex->minor)
143 			continue;
144 
145 		walk->access &= ~ex->access;
146 		if (!walk->access) {
147 			list_del_rcu(&walk->list);
148 			kfree_rcu(walk, rcu);
149 		}
150 	}
151 }
152 
153 /**
154  * dev_exception_clean - frees all entries of the exception list
155  * @dev_cgroup: dev_cgroup with the exception list to be cleaned
156  *
157  * called under devcgroup_mutex
158  */
159 static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
160 {
161 	struct dev_exception_item *ex, *tmp;
162 
163 	list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
164 		list_del(&ex->list);
165 		kfree(ex);
166 	}
167 }
168 
169 /*
170  * called from kernel/cgroup.c with cgroup_lock() held.
171  */
172 static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup)
173 {
174 	struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
175 	struct cgroup *parent_cgroup;
176 	int ret;
177 
178 	dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
179 	if (!dev_cgroup)
180 		return ERR_PTR(-ENOMEM);
181 	INIT_LIST_HEAD(&dev_cgroup->exceptions);
182 	parent_cgroup = cgroup->parent;
183 
184 	if (parent_cgroup == NULL)
185 		dev_cgroup->deny_all = false;
186 	else {
187 		parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
188 		mutex_lock(&devcgroup_mutex);
189 		ret = dev_exceptions_copy(&dev_cgroup->exceptions,
190 					  &parent_dev_cgroup->exceptions);
191 		dev_cgroup->deny_all = parent_dev_cgroup->deny_all;
192 		mutex_unlock(&devcgroup_mutex);
193 		if (ret) {
194 			kfree(dev_cgroup);
195 			return ERR_PTR(ret);
196 		}
197 	}
198 
199 	return &dev_cgroup->css;
200 }
201 
202 static void devcgroup_destroy(struct cgroup *cgroup)
203 {
204 	struct dev_cgroup *dev_cgroup;
205 
206 	dev_cgroup = cgroup_to_devcgroup(cgroup);
207 	dev_exception_clean(dev_cgroup);
208 	kfree(dev_cgroup);
209 }
210 
211 #define DEVCG_ALLOW 1
212 #define DEVCG_DENY 2
213 #define DEVCG_LIST 3
214 
215 #define MAJMINLEN 13
216 #define ACCLEN 4
217 
218 static void set_access(char *acc, short access)
219 {
220 	int idx = 0;
221 	memset(acc, 0, ACCLEN);
222 	if (access & ACC_READ)
223 		acc[idx++] = 'r';
224 	if (access & ACC_WRITE)
225 		acc[idx++] = 'w';
226 	if (access & ACC_MKNOD)
227 		acc[idx++] = 'm';
228 }
229 
230 static char type_to_char(short type)
231 {
232 	if (type == DEV_ALL)
233 		return 'a';
234 	if (type == DEV_CHAR)
235 		return 'c';
236 	if (type == DEV_BLOCK)
237 		return 'b';
238 	return 'X';
239 }
240 
241 static void set_majmin(char *str, unsigned m)
242 {
243 	if (m == ~0)
244 		strcpy(str, "*");
245 	else
246 		sprintf(str, "%u", m);
247 }
248 
249 static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
250 				struct seq_file *m)
251 {
252 	struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
253 	struct dev_exception_item *ex;
254 	char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
255 
256 	rcu_read_lock();
257 	/*
258 	 * To preserve the compatibility:
259 	 * - Only show the "all devices" when the default policy is to allow
260 	 * - List the exceptions in case the default policy is to deny
261 	 * This way, the file remains as a "whitelist of devices"
262 	 */
263 	if (devcgroup->deny_all == false) {
264 		set_access(acc, ACC_MASK);
265 		set_majmin(maj, ~0);
266 		set_majmin(min, ~0);
267 		seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL),
268 			   maj, min, acc);
269 	} else {
270 		list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
271 			set_access(acc, ex->access);
272 			set_majmin(maj, ex->major);
273 			set_majmin(min, ex->minor);
274 			seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type),
275 				   maj, min, acc);
276 		}
277 	}
278 	rcu_read_unlock();
279 
280 	return 0;
281 }
282 
283 /**
284  * may_access - verifies if a new exception is part of what is allowed
285  *		by a dev cgroup based on the default policy +
286  *		exceptions. This is used to make sure a child cgroup
287  *		won't have more privileges than its parent or to
288  *		verify if a certain access is allowed.
289  * @dev_cgroup: dev cgroup to be tested against
290  * @refex: new exception
291  */
292 static int may_access(struct dev_cgroup *dev_cgroup,
293 		      struct dev_exception_item *refex)
294 {
295 	struct dev_exception_item *ex;
296 	bool match = false;
297 
298 	list_for_each_entry(ex, &dev_cgroup->exceptions, list) {
299 		if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
300 			continue;
301 		if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR))
302 			continue;
303 		if (ex->major != ~0 && ex->major != refex->major)
304 			continue;
305 		if (ex->minor != ~0 && ex->minor != refex->minor)
306 			continue;
307 		if (refex->access & (~ex->access))
308 			continue;
309 		match = true;
310 		break;
311 	}
312 
313 	/*
314 	 * In two cases we'll consider this new exception valid:
315 	 * - the dev cgroup has its default policy to allow + exception list:
316 	 *   the new exception should *not* match any of the exceptions
317 	 *   (!deny_all, !match)
318 	 * - the dev cgroup has its default policy to deny + exception list:
319 	 *   the new exception *should* match the exceptions
320 	 *   (deny_all, match)
321 	 */
322 	if (dev_cgroup->deny_all == match)
323 		return 1;
324 	return 0;
325 }
326 
327 /*
328  * parent_has_perm:
329  * when adding a new allow rule to a device exception list, the rule
330  * must be allowed in the parent device
331  */
332 static int parent_has_perm(struct dev_cgroup *childcg,
333 				  struct dev_exception_item *ex)
334 {
335 	struct cgroup *pcg = childcg->css.cgroup->parent;
336 	struct dev_cgroup *parent;
337 
338 	if (!pcg)
339 		return 1;
340 	parent = cgroup_to_devcgroup(pcg);
341 	return may_access(parent, ex);
342 }
343 
344 /*
345  * Modify the exception list using allow/deny rules.
346  * CAP_SYS_ADMIN is needed for this.  It's at least separate from CAP_MKNOD
347  * so we can give a container CAP_MKNOD to let it create devices but not
348  * modify the exception list.
349  * It seems likely we'll want to add a CAP_CONTAINER capability to allow
350  * us to also grant CAP_SYS_ADMIN to containers without giving away the
351  * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN
352  *
353  * Taking rules away is always allowed (given CAP_SYS_ADMIN).  Granting
354  * new access is only allowed if you're in the top-level cgroup, or your
355  * parent cgroup has the access you're asking for.
356  */
357 static int devcgroup_update_access(struct dev_cgroup *devcgroup,
358 				   int filetype, const char *buffer)
359 {
360 	const char *b;
361 	char *endp;
362 	int count;
363 	struct dev_exception_item ex;
364 
365 	if (!capable(CAP_SYS_ADMIN))
366 		return -EPERM;
367 
368 	memset(&ex, 0, sizeof(ex));
369 	b = buffer;
370 
371 	switch (*b) {
372 	case 'a':
373 		switch (filetype) {
374 		case DEVCG_ALLOW:
375 			if (!parent_has_perm(devcgroup, &ex))
376 				return -EPERM;
377 			dev_exception_clean(devcgroup);
378 			devcgroup->deny_all = false;
379 			break;
380 		case DEVCG_DENY:
381 			dev_exception_clean(devcgroup);
382 			devcgroup->deny_all = true;
383 			break;
384 		default:
385 			return -EINVAL;
386 		}
387 		return 0;
388 	case 'b':
389 		ex.type = DEV_BLOCK;
390 		break;
391 	case 'c':
392 		ex.type = DEV_CHAR;
393 		break;
394 	default:
395 		return -EINVAL;
396 	}
397 	b++;
398 	if (!isspace(*b))
399 		return -EINVAL;
400 	b++;
401 	if (*b == '*') {
402 		ex.major = ~0;
403 		b++;
404 	} else if (isdigit(*b)) {
405 		ex.major = simple_strtoul(b, &endp, 10);
406 		b = endp;
407 	} else {
408 		return -EINVAL;
409 	}
410 	if (*b != ':')
411 		return -EINVAL;
412 	b++;
413 
414 	/* read minor */
415 	if (*b == '*') {
416 		ex.minor = ~0;
417 		b++;
418 	} else if (isdigit(*b)) {
419 		ex.minor = simple_strtoul(b, &endp, 10);
420 		b = endp;
421 	} else {
422 		return -EINVAL;
423 	}
424 	if (!isspace(*b))
425 		return -EINVAL;
426 	for (b++, count = 0; count < 3; count++, b++) {
427 		switch (*b) {
428 		case 'r':
429 			ex.access |= ACC_READ;
430 			break;
431 		case 'w':
432 			ex.access |= ACC_WRITE;
433 			break;
434 		case 'm':
435 			ex.access |= ACC_MKNOD;
436 			break;
437 		case '\n':
438 		case '\0':
439 			count = 3;
440 			break;
441 		default:
442 			return -EINVAL;
443 		}
444 	}
445 
446 	switch (filetype) {
447 	case DEVCG_ALLOW:
448 		if (!parent_has_perm(devcgroup, &ex))
449 			return -EPERM;
450 		/*
451 		 * If the default policy is to allow by default, try to remove
452 		 * an matching exception instead. And be silent about it: we
453 		 * don't want to break compatibility
454 		 */
455 		if (devcgroup->deny_all == false) {
456 			dev_exception_rm(devcgroup, &ex);
457 			return 0;
458 		}
459 		return dev_exception_add(devcgroup, &ex);
460 	case DEVCG_DENY:
461 		/*
462 		 * If the default policy is to deny by default, try to remove
463 		 * an matching exception instead. And be silent about it: we
464 		 * don't want to break compatibility
465 		 */
466 		if (devcgroup->deny_all == true) {
467 			dev_exception_rm(devcgroup, &ex);
468 			return 0;
469 		}
470 		return dev_exception_add(devcgroup, &ex);
471 	default:
472 		return -EINVAL;
473 	}
474 	return 0;
475 }
476 
477 static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft,
478 				  const char *buffer)
479 {
480 	int retval;
481 
482 	mutex_lock(&devcgroup_mutex);
483 	retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp),
484 					 cft->private, buffer);
485 	mutex_unlock(&devcgroup_mutex);
486 	return retval;
487 }
488 
489 static struct cftype dev_cgroup_files[] = {
490 	{
491 		.name = "allow",
492 		.write_string  = devcgroup_access_write,
493 		.private = DEVCG_ALLOW,
494 	},
495 	{
496 		.name = "deny",
497 		.write_string = devcgroup_access_write,
498 		.private = DEVCG_DENY,
499 	},
500 	{
501 		.name = "list",
502 		.read_seq_string = devcgroup_seq_read,
503 		.private = DEVCG_LIST,
504 	},
505 	{ }	/* terminate */
506 };
507 
508 struct cgroup_subsys devices_subsys = {
509 	.name = "devices",
510 	.can_attach = devcgroup_can_attach,
511 	.create = devcgroup_create,
512 	.destroy = devcgroup_destroy,
513 	.subsys_id = devices_subsys_id,
514 	.base_cftypes = dev_cgroup_files,
515 
516 	/*
517 	 * While devices cgroup has the rudimentary hierarchy support which
518 	 * checks the parent's restriction, it doesn't properly propagates
519 	 * config changes in ancestors to their descendents.  A child
520 	 * should only be allowed to add more restrictions to the parent's
521 	 * configuration.  Fix it and remove the following.
522 	 */
523 	.broken_hierarchy = true,
524 };
525 
526 /**
527  * __devcgroup_check_permission - checks if an inode operation is permitted
528  * @dev_cgroup: the dev cgroup to be tested against
529  * @type: device type
530  * @major: device major number
531  * @minor: device minor number
532  * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD
533  *
534  * returns 0 on success, -EPERM case the operation is not permitted
535  */
536 static int __devcgroup_check_permission(struct dev_cgroup *dev_cgroup,
537 					short type, u32 major, u32 minor,
538 				        short access)
539 {
540 	struct dev_exception_item ex;
541 	int rc;
542 
543 	memset(&ex, 0, sizeof(ex));
544 	ex.type = type;
545 	ex.major = major;
546 	ex.minor = minor;
547 	ex.access = access;
548 
549 	rcu_read_lock();
550 	rc = may_access(dev_cgroup, &ex);
551 	rcu_read_unlock();
552 
553 	if (!rc)
554 		return -EPERM;
555 
556 	return 0;
557 }
558 
559 int __devcgroup_inode_permission(struct inode *inode, int mask)
560 {
561 	struct dev_cgroup *dev_cgroup = task_devcgroup(current);
562 	short type, access = 0;
563 
564 	if (S_ISBLK(inode->i_mode))
565 		type = DEV_BLOCK;
566 	if (S_ISCHR(inode->i_mode))
567 		type = DEV_CHAR;
568 	if (mask & MAY_WRITE)
569 		access |= ACC_WRITE;
570 	if (mask & MAY_READ)
571 		access |= ACC_READ;
572 
573 	return __devcgroup_check_permission(dev_cgroup, type, imajor(inode),
574 					    iminor(inode), access);
575 }
576 
577 int devcgroup_inode_mknod(int mode, dev_t dev)
578 {
579 	struct dev_cgroup *dev_cgroup = task_devcgroup(current);
580 	short type;
581 
582 	if (!S_ISBLK(mode) && !S_ISCHR(mode))
583 		return 0;
584 
585 	if (S_ISBLK(mode))
586 		type = DEV_BLOCK;
587 	else
588 		type = DEV_CHAR;
589 
590 	return __devcgroup_check_permission(dev_cgroup, type, MAJOR(dev),
591 					    MINOR(dev), ACC_MKNOD);
592 
593 }
594