1 /* 2 * device_cgroup.c - device cgroup subsystem 3 * 4 * Copyright 2007 IBM Corp 5 */ 6 7 #include <linux/device_cgroup.h> 8 #include <linux/cgroup.h> 9 #include <linux/ctype.h> 10 #include <linux/list.h> 11 #include <linux/uaccess.h> 12 #include <linux/seq_file.h> 13 #include <linux/slab.h> 14 #include <linux/rcupdate.h> 15 #include <linux/mutex.h> 16 17 #define ACC_MKNOD 1 18 #define ACC_READ 2 19 #define ACC_WRITE 4 20 #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) 21 22 #define DEV_BLOCK 1 23 #define DEV_CHAR 2 24 #define DEV_ALL 4 /* this represents all devices */ 25 26 static DEFINE_MUTEX(devcgroup_mutex); 27 28 /* 29 * exception list locking rules: 30 * hold devcgroup_mutex for update/read. 31 * hold rcu_read_lock() for read. 32 */ 33 34 struct dev_exception_item { 35 u32 major, minor; 36 short type; 37 short access; 38 struct list_head list; 39 struct rcu_head rcu; 40 }; 41 42 struct dev_cgroup { 43 struct cgroup_subsys_state css; 44 struct list_head exceptions; 45 bool deny_all; 46 }; 47 48 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 49 { 50 return container_of(s, struct dev_cgroup, css); 51 } 52 53 static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) 54 { 55 return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id)); 56 } 57 58 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 59 { 60 return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); 61 } 62 63 struct cgroup_subsys devices_subsys; 64 65 static int devcgroup_can_attach(struct cgroup *new_cgrp, 66 struct cgroup_taskset *set) 67 { 68 struct task_struct *task = cgroup_taskset_first(set); 69 70 if (current != task && !capable(CAP_SYS_ADMIN)) 71 return -EPERM; 72 return 0; 73 } 74 75 /* 76 * called under devcgroup_mutex 77 */ 78 static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) 79 { 80 struct dev_exception_item *ex, *tmp, *new; 81 82 list_for_each_entry(ex, orig, list) { 83 new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 84 if (!new) 85 goto free_and_exit; 86 list_add_tail(&new->list, dest); 87 } 88 89 return 0; 90 91 free_and_exit: 92 list_for_each_entry_safe(ex, tmp, dest, list) { 93 list_del(&ex->list); 94 kfree(ex); 95 } 96 return -ENOMEM; 97 } 98 99 /* 100 * called under devcgroup_mutex 101 */ 102 static int dev_exception_add(struct dev_cgroup *dev_cgroup, 103 struct dev_exception_item *ex) 104 { 105 struct dev_exception_item *excopy, *walk; 106 107 excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 108 if (!excopy) 109 return -ENOMEM; 110 111 list_for_each_entry(walk, &dev_cgroup->exceptions, list) { 112 if (walk->type != ex->type) 113 continue; 114 if (walk->major != ex->major) 115 continue; 116 if (walk->minor != ex->minor) 117 continue; 118 119 walk->access |= ex->access; 120 kfree(excopy); 121 excopy = NULL; 122 } 123 124 if (excopy != NULL) 125 list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); 126 return 0; 127 } 128 129 /* 130 * called under devcgroup_mutex 131 */ 132 static void dev_exception_rm(struct dev_cgroup *dev_cgroup, 133 struct dev_exception_item *ex) 134 { 135 struct dev_exception_item *walk, *tmp; 136 137 list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { 138 if (walk->type != ex->type) 139 continue; 140 if (walk->major != ex->major) 141 continue; 142 if (walk->minor != ex->minor) 143 continue; 144 145 walk->access &= ~ex->access; 146 if (!walk->access) { 147 list_del_rcu(&walk->list); 148 kfree_rcu(walk, rcu); 149 } 150 } 151 } 152 153 /** 154 * dev_exception_clean - frees all entries of the exception list 155 * @dev_cgroup: dev_cgroup with the exception list to be cleaned 156 * 157 * called under devcgroup_mutex 158 */ 159 static void dev_exception_clean(struct dev_cgroup *dev_cgroup) 160 { 161 struct dev_exception_item *ex, *tmp; 162 163 list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { 164 list_del(&ex->list); 165 kfree(ex); 166 } 167 } 168 169 /* 170 * called from kernel/cgroup.c with cgroup_lock() held. 171 */ 172 static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup) 173 { 174 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup; 175 struct cgroup *parent_cgroup; 176 int ret; 177 178 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); 179 if (!dev_cgroup) 180 return ERR_PTR(-ENOMEM); 181 INIT_LIST_HEAD(&dev_cgroup->exceptions); 182 parent_cgroup = cgroup->parent; 183 184 if (parent_cgroup == NULL) 185 dev_cgroup->deny_all = false; 186 else { 187 parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup); 188 mutex_lock(&devcgroup_mutex); 189 ret = dev_exceptions_copy(&dev_cgroup->exceptions, 190 &parent_dev_cgroup->exceptions); 191 dev_cgroup->deny_all = parent_dev_cgroup->deny_all; 192 mutex_unlock(&devcgroup_mutex); 193 if (ret) { 194 kfree(dev_cgroup); 195 return ERR_PTR(ret); 196 } 197 } 198 199 return &dev_cgroup->css; 200 } 201 202 static void devcgroup_destroy(struct cgroup *cgroup) 203 { 204 struct dev_cgroup *dev_cgroup; 205 206 dev_cgroup = cgroup_to_devcgroup(cgroup); 207 dev_exception_clean(dev_cgroup); 208 kfree(dev_cgroup); 209 } 210 211 #define DEVCG_ALLOW 1 212 #define DEVCG_DENY 2 213 #define DEVCG_LIST 3 214 215 #define MAJMINLEN 13 216 #define ACCLEN 4 217 218 static void set_access(char *acc, short access) 219 { 220 int idx = 0; 221 memset(acc, 0, ACCLEN); 222 if (access & ACC_READ) 223 acc[idx++] = 'r'; 224 if (access & ACC_WRITE) 225 acc[idx++] = 'w'; 226 if (access & ACC_MKNOD) 227 acc[idx++] = 'm'; 228 } 229 230 static char type_to_char(short type) 231 { 232 if (type == DEV_ALL) 233 return 'a'; 234 if (type == DEV_CHAR) 235 return 'c'; 236 if (type == DEV_BLOCK) 237 return 'b'; 238 return 'X'; 239 } 240 241 static void set_majmin(char *str, unsigned m) 242 { 243 if (m == ~0) 244 strcpy(str, "*"); 245 else 246 sprintf(str, "%u", m); 247 } 248 249 static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, 250 struct seq_file *m) 251 { 252 struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); 253 struct dev_exception_item *ex; 254 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 255 256 rcu_read_lock(); 257 /* 258 * To preserve the compatibility: 259 * - Only show the "all devices" when the default policy is to allow 260 * - List the exceptions in case the default policy is to deny 261 * This way, the file remains as a "whitelist of devices" 262 */ 263 if (devcgroup->deny_all == false) { 264 set_access(acc, ACC_MASK); 265 set_majmin(maj, ~0); 266 set_majmin(min, ~0); 267 seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL), 268 maj, min, acc); 269 } else { 270 list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { 271 set_access(acc, ex->access); 272 set_majmin(maj, ex->major); 273 set_majmin(min, ex->minor); 274 seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), 275 maj, min, acc); 276 } 277 } 278 rcu_read_unlock(); 279 280 return 0; 281 } 282 283 /** 284 * may_access - verifies if a new exception is part of what is allowed 285 * by a dev cgroup based on the default policy + 286 * exceptions. This is used to make sure a child cgroup 287 * won't have more privileges than its parent or to 288 * verify if a certain access is allowed. 289 * @dev_cgroup: dev cgroup to be tested against 290 * @refex: new exception 291 */ 292 static int may_access(struct dev_cgroup *dev_cgroup, 293 struct dev_exception_item *refex) 294 { 295 struct dev_exception_item *ex; 296 bool match = false; 297 298 list_for_each_entry(ex, &dev_cgroup->exceptions, list) { 299 if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) 300 continue; 301 if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR)) 302 continue; 303 if (ex->major != ~0 && ex->major != refex->major) 304 continue; 305 if (ex->minor != ~0 && ex->minor != refex->minor) 306 continue; 307 if (refex->access & (~ex->access)) 308 continue; 309 match = true; 310 break; 311 } 312 313 /* 314 * In two cases we'll consider this new exception valid: 315 * - the dev cgroup has its default policy to allow + exception list: 316 * the new exception should *not* match any of the exceptions 317 * (!deny_all, !match) 318 * - the dev cgroup has its default policy to deny + exception list: 319 * the new exception *should* match the exceptions 320 * (deny_all, match) 321 */ 322 if (dev_cgroup->deny_all == match) 323 return 1; 324 return 0; 325 } 326 327 /* 328 * parent_has_perm: 329 * when adding a new allow rule to a device exception list, the rule 330 * must be allowed in the parent device 331 */ 332 static int parent_has_perm(struct dev_cgroup *childcg, 333 struct dev_exception_item *ex) 334 { 335 struct cgroup *pcg = childcg->css.cgroup->parent; 336 struct dev_cgroup *parent; 337 338 if (!pcg) 339 return 1; 340 parent = cgroup_to_devcgroup(pcg); 341 return may_access(parent, ex); 342 } 343 344 /* 345 * Modify the exception list using allow/deny rules. 346 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD 347 * so we can give a container CAP_MKNOD to let it create devices but not 348 * modify the exception list. 349 * It seems likely we'll want to add a CAP_CONTAINER capability to allow 350 * us to also grant CAP_SYS_ADMIN to containers without giving away the 351 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN 352 * 353 * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting 354 * new access is only allowed if you're in the top-level cgroup, or your 355 * parent cgroup has the access you're asking for. 356 */ 357 static int devcgroup_update_access(struct dev_cgroup *devcgroup, 358 int filetype, const char *buffer) 359 { 360 const char *b; 361 char *endp; 362 int count; 363 struct dev_exception_item ex; 364 365 if (!capable(CAP_SYS_ADMIN)) 366 return -EPERM; 367 368 memset(&ex, 0, sizeof(ex)); 369 b = buffer; 370 371 switch (*b) { 372 case 'a': 373 switch (filetype) { 374 case DEVCG_ALLOW: 375 if (!parent_has_perm(devcgroup, &ex)) 376 return -EPERM; 377 dev_exception_clean(devcgroup); 378 devcgroup->deny_all = false; 379 break; 380 case DEVCG_DENY: 381 dev_exception_clean(devcgroup); 382 devcgroup->deny_all = true; 383 break; 384 default: 385 return -EINVAL; 386 } 387 return 0; 388 case 'b': 389 ex.type = DEV_BLOCK; 390 break; 391 case 'c': 392 ex.type = DEV_CHAR; 393 break; 394 default: 395 return -EINVAL; 396 } 397 b++; 398 if (!isspace(*b)) 399 return -EINVAL; 400 b++; 401 if (*b == '*') { 402 ex.major = ~0; 403 b++; 404 } else if (isdigit(*b)) { 405 ex.major = simple_strtoul(b, &endp, 10); 406 b = endp; 407 } else { 408 return -EINVAL; 409 } 410 if (*b != ':') 411 return -EINVAL; 412 b++; 413 414 /* read minor */ 415 if (*b == '*') { 416 ex.minor = ~0; 417 b++; 418 } else if (isdigit(*b)) { 419 ex.minor = simple_strtoul(b, &endp, 10); 420 b = endp; 421 } else { 422 return -EINVAL; 423 } 424 if (!isspace(*b)) 425 return -EINVAL; 426 for (b++, count = 0; count < 3; count++, b++) { 427 switch (*b) { 428 case 'r': 429 ex.access |= ACC_READ; 430 break; 431 case 'w': 432 ex.access |= ACC_WRITE; 433 break; 434 case 'm': 435 ex.access |= ACC_MKNOD; 436 break; 437 case '\n': 438 case '\0': 439 count = 3; 440 break; 441 default: 442 return -EINVAL; 443 } 444 } 445 446 switch (filetype) { 447 case DEVCG_ALLOW: 448 if (!parent_has_perm(devcgroup, &ex)) 449 return -EPERM; 450 /* 451 * If the default policy is to allow by default, try to remove 452 * an matching exception instead. And be silent about it: we 453 * don't want to break compatibility 454 */ 455 if (devcgroup->deny_all == false) { 456 dev_exception_rm(devcgroup, &ex); 457 return 0; 458 } 459 return dev_exception_add(devcgroup, &ex); 460 case DEVCG_DENY: 461 /* 462 * If the default policy is to deny by default, try to remove 463 * an matching exception instead. And be silent about it: we 464 * don't want to break compatibility 465 */ 466 if (devcgroup->deny_all == true) { 467 dev_exception_rm(devcgroup, &ex); 468 return 0; 469 } 470 return dev_exception_add(devcgroup, &ex); 471 default: 472 return -EINVAL; 473 } 474 return 0; 475 } 476 477 static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, 478 const char *buffer) 479 { 480 int retval; 481 482 mutex_lock(&devcgroup_mutex); 483 retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), 484 cft->private, buffer); 485 mutex_unlock(&devcgroup_mutex); 486 return retval; 487 } 488 489 static struct cftype dev_cgroup_files[] = { 490 { 491 .name = "allow", 492 .write_string = devcgroup_access_write, 493 .private = DEVCG_ALLOW, 494 }, 495 { 496 .name = "deny", 497 .write_string = devcgroup_access_write, 498 .private = DEVCG_DENY, 499 }, 500 { 501 .name = "list", 502 .read_seq_string = devcgroup_seq_read, 503 .private = DEVCG_LIST, 504 }, 505 { } /* terminate */ 506 }; 507 508 struct cgroup_subsys devices_subsys = { 509 .name = "devices", 510 .can_attach = devcgroup_can_attach, 511 .create = devcgroup_create, 512 .destroy = devcgroup_destroy, 513 .subsys_id = devices_subsys_id, 514 .base_cftypes = dev_cgroup_files, 515 516 /* 517 * While devices cgroup has the rudimentary hierarchy support which 518 * checks the parent's restriction, it doesn't properly propagates 519 * config changes in ancestors to their descendents. A child 520 * should only be allowed to add more restrictions to the parent's 521 * configuration. Fix it and remove the following. 522 */ 523 .broken_hierarchy = true, 524 }; 525 526 /** 527 * __devcgroup_check_permission - checks if an inode operation is permitted 528 * @dev_cgroup: the dev cgroup to be tested against 529 * @type: device type 530 * @major: device major number 531 * @minor: device minor number 532 * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD 533 * 534 * returns 0 on success, -EPERM case the operation is not permitted 535 */ 536 static int __devcgroup_check_permission(struct dev_cgroup *dev_cgroup, 537 short type, u32 major, u32 minor, 538 short access) 539 { 540 struct dev_exception_item ex; 541 int rc; 542 543 memset(&ex, 0, sizeof(ex)); 544 ex.type = type; 545 ex.major = major; 546 ex.minor = minor; 547 ex.access = access; 548 549 rcu_read_lock(); 550 rc = may_access(dev_cgroup, &ex); 551 rcu_read_unlock(); 552 553 if (!rc) 554 return -EPERM; 555 556 return 0; 557 } 558 559 int __devcgroup_inode_permission(struct inode *inode, int mask) 560 { 561 struct dev_cgroup *dev_cgroup = task_devcgroup(current); 562 short type, access = 0; 563 564 if (S_ISBLK(inode->i_mode)) 565 type = DEV_BLOCK; 566 if (S_ISCHR(inode->i_mode)) 567 type = DEV_CHAR; 568 if (mask & MAY_WRITE) 569 access |= ACC_WRITE; 570 if (mask & MAY_READ) 571 access |= ACC_READ; 572 573 return __devcgroup_check_permission(dev_cgroup, type, imajor(inode), 574 iminor(inode), access); 575 } 576 577 int devcgroup_inode_mknod(int mode, dev_t dev) 578 { 579 struct dev_cgroup *dev_cgroup = task_devcgroup(current); 580 short type; 581 582 if (!S_ISBLK(mode) && !S_ISCHR(mode)) 583 return 0; 584 585 if (S_ISBLK(mode)) 586 type = DEV_BLOCK; 587 else 588 type = DEV_CHAR; 589 590 return __devcgroup_check_permission(dev_cgroup, type, MAJOR(dev), 591 MINOR(dev), ACC_MKNOD); 592 593 } 594