1 /* 2 * device_cgroup.c - device cgroup subsystem 3 * 4 * Copyright 2007 IBM Corp 5 */ 6 7 #include <linux/device_cgroup.h> 8 #include <linux/cgroup.h> 9 #include <linux/ctype.h> 10 #include <linux/list.h> 11 #include <linux/uaccess.h> 12 #include <linux/seq_file.h> 13 #include <linux/slab.h> 14 #include <linux/rcupdate.h> 15 #include <linux/mutex.h> 16 17 #define ACC_MKNOD 1 18 #define ACC_READ 2 19 #define ACC_WRITE 4 20 #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) 21 22 #define DEV_BLOCK 1 23 #define DEV_CHAR 2 24 #define DEV_ALL 4 /* this represents all devices */ 25 26 static DEFINE_MUTEX(devcgroup_mutex); 27 28 enum devcg_behavior { 29 DEVCG_DEFAULT_NONE, 30 DEVCG_DEFAULT_ALLOW, 31 DEVCG_DEFAULT_DENY, 32 }; 33 34 /* 35 * exception list locking rules: 36 * hold devcgroup_mutex for update/read. 37 * hold rcu_read_lock() for read. 38 */ 39 40 struct dev_exception_item { 41 u32 major, minor; 42 short type; 43 short access; 44 struct list_head list; 45 struct rcu_head rcu; 46 }; 47 48 struct dev_cgroup { 49 struct cgroup_subsys_state css; 50 struct list_head exceptions; 51 enum devcg_behavior behavior; 52 }; 53 54 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 55 { 56 return s ? container_of(s, struct dev_cgroup, css) : NULL; 57 } 58 59 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 60 { 61 return css_to_devcgroup(task_css(task, devices_subsys_id)); 62 } 63 64 struct cgroup_subsys devices_subsys; 65 66 /* 67 * called under devcgroup_mutex 68 */ 69 static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) 70 { 71 struct dev_exception_item *ex, *tmp, *new; 72 73 lockdep_assert_held(&devcgroup_mutex); 74 75 list_for_each_entry(ex, orig, list) { 76 new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 77 if (!new) 78 goto free_and_exit; 79 list_add_tail(&new->list, dest); 80 } 81 82 return 0; 83 84 free_and_exit: 85 list_for_each_entry_safe(ex, tmp, dest, list) { 86 list_del(&ex->list); 87 kfree(ex); 88 } 89 return -ENOMEM; 90 } 91 92 /* 93 * called under devcgroup_mutex 94 */ 95 static int dev_exception_add(struct dev_cgroup *dev_cgroup, 96 struct dev_exception_item *ex) 97 { 98 struct dev_exception_item *excopy, *walk; 99 100 lockdep_assert_held(&devcgroup_mutex); 101 102 excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 103 if (!excopy) 104 return -ENOMEM; 105 106 list_for_each_entry(walk, &dev_cgroup->exceptions, list) { 107 if (walk->type != ex->type) 108 continue; 109 if (walk->major != ex->major) 110 continue; 111 if (walk->minor != ex->minor) 112 continue; 113 114 walk->access |= ex->access; 115 kfree(excopy); 116 excopy = NULL; 117 } 118 119 if (excopy != NULL) 120 list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); 121 return 0; 122 } 123 124 /* 125 * called under devcgroup_mutex 126 */ 127 static void dev_exception_rm(struct dev_cgroup *dev_cgroup, 128 struct dev_exception_item *ex) 129 { 130 struct dev_exception_item *walk, *tmp; 131 132 lockdep_assert_held(&devcgroup_mutex); 133 134 list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { 135 if (walk->type != ex->type) 136 continue; 137 if (walk->major != ex->major) 138 continue; 139 if (walk->minor != ex->minor) 140 continue; 141 142 walk->access &= ~ex->access; 143 if (!walk->access) { 144 list_del_rcu(&walk->list); 145 kfree_rcu(walk, rcu); 146 } 147 } 148 } 149 150 static void __dev_exception_clean(struct dev_cgroup *dev_cgroup) 151 { 152 struct dev_exception_item *ex, *tmp; 153 154 list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { 155 list_del_rcu(&ex->list); 156 kfree_rcu(ex, rcu); 157 } 158 } 159 160 /** 161 * dev_exception_clean - frees all entries of the exception list 162 * @dev_cgroup: dev_cgroup with the exception list to be cleaned 163 * 164 * called under devcgroup_mutex 165 */ 166 static void dev_exception_clean(struct dev_cgroup *dev_cgroup) 167 { 168 lockdep_assert_held(&devcgroup_mutex); 169 170 __dev_exception_clean(dev_cgroup); 171 } 172 173 static inline bool is_devcg_online(const struct dev_cgroup *devcg) 174 { 175 return (devcg->behavior != DEVCG_DEFAULT_NONE); 176 } 177 178 /** 179 * devcgroup_online - initializes devcgroup's behavior and exceptions based on 180 * parent's 181 * @css: css getting online 182 * returns 0 in case of success, error code otherwise 183 */ 184 static int devcgroup_online(struct cgroup_subsys_state *css) 185 { 186 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); 187 struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(css)); 188 int ret = 0; 189 190 mutex_lock(&devcgroup_mutex); 191 192 if (parent_dev_cgroup == NULL) 193 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; 194 else { 195 ret = dev_exceptions_copy(&dev_cgroup->exceptions, 196 &parent_dev_cgroup->exceptions); 197 if (!ret) 198 dev_cgroup->behavior = parent_dev_cgroup->behavior; 199 } 200 mutex_unlock(&devcgroup_mutex); 201 202 return ret; 203 } 204 205 static void devcgroup_offline(struct cgroup_subsys_state *css) 206 { 207 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); 208 209 mutex_lock(&devcgroup_mutex); 210 dev_cgroup->behavior = DEVCG_DEFAULT_NONE; 211 mutex_unlock(&devcgroup_mutex); 212 } 213 214 /* 215 * called from kernel/cgroup.c with cgroup_lock() held. 216 */ 217 static struct cgroup_subsys_state * 218 devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) 219 { 220 struct dev_cgroup *dev_cgroup; 221 222 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); 223 if (!dev_cgroup) 224 return ERR_PTR(-ENOMEM); 225 INIT_LIST_HEAD(&dev_cgroup->exceptions); 226 dev_cgroup->behavior = DEVCG_DEFAULT_NONE; 227 228 return &dev_cgroup->css; 229 } 230 231 static void devcgroup_css_free(struct cgroup_subsys_state *css) 232 { 233 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); 234 235 __dev_exception_clean(dev_cgroup); 236 kfree(dev_cgroup); 237 } 238 239 #define DEVCG_ALLOW 1 240 #define DEVCG_DENY 2 241 #define DEVCG_LIST 3 242 243 #define MAJMINLEN 13 244 #define ACCLEN 4 245 246 static void set_access(char *acc, short access) 247 { 248 int idx = 0; 249 memset(acc, 0, ACCLEN); 250 if (access & ACC_READ) 251 acc[idx++] = 'r'; 252 if (access & ACC_WRITE) 253 acc[idx++] = 'w'; 254 if (access & ACC_MKNOD) 255 acc[idx++] = 'm'; 256 } 257 258 static char type_to_char(short type) 259 { 260 if (type == DEV_ALL) 261 return 'a'; 262 if (type == DEV_CHAR) 263 return 'c'; 264 if (type == DEV_BLOCK) 265 return 'b'; 266 return 'X'; 267 } 268 269 static void set_majmin(char *str, unsigned m) 270 { 271 if (m == ~0) 272 strcpy(str, "*"); 273 else 274 sprintf(str, "%u", m); 275 } 276 277 static int devcgroup_seq_show(struct seq_file *m, void *v) 278 { 279 struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m)); 280 struct dev_exception_item *ex; 281 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 282 283 rcu_read_lock(); 284 /* 285 * To preserve the compatibility: 286 * - Only show the "all devices" when the default policy is to allow 287 * - List the exceptions in case the default policy is to deny 288 * This way, the file remains as a "whitelist of devices" 289 */ 290 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 291 set_access(acc, ACC_MASK); 292 set_majmin(maj, ~0); 293 set_majmin(min, ~0); 294 seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL), 295 maj, min, acc); 296 } else { 297 list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { 298 set_access(acc, ex->access); 299 set_majmin(maj, ex->major); 300 set_majmin(min, ex->minor); 301 seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), 302 maj, min, acc); 303 } 304 } 305 rcu_read_unlock(); 306 307 return 0; 308 } 309 310 /** 311 * may_access - verifies if a new exception is part of what is allowed 312 * by a dev cgroup based on the default policy + 313 * exceptions. This is used to make sure a child cgroup 314 * won't have more privileges than its parent or to 315 * verify if a certain access is allowed. 316 * @dev_cgroup: dev cgroup to be tested against 317 * @refex: new exception 318 * @behavior: behavior of the exception 319 */ 320 static bool may_access(struct dev_cgroup *dev_cgroup, 321 struct dev_exception_item *refex, 322 enum devcg_behavior behavior) 323 { 324 struct dev_exception_item *ex; 325 bool match = false; 326 327 rcu_lockdep_assert(rcu_read_lock_held() || 328 lockdep_is_held(&devcgroup_mutex), 329 "device_cgroup::may_access() called without proper synchronization"); 330 331 list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) { 332 if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) 333 continue; 334 if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR)) 335 continue; 336 if (ex->major != ~0 && ex->major != refex->major) 337 continue; 338 if (ex->minor != ~0 && ex->minor != refex->minor) 339 continue; 340 if (refex->access & (~ex->access)) 341 continue; 342 match = true; 343 break; 344 } 345 346 if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { 347 if (behavior == DEVCG_DEFAULT_ALLOW) { 348 /* the exception will deny access to certain devices */ 349 return true; 350 } else { 351 /* the exception will allow access to certain devices */ 352 if (match) 353 /* 354 * a new exception allowing access shouldn't 355 * match an parent's exception 356 */ 357 return false; 358 return true; 359 } 360 } else { 361 /* only behavior == DEVCG_DEFAULT_DENY allowed here */ 362 if (match) 363 /* parent has an exception that matches the proposed */ 364 return true; 365 else 366 return false; 367 } 368 return false; 369 } 370 371 /* 372 * parent_has_perm: 373 * when adding a new allow rule to a device exception list, the rule 374 * must be allowed in the parent device 375 */ 376 static int parent_has_perm(struct dev_cgroup *childcg, 377 struct dev_exception_item *ex) 378 { 379 struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css)); 380 381 if (!parent) 382 return 1; 383 return may_access(parent, ex, childcg->behavior); 384 } 385 386 /** 387 * may_allow_all - checks if it's possible to change the behavior to 388 * allow based on parent's rules. 389 * @parent: device cgroup's parent 390 * returns: != 0 in case it's allowed, 0 otherwise 391 */ 392 static inline int may_allow_all(struct dev_cgroup *parent) 393 { 394 if (!parent) 395 return 1; 396 return parent->behavior == DEVCG_DEFAULT_ALLOW; 397 } 398 399 /** 400 * revalidate_active_exceptions - walks through the active exception list and 401 * revalidates the exceptions based on parent's 402 * behavior and exceptions. The exceptions that 403 * are no longer valid will be removed. 404 * Called with devcgroup_mutex held. 405 * @devcg: cgroup which exceptions will be checked 406 * 407 * This is one of the three key functions for hierarchy implementation. 408 * This function is responsible for re-evaluating all the cgroup's active 409 * exceptions due to a parent's exception change. 410 * Refer to Documentation/cgroups/devices.txt for more details. 411 */ 412 static void revalidate_active_exceptions(struct dev_cgroup *devcg) 413 { 414 struct dev_exception_item *ex; 415 struct list_head *this, *tmp; 416 417 list_for_each_safe(this, tmp, &devcg->exceptions) { 418 ex = container_of(this, struct dev_exception_item, list); 419 if (!parent_has_perm(devcg, ex)) 420 dev_exception_rm(devcg, ex); 421 } 422 } 423 424 /** 425 * propagate_exception - propagates a new exception to the children 426 * @devcg_root: device cgroup that added a new exception 427 * @ex: new exception to be propagated 428 * 429 * returns: 0 in case of success, != 0 in case of error 430 */ 431 static int propagate_exception(struct dev_cgroup *devcg_root, 432 struct dev_exception_item *ex) 433 { 434 struct cgroup_subsys_state *pos; 435 int rc = 0; 436 437 rcu_read_lock(); 438 439 css_for_each_descendant_pre(pos, &devcg_root->css) { 440 struct dev_cgroup *devcg = css_to_devcgroup(pos); 441 442 /* 443 * Because devcgroup_mutex is held, no devcg will become 444 * online or offline during the tree walk (see on/offline 445 * methods), and online ones are safe to access outside RCU 446 * read lock without bumping refcnt. 447 */ 448 if (pos == &devcg_root->css || !is_devcg_online(devcg)) 449 continue; 450 451 rcu_read_unlock(); 452 453 /* 454 * in case both root's behavior and devcg is allow, a new 455 * restriction means adding to the exception list 456 */ 457 if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW && 458 devcg->behavior == DEVCG_DEFAULT_ALLOW) { 459 rc = dev_exception_add(devcg, ex); 460 if (rc) 461 break; 462 } else { 463 /* 464 * in the other possible cases: 465 * root's behavior: allow, devcg's: deny 466 * root's behavior: deny, devcg's: deny 467 * the exception will be removed 468 */ 469 dev_exception_rm(devcg, ex); 470 } 471 revalidate_active_exceptions(devcg); 472 473 rcu_read_lock(); 474 } 475 476 rcu_read_unlock(); 477 return rc; 478 } 479 480 static inline bool has_children(struct dev_cgroup *devcgroup) 481 { 482 struct cgroup *cgrp = devcgroup->css.cgroup; 483 484 return !list_empty(&cgrp->children); 485 } 486 487 /* 488 * Modify the exception list using allow/deny rules. 489 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD 490 * so we can give a container CAP_MKNOD to let it create devices but not 491 * modify the exception list. 492 * It seems likely we'll want to add a CAP_CONTAINER capability to allow 493 * us to also grant CAP_SYS_ADMIN to containers without giving away the 494 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN 495 * 496 * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting 497 * new access is only allowed if you're in the top-level cgroup, or your 498 * parent cgroup has the access you're asking for. 499 */ 500 static int devcgroup_update_access(struct dev_cgroup *devcgroup, 501 int filetype, const char *buffer) 502 { 503 const char *b; 504 char temp[12]; /* 11 + 1 characters needed for a u32 */ 505 int count, rc = 0; 506 struct dev_exception_item ex; 507 struct dev_cgroup *parent = css_to_devcgroup(css_parent(&devcgroup->css)); 508 509 if (!capable(CAP_SYS_ADMIN)) 510 return -EPERM; 511 512 memset(&ex, 0, sizeof(ex)); 513 b = buffer; 514 515 switch (*b) { 516 case 'a': 517 switch (filetype) { 518 case DEVCG_ALLOW: 519 if (has_children(devcgroup)) 520 return -EINVAL; 521 522 if (!may_allow_all(parent)) 523 return -EPERM; 524 dev_exception_clean(devcgroup); 525 devcgroup->behavior = DEVCG_DEFAULT_ALLOW; 526 if (!parent) 527 break; 528 529 rc = dev_exceptions_copy(&devcgroup->exceptions, 530 &parent->exceptions); 531 if (rc) 532 return rc; 533 break; 534 case DEVCG_DENY: 535 if (has_children(devcgroup)) 536 return -EINVAL; 537 538 dev_exception_clean(devcgroup); 539 devcgroup->behavior = DEVCG_DEFAULT_DENY; 540 break; 541 default: 542 return -EINVAL; 543 } 544 return 0; 545 case 'b': 546 ex.type = DEV_BLOCK; 547 break; 548 case 'c': 549 ex.type = DEV_CHAR; 550 break; 551 default: 552 return -EINVAL; 553 } 554 b++; 555 if (!isspace(*b)) 556 return -EINVAL; 557 b++; 558 if (*b == '*') { 559 ex.major = ~0; 560 b++; 561 } else if (isdigit(*b)) { 562 memset(temp, 0, sizeof(temp)); 563 for (count = 0; count < sizeof(temp) - 1; count++) { 564 temp[count] = *b; 565 b++; 566 if (!isdigit(*b)) 567 break; 568 } 569 rc = kstrtou32(temp, 10, &ex.major); 570 if (rc) 571 return -EINVAL; 572 } else { 573 return -EINVAL; 574 } 575 if (*b != ':') 576 return -EINVAL; 577 b++; 578 579 /* read minor */ 580 if (*b == '*') { 581 ex.minor = ~0; 582 b++; 583 } else if (isdigit(*b)) { 584 memset(temp, 0, sizeof(temp)); 585 for (count = 0; count < sizeof(temp) - 1; count++) { 586 temp[count] = *b; 587 b++; 588 if (!isdigit(*b)) 589 break; 590 } 591 rc = kstrtou32(temp, 10, &ex.minor); 592 if (rc) 593 return -EINVAL; 594 } else { 595 return -EINVAL; 596 } 597 if (!isspace(*b)) 598 return -EINVAL; 599 for (b++, count = 0; count < 3; count++, b++) { 600 switch (*b) { 601 case 'r': 602 ex.access |= ACC_READ; 603 break; 604 case 'w': 605 ex.access |= ACC_WRITE; 606 break; 607 case 'm': 608 ex.access |= ACC_MKNOD; 609 break; 610 case '\n': 611 case '\0': 612 count = 3; 613 break; 614 default: 615 return -EINVAL; 616 } 617 } 618 619 switch (filetype) { 620 case DEVCG_ALLOW: 621 if (!parent_has_perm(devcgroup, &ex)) 622 return -EPERM; 623 /* 624 * If the default policy is to allow by default, try to remove 625 * an matching exception instead. And be silent about it: we 626 * don't want to break compatibility 627 */ 628 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 629 dev_exception_rm(devcgroup, &ex); 630 return 0; 631 } 632 rc = dev_exception_add(devcgroup, &ex); 633 break; 634 case DEVCG_DENY: 635 /* 636 * If the default policy is to deny by default, try to remove 637 * an matching exception instead. And be silent about it: we 638 * don't want to break compatibility 639 */ 640 if (devcgroup->behavior == DEVCG_DEFAULT_DENY) 641 dev_exception_rm(devcgroup, &ex); 642 else 643 rc = dev_exception_add(devcgroup, &ex); 644 645 if (rc) 646 break; 647 /* we only propagate new restrictions */ 648 rc = propagate_exception(devcgroup, &ex); 649 break; 650 default: 651 rc = -EINVAL; 652 } 653 return rc; 654 } 655 656 static int devcgroup_access_write(struct cgroup_subsys_state *css, 657 struct cftype *cft, const char *buffer) 658 { 659 int retval; 660 661 mutex_lock(&devcgroup_mutex); 662 retval = devcgroup_update_access(css_to_devcgroup(css), 663 cft->private, buffer); 664 mutex_unlock(&devcgroup_mutex); 665 return retval; 666 } 667 668 static struct cftype dev_cgroup_files[] = { 669 { 670 .name = "allow", 671 .write_string = devcgroup_access_write, 672 .private = DEVCG_ALLOW, 673 }, 674 { 675 .name = "deny", 676 .write_string = devcgroup_access_write, 677 .private = DEVCG_DENY, 678 }, 679 { 680 .name = "list", 681 .seq_show = devcgroup_seq_show, 682 .private = DEVCG_LIST, 683 }, 684 { } /* terminate */ 685 }; 686 687 struct cgroup_subsys devices_subsys = { 688 .name = "devices", 689 .css_alloc = devcgroup_css_alloc, 690 .css_free = devcgroup_css_free, 691 .css_online = devcgroup_online, 692 .css_offline = devcgroup_offline, 693 .subsys_id = devices_subsys_id, 694 .base_cftypes = dev_cgroup_files, 695 }; 696 697 /** 698 * __devcgroup_check_permission - checks if an inode operation is permitted 699 * @dev_cgroup: the dev cgroup to be tested against 700 * @type: device type 701 * @major: device major number 702 * @minor: device minor number 703 * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD 704 * 705 * returns 0 on success, -EPERM case the operation is not permitted 706 */ 707 static int __devcgroup_check_permission(short type, u32 major, u32 minor, 708 short access) 709 { 710 struct dev_cgroup *dev_cgroup; 711 struct dev_exception_item ex; 712 int rc; 713 714 memset(&ex, 0, sizeof(ex)); 715 ex.type = type; 716 ex.major = major; 717 ex.minor = minor; 718 ex.access = access; 719 720 rcu_read_lock(); 721 dev_cgroup = task_devcgroup(current); 722 rc = may_access(dev_cgroup, &ex, dev_cgroup->behavior); 723 rcu_read_unlock(); 724 725 if (!rc) 726 return -EPERM; 727 728 return 0; 729 } 730 731 int __devcgroup_inode_permission(struct inode *inode, int mask) 732 { 733 short type, access = 0; 734 735 if (S_ISBLK(inode->i_mode)) 736 type = DEV_BLOCK; 737 if (S_ISCHR(inode->i_mode)) 738 type = DEV_CHAR; 739 if (mask & MAY_WRITE) 740 access |= ACC_WRITE; 741 if (mask & MAY_READ) 742 access |= ACC_READ; 743 744 return __devcgroup_check_permission(type, imajor(inode), iminor(inode), 745 access); 746 } 747 748 int devcgroup_inode_mknod(int mode, dev_t dev) 749 { 750 short type; 751 752 if (!S_ISBLK(mode) && !S_ISCHR(mode)) 753 return 0; 754 755 if (S_ISBLK(mode)) 756 type = DEV_BLOCK; 757 else 758 type = DEV_CHAR; 759 760 return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), 761 ACC_MKNOD); 762 763 } 764