1 /* 2 * device_cgroup.c - device cgroup subsystem 3 * 4 * Copyright 2007 IBM Corp 5 */ 6 7 #include <linux/device_cgroup.h> 8 #include <linux/cgroup.h> 9 #include <linux/ctype.h> 10 #include <linux/list.h> 11 #include <linux/uaccess.h> 12 #include <linux/seq_file.h> 13 #include <linux/slab.h> 14 #include <linux/rcupdate.h> 15 #include <linux/mutex.h> 16 17 #define ACC_MKNOD 1 18 #define ACC_READ 2 19 #define ACC_WRITE 4 20 #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) 21 22 #define DEV_BLOCK 1 23 #define DEV_CHAR 2 24 #define DEV_ALL 4 /* this represents all devices */ 25 26 static DEFINE_MUTEX(devcgroup_mutex); 27 28 /* 29 * exception list locking rules: 30 * hold devcgroup_mutex for update/read. 31 * hold rcu_read_lock() for read. 32 */ 33 34 struct dev_exception_item { 35 u32 major, minor; 36 short type; 37 short access; 38 struct list_head list; 39 struct rcu_head rcu; 40 }; 41 42 struct dev_cgroup { 43 struct cgroup_subsys_state css; 44 struct list_head exceptions; 45 enum { 46 DEVCG_DEFAULT_ALLOW, 47 DEVCG_DEFAULT_DENY, 48 } behavior; 49 }; 50 51 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 52 { 53 return container_of(s, struct dev_cgroup, css); 54 } 55 56 static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) 57 { 58 return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id)); 59 } 60 61 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 62 { 63 return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); 64 } 65 66 struct cgroup_subsys devices_subsys; 67 68 static int devcgroup_can_attach(struct cgroup *new_cgrp, 69 struct cgroup_taskset *set) 70 { 71 struct task_struct *task = cgroup_taskset_first(set); 72 73 if (current != task && !capable(CAP_SYS_ADMIN)) 74 return -EPERM; 75 return 0; 76 } 77 78 /* 79 * called under devcgroup_mutex 80 */ 81 static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) 82 { 83 struct dev_exception_item *ex, *tmp, *new; 84 85 list_for_each_entry(ex, orig, list) { 86 new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 87 if (!new) 88 goto free_and_exit; 89 list_add_tail(&new->list, dest); 90 } 91 92 return 0; 93 94 free_and_exit: 95 list_for_each_entry_safe(ex, tmp, dest, list) { 96 list_del(&ex->list); 97 kfree(ex); 98 } 99 return -ENOMEM; 100 } 101 102 /* 103 * called under devcgroup_mutex 104 */ 105 static int dev_exception_add(struct dev_cgroup *dev_cgroup, 106 struct dev_exception_item *ex) 107 { 108 struct dev_exception_item *excopy, *walk; 109 110 excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 111 if (!excopy) 112 return -ENOMEM; 113 114 list_for_each_entry(walk, &dev_cgroup->exceptions, list) { 115 if (walk->type != ex->type) 116 continue; 117 if (walk->major != ex->major) 118 continue; 119 if (walk->minor != ex->minor) 120 continue; 121 122 walk->access |= ex->access; 123 kfree(excopy); 124 excopy = NULL; 125 } 126 127 if (excopy != NULL) 128 list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); 129 return 0; 130 } 131 132 /* 133 * called under devcgroup_mutex 134 */ 135 static void dev_exception_rm(struct dev_cgroup *dev_cgroup, 136 struct dev_exception_item *ex) 137 { 138 struct dev_exception_item *walk, *tmp; 139 140 list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { 141 if (walk->type != ex->type) 142 continue; 143 if (walk->major != ex->major) 144 continue; 145 if (walk->minor != ex->minor) 146 continue; 147 148 walk->access &= ~ex->access; 149 if (!walk->access) { 150 list_del_rcu(&walk->list); 151 kfree_rcu(walk, rcu); 152 } 153 } 154 } 155 156 /** 157 * dev_exception_clean - frees all entries of the exception list 158 * @dev_cgroup: dev_cgroup with the exception list to be cleaned 159 * 160 * called under devcgroup_mutex 161 */ 162 static void dev_exception_clean(struct dev_cgroup *dev_cgroup) 163 { 164 struct dev_exception_item *ex, *tmp; 165 166 list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { 167 list_del_rcu(&ex->list); 168 kfree_rcu(ex, rcu); 169 } 170 } 171 172 /* 173 * called from kernel/cgroup.c with cgroup_lock() held. 174 */ 175 static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup) 176 { 177 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup; 178 struct cgroup *parent_cgroup; 179 int ret; 180 181 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); 182 if (!dev_cgroup) 183 return ERR_PTR(-ENOMEM); 184 INIT_LIST_HEAD(&dev_cgroup->exceptions); 185 parent_cgroup = cgroup->parent; 186 187 if (parent_cgroup == NULL) 188 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; 189 else { 190 parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup); 191 mutex_lock(&devcgroup_mutex); 192 ret = dev_exceptions_copy(&dev_cgroup->exceptions, 193 &parent_dev_cgroup->exceptions); 194 dev_cgroup->behavior = parent_dev_cgroup->behavior; 195 mutex_unlock(&devcgroup_mutex); 196 if (ret) { 197 kfree(dev_cgroup); 198 return ERR_PTR(ret); 199 } 200 } 201 202 return &dev_cgroup->css; 203 } 204 205 static void devcgroup_destroy(struct cgroup *cgroup) 206 { 207 struct dev_cgroup *dev_cgroup; 208 209 dev_cgroup = cgroup_to_devcgroup(cgroup); 210 dev_exception_clean(dev_cgroup); 211 kfree(dev_cgroup); 212 } 213 214 #define DEVCG_ALLOW 1 215 #define DEVCG_DENY 2 216 #define DEVCG_LIST 3 217 218 #define MAJMINLEN 13 219 #define ACCLEN 4 220 221 static void set_access(char *acc, short access) 222 { 223 int idx = 0; 224 memset(acc, 0, ACCLEN); 225 if (access & ACC_READ) 226 acc[idx++] = 'r'; 227 if (access & ACC_WRITE) 228 acc[idx++] = 'w'; 229 if (access & ACC_MKNOD) 230 acc[idx++] = 'm'; 231 } 232 233 static char type_to_char(short type) 234 { 235 if (type == DEV_ALL) 236 return 'a'; 237 if (type == DEV_CHAR) 238 return 'c'; 239 if (type == DEV_BLOCK) 240 return 'b'; 241 return 'X'; 242 } 243 244 static void set_majmin(char *str, unsigned m) 245 { 246 if (m == ~0) 247 strcpy(str, "*"); 248 else 249 sprintf(str, "%u", m); 250 } 251 252 static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, 253 struct seq_file *m) 254 { 255 struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); 256 struct dev_exception_item *ex; 257 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 258 259 rcu_read_lock(); 260 /* 261 * To preserve the compatibility: 262 * - Only show the "all devices" when the default policy is to allow 263 * - List the exceptions in case the default policy is to deny 264 * This way, the file remains as a "whitelist of devices" 265 */ 266 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 267 set_access(acc, ACC_MASK); 268 set_majmin(maj, ~0); 269 set_majmin(min, ~0); 270 seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL), 271 maj, min, acc); 272 } else { 273 list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { 274 set_access(acc, ex->access); 275 set_majmin(maj, ex->major); 276 set_majmin(min, ex->minor); 277 seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), 278 maj, min, acc); 279 } 280 } 281 rcu_read_unlock(); 282 283 return 0; 284 } 285 286 /** 287 * may_access - verifies if a new exception is part of what is allowed 288 * by a dev cgroup based on the default policy + 289 * exceptions. This is used to make sure a child cgroup 290 * won't have more privileges than its parent or to 291 * verify if a certain access is allowed. 292 * @dev_cgroup: dev cgroup to be tested against 293 * @refex: new exception 294 */ 295 static int may_access(struct dev_cgroup *dev_cgroup, 296 struct dev_exception_item *refex) 297 { 298 struct dev_exception_item *ex; 299 bool match = false; 300 301 list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) { 302 if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) 303 continue; 304 if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR)) 305 continue; 306 if (ex->major != ~0 && ex->major != refex->major) 307 continue; 308 if (ex->minor != ~0 && ex->minor != refex->minor) 309 continue; 310 if (refex->access & (~ex->access)) 311 continue; 312 match = true; 313 break; 314 } 315 316 /* 317 * In two cases we'll consider this new exception valid: 318 * - the dev cgroup has its default policy to allow + exception list: 319 * the new exception should *not* match any of the exceptions 320 * (behavior == DEVCG_DEFAULT_ALLOW, !match) 321 * - the dev cgroup has its default policy to deny + exception list: 322 * the new exception *should* match the exceptions 323 * (behavior == DEVCG_DEFAULT_DENY, match) 324 */ 325 if ((dev_cgroup->behavior == DEVCG_DEFAULT_DENY) == match) 326 return 1; 327 return 0; 328 } 329 330 /* 331 * parent_has_perm: 332 * when adding a new allow rule to a device exception list, the rule 333 * must be allowed in the parent device 334 */ 335 static int parent_has_perm(struct dev_cgroup *childcg, 336 struct dev_exception_item *ex) 337 { 338 struct cgroup *pcg = childcg->css.cgroup->parent; 339 struct dev_cgroup *parent; 340 341 if (!pcg) 342 return 1; 343 parent = cgroup_to_devcgroup(pcg); 344 return may_access(parent, ex); 345 } 346 347 /** 348 * may_allow_all - checks if it's possible to change the behavior to 349 * allow based on parent's rules. 350 * @parent: device cgroup's parent 351 * returns: != 0 in case it's allowed, 0 otherwise 352 */ 353 static inline int may_allow_all(struct dev_cgroup *parent) 354 { 355 if (!parent) 356 return 1; 357 return parent->behavior == DEVCG_DEFAULT_ALLOW; 358 } 359 360 /* 361 * Modify the exception list using allow/deny rules. 362 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD 363 * so we can give a container CAP_MKNOD to let it create devices but not 364 * modify the exception list. 365 * It seems likely we'll want to add a CAP_CONTAINER capability to allow 366 * us to also grant CAP_SYS_ADMIN to containers without giving away the 367 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN 368 * 369 * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting 370 * new access is only allowed if you're in the top-level cgroup, or your 371 * parent cgroup has the access you're asking for. 372 */ 373 static int devcgroup_update_access(struct dev_cgroup *devcgroup, 374 int filetype, const char *buffer) 375 { 376 const char *b; 377 char temp[12]; /* 11 + 1 characters needed for a u32 */ 378 int count, rc; 379 struct dev_exception_item ex; 380 struct cgroup *p = devcgroup->css.cgroup; 381 struct dev_cgroup *parent = NULL; 382 383 if (!capable(CAP_SYS_ADMIN)) 384 return -EPERM; 385 386 if (p->parent) 387 parent = cgroup_to_devcgroup(p->parent); 388 389 memset(&ex, 0, sizeof(ex)); 390 b = buffer; 391 392 switch (*b) { 393 case 'a': 394 switch (filetype) { 395 case DEVCG_ALLOW: 396 if (!may_allow_all(parent)) 397 return -EPERM; 398 dev_exception_clean(devcgroup); 399 devcgroup->behavior = DEVCG_DEFAULT_ALLOW; 400 if (!parent) 401 break; 402 403 rc = dev_exceptions_copy(&devcgroup->exceptions, 404 &parent->exceptions); 405 if (rc) 406 return rc; 407 break; 408 case DEVCG_DENY: 409 dev_exception_clean(devcgroup); 410 devcgroup->behavior = DEVCG_DEFAULT_DENY; 411 break; 412 default: 413 return -EINVAL; 414 } 415 return 0; 416 case 'b': 417 ex.type = DEV_BLOCK; 418 break; 419 case 'c': 420 ex.type = DEV_CHAR; 421 break; 422 default: 423 return -EINVAL; 424 } 425 b++; 426 if (!isspace(*b)) 427 return -EINVAL; 428 b++; 429 if (*b == '*') { 430 ex.major = ~0; 431 b++; 432 } else if (isdigit(*b)) { 433 memset(temp, 0, sizeof(temp)); 434 for (count = 0; count < sizeof(temp) - 1; count++) { 435 temp[count] = *b; 436 b++; 437 if (!isdigit(*b)) 438 break; 439 } 440 rc = kstrtou32(temp, 10, &ex.major); 441 if (rc) 442 return -EINVAL; 443 } else { 444 return -EINVAL; 445 } 446 if (*b != ':') 447 return -EINVAL; 448 b++; 449 450 /* read minor */ 451 if (*b == '*') { 452 ex.minor = ~0; 453 b++; 454 } else if (isdigit(*b)) { 455 memset(temp, 0, sizeof(temp)); 456 for (count = 0; count < sizeof(temp) - 1; count++) { 457 temp[count] = *b; 458 b++; 459 if (!isdigit(*b)) 460 break; 461 } 462 rc = kstrtou32(temp, 10, &ex.minor); 463 if (rc) 464 return -EINVAL; 465 } else { 466 return -EINVAL; 467 } 468 if (!isspace(*b)) 469 return -EINVAL; 470 for (b++, count = 0; count < 3; count++, b++) { 471 switch (*b) { 472 case 'r': 473 ex.access |= ACC_READ; 474 break; 475 case 'w': 476 ex.access |= ACC_WRITE; 477 break; 478 case 'm': 479 ex.access |= ACC_MKNOD; 480 break; 481 case '\n': 482 case '\0': 483 count = 3; 484 break; 485 default: 486 return -EINVAL; 487 } 488 } 489 490 switch (filetype) { 491 case DEVCG_ALLOW: 492 if (!parent_has_perm(devcgroup, &ex)) 493 return -EPERM; 494 /* 495 * If the default policy is to allow by default, try to remove 496 * an matching exception instead. And be silent about it: we 497 * don't want to break compatibility 498 */ 499 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 500 dev_exception_rm(devcgroup, &ex); 501 return 0; 502 } 503 return dev_exception_add(devcgroup, &ex); 504 case DEVCG_DENY: 505 /* 506 * If the default policy is to deny by default, try to remove 507 * an matching exception instead. And be silent about it: we 508 * don't want to break compatibility 509 */ 510 if (devcgroup->behavior == DEVCG_DEFAULT_DENY) { 511 dev_exception_rm(devcgroup, &ex); 512 return 0; 513 } 514 return dev_exception_add(devcgroup, &ex); 515 default: 516 return -EINVAL; 517 } 518 return 0; 519 } 520 521 static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, 522 const char *buffer) 523 { 524 int retval; 525 526 mutex_lock(&devcgroup_mutex); 527 retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), 528 cft->private, buffer); 529 mutex_unlock(&devcgroup_mutex); 530 return retval; 531 } 532 533 static struct cftype dev_cgroup_files[] = { 534 { 535 .name = "allow", 536 .write_string = devcgroup_access_write, 537 .private = DEVCG_ALLOW, 538 }, 539 { 540 .name = "deny", 541 .write_string = devcgroup_access_write, 542 .private = DEVCG_DENY, 543 }, 544 { 545 .name = "list", 546 .read_seq_string = devcgroup_seq_read, 547 .private = DEVCG_LIST, 548 }, 549 { } /* terminate */ 550 }; 551 552 struct cgroup_subsys devices_subsys = { 553 .name = "devices", 554 .can_attach = devcgroup_can_attach, 555 .create = devcgroup_create, 556 .destroy = devcgroup_destroy, 557 .subsys_id = devices_subsys_id, 558 .base_cftypes = dev_cgroup_files, 559 560 /* 561 * While devices cgroup has the rudimentary hierarchy support which 562 * checks the parent's restriction, it doesn't properly propagates 563 * config changes in ancestors to their descendents. A child 564 * should only be allowed to add more restrictions to the parent's 565 * configuration. Fix it and remove the following. 566 */ 567 .broken_hierarchy = true, 568 }; 569 570 /** 571 * __devcgroup_check_permission - checks if an inode operation is permitted 572 * @dev_cgroup: the dev cgroup to be tested against 573 * @type: device type 574 * @major: device major number 575 * @minor: device minor number 576 * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD 577 * 578 * returns 0 on success, -EPERM case the operation is not permitted 579 */ 580 static int __devcgroup_check_permission(short type, u32 major, u32 minor, 581 short access) 582 { 583 struct dev_cgroup *dev_cgroup; 584 struct dev_exception_item ex; 585 int rc; 586 587 memset(&ex, 0, sizeof(ex)); 588 ex.type = type; 589 ex.major = major; 590 ex.minor = minor; 591 ex.access = access; 592 593 rcu_read_lock(); 594 dev_cgroup = task_devcgroup(current); 595 rc = may_access(dev_cgroup, &ex); 596 rcu_read_unlock(); 597 598 if (!rc) 599 return -EPERM; 600 601 return 0; 602 } 603 604 int __devcgroup_inode_permission(struct inode *inode, int mask) 605 { 606 short type, access = 0; 607 608 if (S_ISBLK(inode->i_mode)) 609 type = DEV_BLOCK; 610 if (S_ISCHR(inode->i_mode)) 611 type = DEV_CHAR; 612 if (mask & MAY_WRITE) 613 access |= ACC_WRITE; 614 if (mask & MAY_READ) 615 access |= ACC_READ; 616 617 return __devcgroup_check_permission(type, imajor(inode), iminor(inode), 618 access); 619 } 620 621 int devcgroup_inode_mknod(int mode, dev_t dev) 622 { 623 short type; 624 625 if (!S_ISBLK(mode) && !S_ISCHR(mode)) 626 return 0; 627 628 if (S_ISBLK(mode)) 629 type = DEV_BLOCK; 630 else 631 type = DEV_CHAR; 632 633 return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), 634 ACC_MKNOD); 635 636 } 637