1 /* 2 * device_cgroup.c - device cgroup subsystem 3 * 4 * Copyright 2007 IBM Corp 5 */ 6 7 #include <linux/device_cgroup.h> 8 #include <linux/cgroup.h> 9 #include <linux/ctype.h> 10 #include <linux/list.h> 11 #include <linux/uaccess.h> 12 #include <linux/seq_file.h> 13 #include <linux/slab.h> 14 #include <linux/rcupdate.h> 15 #include <linux/mutex.h> 16 17 #define ACC_MKNOD 1 18 #define ACC_READ 2 19 #define ACC_WRITE 4 20 #define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) 21 22 #define DEV_BLOCK 1 23 #define DEV_CHAR 2 24 #define DEV_ALL 4 /* this represents all devices */ 25 26 static DEFINE_MUTEX(devcgroup_mutex); 27 28 /* 29 * exception list locking rules: 30 * hold devcgroup_mutex for update/read. 31 * hold rcu_read_lock() for read. 32 */ 33 34 struct dev_exception_item { 35 u32 major, minor; 36 short type; 37 short access; 38 struct list_head list; 39 struct rcu_head rcu; 40 }; 41 42 struct dev_cgroup { 43 struct cgroup_subsys_state css; 44 struct list_head exceptions; 45 enum { 46 DEVCG_DEFAULT_ALLOW, 47 DEVCG_DEFAULT_DENY, 48 } behavior; 49 }; 50 51 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 52 { 53 return container_of(s, struct dev_cgroup, css); 54 } 55 56 static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) 57 { 58 return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id)); 59 } 60 61 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 62 { 63 return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); 64 } 65 66 struct cgroup_subsys devices_subsys; 67 68 static int devcgroup_can_attach(struct cgroup *new_cgrp, 69 struct cgroup_taskset *set) 70 { 71 struct task_struct *task = cgroup_taskset_first(set); 72 73 if (current != task && !capable(CAP_SYS_ADMIN)) 74 return -EPERM; 75 return 0; 76 } 77 78 /* 79 * called under devcgroup_mutex 80 */ 81 static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) 82 { 83 struct dev_exception_item *ex, *tmp, *new; 84 85 list_for_each_entry(ex, orig, list) { 86 new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 87 if (!new) 88 goto free_and_exit; 89 list_add_tail(&new->list, dest); 90 } 91 92 return 0; 93 94 free_and_exit: 95 list_for_each_entry_safe(ex, tmp, dest, list) { 96 list_del(&ex->list); 97 kfree(ex); 98 } 99 return -ENOMEM; 100 } 101 102 /* 103 * called under devcgroup_mutex 104 */ 105 static int dev_exception_add(struct dev_cgroup *dev_cgroup, 106 struct dev_exception_item *ex) 107 { 108 struct dev_exception_item *excopy, *walk; 109 110 excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); 111 if (!excopy) 112 return -ENOMEM; 113 114 list_for_each_entry(walk, &dev_cgroup->exceptions, list) { 115 if (walk->type != ex->type) 116 continue; 117 if (walk->major != ex->major) 118 continue; 119 if (walk->minor != ex->minor) 120 continue; 121 122 walk->access |= ex->access; 123 kfree(excopy); 124 excopy = NULL; 125 } 126 127 if (excopy != NULL) 128 list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); 129 return 0; 130 } 131 132 /* 133 * called under devcgroup_mutex 134 */ 135 static void dev_exception_rm(struct dev_cgroup *dev_cgroup, 136 struct dev_exception_item *ex) 137 { 138 struct dev_exception_item *walk, *tmp; 139 140 list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { 141 if (walk->type != ex->type) 142 continue; 143 if (walk->major != ex->major) 144 continue; 145 if (walk->minor != ex->minor) 146 continue; 147 148 walk->access &= ~ex->access; 149 if (!walk->access) { 150 list_del_rcu(&walk->list); 151 kfree_rcu(walk, rcu); 152 } 153 } 154 } 155 156 /** 157 * dev_exception_clean - frees all entries of the exception list 158 * @dev_cgroup: dev_cgroup with the exception list to be cleaned 159 * 160 * called under devcgroup_mutex 161 */ 162 static void dev_exception_clean(struct dev_cgroup *dev_cgroup) 163 { 164 struct dev_exception_item *ex, *tmp; 165 166 list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { 167 list_del(&ex->list); 168 kfree(ex); 169 } 170 } 171 172 /* 173 * called from kernel/cgroup.c with cgroup_lock() held. 174 */ 175 static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup) 176 { 177 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup; 178 struct cgroup *parent_cgroup; 179 int ret; 180 181 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); 182 if (!dev_cgroup) 183 return ERR_PTR(-ENOMEM); 184 INIT_LIST_HEAD(&dev_cgroup->exceptions); 185 parent_cgroup = cgroup->parent; 186 187 if (parent_cgroup == NULL) 188 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; 189 else { 190 parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup); 191 mutex_lock(&devcgroup_mutex); 192 ret = dev_exceptions_copy(&dev_cgroup->exceptions, 193 &parent_dev_cgroup->exceptions); 194 dev_cgroup->behavior = parent_dev_cgroup->behavior; 195 mutex_unlock(&devcgroup_mutex); 196 if (ret) { 197 kfree(dev_cgroup); 198 return ERR_PTR(ret); 199 } 200 } 201 202 return &dev_cgroup->css; 203 } 204 205 static void devcgroup_destroy(struct cgroup *cgroup) 206 { 207 struct dev_cgroup *dev_cgroup; 208 209 dev_cgroup = cgroup_to_devcgroup(cgroup); 210 dev_exception_clean(dev_cgroup); 211 kfree(dev_cgroup); 212 } 213 214 #define DEVCG_ALLOW 1 215 #define DEVCG_DENY 2 216 #define DEVCG_LIST 3 217 218 #define MAJMINLEN 13 219 #define ACCLEN 4 220 221 static void set_access(char *acc, short access) 222 { 223 int idx = 0; 224 memset(acc, 0, ACCLEN); 225 if (access & ACC_READ) 226 acc[idx++] = 'r'; 227 if (access & ACC_WRITE) 228 acc[idx++] = 'w'; 229 if (access & ACC_MKNOD) 230 acc[idx++] = 'm'; 231 } 232 233 static char type_to_char(short type) 234 { 235 if (type == DEV_ALL) 236 return 'a'; 237 if (type == DEV_CHAR) 238 return 'c'; 239 if (type == DEV_BLOCK) 240 return 'b'; 241 return 'X'; 242 } 243 244 static void set_majmin(char *str, unsigned m) 245 { 246 if (m == ~0) 247 strcpy(str, "*"); 248 else 249 sprintf(str, "%u", m); 250 } 251 252 static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, 253 struct seq_file *m) 254 { 255 struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); 256 struct dev_exception_item *ex; 257 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 258 259 rcu_read_lock(); 260 /* 261 * To preserve the compatibility: 262 * - Only show the "all devices" when the default policy is to allow 263 * - List the exceptions in case the default policy is to deny 264 * This way, the file remains as a "whitelist of devices" 265 */ 266 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 267 set_access(acc, ACC_MASK); 268 set_majmin(maj, ~0); 269 set_majmin(min, ~0); 270 seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL), 271 maj, min, acc); 272 } else { 273 list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { 274 set_access(acc, ex->access); 275 set_majmin(maj, ex->major); 276 set_majmin(min, ex->minor); 277 seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), 278 maj, min, acc); 279 } 280 } 281 rcu_read_unlock(); 282 283 return 0; 284 } 285 286 /** 287 * may_access - verifies if a new exception is part of what is allowed 288 * by a dev cgroup based on the default policy + 289 * exceptions. This is used to make sure a child cgroup 290 * won't have more privileges than its parent or to 291 * verify if a certain access is allowed. 292 * @dev_cgroup: dev cgroup to be tested against 293 * @refex: new exception 294 */ 295 static int may_access(struct dev_cgroup *dev_cgroup, 296 struct dev_exception_item *refex) 297 { 298 struct dev_exception_item *ex; 299 bool match = false; 300 301 list_for_each_entry(ex, &dev_cgroup->exceptions, list) { 302 if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) 303 continue; 304 if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR)) 305 continue; 306 if (ex->major != ~0 && ex->major != refex->major) 307 continue; 308 if (ex->minor != ~0 && ex->minor != refex->minor) 309 continue; 310 if (refex->access & (~ex->access)) 311 continue; 312 match = true; 313 break; 314 } 315 316 /* 317 * In two cases we'll consider this new exception valid: 318 * - the dev cgroup has its default policy to allow + exception list: 319 * the new exception should *not* match any of the exceptions 320 * (behavior == DEVCG_DEFAULT_ALLOW, !match) 321 * - the dev cgroup has its default policy to deny + exception list: 322 * the new exception *should* match the exceptions 323 * (behavior == DEVCG_DEFAULT_DENY, match) 324 */ 325 if ((dev_cgroup->behavior == DEVCG_DEFAULT_DENY) == match) 326 return 1; 327 return 0; 328 } 329 330 /* 331 * parent_has_perm: 332 * when adding a new allow rule to a device exception list, the rule 333 * must be allowed in the parent device 334 */ 335 static int parent_has_perm(struct dev_cgroup *childcg, 336 struct dev_exception_item *ex) 337 { 338 struct cgroup *pcg = childcg->css.cgroup->parent; 339 struct dev_cgroup *parent; 340 341 if (!pcg) 342 return 1; 343 parent = cgroup_to_devcgroup(pcg); 344 return may_access(parent, ex); 345 } 346 347 /** 348 * may_allow_all - checks if it's possible to change the behavior to 349 * allow based on parent's rules. 350 * @parent: device cgroup's parent 351 * returns: != 0 in case it's allowed, 0 otherwise 352 */ 353 static inline int may_allow_all(struct dev_cgroup *parent) 354 { 355 return parent->behavior == DEVCG_DEFAULT_ALLOW; 356 } 357 358 /* 359 * Modify the exception list using allow/deny rules. 360 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD 361 * so we can give a container CAP_MKNOD to let it create devices but not 362 * modify the exception list. 363 * It seems likely we'll want to add a CAP_CONTAINER capability to allow 364 * us to also grant CAP_SYS_ADMIN to containers without giving away the 365 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN 366 * 367 * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting 368 * new access is only allowed if you're in the top-level cgroup, or your 369 * parent cgroup has the access you're asking for. 370 */ 371 static int devcgroup_update_access(struct dev_cgroup *devcgroup, 372 int filetype, const char *buffer) 373 { 374 const char *b; 375 char temp[12]; /* 11 + 1 characters needed for a u32 */ 376 int count, rc; 377 struct dev_exception_item ex; 378 struct cgroup *p = devcgroup->css.cgroup; 379 struct dev_cgroup *parent = cgroup_to_devcgroup(p->parent); 380 381 if (!capable(CAP_SYS_ADMIN)) 382 return -EPERM; 383 384 memset(&ex, 0, sizeof(ex)); 385 b = buffer; 386 387 switch (*b) { 388 case 'a': 389 switch (filetype) { 390 case DEVCG_ALLOW: 391 if (!may_allow_all(parent)) 392 return -EPERM; 393 dev_exception_clean(devcgroup); 394 rc = dev_exceptions_copy(&devcgroup->exceptions, 395 &parent->exceptions); 396 if (rc) 397 return rc; 398 devcgroup->behavior = DEVCG_DEFAULT_ALLOW; 399 break; 400 case DEVCG_DENY: 401 dev_exception_clean(devcgroup); 402 devcgroup->behavior = DEVCG_DEFAULT_DENY; 403 break; 404 default: 405 return -EINVAL; 406 } 407 return 0; 408 case 'b': 409 ex.type = DEV_BLOCK; 410 break; 411 case 'c': 412 ex.type = DEV_CHAR; 413 break; 414 default: 415 return -EINVAL; 416 } 417 b++; 418 if (!isspace(*b)) 419 return -EINVAL; 420 b++; 421 if (*b == '*') { 422 ex.major = ~0; 423 b++; 424 } else if (isdigit(*b)) { 425 memset(temp, 0, sizeof(temp)); 426 for (count = 0; count < sizeof(temp) - 1; count++) { 427 temp[count] = *b; 428 b++; 429 if (!isdigit(*b)) 430 break; 431 } 432 rc = kstrtou32(temp, 10, &ex.major); 433 if (rc) 434 return -EINVAL; 435 } else { 436 return -EINVAL; 437 } 438 if (*b != ':') 439 return -EINVAL; 440 b++; 441 442 /* read minor */ 443 if (*b == '*') { 444 ex.minor = ~0; 445 b++; 446 } else if (isdigit(*b)) { 447 memset(temp, 0, sizeof(temp)); 448 for (count = 0; count < sizeof(temp) - 1; count++) { 449 temp[count] = *b; 450 b++; 451 if (!isdigit(*b)) 452 break; 453 } 454 rc = kstrtou32(temp, 10, &ex.minor); 455 if (rc) 456 return -EINVAL; 457 } else { 458 return -EINVAL; 459 } 460 if (!isspace(*b)) 461 return -EINVAL; 462 for (b++, count = 0; count < 3; count++, b++) { 463 switch (*b) { 464 case 'r': 465 ex.access |= ACC_READ; 466 break; 467 case 'w': 468 ex.access |= ACC_WRITE; 469 break; 470 case 'm': 471 ex.access |= ACC_MKNOD; 472 break; 473 case '\n': 474 case '\0': 475 count = 3; 476 break; 477 default: 478 return -EINVAL; 479 } 480 } 481 482 switch (filetype) { 483 case DEVCG_ALLOW: 484 if (!parent_has_perm(devcgroup, &ex)) 485 return -EPERM; 486 /* 487 * If the default policy is to allow by default, try to remove 488 * an matching exception instead. And be silent about it: we 489 * don't want to break compatibility 490 */ 491 if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { 492 dev_exception_rm(devcgroup, &ex); 493 return 0; 494 } 495 return dev_exception_add(devcgroup, &ex); 496 case DEVCG_DENY: 497 /* 498 * If the default policy is to deny by default, try to remove 499 * an matching exception instead. And be silent about it: we 500 * don't want to break compatibility 501 */ 502 if (devcgroup->behavior == DEVCG_DEFAULT_DENY) { 503 dev_exception_rm(devcgroup, &ex); 504 return 0; 505 } 506 return dev_exception_add(devcgroup, &ex); 507 default: 508 return -EINVAL; 509 } 510 return 0; 511 } 512 513 static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, 514 const char *buffer) 515 { 516 int retval; 517 518 mutex_lock(&devcgroup_mutex); 519 retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), 520 cft->private, buffer); 521 mutex_unlock(&devcgroup_mutex); 522 return retval; 523 } 524 525 static struct cftype dev_cgroup_files[] = { 526 { 527 .name = "allow", 528 .write_string = devcgroup_access_write, 529 .private = DEVCG_ALLOW, 530 }, 531 { 532 .name = "deny", 533 .write_string = devcgroup_access_write, 534 .private = DEVCG_DENY, 535 }, 536 { 537 .name = "list", 538 .read_seq_string = devcgroup_seq_read, 539 .private = DEVCG_LIST, 540 }, 541 { } /* terminate */ 542 }; 543 544 struct cgroup_subsys devices_subsys = { 545 .name = "devices", 546 .can_attach = devcgroup_can_attach, 547 .create = devcgroup_create, 548 .destroy = devcgroup_destroy, 549 .subsys_id = devices_subsys_id, 550 .base_cftypes = dev_cgroup_files, 551 552 /* 553 * While devices cgroup has the rudimentary hierarchy support which 554 * checks the parent's restriction, it doesn't properly propagates 555 * config changes in ancestors to their descendents. A child 556 * should only be allowed to add more restrictions to the parent's 557 * configuration. Fix it and remove the following. 558 */ 559 .broken_hierarchy = true, 560 }; 561 562 /** 563 * __devcgroup_check_permission - checks if an inode operation is permitted 564 * @dev_cgroup: the dev cgroup to be tested against 565 * @type: device type 566 * @major: device major number 567 * @minor: device minor number 568 * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD 569 * 570 * returns 0 on success, -EPERM case the operation is not permitted 571 */ 572 static int __devcgroup_check_permission(short type, u32 major, u32 minor, 573 short access) 574 { 575 struct dev_cgroup *dev_cgroup; 576 struct dev_exception_item ex; 577 int rc; 578 579 memset(&ex, 0, sizeof(ex)); 580 ex.type = type; 581 ex.major = major; 582 ex.minor = minor; 583 ex.access = access; 584 585 rcu_read_lock(); 586 dev_cgroup = task_devcgroup(current); 587 rc = may_access(dev_cgroup, &ex); 588 rcu_read_unlock(); 589 590 if (!rc) 591 return -EPERM; 592 593 return 0; 594 } 595 596 int __devcgroup_inode_permission(struct inode *inode, int mask) 597 { 598 short type, access = 0; 599 600 if (S_ISBLK(inode->i_mode)) 601 type = DEV_BLOCK; 602 if (S_ISCHR(inode->i_mode)) 603 type = DEV_CHAR; 604 if (mask & MAY_WRITE) 605 access |= ACC_WRITE; 606 if (mask & MAY_READ) 607 access |= ACC_READ; 608 609 return __devcgroup_check_permission(type, imajor(inode), iminor(inode), 610 access); 611 } 612 613 int devcgroup_inode_mknod(int mode, dev_t dev) 614 { 615 short type; 616 617 if (!S_ISBLK(mode) && !S_ISCHR(mode)) 618 return 0; 619 620 if (S_ISBLK(mode)) 621 type = DEV_BLOCK; 622 else 623 type = DEV_CHAR; 624 625 return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), 626 ACC_MKNOD); 627 628 } 629