1 /* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 */ 13 #include <linux/ioprio.h> 14 #include <linux/seq_file.h> 15 #include <linux/kdev_t.h> 16 #include <linux/module.h> 17 #include <linux/err.h> 18 #include <linux/blkdev.h> 19 #include <linux/slab.h> 20 #include "blk-cgroup.h" 21 #include <linux/genhd.h> 22 23 #define MAX_KEY_LEN 100 24 25 static DEFINE_SPINLOCK(blkio_list_lock); 26 static LIST_HEAD(blkio_list); 27 28 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; 29 EXPORT_SYMBOL_GPL(blkio_root_cgroup); 30 31 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, 32 struct cgroup *); 33 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, 34 struct cgroup_taskset *); 35 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, 36 struct cgroup_taskset *); 37 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 38 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 39 40 /* for encoding cft->private value on file */ 41 #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) 42 /* What policy owns the file, proportional or throttle */ 43 #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) 44 #define BLKIOFILE_ATTR(val) ((val) & 0xffff) 45 46 struct cgroup_subsys blkio_subsys = { 47 .name = "blkio", 48 .create = blkiocg_create, 49 .can_attach = blkiocg_can_attach, 50 .attach = blkiocg_attach, 51 .destroy = blkiocg_destroy, 52 .populate = blkiocg_populate, 53 #ifdef CONFIG_BLK_CGROUP 54 /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ 55 .subsys_id = blkio_subsys_id, 56 #endif 57 .use_id = 1, 58 .module = THIS_MODULE, 59 }; 60 EXPORT_SYMBOL_GPL(blkio_subsys); 61 62 static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, 63 struct blkio_policy_node *pn) 64 { 65 list_add(&pn->node, &blkcg->policy_list); 66 } 67 68 static inline bool cftype_blkg_same_policy(struct cftype *cft, 69 struct blkio_group *blkg) 70 { 71 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 72 73 if (blkg->plid == plid) 74 return 1; 75 76 return 0; 77 } 78 79 /* Determines if policy node matches cgroup file being accessed */ 80 static inline bool pn_matches_cftype(struct cftype *cft, 81 struct blkio_policy_node *pn) 82 { 83 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 84 int fileid = BLKIOFILE_ATTR(cft->private); 85 86 return (plid == pn->plid && fileid == pn->fileid); 87 } 88 89 /* Must be called with blkcg->lock held */ 90 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 91 { 92 list_del(&pn->node); 93 } 94 95 /* Must be called with blkcg->lock held */ 96 static struct blkio_policy_node * 97 blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, 98 enum blkio_policy_id plid, int fileid) 99 { 100 struct blkio_policy_node *pn; 101 102 list_for_each_entry(pn, &blkcg->policy_list, node) { 103 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) 104 return pn; 105 } 106 107 return NULL; 108 } 109 110 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 111 { 112 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 113 struct blkio_cgroup, css); 114 } 115 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 116 117 struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) 118 { 119 return container_of(task_subsys_state(tsk, blkio_subsys_id), 120 struct blkio_cgroup, css); 121 } 122 EXPORT_SYMBOL_GPL(task_blkio_cgroup); 123 124 static inline void 125 blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) 126 { 127 struct blkio_policy_type *blkiop; 128 129 list_for_each_entry(blkiop, &blkio_list, list) { 130 /* If this policy does not own the blkg, do not send updates */ 131 if (blkiop->plid != blkg->plid) 132 continue; 133 if (blkiop->ops.blkio_update_group_weight_fn) 134 blkiop->ops.blkio_update_group_weight_fn(blkg->key, 135 blkg, weight); 136 } 137 } 138 139 static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, 140 int fileid) 141 { 142 struct blkio_policy_type *blkiop; 143 144 list_for_each_entry(blkiop, &blkio_list, list) { 145 146 /* If this policy does not own the blkg, do not send updates */ 147 if (blkiop->plid != blkg->plid) 148 continue; 149 150 if (fileid == BLKIO_THROTL_read_bps_device 151 && blkiop->ops.blkio_update_group_read_bps_fn) 152 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, 153 blkg, bps); 154 155 if (fileid == BLKIO_THROTL_write_bps_device 156 && blkiop->ops.blkio_update_group_write_bps_fn) 157 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, 158 blkg, bps); 159 } 160 } 161 162 static inline void blkio_update_group_iops(struct blkio_group *blkg, 163 unsigned int iops, int fileid) 164 { 165 struct blkio_policy_type *blkiop; 166 167 list_for_each_entry(blkiop, &blkio_list, list) { 168 169 /* If this policy does not own the blkg, do not send updates */ 170 if (blkiop->plid != blkg->plid) 171 continue; 172 173 if (fileid == BLKIO_THROTL_read_iops_device 174 && blkiop->ops.blkio_update_group_read_iops_fn) 175 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, 176 blkg, iops); 177 178 if (fileid == BLKIO_THROTL_write_iops_device 179 && blkiop->ops.blkio_update_group_write_iops_fn) 180 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, 181 blkg,iops); 182 } 183 } 184 185 /* 186 * Add to the appropriate stat variable depending on the request type. 187 * This should be called with the blkg->stats_lock held. 188 */ 189 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, 190 bool sync) 191 { 192 if (direction) 193 stat[BLKIO_STAT_WRITE] += add; 194 else 195 stat[BLKIO_STAT_READ] += add; 196 if (sync) 197 stat[BLKIO_STAT_SYNC] += add; 198 else 199 stat[BLKIO_STAT_ASYNC] += add; 200 } 201 202 /* 203 * Decrements the appropriate stat variable if non-zero depending on the 204 * request type. Panics on value being zero. 205 * This should be called with the blkg->stats_lock held. 206 */ 207 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) 208 { 209 if (direction) { 210 BUG_ON(stat[BLKIO_STAT_WRITE] == 0); 211 stat[BLKIO_STAT_WRITE]--; 212 } else { 213 BUG_ON(stat[BLKIO_STAT_READ] == 0); 214 stat[BLKIO_STAT_READ]--; 215 } 216 if (sync) { 217 BUG_ON(stat[BLKIO_STAT_SYNC] == 0); 218 stat[BLKIO_STAT_SYNC]--; 219 } else { 220 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); 221 stat[BLKIO_STAT_ASYNC]--; 222 } 223 } 224 225 #ifdef CONFIG_DEBUG_BLK_CGROUP 226 /* This should be called with the blkg->stats_lock held. */ 227 static void blkio_set_start_group_wait_time(struct blkio_group *blkg, 228 struct blkio_group *curr_blkg) 229 { 230 if (blkio_blkg_waiting(&blkg->stats)) 231 return; 232 if (blkg == curr_blkg) 233 return; 234 blkg->stats.start_group_wait_time = sched_clock(); 235 blkio_mark_blkg_waiting(&blkg->stats); 236 } 237 238 /* This should be called with the blkg->stats_lock held. */ 239 static void blkio_update_group_wait_time(struct blkio_group_stats *stats) 240 { 241 unsigned long long now; 242 243 if (!blkio_blkg_waiting(stats)) 244 return; 245 246 now = sched_clock(); 247 if (time_after64(now, stats->start_group_wait_time)) 248 stats->group_wait_time += now - stats->start_group_wait_time; 249 blkio_clear_blkg_waiting(stats); 250 } 251 252 /* This should be called with the blkg->stats_lock held. */ 253 static void blkio_end_empty_time(struct blkio_group_stats *stats) 254 { 255 unsigned long long now; 256 257 if (!blkio_blkg_empty(stats)) 258 return; 259 260 now = sched_clock(); 261 if (time_after64(now, stats->start_empty_time)) 262 stats->empty_time += now - stats->start_empty_time; 263 blkio_clear_blkg_empty(stats); 264 } 265 266 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) 267 { 268 unsigned long flags; 269 270 spin_lock_irqsave(&blkg->stats_lock, flags); 271 BUG_ON(blkio_blkg_idling(&blkg->stats)); 272 blkg->stats.start_idle_time = sched_clock(); 273 blkio_mark_blkg_idling(&blkg->stats); 274 spin_unlock_irqrestore(&blkg->stats_lock, flags); 275 } 276 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); 277 278 void blkiocg_update_idle_time_stats(struct blkio_group *blkg) 279 { 280 unsigned long flags; 281 unsigned long long now; 282 struct blkio_group_stats *stats; 283 284 spin_lock_irqsave(&blkg->stats_lock, flags); 285 stats = &blkg->stats; 286 if (blkio_blkg_idling(stats)) { 287 now = sched_clock(); 288 if (time_after64(now, stats->start_idle_time)) 289 stats->idle_time += now - stats->start_idle_time; 290 blkio_clear_blkg_idling(stats); 291 } 292 spin_unlock_irqrestore(&blkg->stats_lock, flags); 293 } 294 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); 295 296 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) 297 { 298 unsigned long flags; 299 struct blkio_group_stats *stats; 300 301 spin_lock_irqsave(&blkg->stats_lock, flags); 302 stats = &blkg->stats; 303 stats->avg_queue_size_sum += 304 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + 305 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; 306 stats->avg_queue_size_samples++; 307 blkio_update_group_wait_time(stats); 308 spin_unlock_irqrestore(&blkg->stats_lock, flags); 309 } 310 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); 311 312 void blkiocg_set_start_empty_time(struct blkio_group *blkg) 313 { 314 unsigned long flags; 315 struct blkio_group_stats *stats; 316 317 spin_lock_irqsave(&blkg->stats_lock, flags); 318 stats = &blkg->stats; 319 320 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || 321 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { 322 spin_unlock_irqrestore(&blkg->stats_lock, flags); 323 return; 324 } 325 326 /* 327 * group is already marked empty. This can happen if cfqq got new 328 * request in parent group and moved to this group while being added 329 * to service tree. Just ignore the event and move on. 330 */ 331 if(blkio_blkg_empty(stats)) { 332 spin_unlock_irqrestore(&blkg->stats_lock, flags); 333 return; 334 } 335 336 stats->start_empty_time = sched_clock(); 337 blkio_mark_blkg_empty(stats); 338 spin_unlock_irqrestore(&blkg->stats_lock, flags); 339 } 340 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); 341 342 void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 343 unsigned long dequeue) 344 { 345 blkg->stats.dequeue += dequeue; 346 } 347 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); 348 #else 349 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, 350 struct blkio_group *curr_blkg) {} 351 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} 352 #endif 353 354 void blkiocg_update_io_add_stats(struct blkio_group *blkg, 355 struct blkio_group *curr_blkg, bool direction, 356 bool sync) 357 { 358 unsigned long flags; 359 360 spin_lock_irqsave(&blkg->stats_lock, flags); 361 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, 362 sync); 363 blkio_end_empty_time(&blkg->stats); 364 blkio_set_start_group_wait_time(blkg, curr_blkg); 365 spin_unlock_irqrestore(&blkg->stats_lock, flags); 366 } 367 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); 368 369 void blkiocg_update_io_remove_stats(struct blkio_group *blkg, 370 bool direction, bool sync) 371 { 372 unsigned long flags; 373 374 spin_lock_irqsave(&blkg->stats_lock, flags); 375 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 376 direction, sync); 377 spin_unlock_irqrestore(&blkg->stats_lock, flags); 378 } 379 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); 380 381 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, 382 unsigned long unaccounted_time) 383 { 384 unsigned long flags; 385 386 spin_lock_irqsave(&blkg->stats_lock, flags); 387 blkg->stats.time += time; 388 #ifdef CONFIG_DEBUG_BLK_CGROUP 389 blkg->stats.unaccounted_time += unaccounted_time; 390 #endif 391 spin_unlock_irqrestore(&blkg->stats_lock, flags); 392 } 393 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 394 395 /* 396 * should be called under rcu read lock or queue lock to make sure blkg pointer 397 * is valid. 398 */ 399 void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 400 uint64_t bytes, bool direction, bool sync) 401 { 402 struct blkio_group_stats_cpu *stats_cpu; 403 unsigned long flags; 404 405 /* 406 * Disabling interrupts to provide mutual exclusion between two 407 * writes on same cpu. It probably is not needed for 64bit. Not 408 * optimizing that case yet. 409 */ 410 local_irq_save(flags); 411 412 stats_cpu = this_cpu_ptr(blkg->stats_cpu); 413 414 u64_stats_update_begin(&stats_cpu->syncp); 415 stats_cpu->sectors += bytes >> 9; 416 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], 417 1, direction, sync); 418 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], 419 bytes, direction, sync); 420 u64_stats_update_end(&stats_cpu->syncp); 421 local_irq_restore(flags); 422 } 423 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); 424 425 void blkiocg_update_completion_stats(struct blkio_group *blkg, 426 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) 427 { 428 struct blkio_group_stats *stats; 429 unsigned long flags; 430 unsigned long long now = sched_clock(); 431 432 spin_lock_irqsave(&blkg->stats_lock, flags); 433 stats = &blkg->stats; 434 if (time_after64(now, io_start_time)) 435 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], 436 now - io_start_time, direction, sync); 437 if (time_after64(io_start_time, start_time)) 438 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], 439 io_start_time - start_time, direction, sync); 440 spin_unlock_irqrestore(&blkg->stats_lock, flags); 441 } 442 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); 443 444 /* Merged stats are per cpu. */ 445 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, 446 bool sync) 447 { 448 struct blkio_group_stats_cpu *stats_cpu; 449 unsigned long flags; 450 451 /* 452 * Disabling interrupts to provide mutual exclusion between two 453 * writes on same cpu. It probably is not needed for 64bit. Not 454 * optimizing that case yet. 455 */ 456 local_irq_save(flags); 457 458 stats_cpu = this_cpu_ptr(blkg->stats_cpu); 459 460 u64_stats_update_begin(&stats_cpu->syncp); 461 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, 462 direction, sync); 463 u64_stats_update_end(&stats_cpu->syncp); 464 local_irq_restore(flags); 465 } 466 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 467 468 /* 469 * This function allocates the per cpu stats for blkio_group. Should be called 470 * from sleepable context as alloc_per_cpu() requires that. 471 */ 472 int blkio_alloc_blkg_stats(struct blkio_group *blkg) 473 { 474 /* Allocate memory for per cpu stats */ 475 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); 476 if (!blkg->stats_cpu) 477 return -ENOMEM; 478 return 0; 479 } 480 EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); 481 482 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 483 struct blkio_group *blkg, void *key, dev_t dev, 484 enum blkio_policy_id plid) 485 { 486 unsigned long flags; 487 488 spin_lock_irqsave(&blkcg->lock, flags); 489 spin_lock_init(&blkg->stats_lock); 490 rcu_assign_pointer(blkg->key, key); 491 blkg->blkcg_id = css_id(&blkcg->css); 492 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 493 blkg->plid = plid; 494 spin_unlock_irqrestore(&blkcg->lock, flags); 495 /* Need to take css reference ? */ 496 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 497 blkg->dev = dev; 498 } 499 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); 500 501 static void __blkiocg_del_blkio_group(struct blkio_group *blkg) 502 { 503 hlist_del_init_rcu(&blkg->blkcg_node); 504 blkg->blkcg_id = 0; 505 } 506 507 /* 508 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 509 * indicating that blk_group was unhashed by the time we got to it. 510 */ 511 int blkiocg_del_blkio_group(struct blkio_group *blkg) 512 { 513 struct blkio_cgroup *blkcg; 514 unsigned long flags; 515 struct cgroup_subsys_state *css; 516 int ret = 1; 517 518 rcu_read_lock(); 519 css = css_lookup(&blkio_subsys, blkg->blkcg_id); 520 if (css) { 521 blkcg = container_of(css, struct blkio_cgroup, css); 522 spin_lock_irqsave(&blkcg->lock, flags); 523 if (!hlist_unhashed(&blkg->blkcg_node)) { 524 __blkiocg_del_blkio_group(blkg); 525 ret = 0; 526 } 527 spin_unlock_irqrestore(&blkcg->lock, flags); 528 } 529 530 rcu_read_unlock(); 531 return ret; 532 } 533 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); 534 535 /* called under rcu_read_lock(). */ 536 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) 537 { 538 struct blkio_group *blkg; 539 struct hlist_node *n; 540 void *__key; 541 542 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { 543 __key = blkg->key; 544 if (__key == key) 545 return blkg; 546 } 547 548 return NULL; 549 } 550 EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 551 552 static void blkio_reset_stats_cpu(struct blkio_group *blkg) 553 { 554 struct blkio_group_stats_cpu *stats_cpu; 555 int i, j, k; 556 /* 557 * Note: On 64 bit arch this should not be an issue. This has the 558 * possibility of returning some inconsistent value on 32bit arch 559 * as 64bit update on 32bit is non atomic. Taking care of this 560 * corner case makes code very complicated, like sending IPIs to 561 * cpus, taking care of stats of offline cpus etc. 562 * 563 * reset stats is anyway more of a debug feature and this sounds a 564 * corner case. So I am not complicating the code yet until and 565 * unless this becomes a real issue. 566 */ 567 for_each_possible_cpu(i) { 568 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); 569 stats_cpu->sectors = 0; 570 for(j = 0; j < BLKIO_STAT_CPU_NR; j++) 571 for (k = 0; k < BLKIO_STAT_TOTAL; k++) 572 stats_cpu->stat_arr_cpu[j][k] = 0; 573 } 574 } 575 576 static int 577 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 578 { 579 struct blkio_cgroup *blkcg; 580 struct blkio_group *blkg; 581 struct blkio_group_stats *stats; 582 struct hlist_node *n; 583 uint64_t queued[BLKIO_STAT_TOTAL]; 584 int i; 585 #ifdef CONFIG_DEBUG_BLK_CGROUP 586 bool idling, waiting, empty; 587 unsigned long long now = sched_clock(); 588 #endif 589 590 blkcg = cgroup_to_blkio_cgroup(cgroup); 591 spin_lock_irq(&blkcg->lock); 592 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 593 spin_lock(&blkg->stats_lock); 594 stats = &blkg->stats; 595 #ifdef CONFIG_DEBUG_BLK_CGROUP 596 idling = blkio_blkg_idling(stats); 597 waiting = blkio_blkg_waiting(stats); 598 empty = blkio_blkg_empty(stats); 599 #endif 600 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 601 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; 602 memset(stats, 0, sizeof(struct blkio_group_stats)); 603 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 604 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; 605 #ifdef CONFIG_DEBUG_BLK_CGROUP 606 if (idling) { 607 blkio_mark_blkg_idling(stats); 608 stats->start_idle_time = now; 609 } 610 if (waiting) { 611 blkio_mark_blkg_waiting(stats); 612 stats->start_group_wait_time = now; 613 } 614 if (empty) { 615 blkio_mark_blkg_empty(stats); 616 stats->start_empty_time = now; 617 } 618 #endif 619 spin_unlock(&blkg->stats_lock); 620 621 /* Reset Per cpu stats which don't take blkg->stats_lock */ 622 blkio_reset_stats_cpu(blkg); 623 } 624 625 spin_unlock_irq(&blkcg->lock); 626 return 0; 627 } 628 629 static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, 630 int chars_left, bool diskname_only) 631 { 632 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); 633 chars_left -= strlen(str); 634 if (chars_left <= 0) { 635 printk(KERN_WARNING 636 "Possibly incorrect cgroup stat display format"); 637 return; 638 } 639 if (diskname_only) 640 return; 641 switch (type) { 642 case BLKIO_STAT_READ: 643 strlcat(str, " Read", chars_left); 644 break; 645 case BLKIO_STAT_WRITE: 646 strlcat(str, " Write", chars_left); 647 break; 648 case BLKIO_STAT_SYNC: 649 strlcat(str, " Sync", chars_left); 650 break; 651 case BLKIO_STAT_ASYNC: 652 strlcat(str, " Async", chars_left); 653 break; 654 case BLKIO_STAT_TOTAL: 655 strlcat(str, " Total", chars_left); 656 break; 657 default: 658 strlcat(str, " Invalid", chars_left); 659 } 660 } 661 662 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, 663 struct cgroup_map_cb *cb, dev_t dev) 664 { 665 blkio_get_key_name(0, dev, str, chars_left, true); 666 cb->fill(cb, str, val); 667 return val; 668 } 669 670 671 static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, 672 enum stat_type_cpu type, enum stat_sub_type sub_type) 673 { 674 int cpu; 675 struct blkio_group_stats_cpu *stats_cpu; 676 u64 val = 0, tval; 677 678 for_each_possible_cpu(cpu) { 679 unsigned int start; 680 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); 681 682 do { 683 start = u64_stats_fetch_begin(&stats_cpu->syncp); 684 if (type == BLKIO_STAT_CPU_SECTORS) 685 tval = stats_cpu->sectors; 686 else 687 tval = stats_cpu->stat_arr_cpu[type][sub_type]; 688 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); 689 690 val += tval; 691 } 692 693 return val; 694 } 695 696 static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, 697 struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) 698 { 699 uint64_t disk_total, val; 700 char key_str[MAX_KEY_LEN]; 701 enum stat_sub_type sub_type; 702 703 if (type == BLKIO_STAT_CPU_SECTORS) { 704 val = blkio_read_stat_cpu(blkg, type, 0); 705 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); 706 } 707 708 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 709 sub_type++) { 710 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); 711 val = blkio_read_stat_cpu(blkg, type, sub_type); 712 cb->fill(cb, key_str, val); 713 } 714 715 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + 716 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); 717 718 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); 719 cb->fill(cb, key_str, disk_total); 720 return disk_total; 721 } 722 723 /* This should be called with blkg->stats_lock held */ 724 static uint64_t blkio_get_stat(struct blkio_group *blkg, 725 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) 726 { 727 uint64_t disk_total; 728 char key_str[MAX_KEY_LEN]; 729 enum stat_sub_type sub_type; 730 731 if (type == BLKIO_STAT_TIME) 732 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 733 blkg->stats.time, cb, dev); 734 #ifdef CONFIG_DEBUG_BLK_CGROUP 735 if (type == BLKIO_STAT_UNACCOUNTED_TIME) 736 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 737 blkg->stats.unaccounted_time, cb, dev); 738 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 739 uint64_t sum = blkg->stats.avg_queue_size_sum; 740 uint64_t samples = blkg->stats.avg_queue_size_samples; 741 if (samples) 742 do_div(sum, samples); 743 else 744 sum = 0; 745 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); 746 } 747 if (type == BLKIO_STAT_GROUP_WAIT_TIME) 748 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 749 blkg->stats.group_wait_time, cb, dev); 750 if (type == BLKIO_STAT_IDLE_TIME) 751 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 752 blkg->stats.idle_time, cb, dev); 753 if (type == BLKIO_STAT_EMPTY_TIME) 754 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 755 blkg->stats.empty_time, cb, dev); 756 if (type == BLKIO_STAT_DEQUEUE) 757 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 758 blkg->stats.dequeue, cb, dev); 759 #endif 760 761 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 762 sub_type++) { 763 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); 764 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); 765 } 766 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + 767 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; 768 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); 769 cb->fill(cb, key_str, disk_total); 770 return disk_total; 771 } 772 773 static int blkio_policy_parse_and_set(char *buf, 774 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) 775 { 776 struct gendisk *disk = NULL; 777 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 778 unsigned long major, minor; 779 int i = 0, ret = -EINVAL; 780 int part; 781 dev_t dev; 782 u64 temp; 783 784 memset(s, 0, sizeof(s)); 785 786 while ((p = strsep(&buf, " ")) != NULL) { 787 if (!*p) 788 continue; 789 790 s[i++] = p; 791 792 /* Prevent from inputing too many things */ 793 if (i == 3) 794 break; 795 } 796 797 if (i != 2) 798 goto out; 799 800 p = strsep(&s[0], ":"); 801 if (p != NULL) 802 major_s = p; 803 else 804 goto out; 805 806 minor_s = s[0]; 807 if (!minor_s) 808 goto out; 809 810 if (strict_strtoul(major_s, 10, &major)) 811 goto out; 812 813 if (strict_strtoul(minor_s, 10, &minor)) 814 goto out; 815 816 dev = MKDEV(major, minor); 817 818 if (strict_strtoull(s[1], 10, &temp)) 819 goto out; 820 821 /* For rule removal, do not check for device presence. */ 822 if (temp) { 823 disk = get_gendisk(dev, &part); 824 if (!disk || part) { 825 ret = -ENODEV; 826 goto out; 827 } 828 } 829 830 newpn->dev = dev; 831 832 switch (plid) { 833 case BLKIO_POLICY_PROP: 834 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || 835 temp > BLKIO_WEIGHT_MAX) 836 goto out; 837 838 newpn->plid = plid; 839 newpn->fileid = fileid; 840 newpn->val.weight = temp; 841 break; 842 case BLKIO_POLICY_THROTL: 843 switch(fileid) { 844 case BLKIO_THROTL_read_bps_device: 845 case BLKIO_THROTL_write_bps_device: 846 newpn->plid = plid; 847 newpn->fileid = fileid; 848 newpn->val.bps = temp; 849 break; 850 case BLKIO_THROTL_read_iops_device: 851 case BLKIO_THROTL_write_iops_device: 852 if (temp > THROTL_IOPS_MAX) 853 goto out; 854 855 newpn->plid = plid; 856 newpn->fileid = fileid; 857 newpn->val.iops = (unsigned int)temp; 858 break; 859 } 860 break; 861 default: 862 BUG(); 863 } 864 ret = 0; 865 out: 866 put_disk(disk); 867 return ret; 868 } 869 870 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 871 dev_t dev) 872 { 873 struct blkio_policy_node *pn; 874 unsigned long flags; 875 unsigned int weight; 876 877 spin_lock_irqsave(&blkcg->lock, flags); 878 879 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, 880 BLKIO_PROP_weight_device); 881 if (pn) 882 weight = pn->val.weight; 883 else 884 weight = blkcg->weight; 885 886 spin_unlock_irqrestore(&blkcg->lock, flags); 887 888 return weight; 889 } 890 EXPORT_SYMBOL_GPL(blkcg_get_weight); 891 892 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) 893 { 894 struct blkio_policy_node *pn; 895 unsigned long flags; 896 uint64_t bps = -1; 897 898 spin_lock_irqsave(&blkcg->lock, flags); 899 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 900 BLKIO_THROTL_read_bps_device); 901 if (pn) 902 bps = pn->val.bps; 903 spin_unlock_irqrestore(&blkcg->lock, flags); 904 905 return bps; 906 } 907 908 uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) 909 { 910 struct blkio_policy_node *pn; 911 unsigned long flags; 912 uint64_t bps = -1; 913 914 spin_lock_irqsave(&blkcg->lock, flags); 915 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 916 BLKIO_THROTL_write_bps_device); 917 if (pn) 918 bps = pn->val.bps; 919 spin_unlock_irqrestore(&blkcg->lock, flags); 920 921 return bps; 922 } 923 924 unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) 925 { 926 struct blkio_policy_node *pn; 927 unsigned long flags; 928 unsigned int iops = -1; 929 930 spin_lock_irqsave(&blkcg->lock, flags); 931 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 932 BLKIO_THROTL_read_iops_device); 933 if (pn) 934 iops = pn->val.iops; 935 spin_unlock_irqrestore(&blkcg->lock, flags); 936 937 return iops; 938 } 939 940 unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) 941 { 942 struct blkio_policy_node *pn; 943 unsigned long flags; 944 unsigned int iops = -1; 945 946 spin_lock_irqsave(&blkcg->lock, flags); 947 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 948 BLKIO_THROTL_write_iops_device); 949 if (pn) 950 iops = pn->val.iops; 951 spin_unlock_irqrestore(&blkcg->lock, flags); 952 953 return iops; 954 } 955 956 /* Checks whether user asked for deleting a policy rule */ 957 static bool blkio_delete_rule_command(struct blkio_policy_node *pn) 958 { 959 switch(pn->plid) { 960 case BLKIO_POLICY_PROP: 961 if (pn->val.weight == 0) 962 return 1; 963 break; 964 case BLKIO_POLICY_THROTL: 965 switch(pn->fileid) { 966 case BLKIO_THROTL_read_bps_device: 967 case BLKIO_THROTL_write_bps_device: 968 if (pn->val.bps == 0) 969 return 1; 970 break; 971 case BLKIO_THROTL_read_iops_device: 972 case BLKIO_THROTL_write_iops_device: 973 if (pn->val.iops == 0) 974 return 1; 975 } 976 break; 977 default: 978 BUG(); 979 } 980 981 return 0; 982 } 983 984 static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, 985 struct blkio_policy_node *newpn) 986 { 987 switch(oldpn->plid) { 988 case BLKIO_POLICY_PROP: 989 oldpn->val.weight = newpn->val.weight; 990 break; 991 case BLKIO_POLICY_THROTL: 992 switch(newpn->fileid) { 993 case BLKIO_THROTL_read_bps_device: 994 case BLKIO_THROTL_write_bps_device: 995 oldpn->val.bps = newpn->val.bps; 996 break; 997 case BLKIO_THROTL_read_iops_device: 998 case BLKIO_THROTL_write_iops_device: 999 oldpn->val.iops = newpn->val.iops; 1000 } 1001 break; 1002 default: 1003 BUG(); 1004 } 1005 } 1006 1007 /* 1008 * Some rules/values in blkg have changed. Propagate those to respective 1009 * policies. 1010 */ 1011 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, 1012 struct blkio_group *blkg, struct blkio_policy_node *pn) 1013 { 1014 unsigned int weight, iops; 1015 u64 bps; 1016 1017 switch(pn->plid) { 1018 case BLKIO_POLICY_PROP: 1019 weight = pn->val.weight ? pn->val.weight : 1020 blkcg->weight; 1021 blkio_update_group_weight(blkg, weight); 1022 break; 1023 case BLKIO_POLICY_THROTL: 1024 switch(pn->fileid) { 1025 case BLKIO_THROTL_read_bps_device: 1026 case BLKIO_THROTL_write_bps_device: 1027 bps = pn->val.bps ? pn->val.bps : (-1); 1028 blkio_update_group_bps(blkg, bps, pn->fileid); 1029 break; 1030 case BLKIO_THROTL_read_iops_device: 1031 case BLKIO_THROTL_write_iops_device: 1032 iops = pn->val.iops ? pn->val.iops : (-1); 1033 blkio_update_group_iops(blkg, iops, pn->fileid); 1034 break; 1035 } 1036 break; 1037 default: 1038 BUG(); 1039 } 1040 } 1041 1042 /* 1043 * A policy node rule has been updated. Propagate this update to all the 1044 * block groups which might be affected by this update. 1045 */ 1046 static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, 1047 struct blkio_policy_node *pn) 1048 { 1049 struct blkio_group *blkg; 1050 struct hlist_node *n; 1051 1052 spin_lock(&blkio_list_lock); 1053 spin_lock_irq(&blkcg->lock); 1054 1055 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 1056 if (pn->dev != blkg->dev || pn->plid != blkg->plid) 1057 continue; 1058 blkio_update_blkg_policy(blkcg, blkg, pn); 1059 } 1060 1061 spin_unlock_irq(&blkcg->lock); 1062 spin_unlock(&blkio_list_lock); 1063 } 1064 1065 static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, 1066 const char *buffer) 1067 { 1068 int ret = 0; 1069 char *buf; 1070 struct blkio_policy_node *newpn, *pn; 1071 struct blkio_cgroup *blkcg; 1072 int keep_newpn = 0; 1073 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1074 int fileid = BLKIOFILE_ATTR(cft->private); 1075 1076 buf = kstrdup(buffer, GFP_KERNEL); 1077 if (!buf) 1078 return -ENOMEM; 1079 1080 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); 1081 if (!newpn) { 1082 ret = -ENOMEM; 1083 goto free_buf; 1084 } 1085 1086 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); 1087 if (ret) 1088 goto free_newpn; 1089 1090 blkcg = cgroup_to_blkio_cgroup(cgrp); 1091 1092 spin_lock_irq(&blkcg->lock); 1093 1094 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); 1095 if (!pn) { 1096 if (!blkio_delete_rule_command(newpn)) { 1097 blkio_policy_insert_node(blkcg, newpn); 1098 keep_newpn = 1; 1099 } 1100 spin_unlock_irq(&blkcg->lock); 1101 goto update_io_group; 1102 } 1103 1104 if (blkio_delete_rule_command(newpn)) { 1105 blkio_policy_delete_node(pn); 1106 kfree(pn); 1107 spin_unlock_irq(&blkcg->lock); 1108 goto update_io_group; 1109 } 1110 spin_unlock_irq(&blkcg->lock); 1111 1112 blkio_update_policy_rule(pn, newpn); 1113 1114 update_io_group: 1115 blkio_update_policy_node_blkg(blkcg, newpn); 1116 1117 free_newpn: 1118 if (!keep_newpn) 1119 kfree(newpn); 1120 free_buf: 1121 kfree(buf); 1122 return ret; 1123 } 1124 1125 static void 1126 blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) 1127 { 1128 switch(pn->plid) { 1129 case BLKIO_POLICY_PROP: 1130 if (pn->fileid == BLKIO_PROP_weight_device) 1131 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1132 MINOR(pn->dev), pn->val.weight); 1133 break; 1134 case BLKIO_POLICY_THROTL: 1135 switch(pn->fileid) { 1136 case BLKIO_THROTL_read_bps_device: 1137 case BLKIO_THROTL_write_bps_device: 1138 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), 1139 MINOR(pn->dev), pn->val.bps); 1140 break; 1141 case BLKIO_THROTL_read_iops_device: 1142 case BLKIO_THROTL_write_iops_device: 1143 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1144 MINOR(pn->dev), pn->val.iops); 1145 break; 1146 } 1147 break; 1148 default: 1149 BUG(); 1150 } 1151 } 1152 1153 /* cgroup files which read their data from policy nodes end up here */ 1154 static void blkio_read_policy_node_files(struct cftype *cft, 1155 struct blkio_cgroup *blkcg, struct seq_file *m) 1156 { 1157 struct blkio_policy_node *pn; 1158 1159 if (!list_empty(&blkcg->policy_list)) { 1160 spin_lock_irq(&blkcg->lock); 1161 list_for_each_entry(pn, &blkcg->policy_list, node) { 1162 if (!pn_matches_cftype(cft, pn)) 1163 continue; 1164 blkio_print_policy_node(m, pn); 1165 } 1166 spin_unlock_irq(&blkcg->lock); 1167 } 1168 } 1169 1170 static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, 1171 struct seq_file *m) 1172 { 1173 struct blkio_cgroup *blkcg; 1174 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1175 int name = BLKIOFILE_ATTR(cft->private); 1176 1177 blkcg = cgroup_to_blkio_cgroup(cgrp); 1178 1179 switch(plid) { 1180 case BLKIO_POLICY_PROP: 1181 switch(name) { 1182 case BLKIO_PROP_weight_device: 1183 blkio_read_policy_node_files(cft, blkcg, m); 1184 return 0; 1185 default: 1186 BUG(); 1187 } 1188 break; 1189 case BLKIO_POLICY_THROTL: 1190 switch(name){ 1191 case BLKIO_THROTL_read_bps_device: 1192 case BLKIO_THROTL_write_bps_device: 1193 case BLKIO_THROTL_read_iops_device: 1194 case BLKIO_THROTL_write_iops_device: 1195 blkio_read_policy_node_files(cft, blkcg, m); 1196 return 0; 1197 default: 1198 BUG(); 1199 } 1200 break; 1201 default: 1202 BUG(); 1203 } 1204 1205 return 0; 1206 } 1207 1208 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, 1209 struct cftype *cft, struct cgroup_map_cb *cb, 1210 enum stat_type type, bool show_total, bool pcpu) 1211 { 1212 struct blkio_group *blkg; 1213 struct hlist_node *n; 1214 uint64_t cgroup_total = 0; 1215 1216 rcu_read_lock(); 1217 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { 1218 if (blkg->dev) { 1219 if (!cftype_blkg_same_policy(cft, blkg)) 1220 continue; 1221 if (pcpu) 1222 cgroup_total += blkio_get_stat_cpu(blkg, cb, 1223 blkg->dev, type); 1224 else { 1225 spin_lock_irq(&blkg->stats_lock); 1226 cgroup_total += blkio_get_stat(blkg, cb, 1227 blkg->dev, type); 1228 spin_unlock_irq(&blkg->stats_lock); 1229 } 1230 } 1231 } 1232 if (show_total) 1233 cb->fill(cb, "Total", cgroup_total); 1234 rcu_read_unlock(); 1235 return 0; 1236 } 1237 1238 /* All map kind of cgroup file get serviced by this function */ 1239 static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, 1240 struct cgroup_map_cb *cb) 1241 { 1242 struct blkio_cgroup *blkcg; 1243 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1244 int name = BLKIOFILE_ATTR(cft->private); 1245 1246 blkcg = cgroup_to_blkio_cgroup(cgrp); 1247 1248 switch(plid) { 1249 case BLKIO_POLICY_PROP: 1250 switch(name) { 1251 case BLKIO_PROP_time: 1252 return blkio_read_blkg_stats(blkcg, cft, cb, 1253 BLKIO_STAT_TIME, 0, 0); 1254 case BLKIO_PROP_sectors: 1255 return blkio_read_blkg_stats(blkcg, cft, cb, 1256 BLKIO_STAT_CPU_SECTORS, 0, 1); 1257 case BLKIO_PROP_io_service_bytes: 1258 return blkio_read_blkg_stats(blkcg, cft, cb, 1259 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1260 case BLKIO_PROP_io_serviced: 1261 return blkio_read_blkg_stats(blkcg, cft, cb, 1262 BLKIO_STAT_CPU_SERVICED, 1, 1); 1263 case BLKIO_PROP_io_service_time: 1264 return blkio_read_blkg_stats(blkcg, cft, cb, 1265 BLKIO_STAT_SERVICE_TIME, 1, 0); 1266 case BLKIO_PROP_io_wait_time: 1267 return blkio_read_blkg_stats(blkcg, cft, cb, 1268 BLKIO_STAT_WAIT_TIME, 1, 0); 1269 case BLKIO_PROP_io_merged: 1270 return blkio_read_blkg_stats(blkcg, cft, cb, 1271 BLKIO_STAT_CPU_MERGED, 1, 1); 1272 case BLKIO_PROP_io_queued: 1273 return blkio_read_blkg_stats(blkcg, cft, cb, 1274 BLKIO_STAT_QUEUED, 1, 0); 1275 #ifdef CONFIG_DEBUG_BLK_CGROUP 1276 case BLKIO_PROP_unaccounted_time: 1277 return blkio_read_blkg_stats(blkcg, cft, cb, 1278 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); 1279 case BLKIO_PROP_dequeue: 1280 return blkio_read_blkg_stats(blkcg, cft, cb, 1281 BLKIO_STAT_DEQUEUE, 0, 0); 1282 case BLKIO_PROP_avg_queue_size: 1283 return blkio_read_blkg_stats(blkcg, cft, cb, 1284 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); 1285 case BLKIO_PROP_group_wait_time: 1286 return blkio_read_blkg_stats(blkcg, cft, cb, 1287 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); 1288 case BLKIO_PROP_idle_time: 1289 return blkio_read_blkg_stats(blkcg, cft, cb, 1290 BLKIO_STAT_IDLE_TIME, 0, 0); 1291 case BLKIO_PROP_empty_time: 1292 return blkio_read_blkg_stats(blkcg, cft, cb, 1293 BLKIO_STAT_EMPTY_TIME, 0, 0); 1294 #endif 1295 default: 1296 BUG(); 1297 } 1298 break; 1299 case BLKIO_POLICY_THROTL: 1300 switch(name){ 1301 case BLKIO_THROTL_io_service_bytes: 1302 return blkio_read_blkg_stats(blkcg, cft, cb, 1303 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1304 case BLKIO_THROTL_io_serviced: 1305 return blkio_read_blkg_stats(blkcg, cft, cb, 1306 BLKIO_STAT_CPU_SERVICED, 1, 1); 1307 default: 1308 BUG(); 1309 } 1310 break; 1311 default: 1312 BUG(); 1313 } 1314 1315 return 0; 1316 } 1317 1318 static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) 1319 { 1320 struct blkio_group *blkg; 1321 struct hlist_node *n; 1322 struct blkio_policy_node *pn; 1323 1324 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 1325 return -EINVAL; 1326 1327 spin_lock(&blkio_list_lock); 1328 spin_lock_irq(&blkcg->lock); 1329 blkcg->weight = (unsigned int)val; 1330 1331 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 1332 pn = blkio_policy_search_node(blkcg, blkg->dev, 1333 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); 1334 if (pn) 1335 continue; 1336 1337 blkio_update_group_weight(blkg, blkcg->weight); 1338 } 1339 spin_unlock_irq(&blkcg->lock); 1340 spin_unlock(&blkio_list_lock); 1341 return 0; 1342 } 1343 1344 static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { 1345 struct blkio_cgroup *blkcg; 1346 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1347 int name = BLKIOFILE_ATTR(cft->private); 1348 1349 blkcg = cgroup_to_blkio_cgroup(cgrp); 1350 1351 switch(plid) { 1352 case BLKIO_POLICY_PROP: 1353 switch(name) { 1354 case BLKIO_PROP_weight: 1355 return (u64)blkcg->weight; 1356 } 1357 break; 1358 default: 1359 BUG(); 1360 } 1361 return 0; 1362 } 1363 1364 static int 1365 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1366 { 1367 struct blkio_cgroup *blkcg; 1368 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1369 int name = BLKIOFILE_ATTR(cft->private); 1370 1371 blkcg = cgroup_to_blkio_cgroup(cgrp); 1372 1373 switch(plid) { 1374 case BLKIO_POLICY_PROP: 1375 switch(name) { 1376 case BLKIO_PROP_weight: 1377 return blkio_weight_write(blkcg, val); 1378 } 1379 break; 1380 default: 1381 BUG(); 1382 } 1383 1384 return 0; 1385 } 1386 1387 struct cftype blkio_files[] = { 1388 { 1389 .name = "weight_device", 1390 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1391 BLKIO_PROP_weight_device), 1392 .read_seq_string = blkiocg_file_read, 1393 .write_string = blkiocg_file_write, 1394 .max_write_len = 256, 1395 }, 1396 { 1397 .name = "weight", 1398 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1399 BLKIO_PROP_weight), 1400 .read_u64 = blkiocg_file_read_u64, 1401 .write_u64 = blkiocg_file_write_u64, 1402 }, 1403 { 1404 .name = "time", 1405 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1406 BLKIO_PROP_time), 1407 .read_map = blkiocg_file_read_map, 1408 }, 1409 { 1410 .name = "sectors", 1411 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1412 BLKIO_PROP_sectors), 1413 .read_map = blkiocg_file_read_map, 1414 }, 1415 { 1416 .name = "io_service_bytes", 1417 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1418 BLKIO_PROP_io_service_bytes), 1419 .read_map = blkiocg_file_read_map, 1420 }, 1421 { 1422 .name = "io_serviced", 1423 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1424 BLKIO_PROP_io_serviced), 1425 .read_map = blkiocg_file_read_map, 1426 }, 1427 { 1428 .name = "io_service_time", 1429 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1430 BLKIO_PROP_io_service_time), 1431 .read_map = blkiocg_file_read_map, 1432 }, 1433 { 1434 .name = "io_wait_time", 1435 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1436 BLKIO_PROP_io_wait_time), 1437 .read_map = blkiocg_file_read_map, 1438 }, 1439 { 1440 .name = "io_merged", 1441 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1442 BLKIO_PROP_io_merged), 1443 .read_map = blkiocg_file_read_map, 1444 }, 1445 { 1446 .name = "io_queued", 1447 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1448 BLKIO_PROP_io_queued), 1449 .read_map = blkiocg_file_read_map, 1450 }, 1451 { 1452 .name = "reset_stats", 1453 .write_u64 = blkiocg_reset_stats, 1454 }, 1455 #ifdef CONFIG_BLK_DEV_THROTTLING 1456 { 1457 .name = "throttle.read_bps_device", 1458 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1459 BLKIO_THROTL_read_bps_device), 1460 .read_seq_string = blkiocg_file_read, 1461 .write_string = blkiocg_file_write, 1462 .max_write_len = 256, 1463 }, 1464 1465 { 1466 .name = "throttle.write_bps_device", 1467 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1468 BLKIO_THROTL_write_bps_device), 1469 .read_seq_string = blkiocg_file_read, 1470 .write_string = blkiocg_file_write, 1471 .max_write_len = 256, 1472 }, 1473 1474 { 1475 .name = "throttle.read_iops_device", 1476 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1477 BLKIO_THROTL_read_iops_device), 1478 .read_seq_string = blkiocg_file_read, 1479 .write_string = blkiocg_file_write, 1480 .max_write_len = 256, 1481 }, 1482 1483 { 1484 .name = "throttle.write_iops_device", 1485 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1486 BLKIO_THROTL_write_iops_device), 1487 .read_seq_string = blkiocg_file_read, 1488 .write_string = blkiocg_file_write, 1489 .max_write_len = 256, 1490 }, 1491 { 1492 .name = "throttle.io_service_bytes", 1493 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1494 BLKIO_THROTL_io_service_bytes), 1495 .read_map = blkiocg_file_read_map, 1496 }, 1497 { 1498 .name = "throttle.io_serviced", 1499 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1500 BLKIO_THROTL_io_serviced), 1501 .read_map = blkiocg_file_read_map, 1502 }, 1503 #endif /* CONFIG_BLK_DEV_THROTTLING */ 1504 1505 #ifdef CONFIG_DEBUG_BLK_CGROUP 1506 { 1507 .name = "avg_queue_size", 1508 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1509 BLKIO_PROP_avg_queue_size), 1510 .read_map = blkiocg_file_read_map, 1511 }, 1512 { 1513 .name = "group_wait_time", 1514 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1515 BLKIO_PROP_group_wait_time), 1516 .read_map = blkiocg_file_read_map, 1517 }, 1518 { 1519 .name = "idle_time", 1520 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1521 BLKIO_PROP_idle_time), 1522 .read_map = blkiocg_file_read_map, 1523 }, 1524 { 1525 .name = "empty_time", 1526 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1527 BLKIO_PROP_empty_time), 1528 .read_map = blkiocg_file_read_map, 1529 }, 1530 { 1531 .name = "dequeue", 1532 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1533 BLKIO_PROP_dequeue), 1534 .read_map = blkiocg_file_read_map, 1535 }, 1536 { 1537 .name = "unaccounted_time", 1538 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1539 BLKIO_PROP_unaccounted_time), 1540 .read_map = blkiocg_file_read_map, 1541 }, 1542 #endif 1543 }; 1544 1545 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1546 { 1547 return cgroup_add_files(cgroup, subsys, blkio_files, 1548 ARRAY_SIZE(blkio_files)); 1549 } 1550 1551 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1552 { 1553 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 1554 unsigned long flags; 1555 struct blkio_group *blkg; 1556 void *key; 1557 struct blkio_policy_type *blkiop; 1558 struct blkio_policy_node *pn, *pntmp; 1559 1560 rcu_read_lock(); 1561 do { 1562 spin_lock_irqsave(&blkcg->lock, flags); 1563 1564 if (hlist_empty(&blkcg->blkg_list)) { 1565 spin_unlock_irqrestore(&blkcg->lock, flags); 1566 break; 1567 } 1568 1569 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 1570 blkcg_node); 1571 key = rcu_dereference(blkg->key); 1572 __blkiocg_del_blkio_group(blkg); 1573 1574 spin_unlock_irqrestore(&blkcg->lock, flags); 1575 1576 /* 1577 * This blkio_group is being unlinked as associated cgroup is 1578 * going away. Let all the IO controlling policies know about 1579 * this event. 1580 */ 1581 spin_lock(&blkio_list_lock); 1582 list_for_each_entry(blkiop, &blkio_list, list) { 1583 if (blkiop->plid != blkg->plid) 1584 continue; 1585 blkiop->ops.blkio_unlink_group_fn(key, blkg); 1586 } 1587 spin_unlock(&blkio_list_lock); 1588 } while (1); 1589 1590 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { 1591 blkio_policy_delete_node(pn); 1592 kfree(pn); 1593 } 1594 1595 free_css_id(&blkio_subsys, &blkcg->css); 1596 rcu_read_unlock(); 1597 if (blkcg != &blkio_root_cgroup) 1598 kfree(blkcg); 1599 } 1600 1601 static struct cgroup_subsys_state * 1602 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1603 { 1604 struct blkio_cgroup *blkcg; 1605 struct cgroup *parent = cgroup->parent; 1606 1607 if (!parent) { 1608 blkcg = &blkio_root_cgroup; 1609 goto done; 1610 } 1611 1612 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1613 if (!blkcg) 1614 return ERR_PTR(-ENOMEM); 1615 1616 blkcg->weight = BLKIO_WEIGHT_DEFAULT; 1617 done: 1618 spin_lock_init(&blkcg->lock); 1619 INIT_HLIST_HEAD(&blkcg->blkg_list); 1620 1621 INIT_LIST_HEAD(&blkcg->policy_list); 1622 return &blkcg->css; 1623 } 1624 1625 /* 1626 * We cannot support shared io contexts, as we have no mean to support 1627 * two tasks with the same ioc in two different groups without major rework 1628 * of the main cic data structures. For now we allow a task to change 1629 * its cgroup only if it's the only owner of its ioc. 1630 */ 1631 static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1632 struct cgroup_taskset *tset) 1633 { 1634 struct task_struct *task; 1635 struct io_context *ioc; 1636 int ret = 0; 1637 1638 /* task_lock() is needed to avoid races with exit_io_context() */ 1639 cgroup_taskset_for_each(task, cgrp, tset) { 1640 task_lock(task); 1641 ioc = task->io_context; 1642 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 1643 ret = -EINVAL; 1644 task_unlock(task); 1645 if (ret) 1646 break; 1647 } 1648 return ret; 1649 } 1650 1651 static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1652 struct cgroup_taskset *tset) 1653 { 1654 struct task_struct *task; 1655 struct io_context *ioc; 1656 1657 cgroup_taskset_for_each(task, cgrp, tset) { 1658 /* we don't lose anything even if ioc allocation fails */ 1659 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); 1660 if (ioc) { 1661 ioc_cgroup_changed(ioc); 1662 put_io_context(ioc, NULL); 1663 } 1664 } 1665 } 1666 1667 void blkio_policy_register(struct blkio_policy_type *blkiop) 1668 { 1669 spin_lock(&blkio_list_lock); 1670 list_add_tail(&blkiop->list, &blkio_list); 1671 spin_unlock(&blkio_list_lock); 1672 } 1673 EXPORT_SYMBOL_GPL(blkio_policy_register); 1674 1675 void blkio_policy_unregister(struct blkio_policy_type *blkiop) 1676 { 1677 spin_lock(&blkio_list_lock); 1678 list_del_init(&blkiop->list); 1679 spin_unlock(&blkio_list_lock); 1680 } 1681 EXPORT_SYMBOL_GPL(blkio_policy_unregister); 1682 1683 static int __init init_cgroup_blkio(void) 1684 { 1685 return cgroup_load_subsys(&blkio_subsys); 1686 } 1687 1688 static void __exit exit_cgroup_blkio(void) 1689 { 1690 cgroup_unload_subsys(&blkio_subsys); 1691 } 1692 1693 module_init(init_cgroup_blkio); 1694 module_exit(exit_cgroup_blkio); 1695 MODULE_LICENSE("GPL"); 1696