xref: /linux/block/blk-cgroup.c (revision f850548ef88e5ff9e40bae9e1a7140bef0653e6b)
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *		      Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  * 	              Nauman Rafique <nauman@google.com>
12  */
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include <linux/slab.h>
20 #include "blk-cgroup.h"
21 #include <linux/genhd.h>
22 
23 #define MAX_KEY_LEN 100
24 
25 static DEFINE_SPINLOCK(blkio_list_lock);
26 static LIST_HEAD(blkio_list);
27 
28 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
29 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
30 
31 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
32 						  struct cgroup *);
33 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
34 			      struct task_struct *, bool);
35 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
36 			   struct cgroup *, struct task_struct *, bool);
37 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39 
40 /* for encoding cft->private value on file */
41 #define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
42 /* What policy owns the file, proportional or throttle */
43 #define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
44 #define BLKIOFILE_ATTR(val)		((val) & 0xffff)
45 
46 struct cgroup_subsys blkio_subsys = {
47 	.name = "blkio",
48 	.create = blkiocg_create,
49 	.can_attach = blkiocg_can_attach,
50 	.attach = blkiocg_attach,
51 	.destroy = blkiocg_destroy,
52 	.populate = blkiocg_populate,
53 #ifdef CONFIG_BLK_CGROUP
54 	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
55 	.subsys_id = blkio_subsys_id,
56 #endif
57 	.use_id = 1,
58 	.module = THIS_MODULE,
59 };
60 EXPORT_SYMBOL_GPL(blkio_subsys);
61 
62 static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
63 					    struct blkio_policy_node *pn)
64 {
65 	list_add(&pn->node, &blkcg->policy_list);
66 }
67 
68 static inline bool cftype_blkg_same_policy(struct cftype *cft,
69 			struct blkio_group *blkg)
70 {
71 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
72 
73 	if (blkg->plid == plid)
74 		return 1;
75 
76 	return 0;
77 }
78 
79 /* Determines if policy node matches cgroup file being accessed */
80 static inline bool pn_matches_cftype(struct cftype *cft,
81 			struct blkio_policy_node *pn)
82 {
83 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
84 	int fileid = BLKIOFILE_ATTR(cft->private);
85 
86 	return (plid == pn->plid && fileid == pn->fileid);
87 }
88 
89 /* Must be called with blkcg->lock held */
90 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
91 {
92 	list_del(&pn->node);
93 }
94 
95 /* Must be called with blkcg->lock held */
96 static struct blkio_policy_node *
97 blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
98 		enum blkio_policy_id plid, int fileid)
99 {
100 	struct blkio_policy_node *pn;
101 
102 	list_for_each_entry(pn, &blkcg->policy_list, node) {
103 		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
104 			return pn;
105 	}
106 
107 	return NULL;
108 }
109 
110 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
111 {
112 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
113 			    struct blkio_cgroup, css);
114 }
115 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
116 
117 static inline void
118 blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
119 {
120 	struct blkio_policy_type *blkiop;
121 
122 	list_for_each_entry(blkiop, &blkio_list, list) {
123 		/* If this policy does not own the blkg, do not send updates */
124 		if (blkiop->plid != blkg->plid)
125 			continue;
126 		if (blkiop->ops.blkio_update_group_weight_fn)
127 			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
128 							blkg, weight);
129 	}
130 }
131 
132 static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
133 				int fileid)
134 {
135 	struct blkio_policy_type *blkiop;
136 
137 	list_for_each_entry(blkiop, &blkio_list, list) {
138 
139 		/* If this policy does not own the blkg, do not send updates */
140 		if (blkiop->plid != blkg->plid)
141 			continue;
142 
143 		if (fileid == BLKIO_THROTL_read_bps_device
144 		    && blkiop->ops.blkio_update_group_read_bps_fn)
145 			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
146 								blkg, bps);
147 
148 		if (fileid == BLKIO_THROTL_write_bps_device
149 		    && blkiop->ops.blkio_update_group_write_bps_fn)
150 			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
151 								blkg, bps);
152 	}
153 }
154 
155 static inline void blkio_update_group_iops(struct blkio_group *blkg,
156 			unsigned int iops, int fileid)
157 {
158 	struct blkio_policy_type *blkiop;
159 
160 	list_for_each_entry(blkiop, &blkio_list, list) {
161 
162 		/* If this policy does not own the blkg, do not send updates */
163 		if (blkiop->plid != blkg->plid)
164 			continue;
165 
166 		if (fileid == BLKIO_THROTL_read_iops_device
167 		    && blkiop->ops.blkio_update_group_read_iops_fn)
168 			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
169 								blkg, iops);
170 
171 		if (fileid == BLKIO_THROTL_write_iops_device
172 		    && blkiop->ops.blkio_update_group_write_iops_fn)
173 			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
174 								blkg,iops);
175 	}
176 }
177 
178 /*
179  * Add to the appropriate stat variable depending on the request type.
180  * This should be called with the blkg->stats_lock held.
181  */
182 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
183 				bool sync)
184 {
185 	if (direction)
186 		stat[BLKIO_STAT_WRITE] += add;
187 	else
188 		stat[BLKIO_STAT_READ] += add;
189 	if (sync)
190 		stat[BLKIO_STAT_SYNC] += add;
191 	else
192 		stat[BLKIO_STAT_ASYNC] += add;
193 }
194 
195 /*
196  * Decrements the appropriate stat variable if non-zero depending on the
197  * request type. Panics on value being zero.
198  * This should be called with the blkg->stats_lock held.
199  */
200 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
201 {
202 	if (direction) {
203 		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
204 		stat[BLKIO_STAT_WRITE]--;
205 	} else {
206 		BUG_ON(stat[BLKIO_STAT_READ] == 0);
207 		stat[BLKIO_STAT_READ]--;
208 	}
209 	if (sync) {
210 		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
211 		stat[BLKIO_STAT_SYNC]--;
212 	} else {
213 		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
214 		stat[BLKIO_STAT_ASYNC]--;
215 	}
216 }
217 
218 #ifdef CONFIG_DEBUG_BLK_CGROUP
219 /* This should be called with the blkg->stats_lock held. */
220 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
221 						struct blkio_group *curr_blkg)
222 {
223 	if (blkio_blkg_waiting(&blkg->stats))
224 		return;
225 	if (blkg == curr_blkg)
226 		return;
227 	blkg->stats.start_group_wait_time = sched_clock();
228 	blkio_mark_blkg_waiting(&blkg->stats);
229 }
230 
231 /* This should be called with the blkg->stats_lock held. */
232 static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
233 {
234 	unsigned long long now;
235 
236 	if (!blkio_blkg_waiting(stats))
237 		return;
238 
239 	now = sched_clock();
240 	if (time_after64(now, stats->start_group_wait_time))
241 		stats->group_wait_time += now - stats->start_group_wait_time;
242 	blkio_clear_blkg_waiting(stats);
243 }
244 
245 /* This should be called with the blkg->stats_lock held. */
246 static void blkio_end_empty_time(struct blkio_group_stats *stats)
247 {
248 	unsigned long long now;
249 
250 	if (!blkio_blkg_empty(stats))
251 		return;
252 
253 	now = sched_clock();
254 	if (time_after64(now, stats->start_empty_time))
255 		stats->empty_time += now - stats->start_empty_time;
256 	blkio_clear_blkg_empty(stats);
257 }
258 
259 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
260 {
261 	unsigned long flags;
262 
263 	spin_lock_irqsave(&blkg->stats_lock, flags);
264 	BUG_ON(blkio_blkg_idling(&blkg->stats));
265 	blkg->stats.start_idle_time = sched_clock();
266 	blkio_mark_blkg_idling(&blkg->stats);
267 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
268 }
269 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
270 
271 void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
272 {
273 	unsigned long flags;
274 	unsigned long long now;
275 	struct blkio_group_stats *stats;
276 
277 	spin_lock_irqsave(&blkg->stats_lock, flags);
278 	stats = &blkg->stats;
279 	if (blkio_blkg_idling(stats)) {
280 		now = sched_clock();
281 		if (time_after64(now, stats->start_idle_time))
282 			stats->idle_time += now - stats->start_idle_time;
283 		blkio_clear_blkg_idling(stats);
284 	}
285 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
286 }
287 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
288 
289 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
290 {
291 	unsigned long flags;
292 	struct blkio_group_stats *stats;
293 
294 	spin_lock_irqsave(&blkg->stats_lock, flags);
295 	stats = &blkg->stats;
296 	stats->avg_queue_size_sum +=
297 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
298 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
299 	stats->avg_queue_size_samples++;
300 	blkio_update_group_wait_time(stats);
301 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
302 }
303 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
304 
305 void blkiocg_set_start_empty_time(struct blkio_group *blkg)
306 {
307 	unsigned long flags;
308 	struct blkio_group_stats *stats;
309 
310 	spin_lock_irqsave(&blkg->stats_lock, flags);
311 	stats = &blkg->stats;
312 
313 	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
314 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
315 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
316 		return;
317 	}
318 
319 	/*
320 	 * group is already marked empty. This can happen if cfqq got new
321 	 * request in parent group and moved to this group while being added
322 	 * to service tree. Just ignore the event and move on.
323 	 */
324 	if(blkio_blkg_empty(stats)) {
325 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
326 		return;
327 	}
328 
329 	stats->start_empty_time = sched_clock();
330 	blkio_mark_blkg_empty(stats);
331 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
332 }
333 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
334 
335 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
336 			unsigned long dequeue)
337 {
338 	blkg->stats.dequeue += dequeue;
339 }
340 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
341 #else
342 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
343 					struct blkio_group *curr_blkg) {}
344 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
345 #endif
346 
347 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
348 			struct blkio_group *curr_blkg, bool direction,
349 			bool sync)
350 {
351 	unsigned long flags;
352 
353 	spin_lock_irqsave(&blkg->stats_lock, flags);
354 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
355 			sync);
356 	blkio_end_empty_time(&blkg->stats);
357 	blkio_set_start_group_wait_time(blkg, curr_blkg);
358 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
359 }
360 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
361 
362 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
363 						bool direction, bool sync)
364 {
365 	unsigned long flags;
366 
367 	spin_lock_irqsave(&blkg->stats_lock, flags);
368 	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
369 					direction, sync);
370 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
371 }
372 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
373 
374 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
375 				unsigned long unaccounted_time)
376 {
377 	unsigned long flags;
378 
379 	spin_lock_irqsave(&blkg->stats_lock, flags);
380 	blkg->stats.time += time;
381 	blkg->stats.unaccounted_time += unaccounted_time;
382 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
383 }
384 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
385 
386 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
387 				uint64_t bytes, bool direction, bool sync)
388 {
389 	struct blkio_group_stats *stats;
390 	unsigned long flags;
391 
392 	spin_lock_irqsave(&blkg->stats_lock, flags);
393 	stats = &blkg->stats;
394 	stats->sectors += bytes >> 9;
395 	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
396 			sync);
397 	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
398 			direction, sync);
399 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
400 }
401 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
402 
403 void blkiocg_update_completion_stats(struct blkio_group *blkg,
404 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
405 {
406 	struct blkio_group_stats *stats;
407 	unsigned long flags;
408 	unsigned long long now = sched_clock();
409 
410 	spin_lock_irqsave(&blkg->stats_lock, flags);
411 	stats = &blkg->stats;
412 	if (time_after64(now, io_start_time))
413 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
414 				now - io_start_time, direction, sync);
415 	if (time_after64(io_start_time, start_time))
416 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
417 				io_start_time - start_time, direction, sync);
418 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
419 }
420 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
421 
422 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
423 					bool sync)
424 {
425 	unsigned long flags;
426 
427 	spin_lock_irqsave(&blkg->stats_lock, flags);
428 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
429 			sync);
430 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
431 }
432 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
433 
434 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
435 		struct blkio_group *blkg, void *key, dev_t dev,
436 		enum blkio_policy_id plid)
437 {
438 	unsigned long flags;
439 
440 	spin_lock_irqsave(&blkcg->lock, flags);
441 	spin_lock_init(&blkg->stats_lock);
442 	rcu_assign_pointer(blkg->key, key);
443 	blkg->blkcg_id = css_id(&blkcg->css);
444 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
445 	blkg->plid = plid;
446 	spin_unlock_irqrestore(&blkcg->lock, flags);
447 	/* Need to take css reference ? */
448 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
449 	blkg->dev = dev;
450 }
451 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
452 
453 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
454 {
455 	hlist_del_init_rcu(&blkg->blkcg_node);
456 	blkg->blkcg_id = 0;
457 }
458 
459 /*
460  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
461  * indicating that blk_group was unhashed by the time we got to it.
462  */
463 int blkiocg_del_blkio_group(struct blkio_group *blkg)
464 {
465 	struct blkio_cgroup *blkcg;
466 	unsigned long flags;
467 	struct cgroup_subsys_state *css;
468 	int ret = 1;
469 
470 	rcu_read_lock();
471 	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
472 	if (css) {
473 		blkcg = container_of(css, struct blkio_cgroup, css);
474 		spin_lock_irqsave(&blkcg->lock, flags);
475 		if (!hlist_unhashed(&blkg->blkcg_node)) {
476 			__blkiocg_del_blkio_group(blkg);
477 			ret = 0;
478 		}
479 		spin_unlock_irqrestore(&blkcg->lock, flags);
480 	}
481 
482 	rcu_read_unlock();
483 	return ret;
484 }
485 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
486 
487 /* called under rcu_read_lock(). */
488 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
489 {
490 	struct blkio_group *blkg;
491 	struct hlist_node *n;
492 	void *__key;
493 
494 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
495 		__key = blkg->key;
496 		if (__key == key)
497 			return blkg;
498 	}
499 
500 	return NULL;
501 }
502 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
503 
504 static int
505 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
506 {
507 	struct blkio_cgroup *blkcg;
508 	struct blkio_group *blkg;
509 	struct blkio_group_stats *stats;
510 	struct hlist_node *n;
511 	uint64_t queued[BLKIO_STAT_TOTAL];
512 	int i;
513 #ifdef CONFIG_DEBUG_BLK_CGROUP
514 	bool idling, waiting, empty;
515 	unsigned long long now = sched_clock();
516 #endif
517 
518 	blkcg = cgroup_to_blkio_cgroup(cgroup);
519 	spin_lock_irq(&blkcg->lock);
520 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
521 		spin_lock(&blkg->stats_lock);
522 		stats = &blkg->stats;
523 #ifdef CONFIG_DEBUG_BLK_CGROUP
524 		idling = blkio_blkg_idling(stats);
525 		waiting = blkio_blkg_waiting(stats);
526 		empty = blkio_blkg_empty(stats);
527 #endif
528 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
529 			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
530 		memset(stats, 0, sizeof(struct blkio_group_stats));
531 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
532 			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
533 #ifdef CONFIG_DEBUG_BLK_CGROUP
534 		if (idling) {
535 			blkio_mark_blkg_idling(stats);
536 			stats->start_idle_time = now;
537 		}
538 		if (waiting) {
539 			blkio_mark_blkg_waiting(stats);
540 			stats->start_group_wait_time = now;
541 		}
542 		if (empty) {
543 			blkio_mark_blkg_empty(stats);
544 			stats->start_empty_time = now;
545 		}
546 #endif
547 		spin_unlock(&blkg->stats_lock);
548 	}
549 	spin_unlock_irq(&blkcg->lock);
550 	return 0;
551 }
552 
553 static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
554 				int chars_left, bool diskname_only)
555 {
556 	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
557 	chars_left -= strlen(str);
558 	if (chars_left <= 0) {
559 		printk(KERN_WARNING
560 			"Possibly incorrect cgroup stat display format");
561 		return;
562 	}
563 	if (diskname_only)
564 		return;
565 	switch (type) {
566 	case BLKIO_STAT_READ:
567 		strlcat(str, " Read", chars_left);
568 		break;
569 	case BLKIO_STAT_WRITE:
570 		strlcat(str, " Write", chars_left);
571 		break;
572 	case BLKIO_STAT_SYNC:
573 		strlcat(str, " Sync", chars_left);
574 		break;
575 	case BLKIO_STAT_ASYNC:
576 		strlcat(str, " Async", chars_left);
577 		break;
578 	case BLKIO_STAT_TOTAL:
579 		strlcat(str, " Total", chars_left);
580 		break;
581 	default:
582 		strlcat(str, " Invalid", chars_left);
583 	}
584 }
585 
586 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
587 				struct cgroup_map_cb *cb, dev_t dev)
588 {
589 	blkio_get_key_name(0, dev, str, chars_left, true);
590 	cb->fill(cb, str, val);
591 	return val;
592 }
593 
594 /* This should be called with blkg->stats_lock held */
595 static uint64_t blkio_get_stat(struct blkio_group *blkg,
596 		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
597 {
598 	uint64_t disk_total;
599 	char key_str[MAX_KEY_LEN];
600 	enum stat_sub_type sub_type;
601 
602 	if (type == BLKIO_STAT_TIME)
603 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
604 					blkg->stats.time, cb, dev);
605 	if (type == BLKIO_STAT_SECTORS)
606 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
607 					blkg->stats.sectors, cb, dev);
608 #ifdef CONFIG_DEBUG_BLK_CGROUP
609 	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
610 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
611 					blkg->stats.unaccounted_time, cb, dev);
612 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
613 		uint64_t sum = blkg->stats.avg_queue_size_sum;
614 		uint64_t samples = blkg->stats.avg_queue_size_samples;
615 		if (samples)
616 			do_div(sum, samples);
617 		else
618 			sum = 0;
619 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
620 	}
621 	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
622 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
623 					blkg->stats.group_wait_time, cb, dev);
624 	if (type == BLKIO_STAT_IDLE_TIME)
625 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
626 					blkg->stats.idle_time, cb, dev);
627 	if (type == BLKIO_STAT_EMPTY_TIME)
628 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
629 					blkg->stats.empty_time, cb, dev);
630 	if (type == BLKIO_STAT_DEQUEUE)
631 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
632 					blkg->stats.dequeue, cb, dev);
633 #endif
634 
635 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
636 			sub_type++) {
637 		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
638 		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
639 	}
640 	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
641 			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
642 	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
643 	cb->fill(cb, key_str, disk_total);
644 	return disk_total;
645 }
646 
647 static int blkio_check_dev_num(dev_t dev)
648 {
649 	int part = 0;
650 	struct gendisk *disk;
651 
652 	disk = get_gendisk(dev, &part);
653 	if (!disk || part)
654 		return -ENODEV;
655 
656 	return 0;
657 }
658 
659 static int blkio_policy_parse_and_set(char *buf,
660 	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
661 {
662 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
663 	int ret;
664 	unsigned long major, minor, temp;
665 	int i = 0;
666 	dev_t dev;
667 	u64 bps, iops;
668 
669 	memset(s, 0, sizeof(s));
670 
671 	while ((p = strsep(&buf, " ")) != NULL) {
672 		if (!*p)
673 			continue;
674 
675 		s[i++] = p;
676 
677 		/* Prevent from inputing too many things */
678 		if (i == 3)
679 			break;
680 	}
681 
682 	if (i != 2)
683 		return -EINVAL;
684 
685 	p = strsep(&s[0], ":");
686 	if (p != NULL)
687 		major_s = p;
688 	else
689 		return -EINVAL;
690 
691 	minor_s = s[0];
692 	if (!minor_s)
693 		return -EINVAL;
694 
695 	ret = strict_strtoul(major_s, 10, &major);
696 	if (ret)
697 		return -EINVAL;
698 
699 	ret = strict_strtoul(minor_s, 10, &minor);
700 	if (ret)
701 		return -EINVAL;
702 
703 	dev = MKDEV(major, minor);
704 
705 	ret = blkio_check_dev_num(dev);
706 	if (ret)
707 		return ret;
708 
709 	newpn->dev = dev;
710 
711 	if (s[1] == NULL)
712 		return -EINVAL;
713 
714 	switch (plid) {
715 	case BLKIO_POLICY_PROP:
716 		ret = strict_strtoul(s[1], 10, &temp);
717 		if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
718 			temp > BLKIO_WEIGHT_MAX)
719 			return -EINVAL;
720 
721 		newpn->plid = plid;
722 		newpn->fileid = fileid;
723 		newpn->val.weight = temp;
724 		break;
725 	case BLKIO_POLICY_THROTL:
726 		switch(fileid) {
727 		case BLKIO_THROTL_read_bps_device:
728 		case BLKIO_THROTL_write_bps_device:
729 			ret = strict_strtoull(s[1], 10, &bps);
730 			if (ret)
731 				return -EINVAL;
732 
733 			newpn->plid = plid;
734 			newpn->fileid = fileid;
735 			newpn->val.bps = bps;
736 			break;
737 		case BLKIO_THROTL_read_iops_device:
738 		case BLKIO_THROTL_write_iops_device:
739 			ret = strict_strtoull(s[1], 10, &iops);
740 			if (ret)
741 				return -EINVAL;
742 
743 			if (iops > THROTL_IOPS_MAX)
744 				return -EINVAL;
745 
746 			newpn->plid = plid;
747 			newpn->fileid = fileid;
748 			newpn->val.iops = (unsigned int)iops;
749 			break;
750 		}
751 		break;
752 	default:
753 		BUG();
754 	}
755 
756 	return 0;
757 }
758 
759 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
760 			      dev_t dev)
761 {
762 	struct blkio_policy_node *pn;
763 
764 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
765 				BLKIO_PROP_weight_device);
766 	if (pn)
767 		return pn->val.weight;
768 	else
769 		return blkcg->weight;
770 }
771 EXPORT_SYMBOL_GPL(blkcg_get_weight);
772 
773 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
774 {
775 	struct blkio_policy_node *pn;
776 
777 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
778 				BLKIO_THROTL_read_bps_device);
779 	if (pn)
780 		return pn->val.bps;
781 	else
782 		return -1;
783 }
784 
785 uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
786 {
787 	struct blkio_policy_node *pn;
788 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
789 				BLKIO_THROTL_write_bps_device);
790 	if (pn)
791 		return pn->val.bps;
792 	else
793 		return -1;
794 }
795 
796 unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
797 {
798 	struct blkio_policy_node *pn;
799 
800 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
801 				BLKIO_THROTL_read_iops_device);
802 	if (pn)
803 		return pn->val.iops;
804 	else
805 		return -1;
806 }
807 
808 unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
809 {
810 	struct blkio_policy_node *pn;
811 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
812 				BLKIO_THROTL_write_iops_device);
813 	if (pn)
814 		return pn->val.iops;
815 	else
816 		return -1;
817 }
818 
819 /* Checks whether user asked for deleting a policy rule */
820 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
821 {
822 	switch(pn->plid) {
823 	case BLKIO_POLICY_PROP:
824 		if (pn->val.weight == 0)
825 			return 1;
826 		break;
827 	case BLKIO_POLICY_THROTL:
828 		switch(pn->fileid) {
829 		case BLKIO_THROTL_read_bps_device:
830 		case BLKIO_THROTL_write_bps_device:
831 			if (pn->val.bps == 0)
832 				return 1;
833 			break;
834 		case BLKIO_THROTL_read_iops_device:
835 		case BLKIO_THROTL_write_iops_device:
836 			if (pn->val.iops == 0)
837 				return 1;
838 		}
839 		break;
840 	default:
841 		BUG();
842 	}
843 
844 	return 0;
845 }
846 
847 static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
848 					struct blkio_policy_node *newpn)
849 {
850 	switch(oldpn->plid) {
851 	case BLKIO_POLICY_PROP:
852 		oldpn->val.weight = newpn->val.weight;
853 		break;
854 	case BLKIO_POLICY_THROTL:
855 		switch(newpn->fileid) {
856 		case BLKIO_THROTL_read_bps_device:
857 		case BLKIO_THROTL_write_bps_device:
858 			oldpn->val.bps = newpn->val.bps;
859 			break;
860 		case BLKIO_THROTL_read_iops_device:
861 		case BLKIO_THROTL_write_iops_device:
862 			oldpn->val.iops = newpn->val.iops;
863 		}
864 		break;
865 	default:
866 		BUG();
867 	}
868 }
869 
870 /*
871  * Some rules/values in blkg have changed. Propagate those to respective
872  * policies.
873  */
874 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
875 		struct blkio_group *blkg, struct blkio_policy_node *pn)
876 {
877 	unsigned int weight, iops;
878 	u64 bps;
879 
880 	switch(pn->plid) {
881 	case BLKIO_POLICY_PROP:
882 		weight = pn->val.weight ? pn->val.weight :
883 				blkcg->weight;
884 		blkio_update_group_weight(blkg, weight);
885 		break;
886 	case BLKIO_POLICY_THROTL:
887 		switch(pn->fileid) {
888 		case BLKIO_THROTL_read_bps_device:
889 		case BLKIO_THROTL_write_bps_device:
890 			bps = pn->val.bps ? pn->val.bps : (-1);
891 			blkio_update_group_bps(blkg, bps, pn->fileid);
892 			break;
893 		case BLKIO_THROTL_read_iops_device:
894 		case BLKIO_THROTL_write_iops_device:
895 			iops = pn->val.iops ? pn->val.iops : (-1);
896 			blkio_update_group_iops(blkg, iops, pn->fileid);
897 			break;
898 		}
899 		break;
900 	default:
901 		BUG();
902 	}
903 }
904 
905 /*
906  * A policy node rule has been updated. Propagate this update to all the
907  * block groups which might be affected by this update.
908  */
909 static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
910 				struct blkio_policy_node *pn)
911 {
912 	struct blkio_group *blkg;
913 	struct hlist_node *n;
914 
915 	spin_lock(&blkio_list_lock);
916 	spin_lock_irq(&blkcg->lock);
917 
918 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
919 		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
920 			continue;
921 		blkio_update_blkg_policy(blkcg, blkg, pn);
922 	}
923 
924 	spin_unlock_irq(&blkcg->lock);
925 	spin_unlock(&blkio_list_lock);
926 }
927 
928 static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
929  				       const char *buffer)
930 {
931 	int ret = 0;
932 	char *buf;
933 	struct blkio_policy_node *newpn, *pn;
934 	struct blkio_cgroup *blkcg;
935 	int keep_newpn = 0;
936 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
937 	int fileid = BLKIOFILE_ATTR(cft->private);
938 
939 	buf = kstrdup(buffer, GFP_KERNEL);
940 	if (!buf)
941 		return -ENOMEM;
942 
943 	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
944 	if (!newpn) {
945 		ret = -ENOMEM;
946 		goto free_buf;
947 	}
948 
949 	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
950 	if (ret)
951 		goto free_newpn;
952 
953 	blkcg = cgroup_to_blkio_cgroup(cgrp);
954 
955 	spin_lock_irq(&blkcg->lock);
956 
957 	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
958 	if (!pn) {
959 		if (!blkio_delete_rule_command(newpn)) {
960 			blkio_policy_insert_node(blkcg, newpn);
961 			keep_newpn = 1;
962 		}
963 		spin_unlock_irq(&blkcg->lock);
964 		goto update_io_group;
965 	}
966 
967 	if (blkio_delete_rule_command(newpn)) {
968 		blkio_policy_delete_node(pn);
969 		spin_unlock_irq(&blkcg->lock);
970 		goto update_io_group;
971 	}
972 	spin_unlock_irq(&blkcg->lock);
973 
974 	blkio_update_policy_rule(pn, newpn);
975 
976 update_io_group:
977 	blkio_update_policy_node_blkg(blkcg, newpn);
978 
979 free_newpn:
980 	if (!keep_newpn)
981 		kfree(newpn);
982 free_buf:
983 	kfree(buf);
984 	return ret;
985 }
986 
987 static void
988 blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
989 {
990 	switch(pn->plid) {
991 		case BLKIO_POLICY_PROP:
992 			if (pn->fileid == BLKIO_PROP_weight_device)
993 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
994 					MINOR(pn->dev), pn->val.weight);
995 			break;
996 		case BLKIO_POLICY_THROTL:
997 			switch(pn->fileid) {
998 			case BLKIO_THROTL_read_bps_device:
999 			case BLKIO_THROTL_write_bps_device:
1000 				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1001 					MINOR(pn->dev), pn->val.bps);
1002 				break;
1003 			case BLKIO_THROTL_read_iops_device:
1004 			case BLKIO_THROTL_write_iops_device:
1005 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1006 					MINOR(pn->dev), pn->val.iops);
1007 				break;
1008 			}
1009 			break;
1010 		default:
1011 			BUG();
1012 	}
1013 }
1014 
1015 /* cgroup files which read their data from policy nodes end up here */
1016 static void blkio_read_policy_node_files(struct cftype *cft,
1017 			struct blkio_cgroup *blkcg, struct seq_file *m)
1018 {
1019 	struct blkio_policy_node *pn;
1020 
1021 	if (!list_empty(&blkcg->policy_list)) {
1022 		spin_lock_irq(&blkcg->lock);
1023 		list_for_each_entry(pn, &blkcg->policy_list, node) {
1024 			if (!pn_matches_cftype(cft, pn))
1025 				continue;
1026 			blkio_print_policy_node(m, pn);
1027 		}
1028 		spin_unlock_irq(&blkcg->lock);
1029 	}
1030 }
1031 
1032 static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1033 				struct seq_file *m)
1034 {
1035 	struct blkio_cgroup *blkcg;
1036 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1037 	int name = BLKIOFILE_ATTR(cft->private);
1038 
1039 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1040 
1041 	switch(plid) {
1042 	case BLKIO_POLICY_PROP:
1043 		switch(name) {
1044 		case BLKIO_PROP_weight_device:
1045 			blkio_read_policy_node_files(cft, blkcg, m);
1046 			return 0;
1047 		default:
1048 			BUG();
1049 		}
1050 		break;
1051 	case BLKIO_POLICY_THROTL:
1052 		switch(name){
1053 		case BLKIO_THROTL_read_bps_device:
1054 		case BLKIO_THROTL_write_bps_device:
1055 		case BLKIO_THROTL_read_iops_device:
1056 		case BLKIO_THROTL_write_iops_device:
1057 			blkio_read_policy_node_files(cft, blkcg, m);
1058 			return 0;
1059 		default:
1060 			BUG();
1061 		}
1062 		break;
1063 	default:
1064 		BUG();
1065 	}
1066 
1067 	return 0;
1068 }
1069 
1070 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1071 		struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
1072 		bool show_total)
1073 {
1074 	struct blkio_group *blkg;
1075 	struct hlist_node *n;
1076 	uint64_t cgroup_total = 0;
1077 
1078 	rcu_read_lock();
1079 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1080 		if (blkg->dev) {
1081 			if (!cftype_blkg_same_policy(cft, blkg))
1082 				continue;
1083 			spin_lock_irq(&blkg->stats_lock);
1084 			cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
1085 						type);
1086 			spin_unlock_irq(&blkg->stats_lock);
1087 		}
1088 	}
1089 	if (show_total)
1090 		cb->fill(cb, "Total", cgroup_total);
1091 	rcu_read_unlock();
1092 	return 0;
1093 }
1094 
1095 /* All map kind of cgroup file get serviced by this function */
1096 static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1097 				struct cgroup_map_cb *cb)
1098 {
1099 	struct blkio_cgroup *blkcg;
1100 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1101 	int name = BLKIOFILE_ATTR(cft->private);
1102 
1103 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1104 
1105 	switch(plid) {
1106 	case BLKIO_POLICY_PROP:
1107 		switch(name) {
1108 		case BLKIO_PROP_time:
1109 			return blkio_read_blkg_stats(blkcg, cft, cb,
1110 						BLKIO_STAT_TIME, 0);
1111 		case BLKIO_PROP_sectors:
1112 			return blkio_read_blkg_stats(blkcg, cft, cb,
1113 						BLKIO_STAT_SECTORS, 0);
1114 		case BLKIO_PROP_io_service_bytes:
1115 			return blkio_read_blkg_stats(blkcg, cft, cb,
1116 						BLKIO_STAT_SERVICE_BYTES, 1);
1117 		case BLKIO_PROP_io_serviced:
1118 			return blkio_read_blkg_stats(blkcg, cft, cb,
1119 						BLKIO_STAT_SERVICED, 1);
1120 		case BLKIO_PROP_io_service_time:
1121 			return blkio_read_blkg_stats(blkcg, cft, cb,
1122 						BLKIO_STAT_SERVICE_TIME, 1);
1123 		case BLKIO_PROP_io_wait_time:
1124 			return blkio_read_blkg_stats(blkcg, cft, cb,
1125 						BLKIO_STAT_WAIT_TIME, 1);
1126 		case BLKIO_PROP_io_merged:
1127 			return blkio_read_blkg_stats(blkcg, cft, cb,
1128 						BLKIO_STAT_MERGED, 1);
1129 		case BLKIO_PROP_io_queued:
1130 			return blkio_read_blkg_stats(blkcg, cft, cb,
1131 						BLKIO_STAT_QUEUED, 1);
1132 #ifdef CONFIG_DEBUG_BLK_CGROUP
1133 		case BLKIO_PROP_unaccounted_time:
1134 			return blkio_read_blkg_stats(blkcg, cft, cb,
1135 						BLKIO_STAT_UNACCOUNTED_TIME, 0);
1136 		case BLKIO_PROP_dequeue:
1137 			return blkio_read_blkg_stats(blkcg, cft, cb,
1138 						BLKIO_STAT_DEQUEUE, 0);
1139 		case BLKIO_PROP_avg_queue_size:
1140 			return blkio_read_blkg_stats(blkcg, cft, cb,
1141 						BLKIO_STAT_AVG_QUEUE_SIZE, 0);
1142 		case BLKIO_PROP_group_wait_time:
1143 			return blkio_read_blkg_stats(blkcg, cft, cb,
1144 						BLKIO_STAT_GROUP_WAIT_TIME, 0);
1145 		case BLKIO_PROP_idle_time:
1146 			return blkio_read_blkg_stats(blkcg, cft, cb,
1147 						BLKIO_STAT_IDLE_TIME, 0);
1148 		case BLKIO_PROP_empty_time:
1149 			return blkio_read_blkg_stats(blkcg, cft, cb,
1150 						BLKIO_STAT_EMPTY_TIME, 0);
1151 #endif
1152 		default:
1153 			BUG();
1154 		}
1155 		break;
1156 	case BLKIO_POLICY_THROTL:
1157 		switch(name){
1158 		case BLKIO_THROTL_io_service_bytes:
1159 			return blkio_read_blkg_stats(blkcg, cft, cb,
1160 						BLKIO_STAT_SERVICE_BYTES, 1);
1161 		case BLKIO_THROTL_io_serviced:
1162 			return blkio_read_blkg_stats(blkcg, cft, cb,
1163 						BLKIO_STAT_SERVICED, 1);
1164 		default:
1165 			BUG();
1166 		}
1167 		break;
1168 	default:
1169 		BUG();
1170 	}
1171 
1172 	return 0;
1173 }
1174 
1175 static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1176 {
1177 	struct blkio_group *blkg;
1178 	struct hlist_node *n;
1179 	struct blkio_policy_node *pn;
1180 
1181 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1182 		return -EINVAL;
1183 
1184 	spin_lock(&blkio_list_lock);
1185 	spin_lock_irq(&blkcg->lock);
1186 	blkcg->weight = (unsigned int)val;
1187 
1188 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1189 		pn = blkio_policy_search_node(blkcg, blkg->dev,
1190 				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1191 		if (pn)
1192 			continue;
1193 
1194 		blkio_update_group_weight(blkg, blkcg->weight);
1195 	}
1196 	spin_unlock_irq(&blkcg->lock);
1197 	spin_unlock(&blkio_list_lock);
1198 	return 0;
1199 }
1200 
1201 static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1202 	struct blkio_cgroup *blkcg;
1203 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1204 	int name = BLKIOFILE_ATTR(cft->private);
1205 
1206 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1207 
1208 	switch(plid) {
1209 	case BLKIO_POLICY_PROP:
1210 		switch(name) {
1211 		case BLKIO_PROP_weight:
1212 			return (u64)blkcg->weight;
1213 		}
1214 		break;
1215 	default:
1216 		BUG();
1217 	}
1218 	return 0;
1219 }
1220 
1221 static int
1222 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1223 {
1224 	struct blkio_cgroup *blkcg;
1225 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1226 	int name = BLKIOFILE_ATTR(cft->private);
1227 
1228 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1229 
1230 	switch(plid) {
1231 	case BLKIO_POLICY_PROP:
1232 		switch(name) {
1233 		case BLKIO_PROP_weight:
1234 			return blkio_weight_write(blkcg, val);
1235 		}
1236 		break;
1237 	default:
1238 		BUG();
1239 	}
1240 
1241 	return 0;
1242 }
1243 
1244 struct cftype blkio_files[] = {
1245 	{
1246 		.name = "weight_device",
1247 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1248 				BLKIO_PROP_weight_device),
1249 		.read_seq_string = blkiocg_file_read,
1250 		.write_string = blkiocg_file_write,
1251 		.max_write_len = 256,
1252 	},
1253 	{
1254 		.name = "weight",
1255 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1256 				BLKIO_PROP_weight),
1257 		.read_u64 = blkiocg_file_read_u64,
1258 		.write_u64 = blkiocg_file_write_u64,
1259 	},
1260 	{
1261 		.name = "time",
1262 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1263 				BLKIO_PROP_time),
1264 		.read_map = blkiocg_file_read_map,
1265 	},
1266 	{
1267 		.name = "sectors",
1268 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1269 				BLKIO_PROP_sectors),
1270 		.read_map = blkiocg_file_read_map,
1271 	},
1272 	{
1273 		.name = "io_service_bytes",
1274 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1275 				BLKIO_PROP_io_service_bytes),
1276 		.read_map = blkiocg_file_read_map,
1277 	},
1278 	{
1279 		.name = "io_serviced",
1280 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1281 				BLKIO_PROP_io_serviced),
1282 		.read_map = blkiocg_file_read_map,
1283 	},
1284 	{
1285 		.name = "io_service_time",
1286 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1287 				BLKIO_PROP_io_service_time),
1288 		.read_map = blkiocg_file_read_map,
1289 	},
1290 	{
1291 		.name = "io_wait_time",
1292 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1293 				BLKIO_PROP_io_wait_time),
1294 		.read_map = blkiocg_file_read_map,
1295 	},
1296 	{
1297 		.name = "io_merged",
1298 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1299 				BLKIO_PROP_io_merged),
1300 		.read_map = blkiocg_file_read_map,
1301 	},
1302 	{
1303 		.name = "io_queued",
1304 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1305 				BLKIO_PROP_io_queued),
1306 		.read_map = blkiocg_file_read_map,
1307 	},
1308 	{
1309 		.name = "reset_stats",
1310 		.write_u64 = blkiocg_reset_stats,
1311 	},
1312 #ifdef CONFIG_BLK_DEV_THROTTLING
1313 	{
1314 		.name = "throttle.read_bps_device",
1315 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1316 				BLKIO_THROTL_read_bps_device),
1317 		.read_seq_string = blkiocg_file_read,
1318 		.write_string = blkiocg_file_write,
1319 		.max_write_len = 256,
1320 	},
1321 
1322 	{
1323 		.name = "throttle.write_bps_device",
1324 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1325 				BLKIO_THROTL_write_bps_device),
1326 		.read_seq_string = blkiocg_file_read,
1327 		.write_string = blkiocg_file_write,
1328 		.max_write_len = 256,
1329 	},
1330 
1331 	{
1332 		.name = "throttle.read_iops_device",
1333 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1334 				BLKIO_THROTL_read_iops_device),
1335 		.read_seq_string = blkiocg_file_read,
1336 		.write_string = blkiocg_file_write,
1337 		.max_write_len = 256,
1338 	},
1339 
1340 	{
1341 		.name = "throttle.write_iops_device",
1342 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1343 				BLKIO_THROTL_write_iops_device),
1344 		.read_seq_string = blkiocg_file_read,
1345 		.write_string = blkiocg_file_write,
1346 		.max_write_len = 256,
1347 	},
1348 	{
1349 		.name = "throttle.io_service_bytes",
1350 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1351 				BLKIO_THROTL_io_service_bytes),
1352 		.read_map = blkiocg_file_read_map,
1353 	},
1354 	{
1355 		.name = "throttle.io_serviced",
1356 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1357 				BLKIO_THROTL_io_serviced),
1358 		.read_map = blkiocg_file_read_map,
1359 	},
1360 #endif /* CONFIG_BLK_DEV_THROTTLING */
1361 
1362 #ifdef CONFIG_DEBUG_BLK_CGROUP
1363 	{
1364 		.name = "avg_queue_size",
1365 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1366 				BLKIO_PROP_avg_queue_size),
1367 		.read_map = blkiocg_file_read_map,
1368 	},
1369 	{
1370 		.name = "group_wait_time",
1371 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1372 				BLKIO_PROP_group_wait_time),
1373 		.read_map = blkiocg_file_read_map,
1374 	},
1375 	{
1376 		.name = "idle_time",
1377 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1378 				BLKIO_PROP_idle_time),
1379 		.read_map = blkiocg_file_read_map,
1380 	},
1381 	{
1382 		.name = "empty_time",
1383 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1384 				BLKIO_PROP_empty_time),
1385 		.read_map = blkiocg_file_read_map,
1386 	},
1387 	{
1388 		.name = "dequeue",
1389 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1390 				BLKIO_PROP_dequeue),
1391 		.read_map = blkiocg_file_read_map,
1392 	},
1393 	{
1394 		.name = "unaccounted_time",
1395 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1396 				BLKIO_PROP_unaccounted_time),
1397 		.read_map = blkiocg_file_read_map,
1398 	},
1399 #endif
1400 };
1401 
1402 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1403 {
1404 	return cgroup_add_files(cgroup, subsys, blkio_files,
1405 				ARRAY_SIZE(blkio_files));
1406 }
1407 
1408 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1409 {
1410 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1411 	unsigned long flags;
1412 	struct blkio_group *blkg;
1413 	void *key;
1414 	struct blkio_policy_type *blkiop;
1415 	struct blkio_policy_node *pn, *pntmp;
1416 
1417 	rcu_read_lock();
1418 	do {
1419 		spin_lock_irqsave(&blkcg->lock, flags);
1420 
1421 		if (hlist_empty(&blkcg->blkg_list)) {
1422 			spin_unlock_irqrestore(&blkcg->lock, flags);
1423 			break;
1424 		}
1425 
1426 		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1427 					blkcg_node);
1428 		key = rcu_dereference(blkg->key);
1429 		__blkiocg_del_blkio_group(blkg);
1430 
1431 		spin_unlock_irqrestore(&blkcg->lock, flags);
1432 
1433 		/*
1434 		 * This blkio_group is being unlinked as associated cgroup is
1435 		 * going away. Let all the IO controlling policies know about
1436 		 * this event.
1437 		 */
1438 		spin_lock(&blkio_list_lock);
1439 		list_for_each_entry(blkiop, &blkio_list, list) {
1440 			if (blkiop->plid != blkg->plid)
1441 				continue;
1442 			blkiop->ops.blkio_unlink_group_fn(key, blkg);
1443 		}
1444 		spin_unlock(&blkio_list_lock);
1445 	} while (1);
1446 
1447 	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
1448 		blkio_policy_delete_node(pn);
1449 		kfree(pn);
1450 	}
1451 
1452 	free_css_id(&blkio_subsys, &blkcg->css);
1453 	rcu_read_unlock();
1454 	if (blkcg != &blkio_root_cgroup)
1455 		kfree(blkcg);
1456 }
1457 
1458 static struct cgroup_subsys_state *
1459 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1460 {
1461 	struct blkio_cgroup *blkcg;
1462 	struct cgroup *parent = cgroup->parent;
1463 
1464 	if (!parent) {
1465 		blkcg = &blkio_root_cgroup;
1466 		goto done;
1467 	}
1468 
1469 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1470 	if (!blkcg)
1471 		return ERR_PTR(-ENOMEM);
1472 
1473 	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1474 done:
1475 	spin_lock_init(&blkcg->lock);
1476 	INIT_HLIST_HEAD(&blkcg->blkg_list);
1477 
1478 	INIT_LIST_HEAD(&blkcg->policy_list);
1479 	return &blkcg->css;
1480 }
1481 
1482 /*
1483  * We cannot support shared io contexts, as we have no mean to support
1484  * two tasks with the same ioc in two different groups without major rework
1485  * of the main cic data structures.  For now we allow a task to change
1486  * its cgroup only if it's the only owner of its ioc.
1487  */
1488 static int blkiocg_can_attach(struct cgroup_subsys *subsys,
1489 				struct cgroup *cgroup, struct task_struct *tsk,
1490 				bool threadgroup)
1491 {
1492 	struct io_context *ioc;
1493 	int ret = 0;
1494 
1495 	/* task_lock() is needed to avoid races with exit_io_context() */
1496 	task_lock(tsk);
1497 	ioc = tsk->io_context;
1498 	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1499 		ret = -EINVAL;
1500 	task_unlock(tsk);
1501 
1502 	return ret;
1503 }
1504 
1505 static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
1506 				struct cgroup *prev, struct task_struct *tsk,
1507 				bool threadgroup)
1508 {
1509 	struct io_context *ioc;
1510 
1511 	task_lock(tsk);
1512 	ioc = tsk->io_context;
1513 	if (ioc)
1514 		ioc->cgroup_changed = 1;
1515 	task_unlock(tsk);
1516 }
1517 
1518 void blkio_policy_register(struct blkio_policy_type *blkiop)
1519 {
1520 	spin_lock(&blkio_list_lock);
1521 	list_add_tail(&blkiop->list, &blkio_list);
1522 	spin_unlock(&blkio_list_lock);
1523 }
1524 EXPORT_SYMBOL_GPL(blkio_policy_register);
1525 
1526 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1527 {
1528 	spin_lock(&blkio_list_lock);
1529 	list_del_init(&blkiop->list);
1530 	spin_unlock(&blkio_list_lock);
1531 }
1532 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1533 
1534 static int __init init_cgroup_blkio(void)
1535 {
1536 	return cgroup_load_subsys(&blkio_subsys);
1537 }
1538 
1539 static void __exit exit_cgroup_blkio(void)
1540 {
1541 	cgroup_unload_subsys(&blkio_subsys);
1542 }
1543 
1544 module_init(init_cgroup_blkio);
1545 module_exit(exit_cgroup_blkio);
1546 MODULE_LICENSE("GPL");
1547