xref: /linux/block/blk-cgroup.c (revision 3e4cd0737d2e9c3dd52153a23aef1753e3a99fc4)
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *		      Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  * 	              Nauman Rafique <nauman@google.com>
12  */
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include <linux/slab.h>
20 #include "blk-cgroup.h"
21 #include <linux/genhd.h>
22 
23 #define MAX_KEY_LEN 100
24 
25 static DEFINE_SPINLOCK(blkio_list_lock);
26 static LIST_HEAD(blkio_list);
27 
28 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
29 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
30 
31 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
32 						  struct cgroup *);
33 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
34 			      struct task_struct *, bool);
35 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
36 			   struct cgroup *, struct task_struct *, bool);
37 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39 
40 /* for encoding cft->private value on file */
41 #define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
42 /* What policy owns the file, proportional or throttle */
43 #define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
44 #define BLKIOFILE_ATTR(val)		((val) & 0xffff)
45 
46 struct cgroup_subsys blkio_subsys = {
47 	.name = "blkio",
48 	.create = blkiocg_create,
49 	.can_attach = blkiocg_can_attach,
50 	.attach = blkiocg_attach,
51 	.destroy = blkiocg_destroy,
52 	.populate = blkiocg_populate,
53 #ifdef CONFIG_BLK_CGROUP
54 	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
55 	.subsys_id = blkio_subsys_id,
56 #endif
57 	.use_id = 1,
58 	.module = THIS_MODULE,
59 };
60 EXPORT_SYMBOL_GPL(blkio_subsys);
61 
62 static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
63 					    struct blkio_policy_node *pn)
64 {
65 	list_add(&pn->node, &blkcg->policy_list);
66 }
67 
68 static inline bool cftype_blkg_same_policy(struct cftype *cft,
69 			struct blkio_group *blkg)
70 {
71 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
72 
73 	if (blkg->plid == plid)
74 		return 1;
75 
76 	return 0;
77 }
78 
79 /* Determines if policy node matches cgroup file being accessed */
80 static inline bool pn_matches_cftype(struct cftype *cft,
81 			struct blkio_policy_node *pn)
82 {
83 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
84 	int fileid = BLKIOFILE_ATTR(cft->private);
85 
86 	return (plid == pn->plid && fileid == pn->fileid);
87 }
88 
89 /* Must be called with blkcg->lock held */
90 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
91 {
92 	list_del(&pn->node);
93 }
94 
95 /* Must be called with blkcg->lock held */
96 static struct blkio_policy_node *
97 blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
98 		enum blkio_policy_id plid, int fileid)
99 {
100 	struct blkio_policy_node *pn;
101 
102 	list_for_each_entry(pn, &blkcg->policy_list, node) {
103 		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
104 			return pn;
105 	}
106 
107 	return NULL;
108 }
109 
110 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
111 {
112 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
113 			    struct blkio_cgroup, css);
114 }
115 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
116 
117 struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
118 {
119 	return container_of(task_subsys_state(tsk, blkio_subsys_id),
120 			    struct blkio_cgroup, css);
121 }
122 EXPORT_SYMBOL_GPL(task_blkio_cgroup);
123 
124 static inline void
125 blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
126 {
127 	struct blkio_policy_type *blkiop;
128 
129 	list_for_each_entry(blkiop, &blkio_list, list) {
130 		/* If this policy does not own the blkg, do not send updates */
131 		if (blkiop->plid != blkg->plid)
132 			continue;
133 		if (blkiop->ops.blkio_update_group_weight_fn)
134 			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
135 							blkg, weight);
136 	}
137 }
138 
139 static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
140 				int fileid)
141 {
142 	struct blkio_policy_type *blkiop;
143 
144 	list_for_each_entry(blkiop, &blkio_list, list) {
145 
146 		/* If this policy does not own the blkg, do not send updates */
147 		if (blkiop->plid != blkg->plid)
148 			continue;
149 
150 		if (fileid == BLKIO_THROTL_read_bps_device
151 		    && blkiop->ops.blkio_update_group_read_bps_fn)
152 			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
153 								blkg, bps);
154 
155 		if (fileid == BLKIO_THROTL_write_bps_device
156 		    && blkiop->ops.blkio_update_group_write_bps_fn)
157 			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
158 								blkg, bps);
159 	}
160 }
161 
162 static inline void blkio_update_group_iops(struct blkio_group *blkg,
163 			unsigned int iops, int fileid)
164 {
165 	struct blkio_policy_type *blkiop;
166 
167 	list_for_each_entry(blkiop, &blkio_list, list) {
168 
169 		/* If this policy does not own the blkg, do not send updates */
170 		if (blkiop->plid != blkg->plid)
171 			continue;
172 
173 		if (fileid == BLKIO_THROTL_read_iops_device
174 		    && blkiop->ops.blkio_update_group_read_iops_fn)
175 			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
176 								blkg, iops);
177 
178 		if (fileid == BLKIO_THROTL_write_iops_device
179 		    && blkiop->ops.blkio_update_group_write_iops_fn)
180 			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
181 								blkg,iops);
182 	}
183 }
184 
185 /*
186  * Add to the appropriate stat variable depending on the request type.
187  * This should be called with the blkg->stats_lock held.
188  */
189 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
190 				bool sync)
191 {
192 	if (direction)
193 		stat[BLKIO_STAT_WRITE] += add;
194 	else
195 		stat[BLKIO_STAT_READ] += add;
196 	if (sync)
197 		stat[BLKIO_STAT_SYNC] += add;
198 	else
199 		stat[BLKIO_STAT_ASYNC] += add;
200 }
201 
202 /*
203  * Decrements the appropriate stat variable if non-zero depending on the
204  * request type. Panics on value being zero.
205  * This should be called with the blkg->stats_lock held.
206  */
207 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
208 {
209 	if (direction) {
210 		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
211 		stat[BLKIO_STAT_WRITE]--;
212 	} else {
213 		BUG_ON(stat[BLKIO_STAT_READ] == 0);
214 		stat[BLKIO_STAT_READ]--;
215 	}
216 	if (sync) {
217 		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
218 		stat[BLKIO_STAT_SYNC]--;
219 	} else {
220 		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
221 		stat[BLKIO_STAT_ASYNC]--;
222 	}
223 }
224 
225 #ifdef CONFIG_DEBUG_BLK_CGROUP
226 /* This should be called with the blkg->stats_lock held. */
227 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
228 						struct blkio_group *curr_blkg)
229 {
230 	if (blkio_blkg_waiting(&blkg->stats))
231 		return;
232 	if (blkg == curr_blkg)
233 		return;
234 	blkg->stats.start_group_wait_time = sched_clock();
235 	blkio_mark_blkg_waiting(&blkg->stats);
236 }
237 
238 /* This should be called with the blkg->stats_lock held. */
239 static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
240 {
241 	unsigned long long now;
242 
243 	if (!blkio_blkg_waiting(stats))
244 		return;
245 
246 	now = sched_clock();
247 	if (time_after64(now, stats->start_group_wait_time))
248 		stats->group_wait_time += now - stats->start_group_wait_time;
249 	blkio_clear_blkg_waiting(stats);
250 }
251 
252 /* This should be called with the blkg->stats_lock held. */
253 static void blkio_end_empty_time(struct blkio_group_stats *stats)
254 {
255 	unsigned long long now;
256 
257 	if (!blkio_blkg_empty(stats))
258 		return;
259 
260 	now = sched_clock();
261 	if (time_after64(now, stats->start_empty_time))
262 		stats->empty_time += now - stats->start_empty_time;
263 	blkio_clear_blkg_empty(stats);
264 }
265 
266 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
267 {
268 	unsigned long flags;
269 
270 	spin_lock_irqsave(&blkg->stats_lock, flags);
271 	BUG_ON(blkio_blkg_idling(&blkg->stats));
272 	blkg->stats.start_idle_time = sched_clock();
273 	blkio_mark_blkg_idling(&blkg->stats);
274 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
275 }
276 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
277 
278 void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
279 {
280 	unsigned long flags;
281 	unsigned long long now;
282 	struct blkio_group_stats *stats;
283 
284 	spin_lock_irqsave(&blkg->stats_lock, flags);
285 	stats = &blkg->stats;
286 	if (blkio_blkg_idling(stats)) {
287 		now = sched_clock();
288 		if (time_after64(now, stats->start_idle_time))
289 			stats->idle_time += now - stats->start_idle_time;
290 		blkio_clear_blkg_idling(stats);
291 	}
292 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
293 }
294 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
295 
296 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
297 {
298 	unsigned long flags;
299 	struct blkio_group_stats *stats;
300 
301 	spin_lock_irqsave(&blkg->stats_lock, flags);
302 	stats = &blkg->stats;
303 	stats->avg_queue_size_sum +=
304 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
305 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
306 	stats->avg_queue_size_samples++;
307 	blkio_update_group_wait_time(stats);
308 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
309 }
310 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
311 
312 void blkiocg_set_start_empty_time(struct blkio_group *blkg)
313 {
314 	unsigned long flags;
315 	struct blkio_group_stats *stats;
316 
317 	spin_lock_irqsave(&blkg->stats_lock, flags);
318 	stats = &blkg->stats;
319 
320 	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
321 			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
322 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
323 		return;
324 	}
325 
326 	/*
327 	 * group is already marked empty. This can happen if cfqq got new
328 	 * request in parent group and moved to this group while being added
329 	 * to service tree. Just ignore the event and move on.
330 	 */
331 	if(blkio_blkg_empty(stats)) {
332 		spin_unlock_irqrestore(&blkg->stats_lock, flags);
333 		return;
334 	}
335 
336 	stats->start_empty_time = sched_clock();
337 	blkio_mark_blkg_empty(stats);
338 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
339 }
340 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
341 
342 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
343 			unsigned long dequeue)
344 {
345 	blkg->stats.dequeue += dequeue;
346 }
347 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
348 #else
349 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
350 					struct blkio_group *curr_blkg) {}
351 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
352 #endif
353 
354 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
355 			struct blkio_group *curr_blkg, bool direction,
356 			bool sync)
357 {
358 	unsigned long flags;
359 
360 	spin_lock_irqsave(&blkg->stats_lock, flags);
361 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
362 			sync);
363 	blkio_end_empty_time(&blkg->stats);
364 	blkio_set_start_group_wait_time(blkg, curr_blkg);
365 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
366 }
367 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
368 
369 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
370 						bool direction, bool sync)
371 {
372 	unsigned long flags;
373 
374 	spin_lock_irqsave(&blkg->stats_lock, flags);
375 	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
376 					direction, sync);
377 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
378 }
379 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
380 
381 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
382 				unsigned long unaccounted_time)
383 {
384 	unsigned long flags;
385 
386 	spin_lock_irqsave(&blkg->stats_lock, flags);
387 	blkg->stats.time += time;
388 	blkg->stats.unaccounted_time += unaccounted_time;
389 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
390 }
391 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
392 
393 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
394 				uint64_t bytes, bool direction, bool sync)
395 {
396 	struct blkio_group_stats *stats;
397 	unsigned long flags;
398 
399 	spin_lock_irqsave(&blkg->stats_lock, flags);
400 	stats = &blkg->stats;
401 	stats->sectors += bytes >> 9;
402 	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
403 			sync);
404 	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
405 			direction, sync);
406 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
407 }
408 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
409 
410 void blkiocg_update_completion_stats(struct blkio_group *blkg,
411 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
412 {
413 	struct blkio_group_stats *stats;
414 	unsigned long flags;
415 	unsigned long long now = sched_clock();
416 
417 	spin_lock_irqsave(&blkg->stats_lock, flags);
418 	stats = &blkg->stats;
419 	if (time_after64(now, io_start_time))
420 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
421 				now - io_start_time, direction, sync);
422 	if (time_after64(io_start_time, start_time))
423 		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
424 				io_start_time - start_time, direction, sync);
425 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
426 }
427 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
428 
429 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
430 					bool sync)
431 {
432 	unsigned long flags;
433 
434 	spin_lock_irqsave(&blkg->stats_lock, flags);
435 	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
436 			sync);
437 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
438 }
439 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
440 
441 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
442 		struct blkio_group *blkg, void *key, dev_t dev,
443 		enum blkio_policy_id plid)
444 {
445 	unsigned long flags;
446 
447 	spin_lock_irqsave(&blkcg->lock, flags);
448 	spin_lock_init(&blkg->stats_lock);
449 	rcu_assign_pointer(blkg->key, key);
450 	blkg->blkcg_id = css_id(&blkcg->css);
451 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
452 	blkg->plid = plid;
453 	spin_unlock_irqrestore(&blkcg->lock, flags);
454 	/* Need to take css reference ? */
455 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
456 	blkg->dev = dev;
457 }
458 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
459 
460 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
461 {
462 	hlist_del_init_rcu(&blkg->blkcg_node);
463 	blkg->blkcg_id = 0;
464 }
465 
466 /*
467  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
468  * indicating that blk_group was unhashed by the time we got to it.
469  */
470 int blkiocg_del_blkio_group(struct blkio_group *blkg)
471 {
472 	struct blkio_cgroup *blkcg;
473 	unsigned long flags;
474 	struct cgroup_subsys_state *css;
475 	int ret = 1;
476 
477 	rcu_read_lock();
478 	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
479 	if (css) {
480 		blkcg = container_of(css, struct blkio_cgroup, css);
481 		spin_lock_irqsave(&blkcg->lock, flags);
482 		if (!hlist_unhashed(&blkg->blkcg_node)) {
483 			__blkiocg_del_blkio_group(blkg);
484 			ret = 0;
485 		}
486 		spin_unlock_irqrestore(&blkcg->lock, flags);
487 	}
488 
489 	rcu_read_unlock();
490 	return ret;
491 }
492 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
493 
494 /* called under rcu_read_lock(). */
495 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
496 {
497 	struct blkio_group *blkg;
498 	struct hlist_node *n;
499 	void *__key;
500 
501 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
502 		__key = blkg->key;
503 		if (__key == key)
504 			return blkg;
505 	}
506 
507 	return NULL;
508 }
509 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
510 
511 static int
512 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
513 {
514 	struct blkio_cgroup *blkcg;
515 	struct blkio_group *blkg;
516 	struct blkio_group_stats *stats;
517 	struct hlist_node *n;
518 	uint64_t queued[BLKIO_STAT_TOTAL];
519 	int i;
520 #ifdef CONFIG_DEBUG_BLK_CGROUP
521 	bool idling, waiting, empty;
522 	unsigned long long now = sched_clock();
523 #endif
524 
525 	blkcg = cgroup_to_blkio_cgroup(cgroup);
526 	spin_lock_irq(&blkcg->lock);
527 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
528 		spin_lock(&blkg->stats_lock);
529 		stats = &blkg->stats;
530 #ifdef CONFIG_DEBUG_BLK_CGROUP
531 		idling = blkio_blkg_idling(stats);
532 		waiting = blkio_blkg_waiting(stats);
533 		empty = blkio_blkg_empty(stats);
534 #endif
535 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
536 			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
537 		memset(stats, 0, sizeof(struct blkio_group_stats));
538 		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
539 			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
540 #ifdef CONFIG_DEBUG_BLK_CGROUP
541 		if (idling) {
542 			blkio_mark_blkg_idling(stats);
543 			stats->start_idle_time = now;
544 		}
545 		if (waiting) {
546 			blkio_mark_blkg_waiting(stats);
547 			stats->start_group_wait_time = now;
548 		}
549 		if (empty) {
550 			blkio_mark_blkg_empty(stats);
551 			stats->start_empty_time = now;
552 		}
553 #endif
554 		spin_unlock(&blkg->stats_lock);
555 	}
556 	spin_unlock_irq(&blkcg->lock);
557 	return 0;
558 }
559 
560 static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
561 				int chars_left, bool diskname_only)
562 {
563 	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
564 	chars_left -= strlen(str);
565 	if (chars_left <= 0) {
566 		printk(KERN_WARNING
567 			"Possibly incorrect cgroup stat display format");
568 		return;
569 	}
570 	if (diskname_only)
571 		return;
572 	switch (type) {
573 	case BLKIO_STAT_READ:
574 		strlcat(str, " Read", chars_left);
575 		break;
576 	case BLKIO_STAT_WRITE:
577 		strlcat(str, " Write", chars_left);
578 		break;
579 	case BLKIO_STAT_SYNC:
580 		strlcat(str, " Sync", chars_left);
581 		break;
582 	case BLKIO_STAT_ASYNC:
583 		strlcat(str, " Async", chars_left);
584 		break;
585 	case BLKIO_STAT_TOTAL:
586 		strlcat(str, " Total", chars_left);
587 		break;
588 	default:
589 		strlcat(str, " Invalid", chars_left);
590 	}
591 }
592 
593 static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
594 				struct cgroup_map_cb *cb, dev_t dev)
595 {
596 	blkio_get_key_name(0, dev, str, chars_left, true);
597 	cb->fill(cb, str, val);
598 	return val;
599 }
600 
601 /* This should be called with blkg->stats_lock held */
602 static uint64_t blkio_get_stat(struct blkio_group *blkg,
603 		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
604 {
605 	uint64_t disk_total;
606 	char key_str[MAX_KEY_LEN];
607 	enum stat_sub_type sub_type;
608 
609 	if (type == BLKIO_STAT_TIME)
610 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
611 					blkg->stats.time, cb, dev);
612 	if (type == BLKIO_STAT_SECTORS)
613 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
614 					blkg->stats.sectors, cb, dev);
615 #ifdef CONFIG_DEBUG_BLK_CGROUP
616 	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
617 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
618 					blkg->stats.unaccounted_time, cb, dev);
619 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
620 		uint64_t sum = blkg->stats.avg_queue_size_sum;
621 		uint64_t samples = blkg->stats.avg_queue_size_samples;
622 		if (samples)
623 			do_div(sum, samples);
624 		else
625 			sum = 0;
626 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
627 	}
628 	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
629 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
630 					blkg->stats.group_wait_time, cb, dev);
631 	if (type == BLKIO_STAT_IDLE_TIME)
632 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
633 					blkg->stats.idle_time, cb, dev);
634 	if (type == BLKIO_STAT_EMPTY_TIME)
635 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
636 					blkg->stats.empty_time, cb, dev);
637 	if (type == BLKIO_STAT_DEQUEUE)
638 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
639 					blkg->stats.dequeue, cb, dev);
640 #endif
641 
642 	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
643 			sub_type++) {
644 		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
645 		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
646 	}
647 	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
648 			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
649 	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
650 	cb->fill(cb, key_str, disk_total);
651 	return disk_total;
652 }
653 
654 static int blkio_check_dev_num(dev_t dev)
655 {
656 	int part = 0;
657 	struct gendisk *disk;
658 
659 	disk = get_gendisk(dev, &part);
660 	if (!disk || part)
661 		return -ENODEV;
662 
663 	return 0;
664 }
665 
666 static int blkio_policy_parse_and_set(char *buf,
667 	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
668 {
669 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
670 	int ret;
671 	unsigned long major, minor, temp;
672 	int i = 0;
673 	dev_t dev;
674 	u64 bps, iops;
675 
676 	memset(s, 0, sizeof(s));
677 
678 	while ((p = strsep(&buf, " ")) != NULL) {
679 		if (!*p)
680 			continue;
681 
682 		s[i++] = p;
683 
684 		/* Prevent from inputing too many things */
685 		if (i == 3)
686 			break;
687 	}
688 
689 	if (i != 2)
690 		return -EINVAL;
691 
692 	p = strsep(&s[0], ":");
693 	if (p != NULL)
694 		major_s = p;
695 	else
696 		return -EINVAL;
697 
698 	minor_s = s[0];
699 	if (!minor_s)
700 		return -EINVAL;
701 
702 	ret = strict_strtoul(major_s, 10, &major);
703 	if (ret)
704 		return -EINVAL;
705 
706 	ret = strict_strtoul(minor_s, 10, &minor);
707 	if (ret)
708 		return -EINVAL;
709 
710 	dev = MKDEV(major, minor);
711 
712 	ret = blkio_check_dev_num(dev);
713 	if (ret)
714 		return ret;
715 
716 	newpn->dev = dev;
717 
718 	if (s[1] == NULL)
719 		return -EINVAL;
720 
721 	switch (plid) {
722 	case BLKIO_POLICY_PROP:
723 		ret = strict_strtoul(s[1], 10, &temp);
724 		if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
725 			temp > BLKIO_WEIGHT_MAX)
726 			return -EINVAL;
727 
728 		newpn->plid = plid;
729 		newpn->fileid = fileid;
730 		newpn->val.weight = temp;
731 		break;
732 	case BLKIO_POLICY_THROTL:
733 		switch(fileid) {
734 		case BLKIO_THROTL_read_bps_device:
735 		case BLKIO_THROTL_write_bps_device:
736 			ret = strict_strtoull(s[1], 10, &bps);
737 			if (ret)
738 				return -EINVAL;
739 
740 			newpn->plid = plid;
741 			newpn->fileid = fileid;
742 			newpn->val.bps = bps;
743 			break;
744 		case BLKIO_THROTL_read_iops_device:
745 		case BLKIO_THROTL_write_iops_device:
746 			ret = strict_strtoull(s[1], 10, &iops);
747 			if (ret)
748 				return -EINVAL;
749 
750 			if (iops > THROTL_IOPS_MAX)
751 				return -EINVAL;
752 
753 			newpn->plid = plid;
754 			newpn->fileid = fileid;
755 			newpn->val.iops = (unsigned int)iops;
756 			break;
757 		}
758 		break;
759 	default:
760 		BUG();
761 	}
762 
763 	return 0;
764 }
765 
766 unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
767 			      dev_t dev)
768 {
769 	struct blkio_policy_node *pn;
770 
771 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
772 				BLKIO_PROP_weight_device);
773 	if (pn)
774 		return pn->val.weight;
775 	else
776 		return blkcg->weight;
777 }
778 EXPORT_SYMBOL_GPL(blkcg_get_weight);
779 
780 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
781 {
782 	struct blkio_policy_node *pn;
783 
784 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
785 				BLKIO_THROTL_read_bps_device);
786 	if (pn)
787 		return pn->val.bps;
788 	else
789 		return -1;
790 }
791 
792 uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
793 {
794 	struct blkio_policy_node *pn;
795 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
796 				BLKIO_THROTL_write_bps_device);
797 	if (pn)
798 		return pn->val.bps;
799 	else
800 		return -1;
801 }
802 
803 unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
804 {
805 	struct blkio_policy_node *pn;
806 
807 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
808 				BLKIO_THROTL_read_iops_device);
809 	if (pn)
810 		return pn->val.iops;
811 	else
812 		return -1;
813 }
814 
815 unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
816 {
817 	struct blkio_policy_node *pn;
818 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
819 				BLKIO_THROTL_write_iops_device);
820 	if (pn)
821 		return pn->val.iops;
822 	else
823 		return -1;
824 }
825 
826 /* Checks whether user asked for deleting a policy rule */
827 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
828 {
829 	switch(pn->plid) {
830 	case BLKIO_POLICY_PROP:
831 		if (pn->val.weight == 0)
832 			return 1;
833 		break;
834 	case BLKIO_POLICY_THROTL:
835 		switch(pn->fileid) {
836 		case BLKIO_THROTL_read_bps_device:
837 		case BLKIO_THROTL_write_bps_device:
838 			if (pn->val.bps == 0)
839 				return 1;
840 			break;
841 		case BLKIO_THROTL_read_iops_device:
842 		case BLKIO_THROTL_write_iops_device:
843 			if (pn->val.iops == 0)
844 				return 1;
845 		}
846 		break;
847 	default:
848 		BUG();
849 	}
850 
851 	return 0;
852 }
853 
854 static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
855 					struct blkio_policy_node *newpn)
856 {
857 	switch(oldpn->plid) {
858 	case BLKIO_POLICY_PROP:
859 		oldpn->val.weight = newpn->val.weight;
860 		break;
861 	case BLKIO_POLICY_THROTL:
862 		switch(newpn->fileid) {
863 		case BLKIO_THROTL_read_bps_device:
864 		case BLKIO_THROTL_write_bps_device:
865 			oldpn->val.bps = newpn->val.bps;
866 			break;
867 		case BLKIO_THROTL_read_iops_device:
868 		case BLKIO_THROTL_write_iops_device:
869 			oldpn->val.iops = newpn->val.iops;
870 		}
871 		break;
872 	default:
873 		BUG();
874 	}
875 }
876 
877 /*
878  * Some rules/values in blkg have changed. Propagate those to respective
879  * policies.
880  */
881 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
882 		struct blkio_group *blkg, struct blkio_policy_node *pn)
883 {
884 	unsigned int weight, iops;
885 	u64 bps;
886 
887 	switch(pn->plid) {
888 	case BLKIO_POLICY_PROP:
889 		weight = pn->val.weight ? pn->val.weight :
890 				blkcg->weight;
891 		blkio_update_group_weight(blkg, weight);
892 		break;
893 	case BLKIO_POLICY_THROTL:
894 		switch(pn->fileid) {
895 		case BLKIO_THROTL_read_bps_device:
896 		case BLKIO_THROTL_write_bps_device:
897 			bps = pn->val.bps ? pn->val.bps : (-1);
898 			blkio_update_group_bps(blkg, bps, pn->fileid);
899 			break;
900 		case BLKIO_THROTL_read_iops_device:
901 		case BLKIO_THROTL_write_iops_device:
902 			iops = pn->val.iops ? pn->val.iops : (-1);
903 			blkio_update_group_iops(blkg, iops, pn->fileid);
904 			break;
905 		}
906 		break;
907 	default:
908 		BUG();
909 	}
910 }
911 
912 /*
913  * A policy node rule has been updated. Propagate this update to all the
914  * block groups which might be affected by this update.
915  */
916 static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
917 				struct blkio_policy_node *pn)
918 {
919 	struct blkio_group *blkg;
920 	struct hlist_node *n;
921 
922 	spin_lock(&blkio_list_lock);
923 	spin_lock_irq(&blkcg->lock);
924 
925 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
926 		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
927 			continue;
928 		blkio_update_blkg_policy(blkcg, blkg, pn);
929 	}
930 
931 	spin_unlock_irq(&blkcg->lock);
932 	spin_unlock(&blkio_list_lock);
933 }
934 
935 static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
936  				       const char *buffer)
937 {
938 	int ret = 0;
939 	char *buf;
940 	struct blkio_policy_node *newpn, *pn;
941 	struct blkio_cgroup *blkcg;
942 	int keep_newpn = 0;
943 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
944 	int fileid = BLKIOFILE_ATTR(cft->private);
945 
946 	buf = kstrdup(buffer, GFP_KERNEL);
947 	if (!buf)
948 		return -ENOMEM;
949 
950 	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
951 	if (!newpn) {
952 		ret = -ENOMEM;
953 		goto free_buf;
954 	}
955 
956 	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
957 	if (ret)
958 		goto free_newpn;
959 
960 	blkcg = cgroup_to_blkio_cgroup(cgrp);
961 
962 	spin_lock_irq(&blkcg->lock);
963 
964 	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
965 	if (!pn) {
966 		if (!blkio_delete_rule_command(newpn)) {
967 			blkio_policy_insert_node(blkcg, newpn);
968 			keep_newpn = 1;
969 		}
970 		spin_unlock_irq(&blkcg->lock);
971 		goto update_io_group;
972 	}
973 
974 	if (blkio_delete_rule_command(newpn)) {
975 		blkio_policy_delete_node(pn);
976 		spin_unlock_irq(&blkcg->lock);
977 		goto update_io_group;
978 	}
979 	spin_unlock_irq(&blkcg->lock);
980 
981 	blkio_update_policy_rule(pn, newpn);
982 
983 update_io_group:
984 	blkio_update_policy_node_blkg(blkcg, newpn);
985 
986 free_newpn:
987 	if (!keep_newpn)
988 		kfree(newpn);
989 free_buf:
990 	kfree(buf);
991 	return ret;
992 }
993 
994 static void
995 blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
996 {
997 	switch(pn->plid) {
998 		case BLKIO_POLICY_PROP:
999 			if (pn->fileid == BLKIO_PROP_weight_device)
1000 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1001 					MINOR(pn->dev), pn->val.weight);
1002 			break;
1003 		case BLKIO_POLICY_THROTL:
1004 			switch(pn->fileid) {
1005 			case BLKIO_THROTL_read_bps_device:
1006 			case BLKIO_THROTL_write_bps_device:
1007 				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1008 					MINOR(pn->dev), pn->val.bps);
1009 				break;
1010 			case BLKIO_THROTL_read_iops_device:
1011 			case BLKIO_THROTL_write_iops_device:
1012 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1013 					MINOR(pn->dev), pn->val.iops);
1014 				break;
1015 			}
1016 			break;
1017 		default:
1018 			BUG();
1019 	}
1020 }
1021 
1022 /* cgroup files which read their data from policy nodes end up here */
1023 static void blkio_read_policy_node_files(struct cftype *cft,
1024 			struct blkio_cgroup *blkcg, struct seq_file *m)
1025 {
1026 	struct blkio_policy_node *pn;
1027 
1028 	if (!list_empty(&blkcg->policy_list)) {
1029 		spin_lock_irq(&blkcg->lock);
1030 		list_for_each_entry(pn, &blkcg->policy_list, node) {
1031 			if (!pn_matches_cftype(cft, pn))
1032 				continue;
1033 			blkio_print_policy_node(m, pn);
1034 		}
1035 		spin_unlock_irq(&blkcg->lock);
1036 	}
1037 }
1038 
1039 static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1040 				struct seq_file *m)
1041 {
1042 	struct blkio_cgroup *blkcg;
1043 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1044 	int name = BLKIOFILE_ATTR(cft->private);
1045 
1046 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1047 
1048 	switch(plid) {
1049 	case BLKIO_POLICY_PROP:
1050 		switch(name) {
1051 		case BLKIO_PROP_weight_device:
1052 			blkio_read_policy_node_files(cft, blkcg, m);
1053 			return 0;
1054 		default:
1055 			BUG();
1056 		}
1057 		break;
1058 	case BLKIO_POLICY_THROTL:
1059 		switch(name){
1060 		case BLKIO_THROTL_read_bps_device:
1061 		case BLKIO_THROTL_write_bps_device:
1062 		case BLKIO_THROTL_read_iops_device:
1063 		case BLKIO_THROTL_write_iops_device:
1064 			blkio_read_policy_node_files(cft, blkcg, m);
1065 			return 0;
1066 		default:
1067 			BUG();
1068 		}
1069 		break;
1070 	default:
1071 		BUG();
1072 	}
1073 
1074 	return 0;
1075 }
1076 
1077 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1078 		struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
1079 		bool show_total)
1080 {
1081 	struct blkio_group *blkg;
1082 	struct hlist_node *n;
1083 	uint64_t cgroup_total = 0;
1084 
1085 	rcu_read_lock();
1086 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1087 		if (blkg->dev) {
1088 			if (!cftype_blkg_same_policy(cft, blkg))
1089 				continue;
1090 			spin_lock_irq(&blkg->stats_lock);
1091 			cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
1092 						type);
1093 			spin_unlock_irq(&blkg->stats_lock);
1094 		}
1095 	}
1096 	if (show_total)
1097 		cb->fill(cb, "Total", cgroup_total);
1098 	rcu_read_unlock();
1099 	return 0;
1100 }
1101 
1102 /* All map kind of cgroup file get serviced by this function */
1103 static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1104 				struct cgroup_map_cb *cb)
1105 {
1106 	struct blkio_cgroup *blkcg;
1107 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1108 	int name = BLKIOFILE_ATTR(cft->private);
1109 
1110 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1111 
1112 	switch(plid) {
1113 	case BLKIO_POLICY_PROP:
1114 		switch(name) {
1115 		case BLKIO_PROP_time:
1116 			return blkio_read_blkg_stats(blkcg, cft, cb,
1117 						BLKIO_STAT_TIME, 0);
1118 		case BLKIO_PROP_sectors:
1119 			return blkio_read_blkg_stats(blkcg, cft, cb,
1120 						BLKIO_STAT_SECTORS, 0);
1121 		case BLKIO_PROP_io_service_bytes:
1122 			return blkio_read_blkg_stats(blkcg, cft, cb,
1123 						BLKIO_STAT_SERVICE_BYTES, 1);
1124 		case BLKIO_PROP_io_serviced:
1125 			return blkio_read_blkg_stats(blkcg, cft, cb,
1126 						BLKIO_STAT_SERVICED, 1);
1127 		case BLKIO_PROP_io_service_time:
1128 			return blkio_read_blkg_stats(blkcg, cft, cb,
1129 						BLKIO_STAT_SERVICE_TIME, 1);
1130 		case BLKIO_PROP_io_wait_time:
1131 			return blkio_read_blkg_stats(blkcg, cft, cb,
1132 						BLKIO_STAT_WAIT_TIME, 1);
1133 		case BLKIO_PROP_io_merged:
1134 			return blkio_read_blkg_stats(blkcg, cft, cb,
1135 						BLKIO_STAT_MERGED, 1);
1136 		case BLKIO_PROP_io_queued:
1137 			return blkio_read_blkg_stats(blkcg, cft, cb,
1138 						BLKIO_STAT_QUEUED, 1);
1139 #ifdef CONFIG_DEBUG_BLK_CGROUP
1140 		case BLKIO_PROP_unaccounted_time:
1141 			return blkio_read_blkg_stats(blkcg, cft, cb,
1142 						BLKIO_STAT_UNACCOUNTED_TIME, 0);
1143 		case BLKIO_PROP_dequeue:
1144 			return blkio_read_blkg_stats(blkcg, cft, cb,
1145 						BLKIO_STAT_DEQUEUE, 0);
1146 		case BLKIO_PROP_avg_queue_size:
1147 			return blkio_read_blkg_stats(blkcg, cft, cb,
1148 						BLKIO_STAT_AVG_QUEUE_SIZE, 0);
1149 		case BLKIO_PROP_group_wait_time:
1150 			return blkio_read_blkg_stats(blkcg, cft, cb,
1151 						BLKIO_STAT_GROUP_WAIT_TIME, 0);
1152 		case BLKIO_PROP_idle_time:
1153 			return blkio_read_blkg_stats(blkcg, cft, cb,
1154 						BLKIO_STAT_IDLE_TIME, 0);
1155 		case BLKIO_PROP_empty_time:
1156 			return blkio_read_blkg_stats(blkcg, cft, cb,
1157 						BLKIO_STAT_EMPTY_TIME, 0);
1158 #endif
1159 		default:
1160 			BUG();
1161 		}
1162 		break;
1163 	case BLKIO_POLICY_THROTL:
1164 		switch(name){
1165 		case BLKIO_THROTL_io_service_bytes:
1166 			return blkio_read_blkg_stats(blkcg, cft, cb,
1167 						BLKIO_STAT_SERVICE_BYTES, 1);
1168 		case BLKIO_THROTL_io_serviced:
1169 			return blkio_read_blkg_stats(blkcg, cft, cb,
1170 						BLKIO_STAT_SERVICED, 1);
1171 		default:
1172 			BUG();
1173 		}
1174 		break;
1175 	default:
1176 		BUG();
1177 	}
1178 
1179 	return 0;
1180 }
1181 
1182 static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1183 {
1184 	struct blkio_group *blkg;
1185 	struct hlist_node *n;
1186 	struct blkio_policy_node *pn;
1187 
1188 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1189 		return -EINVAL;
1190 
1191 	spin_lock(&blkio_list_lock);
1192 	spin_lock_irq(&blkcg->lock);
1193 	blkcg->weight = (unsigned int)val;
1194 
1195 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1196 		pn = blkio_policy_search_node(blkcg, blkg->dev,
1197 				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1198 		if (pn)
1199 			continue;
1200 
1201 		blkio_update_group_weight(blkg, blkcg->weight);
1202 	}
1203 	spin_unlock_irq(&blkcg->lock);
1204 	spin_unlock(&blkio_list_lock);
1205 	return 0;
1206 }
1207 
1208 static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1209 	struct blkio_cgroup *blkcg;
1210 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1211 	int name = BLKIOFILE_ATTR(cft->private);
1212 
1213 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1214 
1215 	switch(plid) {
1216 	case BLKIO_POLICY_PROP:
1217 		switch(name) {
1218 		case BLKIO_PROP_weight:
1219 			return (u64)blkcg->weight;
1220 		}
1221 		break;
1222 	default:
1223 		BUG();
1224 	}
1225 	return 0;
1226 }
1227 
1228 static int
1229 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1230 {
1231 	struct blkio_cgroup *blkcg;
1232 	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1233 	int name = BLKIOFILE_ATTR(cft->private);
1234 
1235 	blkcg = cgroup_to_blkio_cgroup(cgrp);
1236 
1237 	switch(plid) {
1238 	case BLKIO_POLICY_PROP:
1239 		switch(name) {
1240 		case BLKIO_PROP_weight:
1241 			return blkio_weight_write(blkcg, val);
1242 		}
1243 		break;
1244 	default:
1245 		BUG();
1246 	}
1247 
1248 	return 0;
1249 }
1250 
1251 struct cftype blkio_files[] = {
1252 	{
1253 		.name = "weight_device",
1254 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1255 				BLKIO_PROP_weight_device),
1256 		.read_seq_string = blkiocg_file_read,
1257 		.write_string = blkiocg_file_write,
1258 		.max_write_len = 256,
1259 	},
1260 	{
1261 		.name = "weight",
1262 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1263 				BLKIO_PROP_weight),
1264 		.read_u64 = blkiocg_file_read_u64,
1265 		.write_u64 = blkiocg_file_write_u64,
1266 	},
1267 	{
1268 		.name = "time",
1269 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1270 				BLKIO_PROP_time),
1271 		.read_map = blkiocg_file_read_map,
1272 	},
1273 	{
1274 		.name = "sectors",
1275 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1276 				BLKIO_PROP_sectors),
1277 		.read_map = blkiocg_file_read_map,
1278 	},
1279 	{
1280 		.name = "io_service_bytes",
1281 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1282 				BLKIO_PROP_io_service_bytes),
1283 		.read_map = blkiocg_file_read_map,
1284 	},
1285 	{
1286 		.name = "io_serviced",
1287 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1288 				BLKIO_PROP_io_serviced),
1289 		.read_map = blkiocg_file_read_map,
1290 	},
1291 	{
1292 		.name = "io_service_time",
1293 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1294 				BLKIO_PROP_io_service_time),
1295 		.read_map = blkiocg_file_read_map,
1296 	},
1297 	{
1298 		.name = "io_wait_time",
1299 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1300 				BLKIO_PROP_io_wait_time),
1301 		.read_map = blkiocg_file_read_map,
1302 	},
1303 	{
1304 		.name = "io_merged",
1305 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1306 				BLKIO_PROP_io_merged),
1307 		.read_map = blkiocg_file_read_map,
1308 	},
1309 	{
1310 		.name = "io_queued",
1311 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1312 				BLKIO_PROP_io_queued),
1313 		.read_map = blkiocg_file_read_map,
1314 	},
1315 	{
1316 		.name = "reset_stats",
1317 		.write_u64 = blkiocg_reset_stats,
1318 	},
1319 #ifdef CONFIG_BLK_DEV_THROTTLING
1320 	{
1321 		.name = "throttle.read_bps_device",
1322 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1323 				BLKIO_THROTL_read_bps_device),
1324 		.read_seq_string = blkiocg_file_read,
1325 		.write_string = blkiocg_file_write,
1326 		.max_write_len = 256,
1327 	},
1328 
1329 	{
1330 		.name = "throttle.write_bps_device",
1331 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1332 				BLKIO_THROTL_write_bps_device),
1333 		.read_seq_string = blkiocg_file_read,
1334 		.write_string = blkiocg_file_write,
1335 		.max_write_len = 256,
1336 	},
1337 
1338 	{
1339 		.name = "throttle.read_iops_device",
1340 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1341 				BLKIO_THROTL_read_iops_device),
1342 		.read_seq_string = blkiocg_file_read,
1343 		.write_string = blkiocg_file_write,
1344 		.max_write_len = 256,
1345 	},
1346 
1347 	{
1348 		.name = "throttle.write_iops_device",
1349 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1350 				BLKIO_THROTL_write_iops_device),
1351 		.read_seq_string = blkiocg_file_read,
1352 		.write_string = blkiocg_file_write,
1353 		.max_write_len = 256,
1354 	},
1355 	{
1356 		.name = "throttle.io_service_bytes",
1357 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1358 				BLKIO_THROTL_io_service_bytes),
1359 		.read_map = blkiocg_file_read_map,
1360 	},
1361 	{
1362 		.name = "throttle.io_serviced",
1363 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1364 				BLKIO_THROTL_io_serviced),
1365 		.read_map = blkiocg_file_read_map,
1366 	},
1367 #endif /* CONFIG_BLK_DEV_THROTTLING */
1368 
1369 #ifdef CONFIG_DEBUG_BLK_CGROUP
1370 	{
1371 		.name = "avg_queue_size",
1372 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1373 				BLKIO_PROP_avg_queue_size),
1374 		.read_map = blkiocg_file_read_map,
1375 	},
1376 	{
1377 		.name = "group_wait_time",
1378 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1379 				BLKIO_PROP_group_wait_time),
1380 		.read_map = blkiocg_file_read_map,
1381 	},
1382 	{
1383 		.name = "idle_time",
1384 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1385 				BLKIO_PROP_idle_time),
1386 		.read_map = blkiocg_file_read_map,
1387 	},
1388 	{
1389 		.name = "empty_time",
1390 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1391 				BLKIO_PROP_empty_time),
1392 		.read_map = blkiocg_file_read_map,
1393 	},
1394 	{
1395 		.name = "dequeue",
1396 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1397 				BLKIO_PROP_dequeue),
1398 		.read_map = blkiocg_file_read_map,
1399 	},
1400 	{
1401 		.name = "unaccounted_time",
1402 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1403 				BLKIO_PROP_unaccounted_time),
1404 		.read_map = blkiocg_file_read_map,
1405 	},
1406 #endif
1407 };
1408 
1409 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1410 {
1411 	return cgroup_add_files(cgroup, subsys, blkio_files,
1412 				ARRAY_SIZE(blkio_files));
1413 }
1414 
1415 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1416 {
1417 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1418 	unsigned long flags;
1419 	struct blkio_group *blkg;
1420 	void *key;
1421 	struct blkio_policy_type *blkiop;
1422 	struct blkio_policy_node *pn, *pntmp;
1423 
1424 	rcu_read_lock();
1425 	do {
1426 		spin_lock_irqsave(&blkcg->lock, flags);
1427 
1428 		if (hlist_empty(&blkcg->blkg_list)) {
1429 			spin_unlock_irqrestore(&blkcg->lock, flags);
1430 			break;
1431 		}
1432 
1433 		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1434 					blkcg_node);
1435 		key = rcu_dereference(blkg->key);
1436 		__blkiocg_del_blkio_group(blkg);
1437 
1438 		spin_unlock_irqrestore(&blkcg->lock, flags);
1439 
1440 		/*
1441 		 * This blkio_group is being unlinked as associated cgroup is
1442 		 * going away. Let all the IO controlling policies know about
1443 		 * this event.
1444 		 */
1445 		spin_lock(&blkio_list_lock);
1446 		list_for_each_entry(blkiop, &blkio_list, list) {
1447 			if (blkiop->plid != blkg->plid)
1448 				continue;
1449 			blkiop->ops.blkio_unlink_group_fn(key, blkg);
1450 		}
1451 		spin_unlock(&blkio_list_lock);
1452 	} while (1);
1453 
1454 	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
1455 		blkio_policy_delete_node(pn);
1456 		kfree(pn);
1457 	}
1458 
1459 	free_css_id(&blkio_subsys, &blkcg->css);
1460 	rcu_read_unlock();
1461 	if (blkcg != &blkio_root_cgroup)
1462 		kfree(blkcg);
1463 }
1464 
1465 static struct cgroup_subsys_state *
1466 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1467 {
1468 	struct blkio_cgroup *blkcg;
1469 	struct cgroup *parent = cgroup->parent;
1470 
1471 	if (!parent) {
1472 		blkcg = &blkio_root_cgroup;
1473 		goto done;
1474 	}
1475 
1476 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1477 	if (!blkcg)
1478 		return ERR_PTR(-ENOMEM);
1479 
1480 	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1481 done:
1482 	spin_lock_init(&blkcg->lock);
1483 	INIT_HLIST_HEAD(&blkcg->blkg_list);
1484 
1485 	INIT_LIST_HEAD(&blkcg->policy_list);
1486 	return &blkcg->css;
1487 }
1488 
1489 /*
1490  * We cannot support shared io contexts, as we have no mean to support
1491  * two tasks with the same ioc in two different groups without major rework
1492  * of the main cic data structures.  For now we allow a task to change
1493  * its cgroup only if it's the only owner of its ioc.
1494  */
1495 static int blkiocg_can_attach(struct cgroup_subsys *subsys,
1496 				struct cgroup *cgroup, struct task_struct *tsk,
1497 				bool threadgroup)
1498 {
1499 	struct io_context *ioc;
1500 	int ret = 0;
1501 
1502 	/* task_lock() is needed to avoid races with exit_io_context() */
1503 	task_lock(tsk);
1504 	ioc = tsk->io_context;
1505 	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1506 		ret = -EINVAL;
1507 	task_unlock(tsk);
1508 
1509 	return ret;
1510 }
1511 
1512 static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
1513 				struct cgroup *prev, struct task_struct *tsk,
1514 				bool threadgroup)
1515 {
1516 	struct io_context *ioc;
1517 
1518 	task_lock(tsk);
1519 	ioc = tsk->io_context;
1520 	if (ioc)
1521 		ioc->cgroup_changed = 1;
1522 	task_unlock(tsk);
1523 }
1524 
1525 void blkio_policy_register(struct blkio_policy_type *blkiop)
1526 {
1527 	spin_lock(&blkio_list_lock);
1528 	list_add_tail(&blkiop->list, &blkio_list);
1529 	spin_unlock(&blkio_list_lock);
1530 }
1531 EXPORT_SYMBOL_GPL(blkio_policy_register);
1532 
1533 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1534 {
1535 	spin_lock(&blkio_list_lock);
1536 	list_del_init(&blkiop->list);
1537 	spin_unlock(&blkio_list_lock);
1538 }
1539 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1540 
1541 static int __init init_cgroup_blkio(void)
1542 {
1543 	return cgroup_load_subsys(&blkio_subsys);
1544 }
1545 
1546 static void __exit exit_cgroup_blkio(void)
1547 {
1548 	cgroup_unload_subsys(&blkio_subsys);
1549 }
1550 
1551 module_init(init_cgroup_blkio);
1552 module_exit(exit_cgroup_blkio);
1553 MODULE_LICENSE("GPL");
1554