xref: /linux/net/sched/sch_htb.c (revision 1795cf48b322b4d19230a40dbe7181acedd34a94)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <net/netlink.h>
39 #include <net/pkt_sched.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 /* used internaly to keep status of single class */
66 enum htb_cmode {
67 	HTB_CANT_SEND,		/* class can't send and can't borrow */
68 	HTB_MAY_BORROW,		/* class can't send but may borrow */
69 	HTB_CAN_SEND		/* class can send */
70 };
71 
72 /* interior & leaf nodes; props specific to leaves are marked L: */
73 struct htb_class {
74 	struct Qdisc_class_common common;
75 	/* general class parameters */
76 	struct gnet_stats_basic bstats;
77 	struct gnet_stats_queue qstats;
78 	struct gnet_stats_rate_est rate_est;
79 	struct tc_htb_xstats xstats;	/* our special stats */
80 	int refcnt;		/* usage count of this class */
81 
82 	/* topology */
83 	int level;		/* our level (see above) */
84 	unsigned int children;
85 	struct htb_class *parent;	/* parent class */
86 
87 	union {
88 		struct htb_class_leaf {
89 			struct Qdisc *q;
90 			int prio;
91 			int aprio;
92 			int quantum;
93 			int deficit[TC_HTB_MAXDEPTH];
94 			struct list_head drop_list;
95 		} leaf;
96 		struct htb_class_inner {
97 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
98 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
99 			/* When class changes from state 1->2 and disconnects from
100 			   parent's feed then we lost ptr value and start from the
101 			   first child again. Here we store classid of the
102 			   last valid ptr (used when ptr is NULL). */
103 			u32 last_ptr_id[TC_HTB_NUMPRIO];
104 		} inner;
105 	} un;
106 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
107 	struct rb_node pq_node;	/* node for event queue */
108 	psched_time_t pq_key;
109 
110 	int prio_activity;	/* for which prios are we active */
111 	enum htb_cmode cmode;	/* current mode of the class */
112 
113 	/* class attached filters */
114 	struct tcf_proto *filter_list;
115 	int filter_cnt;
116 
117 	int warned;		/* only one warning about non work conserving .. */
118 
119 	/* token bucket parameters */
120 	struct qdisc_rate_table *rate;	/* rate table of the class itself */
121 	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
122 	long buffer, cbuffer;	/* token bucket depth/rate */
123 	psched_tdiff_t mbuffer;	/* max wait time */
124 	long tokens, ctokens;	/* current number of tokens */
125 	psched_time_t t_c;	/* checkpoint time */
126 
127 	int prio;		/* For parent to leaf return possible here */
128 	int quantum;		/* we do backup. Finally full replacement  */
129 				/* of un.leaf originals should be done. */
130 };
131 
132 static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
133 			   int size)
134 {
135 	long result = qdisc_l2t(rate, size);
136 	return result;
137 }
138 
139 struct htb_sched {
140 	struct Qdisc_class_hash clhash;
141 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
142 
143 	/* self list - roots of self generating tree */
144 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
145 	int row_mask[TC_HTB_MAXDEPTH];
146 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
147 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
148 
149 	/* self wait list - roots of wait PQs per row */
150 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
151 
152 	/* time of nearest event per level (row) */
153 	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
154 
155 	/* whether we hit non-work conserving class during this dequeue; we use */
156 	int nwc_hit;		/* this to disable mindelay complaint in dequeue */
157 
158 	int defcls;		/* class where unclassified flows go to */
159 
160 	/* filters for qdisc itself */
161 	struct tcf_proto *filter_list;
162 
163 	int rate2quantum;	/* quant = rate / rate2quantum */
164 	psched_time_t now;	/* cached dequeue time */
165 	struct qdisc_watchdog watchdog;
166 
167 	/* non shaped skbs; let them go directly thru */
168 	struct sk_buff_head direct_queue;
169 	int direct_qlen;	/* max qlen of above */
170 
171 	long direct_pkts;
172 };
173 
174 /* find class in global hash table using given handle */
175 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
176 {
177 	struct htb_sched *q = qdisc_priv(sch);
178 	struct Qdisc_class_common *clc;
179 
180 	clc = qdisc_class_find(&q->clhash, handle);
181 	if (clc == NULL)
182 		return NULL;
183 	return container_of(clc, struct htb_class, common);
184 }
185 
186 /**
187  * htb_classify - classify a packet into class
188  *
189  * It returns NULL if the packet should be dropped or -1 if the packet
190  * should be passed directly thru. In all other cases leaf class is returned.
191  * We allow direct class selection by classid in priority. The we examine
192  * filters in qdisc and in inner nodes (if higher filter points to the inner
193  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
194  * internal fifo (direct). These packets then go directly thru. If we still
195  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
196  * then finish and return direct queue.
197  */
198 #define HTB_DIRECT (struct htb_class*)-1
199 
200 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
201 				      int *qerr)
202 {
203 	struct htb_sched *q = qdisc_priv(sch);
204 	struct htb_class *cl;
205 	struct tcf_result res;
206 	struct tcf_proto *tcf;
207 	int result;
208 
209 	/* allow to select class by setting skb->priority to valid classid;
210 	   note that nfmark can be used too by attaching filter fw with no
211 	   rules in it */
212 	if (skb->priority == sch->handle)
213 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
214 	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
215 		return cl;
216 
217 	*qerr = NET_XMIT_BYPASS;
218 	tcf = q->filter_list;
219 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
220 #ifdef CONFIG_NET_CLS_ACT
221 		switch (result) {
222 		case TC_ACT_QUEUED:
223 		case TC_ACT_STOLEN:
224 			*qerr = NET_XMIT_SUCCESS;
225 		case TC_ACT_SHOT:
226 			return NULL;
227 		}
228 #endif
229 		if ((cl = (void *)res.class) == NULL) {
230 			if (res.classid == sch->handle)
231 				return HTB_DIRECT;	/* X:0 (direct flow) */
232 			if ((cl = htb_find(res.classid, sch)) == NULL)
233 				break;	/* filter selected invalid classid */
234 		}
235 		if (!cl->level)
236 			return cl;	/* we hit leaf; return it */
237 
238 		/* we have got inner class; apply inner filter chain */
239 		tcf = cl->filter_list;
240 	}
241 	/* classification failed; try to use default class */
242 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
243 	if (!cl || cl->level)
244 		return HTB_DIRECT;	/* bad default .. this is safe bet */
245 	return cl;
246 }
247 
248 /**
249  * htb_add_to_id_tree - adds class to the round robin list
250  *
251  * Routine adds class to the list (actually tree) sorted by classid.
252  * Make sure that class is not already on such list for given prio.
253  */
254 static void htb_add_to_id_tree(struct rb_root *root,
255 			       struct htb_class *cl, int prio)
256 {
257 	struct rb_node **p = &root->rb_node, *parent = NULL;
258 
259 	while (*p) {
260 		struct htb_class *c;
261 		parent = *p;
262 		c = rb_entry(parent, struct htb_class, node[prio]);
263 
264 		if (cl->common.classid > c->common.classid)
265 			p = &parent->rb_right;
266 		else
267 			p = &parent->rb_left;
268 	}
269 	rb_link_node(&cl->node[prio], parent, p);
270 	rb_insert_color(&cl->node[prio], root);
271 }
272 
273 /**
274  * htb_add_to_wait_tree - adds class to the event queue with delay
275  *
276  * The class is added to priority event queue to indicate that class will
277  * change its mode in cl->pq_key microseconds. Make sure that class is not
278  * already in the queue.
279  */
280 static void htb_add_to_wait_tree(struct htb_sched *q,
281 				 struct htb_class *cl, long delay)
282 {
283 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
284 
285 	cl->pq_key = q->now + delay;
286 	if (cl->pq_key == q->now)
287 		cl->pq_key++;
288 
289 	/* update the nearest event cache */
290 	if (q->near_ev_cache[cl->level] > cl->pq_key)
291 		q->near_ev_cache[cl->level] = cl->pq_key;
292 
293 	while (*p) {
294 		struct htb_class *c;
295 		parent = *p;
296 		c = rb_entry(parent, struct htb_class, pq_node);
297 		if (cl->pq_key >= c->pq_key)
298 			p = &parent->rb_right;
299 		else
300 			p = &parent->rb_left;
301 	}
302 	rb_link_node(&cl->pq_node, parent, p);
303 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
304 }
305 
306 /**
307  * htb_next_rb_node - finds next node in binary tree
308  *
309  * When we are past last key we return NULL.
310  * Average complexity is 2 steps per call.
311  */
312 static inline void htb_next_rb_node(struct rb_node **n)
313 {
314 	*n = rb_next(*n);
315 }
316 
317 /**
318  * htb_add_class_to_row - add class to its row
319  *
320  * The class is added to row at priorities marked in mask.
321  * It does nothing if mask == 0.
322  */
323 static inline void htb_add_class_to_row(struct htb_sched *q,
324 					struct htb_class *cl, int mask)
325 {
326 	q->row_mask[cl->level] |= mask;
327 	while (mask) {
328 		int prio = ffz(~mask);
329 		mask &= ~(1 << prio);
330 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
331 	}
332 }
333 
334 /* If this triggers, it is a bug in this code, but it need not be fatal */
335 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
336 {
337 	if (RB_EMPTY_NODE(rb)) {
338 		WARN_ON(1);
339 	} else {
340 		rb_erase(rb, root);
341 		RB_CLEAR_NODE(rb);
342 	}
343 }
344 
345 
346 /**
347  * htb_remove_class_from_row - removes class from its row
348  *
349  * The class is removed from row at priorities marked in mask.
350  * It does nothing if mask == 0.
351  */
352 static inline void htb_remove_class_from_row(struct htb_sched *q,
353 						 struct htb_class *cl, int mask)
354 {
355 	int m = 0;
356 
357 	while (mask) {
358 		int prio = ffz(~mask);
359 
360 		mask &= ~(1 << prio);
361 		if (q->ptr[cl->level][prio] == cl->node + prio)
362 			htb_next_rb_node(q->ptr[cl->level] + prio);
363 
364 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
365 		if (!q->row[cl->level][prio].rb_node)
366 			m |= 1 << prio;
367 	}
368 	q->row_mask[cl->level] &= ~m;
369 }
370 
371 /**
372  * htb_activate_prios - creates active classe's feed chain
373  *
374  * The class is connected to ancestors and/or appropriate rows
375  * for priorities it is participating on. cl->cmode must be new
376  * (activated) mode. It does nothing if cl->prio_activity == 0.
377  */
378 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
379 {
380 	struct htb_class *p = cl->parent;
381 	long m, mask = cl->prio_activity;
382 
383 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
384 		m = mask;
385 		while (m) {
386 			int prio = ffz(~m);
387 			m &= ~(1 << prio);
388 
389 			if (p->un.inner.feed[prio].rb_node)
390 				/* parent already has its feed in use so that
391 				   reset bit in mask as parent is already ok */
392 				mask &= ~(1 << prio);
393 
394 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
395 		}
396 		p->prio_activity |= mask;
397 		cl = p;
398 		p = cl->parent;
399 
400 	}
401 	if (cl->cmode == HTB_CAN_SEND && mask)
402 		htb_add_class_to_row(q, cl, mask);
403 }
404 
405 /**
406  * htb_deactivate_prios - remove class from feed chain
407  *
408  * cl->cmode must represent old mode (before deactivation). It does
409  * nothing if cl->prio_activity == 0. Class is removed from all feed
410  * chains and rows.
411  */
412 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
413 {
414 	struct htb_class *p = cl->parent;
415 	long m, mask = cl->prio_activity;
416 
417 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
418 		m = mask;
419 		mask = 0;
420 		while (m) {
421 			int prio = ffz(~m);
422 			m &= ~(1 << prio);
423 
424 			if (p->un.inner.ptr[prio] == cl->node + prio) {
425 				/* we are removing child which is pointed to from
426 				   parent feed - forget the pointer but remember
427 				   classid */
428 				p->un.inner.last_ptr_id[prio] = cl->common.classid;
429 				p->un.inner.ptr[prio] = NULL;
430 			}
431 
432 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
433 
434 			if (!p->un.inner.feed[prio].rb_node)
435 				mask |= 1 << prio;
436 		}
437 
438 		p->prio_activity &= ~mask;
439 		cl = p;
440 		p = cl->parent;
441 
442 	}
443 	if (cl->cmode == HTB_CAN_SEND && mask)
444 		htb_remove_class_from_row(q, cl, mask);
445 }
446 
447 static inline long htb_lowater(const struct htb_class *cl)
448 {
449 	if (htb_hysteresis)
450 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
451 	else
452 		return 0;
453 }
454 static inline long htb_hiwater(const struct htb_class *cl)
455 {
456 	if (htb_hysteresis)
457 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
458 	else
459 		return 0;
460 }
461 
462 
463 /**
464  * htb_class_mode - computes and returns current class mode
465  *
466  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
467  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
468  * from now to time when cl will change its state.
469  * Also it is worth to note that class mode doesn't change simply
470  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
471  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
472  * mode transitions per time unit. The speed gain is about 1/6.
473  */
474 static inline enum htb_cmode
475 htb_class_mode(struct htb_class *cl, long *diff)
476 {
477 	long toks;
478 
479 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
480 		*diff = -toks;
481 		return HTB_CANT_SEND;
482 	}
483 
484 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
485 		return HTB_CAN_SEND;
486 
487 	*diff = -toks;
488 	return HTB_MAY_BORROW;
489 }
490 
491 /**
492  * htb_change_class_mode - changes classe's mode
493  *
494  * This should be the only way how to change classe's mode under normal
495  * cirsumstances. Routine will update feed lists linkage, change mode
496  * and add class to the wait event queue if appropriate. New mode should
497  * be different from old one and cl->pq_key has to be valid if changing
498  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
499  */
500 static void
501 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
502 {
503 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
504 
505 	if (new_mode == cl->cmode)
506 		return;
507 
508 	if (cl->prio_activity) {	/* not necessary: speed optimization */
509 		if (cl->cmode != HTB_CANT_SEND)
510 			htb_deactivate_prios(q, cl);
511 		cl->cmode = new_mode;
512 		if (new_mode != HTB_CANT_SEND)
513 			htb_activate_prios(q, cl);
514 	} else
515 		cl->cmode = new_mode;
516 }
517 
518 /**
519  * htb_activate - inserts leaf cl into appropriate active feeds
520  *
521  * Routine learns (new) priority of leaf and activates feed chain
522  * for the prio. It can be called on already active leaf safely.
523  * It also adds leaf into droplist.
524  */
525 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
526 {
527 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
528 
529 	if (!cl->prio_activity) {
530 		cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
531 		htb_activate_prios(q, cl);
532 		list_add_tail(&cl->un.leaf.drop_list,
533 			      q->drops + cl->un.leaf.aprio);
534 	}
535 }
536 
537 /**
538  * htb_deactivate - remove leaf cl from active feeds
539  *
540  * Make sure that leaf is active. In the other words it can't be called
541  * with non-active leaf. It also removes class from the drop list.
542  */
543 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
544 {
545 	WARN_ON(!cl->prio_activity);
546 
547 	htb_deactivate_prios(q, cl);
548 	cl->prio_activity = 0;
549 	list_del_init(&cl->un.leaf.drop_list);
550 }
551 
552 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
553 {
554 	int ret;
555 	struct htb_sched *q = qdisc_priv(sch);
556 	struct htb_class *cl = htb_classify(skb, sch, &ret);
557 
558 	if (cl == HTB_DIRECT) {
559 		/* enqueue to helper queue */
560 		if (q->direct_queue.qlen < q->direct_qlen) {
561 			__skb_queue_tail(&q->direct_queue, skb);
562 			q->direct_pkts++;
563 		} else {
564 			kfree_skb(skb);
565 			sch->qstats.drops++;
566 			return NET_XMIT_DROP;
567 		}
568 #ifdef CONFIG_NET_CLS_ACT
569 	} else if (!cl) {
570 		if (ret == NET_XMIT_BYPASS)
571 			sch->qstats.drops++;
572 		kfree_skb(skb);
573 		return ret;
574 #endif
575 	} else if (qdisc_enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) {
576 		sch->qstats.drops++;
577 		cl->qstats.drops++;
578 		return NET_XMIT_DROP;
579 	} else {
580 		cl->bstats.packets +=
581 			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
582 		cl->bstats.bytes += qdisc_pkt_len(skb);
583 		htb_activate(q, cl);
584 	}
585 
586 	sch->q.qlen++;
587 	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
588 	sch->bstats.bytes += qdisc_pkt_len(skb);
589 	return NET_XMIT_SUCCESS;
590 }
591 
592 /* TODO: requeuing packet charges it to policers again !! */
593 static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
594 {
595 	int ret;
596 	struct htb_sched *q = qdisc_priv(sch);
597 	struct htb_class *cl = htb_classify(skb, sch, &ret);
598 	struct sk_buff *tskb;
599 
600 	if (cl == HTB_DIRECT) {
601 		/* enqueue to helper queue */
602 		if (q->direct_queue.qlen < q->direct_qlen) {
603 			__skb_queue_head(&q->direct_queue, skb);
604 		} else {
605 			__skb_queue_head(&q->direct_queue, skb);
606 			tskb = __skb_dequeue_tail(&q->direct_queue);
607 			kfree_skb(tskb);
608 			sch->qstats.drops++;
609 			return NET_XMIT_CN;
610 		}
611 #ifdef CONFIG_NET_CLS_ACT
612 	} else if (!cl) {
613 		if (ret == NET_XMIT_BYPASS)
614 			sch->qstats.drops++;
615 		kfree_skb(skb);
616 		return ret;
617 #endif
618 	} else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) !=
619 		   NET_XMIT_SUCCESS) {
620 		sch->qstats.drops++;
621 		cl->qstats.drops++;
622 		return NET_XMIT_DROP;
623 	} else
624 		htb_activate(q, cl);
625 
626 	sch->q.qlen++;
627 	sch->qstats.requeues++;
628 	return NET_XMIT_SUCCESS;
629 }
630 
631 /**
632  * htb_charge_class - charges amount "bytes" to leaf and ancestors
633  *
634  * Routine assumes that packet "bytes" long was dequeued from leaf cl
635  * borrowing from "level". It accounts bytes to ceil leaky bucket for
636  * leaf and all ancestors and to rate bucket for ancestors at levels
637  * "level" and higher. It also handles possible change of mode resulting
638  * from the update. Note that mode can also increase here (MAY_BORROW to
639  * CAN_SEND) because we can use more precise clock that event queue here.
640  * In such case we remove class from event queue first.
641  */
642 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
643 			     int level, struct sk_buff *skb)
644 {
645 	int bytes = qdisc_pkt_len(skb);
646 	long toks, diff;
647 	enum htb_cmode old_mode;
648 
649 #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
650 	if (toks > cl->B) toks = cl->B; \
651 	toks -= L2T(cl, cl->R, bytes); \
652 	if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
653 	cl->T = toks
654 
655 	while (cl) {
656 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
657 		if (cl->level >= level) {
658 			if (cl->level == level)
659 				cl->xstats.lends++;
660 			HTB_ACCNT(tokens, buffer, rate);
661 		} else {
662 			cl->xstats.borrows++;
663 			cl->tokens += diff;	/* we moved t_c; update tokens */
664 		}
665 		HTB_ACCNT(ctokens, cbuffer, ceil);
666 		cl->t_c = q->now;
667 
668 		old_mode = cl->cmode;
669 		diff = 0;
670 		htb_change_class_mode(q, cl, &diff);
671 		if (old_mode != cl->cmode) {
672 			if (old_mode != HTB_CAN_SEND)
673 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
674 			if (cl->cmode != HTB_CAN_SEND)
675 				htb_add_to_wait_tree(q, cl, diff);
676 		}
677 
678 		/* update byte stats except for leaves which are already updated */
679 		if (cl->level) {
680 			cl->bstats.bytes += bytes;
681 			cl->bstats.packets += skb_is_gso(skb)?
682 					skb_shinfo(skb)->gso_segs:1;
683 		}
684 		cl = cl->parent;
685 	}
686 }
687 
688 /**
689  * htb_do_events - make mode changes to classes at the level
690  *
691  * Scans event queue for pending events and applies them. Returns time of
692  * next pending event (0 for no event in pq).
693  * Note: Applied are events whose have cl->pq_key <= q->now.
694  */
695 static psched_time_t htb_do_events(struct htb_sched *q, int level)
696 {
697 	/* don't run for longer than 2 jiffies; 2 is used instead of
698 	   1 to simplify things when jiffy is going to be incremented
699 	   too soon */
700 	unsigned long stop_at = jiffies + 2;
701 	while (time_before(jiffies, stop_at)) {
702 		struct htb_class *cl;
703 		long diff;
704 		struct rb_node *p = rb_first(&q->wait_pq[level]);
705 
706 		if (!p)
707 			return 0;
708 
709 		cl = rb_entry(p, struct htb_class, pq_node);
710 		if (cl->pq_key > q->now)
711 			return cl->pq_key;
712 
713 		htb_safe_rb_erase(p, q->wait_pq + level);
714 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
715 		htb_change_class_mode(q, cl, &diff);
716 		if (cl->cmode != HTB_CAN_SEND)
717 			htb_add_to_wait_tree(q, cl, diff);
718 	}
719 	/* too much load - let's continue on next jiffie */
720 	return q->now + PSCHED_TICKS_PER_SEC / HZ;
721 }
722 
723 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
724    is no such one exists. */
725 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
726 					      u32 id)
727 {
728 	struct rb_node *r = NULL;
729 	while (n) {
730 		struct htb_class *cl =
731 		    rb_entry(n, struct htb_class, node[prio]);
732 		if (id == cl->common.classid)
733 			return n;
734 
735 		if (id > cl->common.classid) {
736 			n = n->rb_right;
737 		} else {
738 			r = n;
739 			n = n->rb_left;
740 		}
741 	}
742 	return r;
743 }
744 
745 /**
746  * htb_lookup_leaf - returns next leaf class in DRR order
747  *
748  * Find leaf where current feed pointers points to.
749  */
750 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
751 					 struct rb_node **pptr, u32 * pid)
752 {
753 	int i;
754 	struct {
755 		struct rb_node *root;
756 		struct rb_node **pptr;
757 		u32 *pid;
758 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
759 
760 	WARN_ON(!tree->rb_node);
761 	sp->root = tree->rb_node;
762 	sp->pptr = pptr;
763 	sp->pid = pid;
764 
765 	for (i = 0; i < 65535; i++) {
766 		if (!*sp->pptr && *sp->pid) {
767 			/* ptr was invalidated but id is valid - try to recover
768 			   the original or next ptr */
769 			*sp->pptr =
770 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
771 		}
772 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
773 				   can become out of date quickly */
774 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
775 			*sp->pptr = sp->root;
776 			while ((*sp->pptr)->rb_left)
777 				*sp->pptr = (*sp->pptr)->rb_left;
778 			if (sp > stk) {
779 				sp--;
780 				WARN_ON(!*sp->pptr);
781 				if (!*sp->pptr)
782 					return NULL;
783 				htb_next_rb_node(sp->pptr);
784 			}
785 		} else {
786 			struct htb_class *cl;
787 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
788 			if (!cl->level)
789 				return cl;
790 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
791 			sp->pptr = cl->un.inner.ptr + prio;
792 			sp->pid = cl->un.inner.last_ptr_id + prio;
793 		}
794 	}
795 	WARN_ON(1);
796 	return NULL;
797 }
798 
799 /* dequeues packet at given priority and level; call only if
800    you are sure that there is active class at prio/level */
801 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
802 					int level)
803 {
804 	struct sk_buff *skb = NULL;
805 	struct htb_class *cl, *start;
806 	/* look initial class up in the row */
807 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
808 				     q->ptr[level] + prio,
809 				     q->last_ptr_id[level] + prio);
810 
811 	do {
812 next:
813 		WARN_ON(!cl);
814 		if (!cl)
815 			return NULL;
816 
817 		/* class can be empty - it is unlikely but can be true if leaf
818 		   qdisc drops packets in enqueue routine or if someone used
819 		   graft operation on the leaf since last dequeue;
820 		   simply deactivate and skip such class */
821 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
822 			struct htb_class *next;
823 			htb_deactivate(q, cl);
824 
825 			/* row/level might become empty */
826 			if ((q->row_mask[level] & (1 << prio)) == 0)
827 				return NULL;
828 
829 			next = htb_lookup_leaf(q->row[level] + prio,
830 					       prio, q->ptr[level] + prio,
831 					       q->last_ptr_id[level] + prio);
832 
833 			if (cl == start)	/* fix start if we just deleted it */
834 				start = next;
835 			cl = next;
836 			goto next;
837 		}
838 
839 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
840 		if (likely(skb != NULL))
841 			break;
842 		if (!cl->warned) {
843 			printk(KERN_WARNING
844 			       "htb: class %X isn't work conserving ?!\n",
845 			       cl->common.classid);
846 			cl->warned = 1;
847 		}
848 		q->nwc_hit++;
849 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
850 				  ptr[0]) + prio);
851 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
852 				     q->ptr[level] + prio,
853 				     q->last_ptr_id[level] + prio);
854 
855 	} while (cl != start);
856 
857 	if (likely(skb != NULL)) {
858 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
859 		if (cl->un.leaf.deficit[level] < 0) {
860 			cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
861 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
862 					  ptr[0]) + prio);
863 		}
864 		/* this used to be after charge_class but this constelation
865 		   gives us slightly better performance */
866 		if (!cl->un.leaf.q->q.qlen)
867 			htb_deactivate(q, cl);
868 		htb_charge_class(q, cl, level, skb);
869 	}
870 	return skb;
871 }
872 
873 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
874 {
875 	struct sk_buff *skb = NULL;
876 	struct htb_sched *q = qdisc_priv(sch);
877 	int level;
878 	psched_time_t next_event;
879 
880 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
881 	skb = __skb_dequeue(&q->direct_queue);
882 	if (skb != NULL) {
883 		sch->flags &= ~TCQ_F_THROTTLED;
884 		sch->q.qlen--;
885 		return skb;
886 	}
887 
888 	if (!sch->q.qlen)
889 		goto fin;
890 	q->now = psched_get_time();
891 
892 	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
893 	q->nwc_hit = 0;
894 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
895 		/* common case optimization - skip event handler quickly */
896 		int m;
897 		psched_time_t event;
898 
899 		if (q->now >= q->near_ev_cache[level]) {
900 			event = htb_do_events(q, level);
901 			if (!event)
902 				event = q->now + PSCHED_TICKS_PER_SEC;
903 			q->near_ev_cache[level] = event;
904 		} else
905 			event = q->near_ev_cache[level];
906 
907 		if (event && next_event > event)
908 			next_event = event;
909 
910 		m = ~q->row_mask[level];
911 		while (m != (int)(-1)) {
912 			int prio = ffz(m);
913 			m |= 1 << prio;
914 			skb = htb_dequeue_tree(q, prio, level);
915 			if (likely(skb != NULL)) {
916 				sch->q.qlen--;
917 				sch->flags &= ~TCQ_F_THROTTLED;
918 				goto fin;
919 			}
920 		}
921 	}
922 	sch->qstats.overlimits++;
923 	qdisc_watchdog_schedule(&q->watchdog, next_event);
924 fin:
925 	return skb;
926 }
927 
928 /* try to drop from each class (by prio) until one succeed */
929 static unsigned int htb_drop(struct Qdisc *sch)
930 {
931 	struct htb_sched *q = qdisc_priv(sch);
932 	int prio;
933 
934 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
935 		struct list_head *p;
936 		list_for_each(p, q->drops + prio) {
937 			struct htb_class *cl = list_entry(p, struct htb_class,
938 							  un.leaf.drop_list);
939 			unsigned int len;
940 			if (cl->un.leaf.q->ops->drop &&
941 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
942 				sch->q.qlen--;
943 				if (!cl->un.leaf.q->q.qlen)
944 					htb_deactivate(q, cl);
945 				return len;
946 			}
947 		}
948 	}
949 	return 0;
950 }
951 
952 /* reset all classes */
953 /* always caled under BH & queue lock */
954 static void htb_reset(struct Qdisc *sch)
955 {
956 	struct htb_sched *q = qdisc_priv(sch);
957 	struct htb_class *cl;
958 	struct hlist_node *n;
959 	unsigned int i;
960 
961 	for (i = 0; i < q->clhash.hashsize; i++) {
962 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
963 			if (cl->level)
964 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
965 			else {
966 				if (cl->un.leaf.q)
967 					qdisc_reset(cl->un.leaf.q);
968 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
969 			}
970 			cl->prio_activity = 0;
971 			cl->cmode = HTB_CAN_SEND;
972 
973 		}
974 	}
975 	qdisc_watchdog_cancel(&q->watchdog);
976 	__skb_queue_purge(&q->direct_queue);
977 	sch->q.qlen = 0;
978 	memset(q->row, 0, sizeof(q->row));
979 	memset(q->row_mask, 0, sizeof(q->row_mask));
980 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
981 	memset(q->ptr, 0, sizeof(q->ptr));
982 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
983 		INIT_LIST_HEAD(q->drops + i);
984 }
985 
986 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
987 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
988 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
989 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
990 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
991 };
992 
993 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
994 {
995 	struct htb_sched *q = qdisc_priv(sch);
996 	struct nlattr *tb[TCA_HTB_INIT + 1];
997 	struct tc_htb_glob *gopt;
998 	int err;
999 	int i;
1000 
1001 	if (!opt)
1002 		return -EINVAL;
1003 
1004 	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
1005 	if (err < 0)
1006 		return err;
1007 
1008 	if (tb[TCA_HTB_INIT] == NULL) {
1009 		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1010 		return -EINVAL;
1011 	}
1012 	gopt = nla_data(tb[TCA_HTB_INIT]);
1013 	if (gopt->version != HTB_VER >> 16) {
1014 		printk(KERN_ERR
1015 		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1016 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1017 		return -EINVAL;
1018 	}
1019 
1020 	err = qdisc_class_hash_init(&q->clhash);
1021 	if (err < 0)
1022 		return err;
1023 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1024 		INIT_LIST_HEAD(q->drops + i);
1025 
1026 	qdisc_watchdog_init(&q->watchdog, sch);
1027 	skb_queue_head_init(&q->direct_queue);
1028 
1029 	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1030 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1031 		q->direct_qlen = 2;
1032 
1033 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1034 		q->rate2quantum = 1;
1035 	q->defcls = gopt->defcls;
1036 
1037 	return 0;
1038 }
1039 
1040 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1041 {
1042 	spinlock_t *root_lock = qdisc_root_lock(sch);
1043 	struct htb_sched *q = qdisc_priv(sch);
1044 	struct nlattr *nest;
1045 	struct tc_htb_glob gopt;
1046 
1047 	spin_lock_bh(root_lock);
1048 
1049 	gopt.direct_pkts = q->direct_pkts;
1050 	gopt.version = HTB_VER;
1051 	gopt.rate2quantum = q->rate2quantum;
1052 	gopt.defcls = q->defcls;
1053 	gopt.debug = 0;
1054 
1055 	nest = nla_nest_start(skb, TCA_OPTIONS);
1056 	if (nest == NULL)
1057 		goto nla_put_failure;
1058 	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1059 	nla_nest_end(skb, nest);
1060 
1061 	spin_unlock_bh(root_lock);
1062 	return skb->len;
1063 
1064 nla_put_failure:
1065 	spin_unlock_bh(root_lock);
1066 	nla_nest_cancel(skb, nest);
1067 	return -1;
1068 }
1069 
1070 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1071 			  struct sk_buff *skb, struct tcmsg *tcm)
1072 {
1073 	struct htb_class *cl = (struct htb_class *)arg;
1074 	spinlock_t *root_lock = qdisc_root_lock(sch);
1075 	struct nlattr *nest;
1076 	struct tc_htb_opt opt;
1077 
1078 	spin_lock_bh(root_lock);
1079 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1080 	tcm->tcm_handle = cl->common.classid;
1081 	if (!cl->level && cl->un.leaf.q)
1082 		tcm->tcm_info = cl->un.leaf.q->handle;
1083 
1084 	nest = nla_nest_start(skb, TCA_OPTIONS);
1085 	if (nest == NULL)
1086 		goto nla_put_failure;
1087 
1088 	memset(&opt, 0, sizeof(opt));
1089 
1090 	opt.rate = cl->rate->rate;
1091 	opt.buffer = cl->buffer;
1092 	opt.ceil = cl->ceil->rate;
1093 	opt.cbuffer = cl->cbuffer;
1094 	opt.quantum = cl->un.leaf.quantum;
1095 	opt.prio = cl->un.leaf.prio;
1096 	opt.level = cl->level;
1097 	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1098 
1099 	nla_nest_end(skb, nest);
1100 	spin_unlock_bh(root_lock);
1101 	return skb->len;
1102 
1103 nla_put_failure:
1104 	spin_unlock_bh(root_lock);
1105 	nla_nest_cancel(skb, nest);
1106 	return -1;
1107 }
1108 
1109 static int
1110 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1111 {
1112 	struct htb_class *cl = (struct htb_class *)arg;
1113 
1114 	if (!cl->level && cl->un.leaf.q)
1115 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1116 	cl->xstats.tokens = cl->tokens;
1117 	cl->xstats.ctokens = cl->ctokens;
1118 
1119 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1120 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1121 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1122 		return -1;
1123 
1124 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1125 }
1126 
1127 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1128 		     struct Qdisc **old)
1129 {
1130 	struct htb_class *cl = (struct htb_class *)arg;
1131 
1132 	if (cl && !cl->level) {
1133 		if (new == NULL &&
1134 		    (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1135 					     &pfifo_qdisc_ops,
1136 					     cl->common.classid))
1137 		    == NULL)
1138 			return -ENOBUFS;
1139 		sch_tree_lock(sch);
1140 		if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1141 			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1142 			qdisc_reset(*old);
1143 		}
1144 		sch_tree_unlock(sch);
1145 		return 0;
1146 	}
1147 	return -ENOENT;
1148 }
1149 
1150 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1151 {
1152 	struct htb_class *cl = (struct htb_class *)arg;
1153 	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1154 }
1155 
1156 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1157 {
1158 	struct htb_class *cl = (struct htb_class *)arg;
1159 
1160 	if (cl->un.leaf.q->q.qlen == 0)
1161 		htb_deactivate(qdisc_priv(sch), cl);
1162 }
1163 
1164 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1165 {
1166 	struct htb_class *cl = htb_find(classid, sch);
1167 	if (cl)
1168 		cl->refcnt++;
1169 	return (unsigned long)cl;
1170 }
1171 
1172 static inline int htb_parent_last_child(struct htb_class *cl)
1173 {
1174 	if (!cl->parent)
1175 		/* the root class */
1176 		return 0;
1177 	if (cl->parent->children > 1)
1178 		/* not the last child */
1179 		return 0;
1180 	return 1;
1181 }
1182 
1183 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1184 			       struct Qdisc *new_q)
1185 {
1186 	struct htb_class *parent = cl->parent;
1187 
1188 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1189 
1190 	if (parent->cmode != HTB_CAN_SEND)
1191 		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
1192 
1193 	parent->level = 0;
1194 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1195 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1196 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1197 	parent->un.leaf.quantum = parent->quantum;
1198 	parent->un.leaf.prio = parent->prio;
1199 	parent->tokens = parent->buffer;
1200 	parent->ctokens = parent->cbuffer;
1201 	parent->t_c = psched_get_time();
1202 	parent->cmode = HTB_CAN_SEND;
1203 }
1204 
1205 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1206 {
1207 	if (!cl->level) {
1208 		WARN_ON(!cl->un.leaf.q);
1209 		qdisc_destroy(cl->un.leaf.q);
1210 	}
1211 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1212 	qdisc_put_rtab(cl->rate);
1213 	qdisc_put_rtab(cl->ceil);
1214 
1215 	tcf_destroy_chain(&cl->filter_list);
1216 	kfree(cl);
1217 }
1218 
1219 /* always caled under BH & queue lock */
1220 static void htb_destroy(struct Qdisc *sch)
1221 {
1222 	struct htb_sched *q = qdisc_priv(sch);
1223 	struct hlist_node *n, *next;
1224 	struct htb_class *cl;
1225 	unsigned int i;
1226 
1227 	qdisc_watchdog_cancel(&q->watchdog);
1228 	/* This line used to be after htb_destroy_class call below
1229 	   and surprisingly it worked in 2.4. But it must precede it
1230 	   because filter need its target class alive to be able to call
1231 	   unbind_filter on it (without Oops). */
1232 	tcf_destroy_chain(&q->filter_list);
1233 
1234 	for (i = 0; i < q->clhash.hashsize; i++) {
1235 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1236 			tcf_destroy_chain(&cl->filter_list);
1237 	}
1238 	for (i = 0; i < q->clhash.hashsize; i++) {
1239 		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
1240 					  common.hnode)
1241 			htb_destroy_class(sch, cl);
1242 	}
1243 	qdisc_class_hash_destroy(&q->clhash);
1244 	__skb_queue_purge(&q->direct_queue);
1245 }
1246 
1247 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1248 {
1249 	struct htb_sched *q = qdisc_priv(sch);
1250 	struct htb_class *cl = (struct htb_class *)arg;
1251 	unsigned int qlen;
1252 	struct Qdisc *new_q = NULL;
1253 	int last_child = 0;
1254 
1255 	// TODO: why don't allow to delete subtree ? references ? does
1256 	// tc subsys quarantee us that in htb_destroy it holds no class
1257 	// refs so that we can remove children safely there ?
1258 	if (cl->children || cl->filter_cnt)
1259 		return -EBUSY;
1260 
1261 	if (!cl->level && htb_parent_last_child(cl)) {
1262 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1263 					  &pfifo_qdisc_ops,
1264 					  cl->parent->common.classid);
1265 		last_child = 1;
1266 	}
1267 
1268 	sch_tree_lock(sch);
1269 
1270 	if (!cl->level) {
1271 		qlen = cl->un.leaf.q->q.qlen;
1272 		qdisc_reset(cl->un.leaf.q);
1273 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1274 	}
1275 
1276 	/* delete from hash and active; remainder in destroy_class */
1277 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1278 	cl->parent->children--;
1279 
1280 	if (cl->prio_activity)
1281 		htb_deactivate(q, cl);
1282 
1283 	if (cl->cmode != HTB_CAN_SEND)
1284 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1285 
1286 	if (last_child)
1287 		htb_parent_to_leaf(q, cl, new_q);
1288 
1289 	if (--cl->refcnt == 0)
1290 		htb_destroy_class(sch, cl);
1291 
1292 	sch_tree_unlock(sch);
1293 	return 0;
1294 }
1295 
1296 static void htb_put(struct Qdisc *sch, unsigned long arg)
1297 {
1298 	struct htb_class *cl = (struct htb_class *)arg;
1299 
1300 	if (--cl->refcnt == 0)
1301 		htb_destroy_class(sch, cl);
1302 }
1303 
1304 static int htb_change_class(struct Qdisc *sch, u32 classid,
1305 			    u32 parentid, struct nlattr **tca,
1306 			    unsigned long *arg)
1307 {
1308 	int err = -EINVAL;
1309 	struct htb_sched *q = qdisc_priv(sch);
1310 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1311 	struct nlattr *opt = tca[TCA_OPTIONS];
1312 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1313 	struct nlattr *tb[TCA_HTB_RTAB + 1];
1314 	struct tc_htb_opt *hopt;
1315 
1316 	/* extract all subattrs from opt attr */
1317 	if (!opt)
1318 		goto failure;
1319 
1320 	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1321 	if (err < 0)
1322 		goto failure;
1323 
1324 	err = -EINVAL;
1325 	if (tb[TCA_HTB_PARMS] == NULL)
1326 		goto failure;
1327 
1328 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1329 
1330 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1331 
1332 	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1333 	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1334 	if (!rtab || !ctab)
1335 		goto failure;
1336 
1337 	if (!cl) {		/* new class */
1338 		struct Qdisc *new_q;
1339 		int prio;
1340 		struct {
1341 			struct nlattr		nla;
1342 			struct gnet_estimator	opt;
1343 		} est = {
1344 			.nla = {
1345 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1346 				.nla_type	= TCA_RATE,
1347 			},
1348 			.opt = {
1349 				/* 4s interval, 16s averaging constant */
1350 				.interval	= 2,
1351 				.ewma_log	= 2,
1352 			},
1353 		};
1354 
1355 		/* check for valid classid */
1356 		if (!classid || TC_H_MAJ(classid ^ sch->handle)
1357 		    || htb_find(classid, sch))
1358 			goto failure;
1359 
1360 		/* check maximal depth */
1361 		if (parent && parent->parent && parent->parent->level < 2) {
1362 			printk(KERN_ERR "htb: tree is too deep\n");
1363 			goto failure;
1364 		}
1365 		err = -ENOBUFS;
1366 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1367 			goto failure;
1368 
1369 		gen_new_estimator(&cl->bstats, &cl->rate_est,
1370 				  qdisc_root_lock(sch),
1371 				  tca[TCA_RATE] ? : &est.nla);
1372 		cl->refcnt = 1;
1373 		cl->children = 0;
1374 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1375 		RB_CLEAR_NODE(&cl->pq_node);
1376 
1377 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1378 			RB_CLEAR_NODE(&cl->node[prio]);
1379 
1380 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1381 		   so that can't be used inside of sch_tree_lock
1382 		   -- thanks to Karlis Peisenieks */
1383 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1384 					  &pfifo_qdisc_ops, classid);
1385 		sch_tree_lock(sch);
1386 		if (parent && !parent->level) {
1387 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1388 
1389 			/* turn parent into inner node */
1390 			qdisc_reset(parent->un.leaf.q);
1391 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1392 			qdisc_destroy(parent->un.leaf.q);
1393 			if (parent->prio_activity)
1394 				htb_deactivate(q, parent);
1395 
1396 			/* remove from evt list because of level change */
1397 			if (parent->cmode != HTB_CAN_SEND) {
1398 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1399 				parent->cmode = HTB_CAN_SEND;
1400 			}
1401 			parent->level = (parent->parent ? parent->parent->level
1402 					 : TC_HTB_MAXDEPTH) - 1;
1403 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1404 		}
1405 		/* leaf (we) needs elementary qdisc */
1406 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1407 
1408 		cl->common.classid = classid;
1409 		cl->parent = parent;
1410 
1411 		/* set class to be in HTB_CAN_SEND state */
1412 		cl->tokens = hopt->buffer;
1413 		cl->ctokens = hopt->cbuffer;
1414 		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1415 		cl->t_c = psched_get_time();
1416 		cl->cmode = HTB_CAN_SEND;
1417 
1418 		/* attach to the hash list and parent's family */
1419 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1420 		if (parent)
1421 			parent->children++;
1422 	} else {
1423 		if (tca[TCA_RATE])
1424 			gen_replace_estimator(&cl->bstats, &cl->rate_est,
1425 					      qdisc_root_lock(sch),
1426 					      tca[TCA_RATE]);
1427 		sch_tree_lock(sch);
1428 	}
1429 
1430 	/* it used to be a nasty bug here, we have to check that node
1431 	   is really leaf before changing cl->un.leaf ! */
1432 	if (!cl->level) {
1433 		cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1434 		if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1435 			printk(KERN_WARNING
1436 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1437 			       cl->common.classid);
1438 			cl->un.leaf.quantum = 1000;
1439 		}
1440 		if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1441 			printk(KERN_WARNING
1442 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1443 			       cl->common.classid);
1444 			cl->un.leaf.quantum = 200000;
1445 		}
1446 		if (hopt->quantum)
1447 			cl->un.leaf.quantum = hopt->quantum;
1448 		if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1449 			cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1450 
1451 		/* backup for htb_parent_to_leaf */
1452 		cl->quantum = cl->un.leaf.quantum;
1453 		cl->prio = cl->un.leaf.prio;
1454 	}
1455 
1456 	cl->buffer = hopt->buffer;
1457 	cl->cbuffer = hopt->cbuffer;
1458 	if (cl->rate)
1459 		qdisc_put_rtab(cl->rate);
1460 	cl->rate = rtab;
1461 	if (cl->ceil)
1462 		qdisc_put_rtab(cl->ceil);
1463 	cl->ceil = ctab;
1464 	sch_tree_unlock(sch);
1465 
1466 	qdisc_class_hash_grow(sch, &q->clhash);
1467 
1468 	*arg = (unsigned long)cl;
1469 	return 0;
1470 
1471 failure:
1472 	if (rtab)
1473 		qdisc_put_rtab(rtab);
1474 	if (ctab)
1475 		qdisc_put_rtab(ctab);
1476 	return err;
1477 }
1478 
1479 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1480 {
1481 	struct htb_sched *q = qdisc_priv(sch);
1482 	struct htb_class *cl = (struct htb_class *)arg;
1483 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1484 
1485 	return fl;
1486 }
1487 
1488 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1489 				     u32 classid)
1490 {
1491 	struct htb_class *cl = htb_find(classid, sch);
1492 
1493 	/*if (cl && !cl->level) return 0;
1494 	   The line above used to be there to prevent attaching filters to
1495 	   leaves. But at least tc_index filter uses this just to get class
1496 	   for other reasons so that we have to allow for it.
1497 	   ----
1498 	   19.6.2002 As Werner explained it is ok - bind filter is just
1499 	   another way to "lock" the class - unlike "get" this lock can
1500 	   be broken by class during destroy IIUC.
1501 	 */
1502 	if (cl)
1503 		cl->filter_cnt++;
1504 	return (unsigned long)cl;
1505 }
1506 
1507 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1508 {
1509 	struct htb_class *cl = (struct htb_class *)arg;
1510 
1511 	if (cl)
1512 		cl->filter_cnt--;
1513 }
1514 
1515 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1516 {
1517 	struct htb_sched *q = qdisc_priv(sch);
1518 	struct htb_class *cl;
1519 	struct hlist_node *n;
1520 	unsigned int i;
1521 
1522 	if (arg->stop)
1523 		return;
1524 
1525 	for (i = 0; i < q->clhash.hashsize; i++) {
1526 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1527 			if (arg->count < arg->skip) {
1528 				arg->count++;
1529 				continue;
1530 			}
1531 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1532 				arg->stop = 1;
1533 				return;
1534 			}
1535 			arg->count++;
1536 		}
1537 	}
1538 }
1539 
1540 static const struct Qdisc_class_ops htb_class_ops = {
1541 	.graft		=	htb_graft,
1542 	.leaf		=	htb_leaf,
1543 	.qlen_notify	=	htb_qlen_notify,
1544 	.get		=	htb_get,
1545 	.put		=	htb_put,
1546 	.change		=	htb_change_class,
1547 	.delete		=	htb_delete,
1548 	.walk		=	htb_walk,
1549 	.tcf_chain	=	htb_find_tcf,
1550 	.bind_tcf	=	htb_bind_filter,
1551 	.unbind_tcf	=	htb_unbind_filter,
1552 	.dump		=	htb_dump_class,
1553 	.dump_stats	=	htb_dump_class_stats,
1554 };
1555 
1556 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1557 	.next		=	NULL,
1558 	.cl_ops		=	&htb_class_ops,
1559 	.id		=	"htb",
1560 	.priv_size	=	sizeof(struct htb_sched),
1561 	.enqueue	=	htb_enqueue,
1562 	.dequeue	=	htb_dequeue,
1563 	.requeue	=	htb_requeue,
1564 	.drop		=	htb_drop,
1565 	.init		=	htb_init,
1566 	.reset		=	htb_reset,
1567 	.destroy	=	htb_destroy,
1568 	.change		=	NULL /* htb_change */,
1569 	.dump		=	htb_dump,
1570 	.owner		=	THIS_MODULE,
1571 };
1572 
1573 static int __init htb_module_init(void)
1574 {
1575 	return register_qdisc(&htb_qdisc_ops);
1576 }
1577 static void __exit htb_module_exit(void)
1578 {
1579 	unregister_qdisc(&htb_qdisc_ops);
1580 }
1581 
1582 module_init(htb_module_init)
1583 module_exit(htb_module_exit)
1584 MODULE_LICENSE("GPL");
1585