xref: /linux/net/sched/sch_htb.c (revision 367b8112fe2ea5c39a7bb4d263dcdd9b612fae18)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <net/netlink.h>
39 #include <net/pkt_sched.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 /* used internaly to keep status of single class */
66 enum htb_cmode {
67 	HTB_CANT_SEND,		/* class can't send and can't borrow */
68 	HTB_MAY_BORROW,		/* class can't send but may borrow */
69 	HTB_CAN_SEND		/* class can send */
70 };
71 
72 /* interior & leaf nodes; props specific to leaves are marked L: */
73 struct htb_class {
74 	struct Qdisc_class_common common;
75 	/* general class parameters */
76 	struct gnet_stats_basic bstats;
77 	struct gnet_stats_queue qstats;
78 	struct gnet_stats_rate_est rate_est;
79 	struct tc_htb_xstats xstats;	/* our special stats */
80 	int refcnt;		/* usage count of this class */
81 
82 	/* topology */
83 	int level;		/* our level (see above) */
84 	unsigned int children;
85 	struct htb_class *parent;	/* parent class */
86 
87 	union {
88 		struct htb_class_leaf {
89 			struct Qdisc *q;
90 			int prio;
91 			int aprio;
92 			int quantum;
93 			int deficit[TC_HTB_MAXDEPTH];
94 			struct list_head drop_list;
95 		} leaf;
96 		struct htb_class_inner {
97 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
98 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
99 			/* When class changes from state 1->2 and disconnects from
100 			   parent's feed then we lost ptr value and start from the
101 			   first child again. Here we store classid of the
102 			   last valid ptr (used when ptr is NULL). */
103 			u32 last_ptr_id[TC_HTB_NUMPRIO];
104 		} inner;
105 	} un;
106 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
107 	struct rb_node pq_node;	/* node for event queue */
108 	psched_time_t pq_key;
109 
110 	int prio_activity;	/* for which prios are we active */
111 	enum htb_cmode cmode;	/* current mode of the class */
112 
113 	/* class attached filters */
114 	struct tcf_proto *filter_list;
115 	int filter_cnt;
116 
117 	int warned;		/* only one warning about non work conserving .. */
118 
119 	/* token bucket parameters */
120 	struct qdisc_rate_table *rate;	/* rate table of the class itself */
121 	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
122 	long buffer, cbuffer;	/* token bucket depth/rate */
123 	psched_tdiff_t mbuffer;	/* max wait time */
124 	long tokens, ctokens;	/* current number of tokens */
125 	psched_time_t t_c;	/* checkpoint time */
126 
127 	int prio;		/* For parent to leaf return possible here */
128 	int quantum;		/* we do backup. Finally full replacement  */
129 				/* of un.leaf originals should be done. */
130 };
131 
132 static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
133 			   int size)
134 {
135 	long result = qdisc_l2t(rate, size);
136 	return result;
137 }
138 
139 struct htb_sched {
140 	struct Qdisc_class_hash clhash;
141 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
142 
143 	/* self list - roots of self generating tree */
144 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
145 	int row_mask[TC_HTB_MAXDEPTH];
146 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
147 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
148 
149 	/* self wait list - roots of wait PQs per row */
150 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
151 
152 	/* time of nearest event per level (row) */
153 	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
154 
155 	/* whether we hit non-work conserving class during this dequeue; we use */
156 	int nwc_hit;		/* this to disable mindelay complaint in dequeue */
157 
158 	int defcls;		/* class where unclassified flows go to */
159 
160 	/* filters for qdisc itself */
161 	struct tcf_proto *filter_list;
162 
163 	int rate2quantum;	/* quant = rate / rate2quantum */
164 	psched_time_t now;	/* cached dequeue time */
165 	struct qdisc_watchdog watchdog;
166 
167 	/* non shaped skbs; let them go directly thru */
168 	struct sk_buff_head direct_queue;
169 	int direct_qlen;	/* max qlen of above */
170 
171 	long direct_pkts;
172 };
173 
174 /* find class in global hash table using given handle */
175 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
176 {
177 	struct htb_sched *q = qdisc_priv(sch);
178 	struct Qdisc_class_common *clc;
179 
180 	clc = qdisc_class_find(&q->clhash, handle);
181 	if (clc == NULL)
182 		return NULL;
183 	return container_of(clc, struct htb_class, common);
184 }
185 
186 /**
187  * htb_classify - classify a packet into class
188  *
189  * It returns NULL if the packet should be dropped or -1 if the packet
190  * should be passed directly thru. In all other cases leaf class is returned.
191  * We allow direct class selection by classid in priority. The we examine
192  * filters in qdisc and in inner nodes (if higher filter points to the inner
193  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
194  * internal fifo (direct). These packets then go directly thru. If we still
195  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
196  * then finish and return direct queue.
197  */
198 #define HTB_DIRECT (struct htb_class*)-1
199 
200 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
201 				      int *qerr)
202 {
203 	struct htb_sched *q = qdisc_priv(sch);
204 	struct htb_class *cl;
205 	struct tcf_result res;
206 	struct tcf_proto *tcf;
207 	int result;
208 
209 	/* allow to select class by setting skb->priority to valid classid;
210 	   note that nfmark can be used too by attaching filter fw with no
211 	   rules in it */
212 	if (skb->priority == sch->handle)
213 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
214 	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
215 		return cl;
216 
217 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
218 	tcf = q->filter_list;
219 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
220 #ifdef CONFIG_NET_CLS_ACT
221 		switch (result) {
222 		case TC_ACT_QUEUED:
223 		case TC_ACT_STOLEN:
224 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
225 		case TC_ACT_SHOT:
226 			return NULL;
227 		}
228 #endif
229 		if ((cl = (void *)res.class) == NULL) {
230 			if (res.classid == sch->handle)
231 				return HTB_DIRECT;	/* X:0 (direct flow) */
232 			if ((cl = htb_find(res.classid, sch)) == NULL)
233 				break;	/* filter selected invalid classid */
234 		}
235 		if (!cl->level)
236 			return cl;	/* we hit leaf; return it */
237 
238 		/* we have got inner class; apply inner filter chain */
239 		tcf = cl->filter_list;
240 	}
241 	/* classification failed; try to use default class */
242 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
243 	if (!cl || cl->level)
244 		return HTB_DIRECT;	/* bad default .. this is safe bet */
245 	return cl;
246 }
247 
248 /**
249  * htb_add_to_id_tree - adds class to the round robin list
250  *
251  * Routine adds class to the list (actually tree) sorted by classid.
252  * Make sure that class is not already on such list for given prio.
253  */
254 static void htb_add_to_id_tree(struct rb_root *root,
255 			       struct htb_class *cl, int prio)
256 {
257 	struct rb_node **p = &root->rb_node, *parent = NULL;
258 
259 	while (*p) {
260 		struct htb_class *c;
261 		parent = *p;
262 		c = rb_entry(parent, struct htb_class, node[prio]);
263 
264 		if (cl->common.classid > c->common.classid)
265 			p = &parent->rb_right;
266 		else
267 			p = &parent->rb_left;
268 	}
269 	rb_link_node(&cl->node[prio], parent, p);
270 	rb_insert_color(&cl->node[prio], root);
271 }
272 
273 /**
274  * htb_add_to_wait_tree - adds class to the event queue with delay
275  *
276  * The class is added to priority event queue to indicate that class will
277  * change its mode in cl->pq_key microseconds. Make sure that class is not
278  * already in the queue.
279  */
280 static void htb_add_to_wait_tree(struct htb_sched *q,
281 				 struct htb_class *cl, long delay)
282 {
283 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
284 
285 	cl->pq_key = q->now + delay;
286 	if (cl->pq_key == q->now)
287 		cl->pq_key++;
288 
289 	/* update the nearest event cache */
290 	if (q->near_ev_cache[cl->level] > cl->pq_key)
291 		q->near_ev_cache[cl->level] = cl->pq_key;
292 
293 	while (*p) {
294 		struct htb_class *c;
295 		parent = *p;
296 		c = rb_entry(parent, struct htb_class, pq_node);
297 		if (cl->pq_key >= c->pq_key)
298 			p = &parent->rb_right;
299 		else
300 			p = &parent->rb_left;
301 	}
302 	rb_link_node(&cl->pq_node, parent, p);
303 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
304 }
305 
306 /**
307  * htb_next_rb_node - finds next node in binary tree
308  *
309  * When we are past last key we return NULL.
310  * Average complexity is 2 steps per call.
311  */
312 static inline void htb_next_rb_node(struct rb_node **n)
313 {
314 	*n = rb_next(*n);
315 }
316 
317 /**
318  * htb_add_class_to_row - add class to its row
319  *
320  * The class is added to row at priorities marked in mask.
321  * It does nothing if mask == 0.
322  */
323 static inline void htb_add_class_to_row(struct htb_sched *q,
324 					struct htb_class *cl, int mask)
325 {
326 	q->row_mask[cl->level] |= mask;
327 	while (mask) {
328 		int prio = ffz(~mask);
329 		mask &= ~(1 << prio);
330 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
331 	}
332 }
333 
334 /* If this triggers, it is a bug in this code, but it need not be fatal */
335 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
336 {
337 	if (RB_EMPTY_NODE(rb)) {
338 		WARN_ON(1);
339 	} else {
340 		rb_erase(rb, root);
341 		RB_CLEAR_NODE(rb);
342 	}
343 }
344 
345 
346 /**
347  * htb_remove_class_from_row - removes class from its row
348  *
349  * The class is removed from row at priorities marked in mask.
350  * It does nothing if mask == 0.
351  */
352 static inline void htb_remove_class_from_row(struct htb_sched *q,
353 						 struct htb_class *cl, int mask)
354 {
355 	int m = 0;
356 
357 	while (mask) {
358 		int prio = ffz(~mask);
359 
360 		mask &= ~(1 << prio);
361 		if (q->ptr[cl->level][prio] == cl->node + prio)
362 			htb_next_rb_node(q->ptr[cl->level] + prio);
363 
364 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
365 		if (!q->row[cl->level][prio].rb_node)
366 			m |= 1 << prio;
367 	}
368 	q->row_mask[cl->level] &= ~m;
369 }
370 
371 /**
372  * htb_activate_prios - creates active classe's feed chain
373  *
374  * The class is connected to ancestors and/or appropriate rows
375  * for priorities it is participating on. cl->cmode must be new
376  * (activated) mode. It does nothing if cl->prio_activity == 0.
377  */
378 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
379 {
380 	struct htb_class *p = cl->parent;
381 	long m, mask = cl->prio_activity;
382 
383 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
384 		m = mask;
385 		while (m) {
386 			int prio = ffz(~m);
387 			m &= ~(1 << prio);
388 
389 			if (p->un.inner.feed[prio].rb_node)
390 				/* parent already has its feed in use so that
391 				   reset bit in mask as parent is already ok */
392 				mask &= ~(1 << prio);
393 
394 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
395 		}
396 		p->prio_activity |= mask;
397 		cl = p;
398 		p = cl->parent;
399 
400 	}
401 	if (cl->cmode == HTB_CAN_SEND && mask)
402 		htb_add_class_to_row(q, cl, mask);
403 }
404 
405 /**
406  * htb_deactivate_prios - remove class from feed chain
407  *
408  * cl->cmode must represent old mode (before deactivation). It does
409  * nothing if cl->prio_activity == 0. Class is removed from all feed
410  * chains and rows.
411  */
412 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
413 {
414 	struct htb_class *p = cl->parent;
415 	long m, mask = cl->prio_activity;
416 
417 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
418 		m = mask;
419 		mask = 0;
420 		while (m) {
421 			int prio = ffz(~m);
422 			m &= ~(1 << prio);
423 
424 			if (p->un.inner.ptr[prio] == cl->node + prio) {
425 				/* we are removing child which is pointed to from
426 				   parent feed - forget the pointer but remember
427 				   classid */
428 				p->un.inner.last_ptr_id[prio] = cl->common.classid;
429 				p->un.inner.ptr[prio] = NULL;
430 			}
431 
432 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
433 
434 			if (!p->un.inner.feed[prio].rb_node)
435 				mask |= 1 << prio;
436 		}
437 
438 		p->prio_activity &= ~mask;
439 		cl = p;
440 		p = cl->parent;
441 
442 	}
443 	if (cl->cmode == HTB_CAN_SEND && mask)
444 		htb_remove_class_from_row(q, cl, mask);
445 }
446 
447 static inline long htb_lowater(const struct htb_class *cl)
448 {
449 	if (htb_hysteresis)
450 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
451 	else
452 		return 0;
453 }
454 static inline long htb_hiwater(const struct htb_class *cl)
455 {
456 	if (htb_hysteresis)
457 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
458 	else
459 		return 0;
460 }
461 
462 
463 /**
464  * htb_class_mode - computes and returns current class mode
465  *
466  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
467  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
468  * from now to time when cl will change its state.
469  * Also it is worth to note that class mode doesn't change simply
470  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
471  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
472  * mode transitions per time unit. The speed gain is about 1/6.
473  */
474 static inline enum htb_cmode
475 htb_class_mode(struct htb_class *cl, long *diff)
476 {
477 	long toks;
478 
479 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
480 		*diff = -toks;
481 		return HTB_CANT_SEND;
482 	}
483 
484 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
485 		return HTB_CAN_SEND;
486 
487 	*diff = -toks;
488 	return HTB_MAY_BORROW;
489 }
490 
491 /**
492  * htb_change_class_mode - changes classe's mode
493  *
494  * This should be the only way how to change classe's mode under normal
495  * cirsumstances. Routine will update feed lists linkage, change mode
496  * and add class to the wait event queue if appropriate. New mode should
497  * be different from old one and cl->pq_key has to be valid if changing
498  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
499  */
500 static void
501 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
502 {
503 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
504 
505 	if (new_mode == cl->cmode)
506 		return;
507 
508 	if (cl->prio_activity) {	/* not necessary: speed optimization */
509 		if (cl->cmode != HTB_CANT_SEND)
510 			htb_deactivate_prios(q, cl);
511 		cl->cmode = new_mode;
512 		if (new_mode != HTB_CANT_SEND)
513 			htb_activate_prios(q, cl);
514 	} else
515 		cl->cmode = new_mode;
516 }
517 
518 /**
519  * htb_activate - inserts leaf cl into appropriate active feeds
520  *
521  * Routine learns (new) priority of leaf and activates feed chain
522  * for the prio. It can be called on already active leaf safely.
523  * It also adds leaf into droplist.
524  */
525 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
526 {
527 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
528 
529 	if (!cl->prio_activity) {
530 		cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
531 		htb_activate_prios(q, cl);
532 		list_add_tail(&cl->un.leaf.drop_list,
533 			      q->drops + cl->un.leaf.aprio);
534 	}
535 }
536 
537 /**
538  * htb_deactivate - remove leaf cl from active feeds
539  *
540  * Make sure that leaf is active. In the other words it can't be called
541  * with non-active leaf. It also removes class from the drop list.
542  */
543 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
544 {
545 	WARN_ON(!cl->prio_activity);
546 
547 	htb_deactivate_prios(q, cl);
548 	cl->prio_activity = 0;
549 	list_del_init(&cl->un.leaf.drop_list);
550 }
551 
552 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
553 {
554 	int ret;
555 	struct htb_sched *q = qdisc_priv(sch);
556 	struct htb_class *cl = htb_classify(skb, sch, &ret);
557 
558 	if (cl == HTB_DIRECT) {
559 		/* enqueue to helper queue */
560 		if (q->direct_queue.qlen < q->direct_qlen) {
561 			__skb_queue_tail(&q->direct_queue, skb);
562 			q->direct_pkts++;
563 		} else {
564 			kfree_skb(skb);
565 			sch->qstats.drops++;
566 			return NET_XMIT_DROP;
567 		}
568 #ifdef CONFIG_NET_CLS_ACT
569 	} else if (!cl) {
570 		if (ret & __NET_XMIT_BYPASS)
571 			sch->qstats.drops++;
572 		kfree_skb(skb);
573 		return ret;
574 #endif
575 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
576 		if (net_xmit_drop_count(ret)) {
577 			sch->qstats.drops++;
578 			cl->qstats.drops++;
579 		}
580 		return ret;
581 	} else {
582 		cl->bstats.packets +=
583 			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
584 		cl->bstats.bytes += qdisc_pkt_len(skb);
585 		htb_activate(q, cl);
586 	}
587 
588 	sch->q.qlen++;
589 	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
590 	sch->bstats.bytes += qdisc_pkt_len(skb);
591 	return NET_XMIT_SUCCESS;
592 }
593 
594 /* TODO: requeuing packet charges it to policers again !! */
595 static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
596 {
597 	int ret;
598 	struct htb_sched *q = qdisc_priv(sch);
599 	struct htb_class *cl = htb_classify(skb, sch, &ret);
600 	struct sk_buff *tskb;
601 
602 	if (cl == HTB_DIRECT) {
603 		/* enqueue to helper queue */
604 		if (q->direct_queue.qlen < q->direct_qlen) {
605 			__skb_queue_head(&q->direct_queue, skb);
606 		} else {
607 			__skb_queue_head(&q->direct_queue, skb);
608 			tskb = __skb_dequeue_tail(&q->direct_queue);
609 			kfree_skb(tskb);
610 			sch->qstats.drops++;
611 			return NET_XMIT_CN;
612 		}
613 #ifdef CONFIG_NET_CLS_ACT
614 	} else if (!cl) {
615 		if (ret & __NET_XMIT_BYPASS)
616 			sch->qstats.drops++;
617 		kfree_skb(skb);
618 		return ret;
619 #endif
620 	} else if ((ret = cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q)) !=
621 		   NET_XMIT_SUCCESS) {
622 		if (net_xmit_drop_count(ret)) {
623 			sch->qstats.drops++;
624 			cl->qstats.drops++;
625 		}
626 		return ret;
627 	} else
628 		htb_activate(q, cl);
629 
630 	sch->q.qlen++;
631 	sch->qstats.requeues++;
632 	return NET_XMIT_SUCCESS;
633 }
634 
635 /**
636  * htb_charge_class - charges amount "bytes" to leaf and ancestors
637  *
638  * Routine assumes that packet "bytes" long was dequeued from leaf cl
639  * borrowing from "level". It accounts bytes to ceil leaky bucket for
640  * leaf and all ancestors and to rate bucket for ancestors at levels
641  * "level" and higher. It also handles possible change of mode resulting
642  * from the update. Note that mode can also increase here (MAY_BORROW to
643  * CAN_SEND) because we can use more precise clock that event queue here.
644  * In such case we remove class from event queue first.
645  */
646 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
647 			     int level, struct sk_buff *skb)
648 {
649 	int bytes = qdisc_pkt_len(skb);
650 	long toks, diff;
651 	enum htb_cmode old_mode;
652 
653 #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
654 	if (toks > cl->B) toks = cl->B; \
655 	toks -= L2T(cl, cl->R, bytes); \
656 	if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
657 	cl->T = toks
658 
659 	while (cl) {
660 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
661 		if (cl->level >= level) {
662 			if (cl->level == level)
663 				cl->xstats.lends++;
664 			HTB_ACCNT(tokens, buffer, rate);
665 		} else {
666 			cl->xstats.borrows++;
667 			cl->tokens += diff;	/* we moved t_c; update tokens */
668 		}
669 		HTB_ACCNT(ctokens, cbuffer, ceil);
670 		cl->t_c = q->now;
671 
672 		old_mode = cl->cmode;
673 		diff = 0;
674 		htb_change_class_mode(q, cl, &diff);
675 		if (old_mode != cl->cmode) {
676 			if (old_mode != HTB_CAN_SEND)
677 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
678 			if (cl->cmode != HTB_CAN_SEND)
679 				htb_add_to_wait_tree(q, cl, diff);
680 		}
681 
682 		/* update byte stats except for leaves which are already updated */
683 		if (cl->level) {
684 			cl->bstats.bytes += bytes;
685 			cl->bstats.packets += skb_is_gso(skb)?
686 					skb_shinfo(skb)->gso_segs:1;
687 		}
688 		cl = cl->parent;
689 	}
690 }
691 
692 /**
693  * htb_do_events - make mode changes to classes at the level
694  *
695  * Scans event queue for pending events and applies them. Returns time of
696  * next pending event (0 for no event in pq).
697  * Note: Applied are events whose have cl->pq_key <= q->now.
698  */
699 static psched_time_t htb_do_events(struct htb_sched *q, int level)
700 {
701 	/* don't run for longer than 2 jiffies; 2 is used instead of
702 	   1 to simplify things when jiffy is going to be incremented
703 	   too soon */
704 	unsigned long stop_at = jiffies + 2;
705 	while (time_before(jiffies, stop_at)) {
706 		struct htb_class *cl;
707 		long diff;
708 		struct rb_node *p = rb_first(&q->wait_pq[level]);
709 
710 		if (!p)
711 			return 0;
712 
713 		cl = rb_entry(p, struct htb_class, pq_node);
714 		if (cl->pq_key > q->now)
715 			return cl->pq_key;
716 
717 		htb_safe_rb_erase(p, q->wait_pq + level);
718 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
719 		htb_change_class_mode(q, cl, &diff);
720 		if (cl->cmode != HTB_CAN_SEND)
721 			htb_add_to_wait_tree(q, cl, diff);
722 	}
723 	/* too much load - let's continue on next jiffie */
724 	return q->now + PSCHED_TICKS_PER_SEC / HZ;
725 }
726 
727 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
728    is no such one exists. */
729 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
730 					      u32 id)
731 {
732 	struct rb_node *r = NULL;
733 	while (n) {
734 		struct htb_class *cl =
735 		    rb_entry(n, struct htb_class, node[prio]);
736 		if (id == cl->common.classid)
737 			return n;
738 
739 		if (id > cl->common.classid) {
740 			n = n->rb_right;
741 		} else {
742 			r = n;
743 			n = n->rb_left;
744 		}
745 	}
746 	return r;
747 }
748 
749 /**
750  * htb_lookup_leaf - returns next leaf class in DRR order
751  *
752  * Find leaf where current feed pointers points to.
753  */
754 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
755 					 struct rb_node **pptr, u32 * pid)
756 {
757 	int i;
758 	struct {
759 		struct rb_node *root;
760 		struct rb_node **pptr;
761 		u32 *pid;
762 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
763 
764 	WARN_ON(!tree->rb_node);
765 	sp->root = tree->rb_node;
766 	sp->pptr = pptr;
767 	sp->pid = pid;
768 
769 	for (i = 0; i < 65535; i++) {
770 		if (!*sp->pptr && *sp->pid) {
771 			/* ptr was invalidated but id is valid - try to recover
772 			   the original or next ptr */
773 			*sp->pptr =
774 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
775 		}
776 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
777 				   can become out of date quickly */
778 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
779 			*sp->pptr = sp->root;
780 			while ((*sp->pptr)->rb_left)
781 				*sp->pptr = (*sp->pptr)->rb_left;
782 			if (sp > stk) {
783 				sp--;
784 				WARN_ON(!*sp->pptr);
785 				if (!*sp->pptr)
786 					return NULL;
787 				htb_next_rb_node(sp->pptr);
788 			}
789 		} else {
790 			struct htb_class *cl;
791 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
792 			if (!cl->level)
793 				return cl;
794 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
795 			sp->pptr = cl->un.inner.ptr + prio;
796 			sp->pid = cl->un.inner.last_ptr_id + prio;
797 		}
798 	}
799 	WARN_ON(1);
800 	return NULL;
801 }
802 
803 /* dequeues packet at given priority and level; call only if
804    you are sure that there is active class at prio/level */
805 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
806 					int level)
807 {
808 	struct sk_buff *skb = NULL;
809 	struct htb_class *cl, *start;
810 	/* look initial class up in the row */
811 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
812 				     q->ptr[level] + prio,
813 				     q->last_ptr_id[level] + prio);
814 
815 	do {
816 next:
817 		WARN_ON(!cl);
818 		if (!cl)
819 			return NULL;
820 
821 		/* class can be empty - it is unlikely but can be true if leaf
822 		   qdisc drops packets in enqueue routine or if someone used
823 		   graft operation on the leaf since last dequeue;
824 		   simply deactivate and skip such class */
825 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
826 			struct htb_class *next;
827 			htb_deactivate(q, cl);
828 
829 			/* row/level might become empty */
830 			if ((q->row_mask[level] & (1 << prio)) == 0)
831 				return NULL;
832 
833 			next = htb_lookup_leaf(q->row[level] + prio,
834 					       prio, q->ptr[level] + prio,
835 					       q->last_ptr_id[level] + prio);
836 
837 			if (cl == start)	/* fix start if we just deleted it */
838 				start = next;
839 			cl = next;
840 			goto next;
841 		}
842 
843 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
844 		if (likely(skb != NULL))
845 			break;
846 		if (!cl->warned) {
847 			printk(KERN_WARNING
848 			       "htb: class %X isn't work conserving ?!\n",
849 			       cl->common.classid);
850 			cl->warned = 1;
851 		}
852 		q->nwc_hit++;
853 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
854 				  ptr[0]) + prio);
855 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
856 				     q->ptr[level] + prio,
857 				     q->last_ptr_id[level] + prio);
858 
859 	} while (cl != start);
860 
861 	if (likely(skb != NULL)) {
862 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
863 		if (cl->un.leaf.deficit[level] < 0) {
864 			cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
865 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
866 					  ptr[0]) + prio);
867 		}
868 		/* this used to be after charge_class but this constelation
869 		   gives us slightly better performance */
870 		if (!cl->un.leaf.q->q.qlen)
871 			htb_deactivate(q, cl);
872 		htb_charge_class(q, cl, level, skb);
873 	}
874 	return skb;
875 }
876 
877 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
878 {
879 	struct sk_buff *skb = NULL;
880 	struct htb_sched *q = qdisc_priv(sch);
881 	int level;
882 	psched_time_t next_event;
883 
884 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
885 	skb = __skb_dequeue(&q->direct_queue);
886 	if (skb != NULL) {
887 		sch->flags &= ~TCQ_F_THROTTLED;
888 		sch->q.qlen--;
889 		return skb;
890 	}
891 
892 	if (!sch->q.qlen)
893 		goto fin;
894 	q->now = psched_get_time();
895 
896 	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
897 	q->nwc_hit = 0;
898 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
899 		/* common case optimization - skip event handler quickly */
900 		int m;
901 		psched_time_t event;
902 
903 		if (q->now >= q->near_ev_cache[level]) {
904 			event = htb_do_events(q, level);
905 			if (!event)
906 				event = q->now + PSCHED_TICKS_PER_SEC;
907 			q->near_ev_cache[level] = event;
908 		} else
909 			event = q->near_ev_cache[level];
910 
911 		if (event && next_event > event)
912 			next_event = event;
913 
914 		m = ~q->row_mask[level];
915 		while (m != (int)(-1)) {
916 			int prio = ffz(m);
917 			m |= 1 << prio;
918 			skb = htb_dequeue_tree(q, prio, level);
919 			if (likely(skb != NULL)) {
920 				sch->q.qlen--;
921 				sch->flags &= ~TCQ_F_THROTTLED;
922 				goto fin;
923 			}
924 		}
925 	}
926 	sch->qstats.overlimits++;
927 	qdisc_watchdog_schedule(&q->watchdog, next_event);
928 fin:
929 	return skb;
930 }
931 
932 /* try to drop from each class (by prio) until one succeed */
933 static unsigned int htb_drop(struct Qdisc *sch)
934 {
935 	struct htb_sched *q = qdisc_priv(sch);
936 	int prio;
937 
938 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
939 		struct list_head *p;
940 		list_for_each(p, q->drops + prio) {
941 			struct htb_class *cl = list_entry(p, struct htb_class,
942 							  un.leaf.drop_list);
943 			unsigned int len;
944 			if (cl->un.leaf.q->ops->drop &&
945 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
946 				sch->q.qlen--;
947 				if (!cl->un.leaf.q->q.qlen)
948 					htb_deactivate(q, cl);
949 				return len;
950 			}
951 		}
952 	}
953 	return 0;
954 }
955 
956 /* reset all classes */
957 /* always caled under BH & queue lock */
958 static void htb_reset(struct Qdisc *sch)
959 {
960 	struct htb_sched *q = qdisc_priv(sch);
961 	struct htb_class *cl;
962 	struct hlist_node *n;
963 	unsigned int i;
964 
965 	for (i = 0; i < q->clhash.hashsize; i++) {
966 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
967 			if (cl->level)
968 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
969 			else {
970 				if (cl->un.leaf.q)
971 					qdisc_reset(cl->un.leaf.q);
972 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
973 			}
974 			cl->prio_activity = 0;
975 			cl->cmode = HTB_CAN_SEND;
976 
977 		}
978 	}
979 	qdisc_watchdog_cancel(&q->watchdog);
980 	__skb_queue_purge(&q->direct_queue);
981 	sch->q.qlen = 0;
982 	memset(q->row, 0, sizeof(q->row));
983 	memset(q->row_mask, 0, sizeof(q->row_mask));
984 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
985 	memset(q->ptr, 0, sizeof(q->ptr));
986 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
987 		INIT_LIST_HEAD(q->drops + i);
988 }
989 
990 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
991 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
992 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
993 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
994 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
995 };
996 
997 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
998 {
999 	struct htb_sched *q = qdisc_priv(sch);
1000 	struct nlattr *tb[TCA_HTB_INIT + 1];
1001 	struct tc_htb_glob *gopt;
1002 	int err;
1003 	int i;
1004 
1005 	if (!opt)
1006 		return -EINVAL;
1007 
1008 	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
1009 	if (err < 0)
1010 		return err;
1011 
1012 	if (tb[TCA_HTB_INIT] == NULL) {
1013 		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1014 		return -EINVAL;
1015 	}
1016 	gopt = nla_data(tb[TCA_HTB_INIT]);
1017 	if (gopt->version != HTB_VER >> 16) {
1018 		printk(KERN_ERR
1019 		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1020 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1021 		return -EINVAL;
1022 	}
1023 
1024 	err = qdisc_class_hash_init(&q->clhash);
1025 	if (err < 0)
1026 		return err;
1027 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1028 		INIT_LIST_HEAD(q->drops + i);
1029 
1030 	qdisc_watchdog_init(&q->watchdog, sch);
1031 	skb_queue_head_init(&q->direct_queue);
1032 
1033 	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1034 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1035 		q->direct_qlen = 2;
1036 
1037 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1038 		q->rate2quantum = 1;
1039 	q->defcls = gopt->defcls;
1040 
1041 	return 0;
1042 }
1043 
1044 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1045 {
1046 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1047 	struct htb_sched *q = qdisc_priv(sch);
1048 	struct nlattr *nest;
1049 	struct tc_htb_glob gopt;
1050 
1051 	spin_lock_bh(root_lock);
1052 
1053 	gopt.direct_pkts = q->direct_pkts;
1054 	gopt.version = HTB_VER;
1055 	gopt.rate2quantum = q->rate2quantum;
1056 	gopt.defcls = q->defcls;
1057 	gopt.debug = 0;
1058 
1059 	nest = nla_nest_start(skb, TCA_OPTIONS);
1060 	if (nest == NULL)
1061 		goto nla_put_failure;
1062 	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1063 	nla_nest_end(skb, nest);
1064 
1065 	spin_unlock_bh(root_lock);
1066 	return skb->len;
1067 
1068 nla_put_failure:
1069 	spin_unlock_bh(root_lock);
1070 	nla_nest_cancel(skb, nest);
1071 	return -1;
1072 }
1073 
1074 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1075 			  struct sk_buff *skb, struct tcmsg *tcm)
1076 {
1077 	struct htb_class *cl = (struct htb_class *)arg;
1078 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1079 	struct nlattr *nest;
1080 	struct tc_htb_opt opt;
1081 
1082 	spin_lock_bh(root_lock);
1083 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1084 	tcm->tcm_handle = cl->common.classid;
1085 	if (!cl->level && cl->un.leaf.q)
1086 		tcm->tcm_info = cl->un.leaf.q->handle;
1087 
1088 	nest = nla_nest_start(skb, TCA_OPTIONS);
1089 	if (nest == NULL)
1090 		goto nla_put_failure;
1091 
1092 	memset(&opt, 0, sizeof(opt));
1093 
1094 	opt.rate = cl->rate->rate;
1095 	opt.buffer = cl->buffer;
1096 	opt.ceil = cl->ceil->rate;
1097 	opt.cbuffer = cl->cbuffer;
1098 	opt.quantum = cl->un.leaf.quantum;
1099 	opt.prio = cl->un.leaf.prio;
1100 	opt.level = cl->level;
1101 	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1102 
1103 	nla_nest_end(skb, nest);
1104 	spin_unlock_bh(root_lock);
1105 	return skb->len;
1106 
1107 nla_put_failure:
1108 	spin_unlock_bh(root_lock);
1109 	nla_nest_cancel(skb, nest);
1110 	return -1;
1111 }
1112 
1113 static int
1114 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1115 {
1116 	struct htb_class *cl = (struct htb_class *)arg;
1117 
1118 	if (!cl->level && cl->un.leaf.q)
1119 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1120 	cl->xstats.tokens = cl->tokens;
1121 	cl->xstats.ctokens = cl->ctokens;
1122 
1123 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1124 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1125 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1126 		return -1;
1127 
1128 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1129 }
1130 
1131 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1132 		     struct Qdisc **old)
1133 {
1134 	struct htb_class *cl = (struct htb_class *)arg;
1135 
1136 	if (cl && !cl->level) {
1137 		if (new == NULL &&
1138 		    (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1139 					     &pfifo_qdisc_ops,
1140 					     cl->common.classid))
1141 		    == NULL)
1142 			return -ENOBUFS;
1143 		sch_tree_lock(sch);
1144 		if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1145 			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1146 			qdisc_reset(*old);
1147 		}
1148 		sch_tree_unlock(sch);
1149 		return 0;
1150 	}
1151 	return -ENOENT;
1152 }
1153 
1154 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1155 {
1156 	struct htb_class *cl = (struct htb_class *)arg;
1157 	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1158 }
1159 
1160 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1161 {
1162 	struct htb_class *cl = (struct htb_class *)arg;
1163 
1164 	if (cl->un.leaf.q->q.qlen == 0)
1165 		htb_deactivate(qdisc_priv(sch), cl);
1166 }
1167 
1168 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1169 {
1170 	struct htb_class *cl = htb_find(classid, sch);
1171 	if (cl)
1172 		cl->refcnt++;
1173 	return (unsigned long)cl;
1174 }
1175 
1176 static inline int htb_parent_last_child(struct htb_class *cl)
1177 {
1178 	if (!cl->parent)
1179 		/* the root class */
1180 		return 0;
1181 	if (cl->parent->children > 1)
1182 		/* not the last child */
1183 		return 0;
1184 	return 1;
1185 }
1186 
1187 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1188 			       struct Qdisc *new_q)
1189 {
1190 	struct htb_class *parent = cl->parent;
1191 
1192 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1193 
1194 	if (parent->cmode != HTB_CAN_SEND)
1195 		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
1196 
1197 	parent->level = 0;
1198 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1199 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1200 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1201 	parent->un.leaf.quantum = parent->quantum;
1202 	parent->un.leaf.prio = parent->prio;
1203 	parent->tokens = parent->buffer;
1204 	parent->ctokens = parent->cbuffer;
1205 	parent->t_c = psched_get_time();
1206 	parent->cmode = HTB_CAN_SEND;
1207 }
1208 
1209 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1210 {
1211 	if (!cl->level) {
1212 		WARN_ON(!cl->un.leaf.q);
1213 		qdisc_destroy(cl->un.leaf.q);
1214 	}
1215 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1216 	qdisc_put_rtab(cl->rate);
1217 	qdisc_put_rtab(cl->ceil);
1218 
1219 	tcf_destroy_chain(&cl->filter_list);
1220 	kfree(cl);
1221 }
1222 
1223 /* always caled under BH & queue lock */
1224 static void htb_destroy(struct Qdisc *sch)
1225 {
1226 	struct htb_sched *q = qdisc_priv(sch);
1227 	struct hlist_node *n, *next;
1228 	struct htb_class *cl;
1229 	unsigned int i;
1230 
1231 	qdisc_watchdog_cancel(&q->watchdog);
1232 	/* This line used to be after htb_destroy_class call below
1233 	   and surprisingly it worked in 2.4. But it must precede it
1234 	   because filter need its target class alive to be able to call
1235 	   unbind_filter on it (without Oops). */
1236 	tcf_destroy_chain(&q->filter_list);
1237 
1238 	for (i = 0; i < q->clhash.hashsize; i++) {
1239 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1240 			tcf_destroy_chain(&cl->filter_list);
1241 	}
1242 	for (i = 0; i < q->clhash.hashsize; i++) {
1243 		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
1244 					  common.hnode)
1245 			htb_destroy_class(sch, cl);
1246 	}
1247 	qdisc_class_hash_destroy(&q->clhash);
1248 	__skb_queue_purge(&q->direct_queue);
1249 }
1250 
1251 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1252 {
1253 	struct htb_sched *q = qdisc_priv(sch);
1254 	struct htb_class *cl = (struct htb_class *)arg;
1255 	unsigned int qlen;
1256 	struct Qdisc *new_q = NULL;
1257 	int last_child = 0;
1258 
1259 	// TODO: why don't allow to delete subtree ? references ? does
1260 	// tc subsys quarantee us that in htb_destroy it holds no class
1261 	// refs so that we can remove children safely there ?
1262 	if (cl->children || cl->filter_cnt)
1263 		return -EBUSY;
1264 
1265 	if (!cl->level && htb_parent_last_child(cl)) {
1266 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1267 					  &pfifo_qdisc_ops,
1268 					  cl->parent->common.classid);
1269 		last_child = 1;
1270 	}
1271 
1272 	sch_tree_lock(sch);
1273 
1274 	if (!cl->level) {
1275 		qlen = cl->un.leaf.q->q.qlen;
1276 		qdisc_reset(cl->un.leaf.q);
1277 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1278 	}
1279 
1280 	/* delete from hash and active; remainder in destroy_class */
1281 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1282 	if (cl->parent)
1283 		cl->parent->children--;
1284 
1285 	if (cl->prio_activity)
1286 		htb_deactivate(q, cl);
1287 
1288 	if (cl->cmode != HTB_CAN_SEND)
1289 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1290 
1291 	if (last_child)
1292 		htb_parent_to_leaf(q, cl, new_q);
1293 
1294 	if (--cl->refcnt == 0)
1295 		htb_destroy_class(sch, cl);
1296 
1297 	sch_tree_unlock(sch);
1298 	return 0;
1299 }
1300 
1301 static void htb_put(struct Qdisc *sch, unsigned long arg)
1302 {
1303 	struct htb_class *cl = (struct htb_class *)arg;
1304 
1305 	if (--cl->refcnt == 0)
1306 		htb_destroy_class(sch, cl);
1307 }
1308 
1309 static int htb_change_class(struct Qdisc *sch, u32 classid,
1310 			    u32 parentid, struct nlattr **tca,
1311 			    unsigned long *arg)
1312 {
1313 	int err = -EINVAL;
1314 	struct htb_sched *q = qdisc_priv(sch);
1315 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1316 	struct nlattr *opt = tca[TCA_OPTIONS];
1317 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1318 	struct nlattr *tb[TCA_HTB_RTAB + 1];
1319 	struct tc_htb_opt *hopt;
1320 
1321 	/* extract all subattrs from opt attr */
1322 	if (!opt)
1323 		goto failure;
1324 
1325 	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1326 	if (err < 0)
1327 		goto failure;
1328 
1329 	err = -EINVAL;
1330 	if (tb[TCA_HTB_PARMS] == NULL)
1331 		goto failure;
1332 
1333 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1334 
1335 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1336 
1337 	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1338 	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1339 	if (!rtab || !ctab)
1340 		goto failure;
1341 
1342 	if (!cl) {		/* new class */
1343 		struct Qdisc *new_q;
1344 		int prio;
1345 		struct {
1346 			struct nlattr		nla;
1347 			struct gnet_estimator	opt;
1348 		} est = {
1349 			.nla = {
1350 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1351 				.nla_type	= TCA_RATE,
1352 			},
1353 			.opt = {
1354 				/* 4s interval, 16s averaging constant */
1355 				.interval	= 2,
1356 				.ewma_log	= 2,
1357 			},
1358 		};
1359 
1360 		/* check for valid classid */
1361 		if (!classid || TC_H_MAJ(classid ^ sch->handle)
1362 		    || htb_find(classid, sch))
1363 			goto failure;
1364 
1365 		/* check maximal depth */
1366 		if (parent && parent->parent && parent->parent->level < 2) {
1367 			printk(KERN_ERR "htb: tree is too deep\n");
1368 			goto failure;
1369 		}
1370 		err = -ENOBUFS;
1371 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1372 			goto failure;
1373 
1374 		gen_new_estimator(&cl->bstats, &cl->rate_est,
1375 				  qdisc_root_sleeping_lock(sch),
1376 				  tca[TCA_RATE] ? : &est.nla);
1377 		cl->refcnt = 1;
1378 		cl->children = 0;
1379 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1380 		RB_CLEAR_NODE(&cl->pq_node);
1381 
1382 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1383 			RB_CLEAR_NODE(&cl->node[prio]);
1384 
1385 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1386 		   so that can't be used inside of sch_tree_lock
1387 		   -- thanks to Karlis Peisenieks */
1388 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1389 					  &pfifo_qdisc_ops, classid);
1390 		sch_tree_lock(sch);
1391 		if (parent && !parent->level) {
1392 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1393 
1394 			/* turn parent into inner node */
1395 			qdisc_reset(parent->un.leaf.q);
1396 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1397 			qdisc_destroy(parent->un.leaf.q);
1398 			if (parent->prio_activity)
1399 				htb_deactivate(q, parent);
1400 
1401 			/* remove from evt list because of level change */
1402 			if (parent->cmode != HTB_CAN_SEND) {
1403 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1404 				parent->cmode = HTB_CAN_SEND;
1405 			}
1406 			parent->level = (parent->parent ? parent->parent->level
1407 					 : TC_HTB_MAXDEPTH) - 1;
1408 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1409 		}
1410 		/* leaf (we) needs elementary qdisc */
1411 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1412 
1413 		cl->common.classid = classid;
1414 		cl->parent = parent;
1415 
1416 		/* set class to be in HTB_CAN_SEND state */
1417 		cl->tokens = hopt->buffer;
1418 		cl->ctokens = hopt->cbuffer;
1419 		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1420 		cl->t_c = psched_get_time();
1421 		cl->cmode = HTB_CAN_SEND;
1422 
1423 		/* attach to the hash list and parent's family */
1424 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1425 		if (parent)
1426 			parent->children++;
1427 	} else {
1428 		if (tca[TCA_RATE])
1429 			gen_replace_estimator(&cl->bstats, &cl->rate_est,
1430 					      qdisc_root_sleeping_lock(sch),
1431 					      tca[TCA_RATE]);
1432 		sch_tree_lock(sch);
1433 	}
1434 
1435 	/* it used to be a nasty bug here, we have to check that node
1436 	   is really leaf before changing cl->un.leaf ! */
1437 	if (!cl->level) {
1438 		cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1439 		if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1440 			printk(KERN_WARNING
1441 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1442 			       cl->common.classid);
1443 			cl->un.leaf.quantum = 1000;
1444 		}
1445 		if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1446 			printk(KERN_WARNING
1447 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1448 			       cl->common.classid);
1449 			cl->un.leaf.quantum = 200000;
1450 		}
1451 		if (hopt->quantum)
1452 			cl->un.leaf.quantum = hopt->quantum;
1453 		if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1454 			cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1455 
1456 		/* backup for htb_parent_to_leaf */
1457 		cl->quantum = cl->un.leaf.quantum;
1458 		cl->prio = cl->un.leaf.prio;
1459 	}
1460 
1461 	cl->buffer = hopt->buffer;
1462 	cl->cbuffer = hopt->cbuffer;
1463 	if (cl->rate)
1464 		qdisc_put_rtab(cl->rate);
1465 	cl->rate = rtab;
1466 	if (cl->ceil)
1467 		qdisc_put_rtab(cl->ceil);
1468 	cl->ceil = ctab;
1469 	sch_tree_unlock(sch);
1470 
1471 	qdisc_class_hash_grow(sch, &q->clhash);
1472 
1473 	*arg = (unsigned long)cl;
1474 	return 0;
1475 
1476 failure:
1477 	if (rtab)
1478 		qdisc_put_rtab(rtab);
1479 	if (ctab)
1480 		qdisc_put_rtab(ctab);
1481 	return err;
1482 }
1483 
1484 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1485 {
1486 	struct htb_sched *q = qdisc_priv(sch);
1487 	struct htb_class *cl = (struct htb_class *)arg;
1488 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1489 
1490 	return fl;
1491 }
1492 
1493 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1494 				     u32 classid)
1495 {
1496 	struct htb_class *cl = htb_find(classid, sch);
1497 
1498 	/*if (cl && !cl->level) return 0;
1499 	   The line above used to be there to prevent attaching filters to
1500 	   leaves. But at least tc_index filter uses this just to get class
1501 	   for other reasons so that we have to allow for it.
1502 	   ----
1503 	   19.6.2002 As Werner explained it is ok - bind filter is just
1504 	   another way to "lock" the class - unlike "get" this lock can
1505 	   be broken by class during destroy IIUC.
1506 	 */
1507 	if (cl)
1508 		cl->filter_cnt++;
1509 	return (unsigned long)cl;
1510 }
1511 
1512 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1513 {
1514 	struct htb_class *cl = (struct htb_class *)arg;
1515 
1516 	if (cl)
1517 		cl->filter_cnt--;
1518 }
1519 
1520 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1521 {
1522 	struct htb_sched *q = qdisc_priv(sch);
1523 	struct htb_class *cl;
1524 	struct hlist_node *n;
1525 	unsigned int i;
1526 
1527 	if (arg->stop)
1528 		return;
1529 
1530 	for (i = 0; i < q->clhash.hashsize; i++) {
1531 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1532 			if (arg->count < arg->skip) {
1533 				arg->count++;
1534 				continue;
1535 			}
1536 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1537 				arg->stop = 1;
1538 				return;
1539 			}
1540 			arg->count++;
1541 		}
1542 	}
1543 }
1544 
1545 static const struct Qdisc_class_ops htb_class_ops = {
1546 	.graft		=	htb_graft,
1547 	.leaf		=	htb_leaf,
1548 	.qlen_notify	=	htb_qlen_notify,
1549 	.get		=	htb_get,
1550 	.put		=	htb_put,
1551 	.change		=	htb_change_class,
1552 	.delete		=	htb_delete,
1553 	.walk		=	htb_walk,
1554 	.tcf_chain	=	htb_find_tcf,
1555 	.bind_tcf	=	htb_bind_filter,
1556 	.unbind_tcf	=	htb_unbind_filter,
1557 	.dump		=	htb_dump_class,
1558 	.dump_stats	=	htb_dump_class_stats,
1559 };
1560 
1561 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1562 	.next		=	NULL,
1563 	.cl_ops		=	&htb_class_ops,
1564 	.id		=	"htb",
1565 	.priv_size	=	sizeof(struct htb_sched),
1566 	.enqueue	=	htb_enqueue,
1567 	.dequeue	=	htb_dequeue,
1568 	.requeue	=	htb_requeue,
1569 	.drop		=	htb_drop,
1570 	.init		=	htb_init,
1571 	.reset		=	htb_reset,
1572 	.destroy	=	htb_destroy,
1573 	.change		=	NULL /* htb_change */,
1574 	.dump		=	htb_dump,
1575 	.owner		=	THIS_MODULE,
1576 };
1577 
1578 static int __init htb_module_init(void)
1579 {
1580 	return register_qdisc(&htb_qdisc_ops);
1581 }
1582 static void __exit htb_module_exit(void)
1583 {
1584 	unregister_qdisc(&htb_qdisc_ops);
1585 }
1586 
1587 module_init(htb_module_init)
1588 module_exit(htb_module_exit)
1589 MODULE_LICENSE("GPL");
1590