xref: /linux/net/sched/sch_htb.c (revision ed3174d93c342b8b2eeba6bbd124707d55304a7b)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  *
28  * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $
29  */
30 #include <linux/module.h>
31 #include <linux/types.h>
32 #include <linux/kernel.h>
33 #include <linux/string.h>
34 #include <linux/errno.h>
35 #include <linux/skbuff.h>
36 #include <linux/list.h>
37 #include <linux/compiler.h>
38 #include <linux/rbtree.h>
39 #include <net/netlink.h>
40 #include <net/pkt_sched.h>
41 
42 /* HTB algorithm.
43     Author: devik@cdi.cz
44     ========================================================================
45     HTB is like TBF with multiple classes. It is also similar to CBQ because
46     it allows to assign priority to each class in hierarchy.
47     In fact it is another implementation of Floyd's formal sharing.
48 
49     Levels:
50     Each class is assigned level. Leaf has ALWAYS level 0 and root
51     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
52     one less than their parent.
53 */
54 
55 #define HTB_HSIZE 16		/* classid hash size */
56 #define HTB_HYSTERESIS 1	/* whether to use mode hysteresis for speedup */
57 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
58 
59 #if HTB_VER >> 16 != TC_HTB_PROTOVER
60 #error "Mismatched sch_htb.c and pkt_sch.h"
61 #endif
62 
63 /* used internaly to keep status of single class */
64 enum htb_cmode {
65 	HTB_CANT_SEND,		/* class can't send and can't borrow */
66 	HTB_MAY_BORROW,		/* class can't send but may borrow */
67 	HTB_CAN_SEND		/* class can send */
68 };
69 
70 /* interior & leaf nodes; props specific to leaves are marked L: */
71 struct htb_class {
72 	/* general class parameters */
73 	u32 classid;
74 	struct gnet_stats_basic bstats;
75 	struct gnet_stats_queue qstats;
76 	struct gnet_stats_rate_est rate_est;
77 	struct tc_htb_xstats xstats;	/* our special stats */
78 	int refcnt;		/* usage count of this class */
79 
80 	/* topology */
81 	int level;		/* our level (see above) */
82 	struct htb_class *parent;	/* parent class */
83 	struct hlist_node hlist;	/* classid hash list item */
84 	struct list_head sibling;	/* sibling list item */
85 	struct list_head children;	/* children list */
86 
87 	union {
88 		struct htb_class_leaf {
89 			struct Qdisc *q;
90 			int prio;
91 			int aprio;
92 			int quantum;
93 			int deficit[TC_HTB_MAXDEPTH];
94 			struct list_head drop_list;
95 		} leaf;
96 		struct htb_class_inner {
97 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
98 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
99 			/* When class changes from state 1->2 and disconnects from
100 			   parent's feed then we lost ptr value and start from the
101 			   first child again. Here we store classid of the
102 			   last valid ptr (used when ptr is NULL). */
103 			u32 last_ptr_id[TC_HTB_NUMPRIO];
104 		} inner;
105 	} un;
106 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
107 	struct rb_node pq_node;	/* node for event queue */
108 	psched_time_t pq_key;
109 
110 	int prio_activity;	/* for which prios are we active */
111 	enum htb_cmode cmode;	/* current mode of the class */
112 
113 	/* class attached filters */
114 	struct tcf_proto *filter_list;
115 	int filter_cnt;
116 
117 	int warned;		/* only one warning about non work conserving .. */
118 
119 	/* token bucket parameters */
120 	struct qdisc_rate_table *rate;	/* rate table of the class itself */
121 	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
122 	long buffer, cbuffer;	/* token bucket depth/rate */
123 	psched_tdiff_t mbuffer;	/* max wait time */
124 	long tokens, ctokens;	/* current number of tokens */
125 	psched_time_t t_c;	/* checkpoint time */
126 
127 	int prio;		/* For parent to leaf return possible here */
128 	int quantum;		/* we do backup. Finally full replacement  */
129 				/* of un.leaf originals should be done. */
130 };
131 
132 static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
133 			   int size)
134 {
135 	long result = qdisc_l2t(rate, size);
136 	return result;
137 }
138 
139 struct htb_sched {
140 	struct list_head root;	/* root classes list */
141 	struct hlist_head hash[HTB_HSIZE];	/* hashed by classid */
142 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
143 
144 	/* self list - roots of self generating tree */
145 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
146 	int row_mask[TC_HTB_MAXDEPTH];
147 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
148 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
149 
150 	/* self wait list - roots of wait PQs per row */
151 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
152 
153 	/* time of nearest event per level (row) */
154 	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
155 
156 	/* whether we hit non-work conserving class during this dequeue; we use */
157 	int nwc_hit;		/* this to disable mindelay complaint in dequeue */
158 
159 	int defcls;		/* class where unclassified flows go to */
160 
161 	/* filters for qdisc itself */
162 	struct tcf_proto *filter_list;
163 	int filter_cnt;
164 
165 	int rate2quantum;	/* quant = rate / rate2quantum */
166 	psched_time_t now;	/* cached dequeue time */
167 	struct qdisc_watchdog watchdog;
168 
169 	/* non shaped skbs; let them go directly thru */
170 	struct sk_buff_head direct_queue;
171 	int direct_qlen;	/* max qlen of above */
172 
173 	long direct_pkts;
174 };
175 
176 /* compute hash of size HTB_HSIZE for given handle */
177 static inline int htb_hash(u32 h)
178 {
179 #if HTB_HSIZE != 16
180 #error "Declare new hash for your HTB_HSIZE"
181 #endif
182 	h ^= h >> 8;		/* stolen from cbq_hash */
183 	h ^= h >> 4;
184 	return h & 0xf;
185 }
186 
187 /* find class in global hash table using given handle */
188 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
189 {
190 	struct htb_sched *q = qdisc_priv(sch);
191 	struct hlist_node *p;
192 	struct htb_class *cl;
193 
194 	if (TC_H_MAJ(handle) != sch->handle)
195 		return NULL;
196 
197 	hlist_for_each_entry(cl, p, q->hash + htb_hash(handle), hlist) {
198 		if (cl->classid == handle)
199 			return cl;
200 	}
201 	return NULL;
202 }
203 
204 /**
205  * htb_classify - classify a packet into class
206  *
207  * It returns NULL if the packet should be dropped or -1 if the packet
208  * should be passed directly thru. In all other cases leaf class is returned.
209  * We allow direct class selection by classid in priority. The we examine
210  * filters in qdisc and in inner nodes (if higher filter points to the inner
211  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
212  * internal fifo (direct). These packets then go directly thru. If we still
213  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
214  * then finish and return direct queue.
215  */
216 #define HTB_DIRECT (struct htb_class*)-1
217 
218 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
219 				      int *qerr)
220 {
221 	struct htb_sched *q = qdisc_priv(sch);
222 	struct htb_class *cl;
223 	struct tcf_result res;
224 	struct tcf_proto *tcf;
225 	int result;
226 
227 	/* allow to select class by setting skb->priority to valid classid;
228 	   note that nfmark can be used too by attaching filter fw with no
229 	   rules in it */
230 	if (skb->priority == sch->handle)
231 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
232 	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
233 		return cl;
234 
235 	*qerr = NET_XMIT_BYPASS;
236 	tcf = q->filter_list;
237 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
238 #ifdef CONFIG_NET_CLS_ACT
239 		switch (result) {
240 		case TC_ACT_QUEUED:
241 		case TC_ACT_STOLEN:
242 			*qerr = NET_XMIT_SUCCESS;
243 		case TC_ACT_SHOT:
244 			return NULL;
245 		}
246 #endif
247 		if ((cl = (void *)res.class) == NULL) {
248 			if (res.classid == sch->handle)
249 				return HTB_DIRECT;	/* X:0 (direct flow) */
250 			if ((cl = htb_find(res.classid, sch)) == NULL)
251 				break;	/* filter selected invalid classid */
252 		}
253 		if (!cl->level)
254 			return cl;	/* we hit leaf; return it */
255 
256 		/* we have got inner class; apply inner filter chain */
257 		tcf = cl->filter_list;
258 	}
259 	/* classification failed; try to use default class */
260 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
261 	if (!cl || cl->level)
262 		return HTB_DIRECT;	/* bad default .. this is safe bet */
263 	return cl;
264 }
265 
266 /**
267  * htb_add_to_id_tree - adds class to the round robin list
268  *
269  * Routine adds class to the list (actually tree) sorted by classid.
270  * Make sure that class is not already on such list for given prio.
271  */
272 static void htb_add_to_id_tree(struct rb_root *root,
273 			       struct htb_class *cl, int prio)
274 {
275 	struct rb_node **p = &root->rb_node, *parent = NULL;
276 
277 	while (*p) {
278 		struct htb_class *c;
279 		parent = *p;
280 		c = rb_entry(parent, struct htb_class, node[prio]);
281 
282 		if (cl->classid > c->classid)
283 			p = &parent->rb_right;
284 		else
285 			p = &parent->rb_left;
286 	}
287 	rb_link_node(&cl->node[prio], parent, p);
288 	rb_insert_color(&cl->node[prio], root);
289 }
290 
291 /**
292  * htb_add_to_wait_tree - adds class to the event queue with delay
293  *
294  * The class is added to priority event queue to indicate that class will
295  * change its mode in cl->pq_key microseconds. Make sure that class is not
296  * already in the queue.
297  */
298 static void htb_add_to_wait_tree(struct htb_sched *q,
299 				 struct htb_class *cl, long delay)
300 {
301 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
302 
303 	cl->pq_key = q->now + delay;
304 	if (cl->pq_key == q->now)
305 		cl->pq_key++;
306 
307 	/* update the nearest event cache */
308 	if (q->near_ev_cache[cl->level] > cl->pq_key)
309 		q->near_ev_cache[cl->level] = cl->pq_key;
310 
311 	while (*p) {
312 		struct htb_class *c;
313 		parent = *p;
314 		c = rb_entry(parent, struct htb_class, pq_node);
315 		if (cl->pq_key >= c->pq_key)
316 			p = &parent->rb_right;
317 		else
318 			p = &parent->rb_left;
319 	}
320 	rb_link_node(&cl->pq_node, parent, p);
321 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
322 }
323 
324 /**
325  * htb_next_rb_node - finds next node in binary tree
326  *
327  * When we are past last key we return NULL.
328  * Average complexity is 2 steps per call.
329  */
330 static inline void htb_next_rb_node(struct rb_node **n)
331 {
332 	*n = rb_next(*n);
333 }
334 
335 /**
336  * htb_add_class_to_row - add class to its row
337  *
338  * The class is added to row at priorities marked in mask.
339  * It does nothing if mask == 0.
340  */
341 static inline void htb_add_class_to_row(struct htb_sched *q,
342 					struct htb_class *cl, int mask)
343 {
344 	q->row_mask[cl->level] |= mask;
345 	while (mask) {
346 		int prio = ffz(~mask);
347 		mask &= ~(1 << prio);
348 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
349 	}
350 }
351 
352 /* If this triggers, it is a bug in this code, but it need not be fatal */
353 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
354 {
355 	if (RB_EMPTY_NODE(rb)) {
356 		WARN_ON(1);
357 	} else {
358 		rb_erase(rb, root);
359 		RB_CLEAR_NODE(rb);
360 	}
361 }
362 
363 
364 /**
365  * htb_remove_class_from_row - removes class from its row
366  *
367  * The class is removed from row at priorities marked in mask.
368  * It does nothing if mask == 0.
369  */
370 static inline void htb_remove_class_from_row(struct htb_sched *q,
371 						 struct htb_class *cl, int mask)
372 {
373 	int m = 0;
374 
375 	while (mask) {
376 		int prio = ffz(~mask);
377 
378 		mask &= ~(1 << prio);
379 		if (q->ptr[cl->level][prio] == cl->node + prio)
380 			htb_next_rb_node(q->ptr[cl->level] + prio);
381 
382 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
383 		if (!q->row[cl->level][prio].rb_node)
384 			m |= 1 << prio;
385 	}
386 	q->row_mask[cl->level] &= ~m;
387 }
388 
389 /**
390  * htb_activate_prios - creates active classe's feed chain
391  *
392  * The class is connected to ancestors and/or appropriate rows
393  * for priorities it is participating on. cl->cmode must be new
394  * (activated) mode. It does nothing if cl->prio_activity == 0.
395  */
396 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
397 {
398 	struct htb_class *p = cl->parent;
399 	long m, mask = cl->prio_activity;
400 
401 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
402 		m = mask;
403 		while (m) {
404 			int prio = ffz(~m);
405 			m &= ~(1 << prio);
406 
407 			if (p->un.inner.feed[prio].rb_node)
408 				/* parent already has its feed in use so that
409 				   reset bit in mask as parent is already ok */
410 				mask &= ~(1 << prio);
411 
412 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
413 		}
414 		p->prio_activity |= mask;
415 		cl = p;
416 		p = cl->parent;
417 
418 	}
419 	if (cl->cmode == HTB_CAN_SEND && mask)
420 		htb_add_class_to_row(q, cl, mask);
421 }
422 
423 /**
424  * htb_deactivate_prios - remove class from feed chain
425  *
426  * cl->cmode must represent old mode (before deactivation). It does
427  * nothing if cl->prio_activity == 0. Class is removed from all feed
428  * chains and rows.
429  */
430 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
431 {
432 	struct htb_class *p = cl->parent;
433 	long m, mask = cl->prio_activity;
434 
435 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
436 		m = mask;
437 		mask = 0;
438 		while (m) {
439 			int prio = ffz(~m);
440 			m &= ~(1 << prio);
441 
442 			if (p->un.inner.ptr[prio] == cl->node + prio) {
443 				/* we are removing child which is pointed to from
444 				   parent feed - forget the pointer but remember
445 				   classid */
446 				p->un.inner.last_ptr_id[prio] = cl->classid;
447 				p->un.inner.ptr[prio] = NULL;
448 			}
449 
450 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
451 
452 			if (!p->un.inner.feed[prio].rb_node)
453 				mask |= 1 << prio;
454 		}
455 
456 		p->prio_activity &= ~mask;
457 		cl = p;
458 		p = cl->parent;
459 
460 	}
461 	if (cl->cmode == HTB_CAN_SEND && mask)
462 		htb_remove_class_from_row(q, cl, mask);
463 }
464 
465 #if HTB_HYSTERESIS
466 static inline long htb_lowater(const struct htb_class *cl)
467 {
468 	return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
469 }
470 static inline long htb_hiwater(const struct htb_class *cl)
471 {
472 	return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
473 }
474 #else
475 #define htb_lowater(cl)	(0)
476 #define htb_hiwater(cl)	(0)
477 #endif
478 
479 /**
480  * htb_class_mode - computes and returns current class mode
481  *
482  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
483  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
484  * from now to time when cl will change its state.
485  * Also it is worth to note that class mode doesn't change simply
486  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
487  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
488  * mode transitions per time unit. The speed gain is about 1/6.
489  */
490 static inline enum htb_cmode
491 htb_class_mode(struct htb_class *cl, long *diff)
492 {
493 	long toks;
494 
495 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
496 		*diff = -toks;
497 		return HTB_CANT_SEND;
498 	}
499 
500 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
501 		return HTB_CAN_SEND;
502 
503 	*diff = -toks;
504 	return HTB_MAY_BORROW;
505 }
506 
507 /**
508  * htb_change_class_mode - changes classe's mode
509  *
510  * This should be the only way how to change classe's mode under normal
511  * cirsumstances. Routine will update feed lists linkage, change mode
512  * and add class to the wait event queue if appropriate. New mode should
513  * be different from old one and cl->pq_key has to be valid if changing
514  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
515  */
516 static void
517 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
518 {
519 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
520 
521 	if (new_mode == cl->cmode)
522 		return;
523 
524 	if (cl->prio_activity) {	/* not necessary: speed optimization */
525 		if (cl->cmode != HTB_CANT_SEND)
526 			htb_deactivate_prios(q, cl);
527 		cl->cmode = new_mode;
528 		if (new_mode != HTB_CANT_SEND)
529 			htb_activate_prios(q, cl);
530 	} else
531 		cl->cmode = new_mode;
532 }
533 
534 /**
535  * htb_activate - inserts leaf cl into appropriate active feeds
536  *
537  * Routine learns (new) priority of leaf and activates feed chain
538  * for the prio. It can be called on already active leaf safely.
539  * It also adds leaf into droplist.
540  */
541 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
542 {
543 	BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
544 
545 	if (!cl->prio_activity) {
546 		cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
547 		htb_activate_prios(q, cl);
548 		list_add_tail(&cl->un.leaf.drop_list,
549 			      q->drops + cl->un.leaf.aprio);
550 	}
551 }
552 
553 /**
554  * htb_deactivate - remove leaf cl from active feeds
555  *
556  * Make sure that leaf is active. In the other words it can't be called
557  * with non-active leaf. It also removes class from the drop list.
558  */
559 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
560 {
561 	BUG_TRAP(cl->prio_activity);
562 
563 	htb_deactivate_prios(q, cl);
564 	cl->prio_activity = 0;
565 	list_del_init(&cl->un.leaf.drop_list);
566 }
567 
568 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
569 {
570 	int ret;
571 	struct htb_sched *q = qdisc_priv(sch);
572 	struct htb_class *cl = htb_classify(skb, sch, &ret);
573 
574 	if (cl == HTB_DIRECT) {
575 		/* enqueue to helper queue */
576 		if (q->direct_queue.qlen < q->direct_qlen) {
577 			__skb_queue_tail(&q->direct_queue, skb);
578 			q->direct_pkts++;
579 		} else {
580 			kfree_skb(skb);
581 			sch->qstats.drops++;
582 			return NET_XMIT_DROP;
583 		}
584 #ifdef CONFIG_NET_CLS_ACT
585 	} else if (!cl) {
586 		if (ret == NET_XMIT_BYPASS)
587 			sch->qstats.drops++;
588 		kfree_skb(skb);
589 		return ret;
590 #endif
591 	} else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) !=
592 		   NET_XMIT_SUCCESS) {
593 		sch->qstats.drops++;
594 		cl->qstats.drops++;
595 		return NET_XMIT_DROP;
596 	} else {
597 		cl->bstats.packets +=
598 			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
599 		cl->bstats.bytes += skb->len;
600 		htb_activate(q, cl);
601 	}
602 
603 	sch->q.qlen++;
604 	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
605 	sch->bstats.bytes += skb->len;
606 	return NET_XMIT_SUCCESS;
607 }
608 
609 /* TODO: requeuing packet charges it to policers again !! */
610 static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
611 {
612 	int ret;
613 	struct htb_sched *q = qdisc_priv(sch);
614 	struct htb_class *cl = htb_classify(skb, sch, &ret);
615 	struct sk_buff *tskb;
616 
617 	if (cl == HTB_DIRECT) {
618 		/* enqueue to helper queue */
619 		if (q->direct_queue.qlen < q->direct_qlen) {
620 			__skb_queue_head(&q->direct_queue, skb);
621 		} else {
622 			__skb_queue_head(&q->direct_queue, skb);
623 			tskb = __skb_dequeue_tail(&q->direct_queue);
624 			kfree_skb(tskb);
625 			sch->qstats.drops++;
626 			return NET_XMIT_CN;
627 		}
628 #ifdef CONFIG_NET_CLS_ACT
629 	} else if (!cl) {
630 		if (ret == NET_XMIT_BYPASS)
631 			sch->qstats.drops++;
632 		kfree_skb(skb);
633 		return ret;
634 #endif
635 	} else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) !=
636 		   NET_XMIT_SUCCESS) {
637 		sch->qstats.drops++;
638 		cl->qstats.drops++;
639 		return NET_XMIT_DROP;
640 	} else
641 		htb_activate(q, cl);
642 
643 	sch->q.qlen++;
644 	sch->qstats.requeues++;
645 	return NET_XMIT_SUCCESS;
646 }
647 
648 /**
649  * htb_charge_class - charges amount "bytes" to leaf and ancestors
650  *
651  * Routine assumes that packet "bytes" long was dequeued from leaf cl
652  * borrowing from "level". It accounts bytes to ceil leaky bucket for
653  * leaf and all ancestors and to rate bucket for ancestors at levels
654  * "level" and higher. It also handles possible change of mode resulting
655  * from the update. Note that mode can also increase here (MAY_BORROW to
656  * CAN_SEND) because we can use more precise clock that event queue here.
657  * In such case we remove class from event queue first.
658  */
659 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
660 			     int level, struct sk_buff *skb)
661 {
662 	int bytes = skb->len;
663 	long toks, diff;
664 	enum htb_cmode old_mode;
665 
666 #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
667 	if (toks > cl->B) toks = cl->B; \
668 	toks -= L2T(cl, cl->R, bytes); \
669 	if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
670 	cl->T = toks
671 
672 	while (cl) {
673 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
674 		if (cl->level >= level) {
675 			if (cl->level == level)
676 				cl->xstats.lends++;
677 			HTB_ACCNT(tokens, buffer, rate);
678 		} else {
679 			cl->xstats.borrows++;
680 			cl->tokens += diff;	/* we moved t_c; update tokens */
681 		}
682 		HTB_ACCNT(ctokens, cbuffer, ceil);
683 		cl->t_c = q->now;
684 
685 		old_mode = cl->cmode;
686 		diff = 0;
687 		htb_change_class_mode(q, cl, &diff);
688 		if (old_mode != cl->cmode) {
689 			if (old_mode != HTB_CAN_SEND)
690 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
691 			if (cl->cmode != HTB_CAN_SEND)
692 				htb_add_to_wait_tree(q, cl, diff);
693 		}
694 
695 		/* update byte stats except for leaves which are already updated */
696 		if (cl->level) {
697 			cl->bstats.bytes += bytes;
698 			cl->bstats.packets += skb_is_gso(skb)?
699 					skb_shinfo(skb)->gso_segs:1;
700 		}
701 		cl = cl->parent;
702 	}
703 }
704 
705 /**
706  * htb_do_events - make mode changes to classes at the level
707  *
708  * Scans event queue for pending events and applies them. Returns time of
709  * next pending event (0 for no event in pq).
710  * Note: Applied are events whose have cl->pq_key <= q->now.
711  */
712 static psched_time_t htb_do_events(struct htb_sched *q, int level)
713 {
714 	int i;
715 
716 	for (i = 0; i < 500; i++) {
717 		struct htb_class *cl;
718 		long diff;
719 		struct rb_node *p = rb_first(&q->wait_pq[level]);
720 
721 		if (!p)
722 			return 0;
723 
724 		cl = rb_entry(p, struct htb_class, pq_node);
725 		if (cl->pq_key > q->now)
726 			return cl->pq_key;
727 
728 		htb_safe_rb_erase(p, q->wait_pq + level);
729 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
730 		htb_change_class_mode(q, cl, &diff);
731 		if (cl->cmode != HTB_CAN_SEND)
732 			htb_add_to_wait_tree(q, cl, diff);
733 	}
734 	if (net_ratelimit())
735 		printk(KERN_WARNING "htb: too many events !\n");
736 	return q->now + PSCHED_TICKS_PER_SEC / 10;
737 }
738 
739 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
740    is no such one exists. */
741 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
742 					      u32 id)
743 {
744 	struct rb_node *r = NULL;
745 	while (n) {
746 		struct htb_class *cl =
747 		    rb_entry(n, struct htb_class, node[prio]);
748 		if (id == cl->classid)
749 			return n;
750 
751 		if (id > cl->classid) {
752 			n = n->rb_right;
753 		} else {
754 			r = n;
755 			n = n->rb_left;
756 		}
757 	}
758 	return r;
759 }
760 
761 /**
762  * htb_lookup_leaf - returns next leaf class in DRR order
763  *
764  * Find leaf where current feed pointers points to.
765  */
766 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
767 					 struct rb_node **pptr, u32 * pid)
768 {
769 	int i;
770 	struct {
771 		struct rb_node *root;
772 		struct rb_node **pptr;
773 		u32 *pid;
774 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
775 
776 	BUG_TRAP(tree->rb_node);
777 	sp->root = tree->rb_node;
778 	sp->pptr = pptr;
779 	sp->pid = pid;
780 
781 	for (i = 0; i < 65535; i++) {
782 		if (!*sp->pptr && *sp->pid) {
783 			/* ptr was invalidated but id is valid - try to recover
784 			   the original or next ptr */
785 			*sp->pptr =
786 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
787 		}
788 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
789 				   can become out of date quickly */
790 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
791 			*sp->pptr = sp->root;
792 			while ((*sp->pptr)->rb_left)
793 				*sp->pptr = (*sp->pptr)->rb_left;
794 			if (sp > stk) {
795 				sp--;
796 				BUG_TRAP(*sp->pptr);
797 				if (!*sp->pptr)
798 					return NULL;
799 				htb_next_rb_node(sp->pptr);
800 			}
801 		} else {
802 			struct htb_class *cl;
803 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
804 			if (!cl->level)
805 				return cl;
806 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
807 			sp->pptr = cl->un.inner.ptr + prio;
808 			sp->pid = cl->un.inner.last_ptr_id + prio;
809 		}
810 	}
811 	BUG_TRAP(0);
812 	return NULL;
813 }
814 
815 /* dequeues packet at given priority and level; call only if
816    you are sure that there is active class at prio/level */
817 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
818 					int level)
819 {
820 	struct sk_buff *skb = NULL;
821 	struct htb_class *cl, *start;
822 	/* look initial class up in the row */
823 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
824 				     q->ptr[level] + prio,
825 				     q->last_ptr_id[level] + prio);
826 
827 	do {
828 next:
829 		BUG_TRAP(cl);
830 		if (!cl)
831 			return NULL;
832 
833 		/* class can be empty - it is unlikely but can be true if leaf
834 		   qdisc drops packets in enqueue routine or if someone used
835 		   graft operation on the leaf since last dequeue;
836 		   simply deactivate and skip such class */
837 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
838 			struct htb_class *next;
839 			htb_deactivate(q, cl);
840 
841 			/* row/level might become empty */
842 			if ((q->row_mask[level] & (1 << prio)) == 0)
843 				return NULL;
844 
845 			next = htb_lookup_leaf(q->row[level] + prio,
846 					       prio, q->ptr[level] + prio,
847 					       q->last_ptr_id[level] + prio);
848 
849 			if (cl == start)	/* fix start if we just deleted it */
850 				start = next;
851 			cl = next;
852 			goto next;
853 		}
854 
855 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
856 		if (likely(skb != NULL))
857 			break;
858 		if (!cl->warned) {
859 			printk(KERN_WARNING
860 			       "htb: class %X isn't work conserving ?!\n",
861 			       cl->classid);
862 			cl->warned = 1;
863 		}
864 		q->nwc_hit++;
865 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
866 				  ptr[0]) + prio);
867 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
868 				     q->ptr[level] + prio,
869 				     q->last_ptr_id[level] + prio);
870 
871 	} while (cl != start);
872 
873 	if (likely(skb != NULL)) {
874 		if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
875 			cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
876 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
877 					  ptr[0]) + prio);
878 		}
879 		/* this used to be after charge_class but this constelation
880 		   gives us slightly better performance */
881 		if (!cl->un.leaf.q->q.qlen)
882 			htb_deactivate(q, cl);
883 		htb_charge_class(q, cl, level, skb);
884 	}
885 	return skb;
886 }
887 
888 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
889 {
890 	struct sk_buff *skb = NULL;
891 	struct htb_sched *q = qdisc_priv(sch);
892 	int level;
893 	psched_time_t next_event;
894 
895 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
896 	skb = __skb_dequeue(&q->direct_queue);
897 	if (skb != NULL) {
898 		sch->flags &= ~TCQ_F_THROTTLED;
899 		sch->q.qlen--;
900 		return skb;
901 	}
902 
903 	if (!sch->q.qlen)
904 		goto fin;
905 	q->now = psched_get_time();
906 
907 	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
908 	q->nwc_hit = 0;
909 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
910 		/* common case optimization - skip event handler quickly */
911 		int m;
912 		psched_time_t event;
913 
914 		if (q->now >= q->near_ev_cache[level]) {
915 			event = htb_do_events(q, level);
916 			if (!event)
917 				event = q->now + PSCHED_TICKS_PER_SEC;
918 			q->near_ev_cache[level] = event;
919 		} else
920 			event = q->near_ev_cache[level];
921 
922 		if (event && next_event > event)
923 			next_event = event;
924 
925 		m = ~q->row_mask[level];
926 		while (m != (int)(-1)) {
927 			int prio = ffz(m);
928 			m |= 1 << prio;
929 			skb = htb_dequeue_tree(q, prio, level);
930 			if (likely(skb != NULL)) {
931 				sch->q.qlen--;
932 				sch->flags &= ~TCQ_F_THROTTLED;
933 				goto fin;
934 			}
935 		}
936 	}
937 	sch->qstats.overlimits++;
938 	qdisc_watchdog_schedule(&q->watchdog, next_event);
939 fin:
940 	return skb;
941 }
942 
943 /* try to drop from each class (by prio) until one succeed */
944 static unsigned int htb_drop(struct Qdisc *sch)
945 {
946 	struct htb_sched *q = qdisc_priv(sch);
947 	int prio;
948 
949 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
950 		struct list_head *p;
951 		list_for_each(p, q->drops + prio) {
952 			struct htb_class *cl = list_entry(p, struct htb_class,
953 							  un.leaf.drop_list);
954 			unsigned int len;
955 			if (cl->un.leaf.q->ops->drop &&
956 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
957 				sch->q.qlen--;
958 				if (!cl->un.leaf.q->q.qlen)
959 					htb_deactivate(q, cl);
960 				return len;
961 			}
962 		}
963 	}
964 	return 0;
965 }
966 
967 /* reset all classes */
968 /* always caled under BH & queue lock */
969 static void htb_reset(struct Qdisc *sch)
970 {
971 	struct htb_sched *q = qdisc_priv(sch);
972 	int i;
973 
974 	for (i = 0; i < HTB_HSIZE; i++) {
975 		struct hlist_node *p;
976 		struct htb_class *cl;
977 
978 		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
979 			if (cl->level)
980 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
981 			else {
982 				if (cl->un.leaf.q)
983 					qdisc_reset(cl->un.leaf.q);
984 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
985 			}
986 			cl->prio_activity = 0;
987 			cl->cmode = HTB_CAN_SEND;
988 
989 		}
990 	}
991 	qdisc_watchdog_cancel(&q->watchdog);
992 	__skb_queue_purge(&q->direct_queue);
993 	sch->q.qlen = 0;
994 	memset(q->row, 0, sizeof(q->row));
995 	memset(q->row_mask, 0, sizeof(q->row_mask));
996 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
997 	memset(q->ptr, 0, sizeof(q->ptr));
998 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
999 		INIT_LIST_HEAD(q->drops + i);
1000 }
1001 
1002 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1003 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1004 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1005 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1006 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1007 };
1008 
1009 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1010 {
1011 	struct htb_sched *q = qdisc_priv(sch);
1012 	struct nlattr *tb[TCA_HTB_INIT + 1];
1013 	struct tc_htb_glob *gopt;
1014 	int err;
1015 	int i;
1016 
1017 	if (!opt)
1018 		return -EINVAL;
1019 
1020 	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
1021 	if (err < 0)
1022 		return err;
1023 
1024 	if (tb[TCA_HTB_INIT] == NULL) {
1025 		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1026 		return -EINVAL;
1027 	}
1028 	gopt = nla_data(tb[TCA_HTB_INIT]);
1029 	if (gopt->version != HTB_VER >> 16) {
1030 		printk(KERN_ERR
1031 		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1032 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1033 		return -EINVAL;
1034 	}
1035 
1036 	INIT_LIST_HEAD(&q->root);
1037 	for (i = 0; i < HTB_HSIZE; i++)
1038 		INIT_HLIST_HEAD(q->hash + i);
1039 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1040 		INIT_LIST_HEAD(q->drops + i);
1041 
1042 	qdisc_watchdog_init(&q->watchdog, sch);
1043 	skb_queue_head_init(&q->direct_queue);
1044 
1045 	q->direct_qlen = sch->dev->tx_queue_len;
1046 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1047 		q->direct_qlen = 2;
1048 
1049 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1050 		q->rate2quantum = 1;
1051 	q->defcls = gopt->defcls;
1052 
1053 	return 0;
1054 }
1055 
1056 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1057 {
1058 	struct htb_sched *q = qdisc_priv(sch);
1059 	struct nlattr *nest;
1060 	struct tc_htb_glob gopt;
1061 
1062 	spin_lock_bh(&sch->dev->queue_lock);
1063 
1064 	gopt.direct_pkts = q->direct_pkts;
1065 	gopt.version = HTB_VER;
1066 	gopt.rate2quantum = q->rate2quantum;
1067 	gopt.defcls = q->defcls;
1068 	gopt.debug = 0;
1069 
1070 	nest = nla_nest_start(skb, TCA_OPTIONS);
1071 	if (nest == NULL)
1072 		goto nla_put_failure;
1073 	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1074 	nla_nest_end(skb, nest);
1075 
1076 	spin_unlock_bh(&sch->dev->queue_lock);
1077 	return skb->len;
1078 
1079 nla_put_failure:
1080 	spin_unlock_bh(&sch->dev->queue_lock);
1081 	nla_nest_cancel(skb, nest);
1082 	return -1;
1083 }
1084 
1085 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1086 			  struct sk_buff *skb, struct tcmsg *tcm)
1087 {
1088 	struct htb_class *cl = (struct htb_class *)arg;
1089 	struct nlattr *nest;
1090 	struct tc_htb_opt opt;
1091 
1092 	spin_lock_bh(&sch->dev->queue_lock);
1093 	tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
1094 	tcm->tcm_handle = cl->classid;
1095 	if (!cl->level && cl->un.leaf.q)
1096 		tcm->tcm_info = cl->un.leaf.q->handle;
1097 
1098 	nest = nla_nest_start(skb, TCA_OPTIONS);
1099 	if (nest == NULL)
1100 		goto nla_put_failure;
1101 
1102 	memset(&opt, 0, sizeof(opt));
1103 
1104 	opt.rate = cl->rate->rate;
1105 	opt.buffer = cl->buffer;
1106 	opt.ceil = cl->ceil->rate;
1107 	opt.cbuffer = cl->cbuffer;
1108 	opt.quantum = cl->un.leaf.quantum;
1109 	opt.prio = cl->un.leaf.prio;
1110 	opt.level = cl->level;
1111 	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1112 
1113 	nla_nest_end(skb, nest);
1114 	spin_unlock_bh(&sch->dev->queue_lock);
1115 	return skb->len;
1116 
1117 nla_put_failure:
1118 	spin_unlock_bh(&sch->dev->queue_lock);
1119 	nla_nest_cancel(skb, nest);
1120 	return -1;
1121 }
1122 
1123 static int
1124 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1125 {
1126 	struct htb_class *cl = (struct htb_class *)arg;
1127 
1128 	if (!cl->level && cl->un.leaf.q)
1129 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1130 	cl->xstats.tokens = cl->tokens;
1131 	cl->xstats.ctokens = cl->ctokens;
1132 
1133 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1134 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1135 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1136 		return -1;
1137 
1138 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1139 }
1140 
1141 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1142 		     struct Qdisc **old)
1143 {
1144 	struct htb_class *cl = (struct htb_class *)arg;
1145 
1146 	if (cl && !cl->level) {
1147 		if (new == NULL &&
1148 		    (new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1149 					     cl->classid))
1150 		    == NULL)
1151 			return -ENOBUFS;
1152 		sch_tree_lock(sch);
1153 		if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1154 			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1155 			qdisc_reset(*old);
1156 		}
1157 		sch_tree_unlock(sch);
1158 		return 0;
1159 	}
1160 	return -ENOENT;
1161 }
1162 
1163 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1164 {
1165 	struct htb_class *cl = (struct htb_class *)arg;
1166 	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1167 }
1168 
1169 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1170 {
1171 	struct htb_class *cl = (struct htb_class *)arg;
1172 
1173 	if (cl->un.leaf.q->q.qlen == 0)
1174 		htb_deactivate(qdisc_priv(sch), cl);
1175 }
1176 
1177 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1178 {
1179 	struct htb_class *cl = htb_find(classid, sch);
1180 	if (cl)
1181 		cl->refcnt++;
1182 	return (unsigned long)cl;
1183 }
1184 
1185 static inline int htb_parent_last_child(struct htb_class *cl)
1186 {
1187 	if (!cl->parent)
1188 		/* the root class */
1189 		return 0;
1190 
1191 	if (!(cl->parent->children.next == &cl->sibling &&
1192 		cl->parent->children.prev == &cl->sibling))
1193 		/* not the last child */
1194 		return 0;
1195 
1196 	return 1;
1197 }
1198 
1199 static void htb_parent_to_leaf(struct htb_class *cl, struct Qdisc *new_q)
1200 {
1201 	struct htb_class *parent = cl->parent;
1202 
1203 	BUG_TRAP(!cl->level && cl->un.leaf.q && !cl->prio_activity);
1204 
1205 	parent->level = 0;
1206 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1207 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1208 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1209 	parent->un.leaf.quantum = parent->quantum;
1210 	parent->un.leaf.prio = parent->prio;
1211 	parent->tokens = parent->buffer;
1212 	parent->ctokens = parent->cbuffer;
1213 	parent->t_c = psched_get_time();
1214 	parent->cmode = HTB_CAN_SEND;
1215 }
1216 
1217 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1218 {
1219 	struct htb_sched *q = qdisc_priv(sch);
1220 
1221 	if (!cl->level) {
1222 		BUG_TRAP(cl->un.leaf.q);
1223 		qdisc_destroy(cl->un.leaf.q);
1224 	}
1225 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1226 	qdisc_put_rtab(cl->rate);
1227 	qdisc_put_rtab(cl->ceil);
1228 
1229 	tcf_destroy_chain(cl->filter_list);
1230 
1231 	while (!list_empty(&cl->children))
1232 		htb_destroy_class(sch, list_entry(cl->children.next,
1233 						  struct htb_class, sibling));
1234 
1235 	/* note: this delete may happen twice (see htb_delete) */
1236 	hlist_del_init(&cl->hlist);
1237 	list_del(&cl->sibling);
1238 
1239 	if (cl->prio_activity)
1240 		htb_deactivate(q, cl);
1241 
1242 	if (cl->cmode != HTB_CAN_SEND)
1243 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1244 
1245 	kfree(cl);
1246 }
1247 
1248 /* always caled under BH & queue lock */
1249 static void htb_destroy(struct Qdisc *sch)
1250 {
1251 	struct htb_sched *q = qdisc_priv(sch);
1252 
1253 	qdisc_watchdog_cancel(&q->watchdog);
1254 	/* This line used to be after htb_destroy_class call below
1255 	   and surprisingly it worked in 2.4. But it must precede it
1256 	   because filter need its target class alive to be able to call
1257 	   unbind_filter on it (without Oops). */
1258 	tcf_destroy_chain(q->filter_list);
1259 
1260 	while (!list_empty(&q->root))
1261 		htb_destroy_class(sch, list_entry(q->root.next,
1262 						  struct htb_class, sibling));
1263 
1264 	__skb_queue_purge(&q->direct_queue);
1265 }
1266 
1267 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1268 {
1269 	struct htb_sched *q = qdisc_priv(sch);
1270 	struct htb_class *cl = (struct htb_class *)arg;
1271 	unsigned int qlen;
1272 	struct Qdisc *new_q = NULL;
1273 	int last_child = 0;
1274 
1275 	// TODO: why don't allow to delete subtree ? references ? does
1276 	// tc subsys quarantee us that in htb_destroy it holds no class
1277 	// refs so that we can remove children safely there ?
1278 	if (!list_empty(&cl->children) || cl->filter_cnt)
1279 		return -EBUSY;
1280 
1281 	if (!cl->level && htb_parent_last_child(cl)) {
1282 		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1283 						cl->parent->classid);
1284 		last_child = 1;
1285 	}
1286 
1287 	sch_tree_lock(sch);
1288 
1289 	if (!cl->level) {
1290 		qlen = cl->un.leaf.q->q.qlen;
1291 		qdisc_reset(cl->un.leaf.q);
1292 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1293 	}
1294 
1295 	/* delete from hash and active; remainder in destroy_class */
1296 	hlist_del_init(&cl->hlist);
1297 
1298 	if (cl->prio_activity)
1299 		htb_deactivate(q, cl);
1300 
1301 	if (last_child)
1302 		htb_parent_to_leaf(cl, new_q);
1303 
1304 	if (--cl->refcnt == 0)
1305 		htb_destroy_class(sch, cl);
1306 
1307 	sch_tree_unlock(sch);
1308 	return 0;
1309 }
1310 
1311 static void htb_put(struct Qdisc *sch, unsigned long arg)
1312 {
1313 	struct htb_class *cl = (struct htb_class *)arg;
1314 
1315 	if (--cl->refcnt == 0)
1316 		htb_destroy_class(sch, cl);
1317 }
1318 
1319 static int htb_change_class(struct Qdisc *sch, u32 classid,
1320 			    u32 parentid, struct nlattr **tca,
1321 			    unsigned long *arg)
1322 {
1323 	int err = -EINVAL;
1324 	struct htb_sched *q = qdisc_priv(sch);
1325 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1326 	struct nlattr *opt = tca[TCA_OPTIONS];
1327 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1328 	struct nlattr *tb[TCA_HTB_RTAB + 1];
1329 	struct tc_htb_opt *hopt;
1330 
1331 	/* extract all subattrs from opt attr */
1332 	if (!opt)
1333 		goto failure;
1334 
1335 	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1336 	if (err < 0)
1337 		goto failure;
1338 
1339 	err = -EINVAL;
1340 	if (tb[TCA_HTB_PARMS] == NULL)
1341 		goto failure;
1342 
1343 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1344 
1345 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1346 
1347 	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1348 	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1349 	if (!rtab || !ctab)
1350 		goto failure;
1351 
1352 	if (!cl) {		/* new class */
1353 		struct Qdisc *new_q;
1354 		int prio;
1355 		struct {
1356 			struct nlattr		nla;
1357 			struct gnet_estimator	opt;
1358 		} est = {
1359 			.nla = {
1360 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1361 				.nla_type	= TCA_RATE,
1362 			},
1363 			.opt = {
1364 				/* 4s interval, 16s averaging constant */
1365 				.interval	= 2,
1366 				.ewma_log	= 2,
1367 			},
1368 		};
1369 
1370 		/* check for valid classid */
1371 		if (!classid || TC_H_MAJ(classid ^ sch->handle)
1372 		    || htb_find(classid, sch))
1373 			goto failure;
1374 
1375 		/* check maximal depth */
1376 		if (parent && parent->parent && parent->parent->level < 2) {
1377 			printk(KERN_ERR "htb: tree is too deep\n");
1378 			goto failure;
1379 		}
1380 		err = -ENOBUFS;
1381 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1382 			goto failure;
1383 
1384 		gen_new_estimator(&cl->bstats, &cl->rate_est,
1385 				  &sch->dev->queue_lock,
1386 				  tca[TCA_RATE] ? : &est.nla);
1387 		cl->refcnt = 1;
1388 		INIT_LIST_HEAD(&cl->sibling);
1389 		INIT_HLIST_NODE(&cl->hlist);
1390 		INIT_LIST_HEAD(&cl->children);
1391 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1392 		RB_CLEAR_NODE(&cl->pq_node);
1393 
1394 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1395 			RB_CLEAR_NODE(&cl->node[prio]);
1396 
1397 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1398 		   so that can't be used inside of sch_tree_lock
1399 		   -- thanks to Karlis Peisenieks */
1400 		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid);
1401 		sch_tree_lock(sch);
1402 		if (parent && !parent->level) {
1403 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1404 
1405 			/* turn parent into inner node */
1406 			qdisc_reset(parent->un.leaf.q);
1407 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1408 			qdisc_destroy(parent->un.leaf.q);
1409 			if (parent->prio_activity)
1410 				htb_deactivate(q, parent);
1411 
1412 			/* remove from evt list because of level change */
1413 			if (parent->cmode != HTB_CAN_SEND) {
1414 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1415 				parent->cmode = HTB_CAN_SEND;
1416 			}
1417 			parent->level = (parent->parent ? parent->parent->level
1418 					 : TC_HTB_MAXDEPTH) - 1;
1419 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1420 		}
1421 		/* leaf (we) needs elementary qdisc */
1422 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1423 
1424 		cl->classid = classid;
1425 		cl->parent = parent;
1426 
1427 		/* set class to be in HTB_CAN_SEND state */
1428 		cl->tokens = hopt->buffer;
1429 		cl->ctokens = hopt->cbuffer;
1430 		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1431 		cl->t_c = psched_get_time();
1432 		cl->cmode = HTB_CAN_SEND;
1433 
1434 		/* attach to the hash list and parent's family */
1435 		hlist_add_head(&cl->hlist, q->hash + htb_hash(classid));
1436 		list_add_tail(&cl->sibling,
1437 			      parent ? &parent->children : &q->root);
1438 	} else {
1439 		if (tca[TCA_RATE])
1440 			gen_replace_estimator(&cl->bstats, &cl->rate_est,
1441 					      &sch->dev->queue_lock,
1442 					      tca[TCA_RATE]);
1443 		sch_tree_lock(sch);
1444 	}
1445 
1446 	/* it used to be a nasty bug here, we have to check that node
1447 	   is really leaf before changing cl->un.leaf ! */
1448 	if (!cl->level) {
1449 		cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1450 		if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1451 			printk(KERN_WARNING
1452 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1453 			       cl->classid);
1454 			cl->un.leaf.quantum = 1000;
1455 		}
1456 		if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1457 			printk(KERN_WARNING
1458 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1459 			       cl->classid);
1460 			cl->un.leaf.quantum = 200000;
1461 		}
1462 		if (hopt->quantum)
1463 			cl->un.leaf.quantum = hopt->quantum;
1464 		if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1465 			cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1466 
1467 		/* backup for htb_parent_to_leaf */
1468 		cl->quantum = cl->un.leaf.quantum;
1469 		cl->prio = cl->un.leaf.prio;
1470 	}
1471 
1472 	cl->buffer = hopt->buffer;
1473 	cl->cbuffer = hopt->cbuffer;
1474 	if (cl->rate)
1475 		qdisc_put_rtab(cl->rate);
1476 	cl->rate = rtab;
1477 	if (cl->ceil)
1478 		qdisc_put_rtab(cl->ceil);
1479 	cl->ceil = ctab;
1480 	sch_tree_unlock(sch);
1481 
1482 	*arg = (unsigned long)cl;
1483 	return 0;
1484 
1485 failure:
1486 	if (rtab)
1487 		qdisc_put_rtab(rtab);
1488 	if (ctab)
1489 		qdisc_put_rtab(ctab);
1490 	return err;
1491 }
1492 
1493 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1494 {
1495 	struct htb_sched *q = qdisc_priv(sch);
1496 	struct htb_class *cl = (struct htb_class *)arg;
1497 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1498 
1499 	return fl;
1500 }
1501 
1502 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1503 				     u32 classid)
1504 {
1505 	struct htb_sched *q = qdisc_priv(sch);
1506 	struct htb_class *cl = htb_find(classid, sch);
1507 
1508 	/*if (cl && !cl->level) return 0;
1509 	   The line above used to be there to prevent attaching filters to
1510 	   leaves. But at least tc_index filter uses this just to get class
1511 	   for other reasons so that we have to allow for it.
1512 	   ----
1513 	   19.6.2002 As Werner explained it is ok - bind filter is just
1514 	   another way to "lock" the class - unlike "get" this lock can
1515 	   be broken by class during destroy IIUC.
1516 	 */
1517 	if (cl)
1518 		cl->filter_cnt++;
1519 	else
1520 		q->filter_cnt++;
1521 	return (unsigned long)cl;
1522 }
1523 
1524 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1525 {
1526 	struct htb_sched *q = qdisc_priv(sch);
1527 	struct htb_class *cl = (struct htb_class *)arg;
1528 
1529 	if (cl)
1530 		cl->filter_cnt--;
1531 	else
1532 		q->filter_cnt--;
1533 }
1534 
1535 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1536 {
1537 	struct htb_sched *q = qdisc_priv(sch);
1538 	int i;
1539 
1540 	if (arg->stop)
1541 		return;
1542 
1543 	for (i = 0; i < HTB_HSIZE; i++) {
1544 		struct hlist_node *p;
1545 		struct htb_class *cl;
1546 
1547 		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
1548 			if (arg->count < arg->skip) {
1549 				arg->count++;
1550 				continue;
1551 			}
1552 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1553 				arg->stop = 1;
1554 				return;
1555 			}
1556 			arg->count++;
1557 		}
1558 	}
1559 }
1560 
1561 static const struct Qdisc_class_ops htb_class_ops = {
1562 	.graft		=	htb_graft,
1563 	.leaf		=	htb_leaf,
1564 	.qlen_notify	=	htb_qlen_notify,
1565 	.get		=	htb_get,
1566 	.put		=	htb_put,
1567 	.change		=	htb_change_class,
1568 	.delete		=	htb_delete,
1569 	.walk		=	htb_walk,
1570 	.tcf_chain	=	htb_find_tcf,
1571 	.bind_tcf	=	htb_bind_filter,
1572 	.unbind_tcf	=	htb_unbind_filter,
1573 	.dump		=	htb_dump_class,
1574 	.dump_stats	=	htb_dump_class_stats,
1575 };
1576 
1577 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1578 	.next		=	NULL,
1579 	.cl_ops		=	&htb_class_ops,
1580 	.id		=	"htb",
1581 	.priv_size	=	sizeof(struct htb_sched),
1582 	.enqueue	=	htb_enqueue,
1583 	.dequeue	=	htb_dequeue,
1584 	.requeue	=	htb_requeue,
1585 	.drop		=	htb_drop,
1586 	.init		=	htb_init,
1587 	.reset		=	htb_reset,
1588 	.destroy	=	htb_destroy,
1589 	.change		=	NULL /* htb_change */,
1590 	.dump		=	htb_dump,
1591 	.owner		=	THIS_MODULE,
1592 };
1593 
1594 static int __init htb_module_init(void)
1595 {
1596 	return register_qdisc(&htb_qdisc_ops);
1597 }
1598 static void __exit htb_module_exit(void)
1599 {
1600 	unregister_qdisc(&htb_qdisc_ops);
1601 }
1602 
1603 module_init(htb_module_init)
1604 module_exit(htb_module_exit)
1605 MODULE_LICENSE("GPL");
1606