xref: /linux/net/sched/sch_htb.c (revision eb2bce7f5e7ac1ca6da434461217fadf3c688d2c)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  *
28  * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $
29  */
30 #include <linux/module.h>
31 #include <asm/uaccess.h>
32 #include <asm/system.h>
33 #include <linux/bitops.h>
34 #include <linux/types.h>
35 #include <linux/kernel.h>
36 #include <linux/string.h>
37 #include <linux/mm.h>
38 #include <linux/socket.h>
39 #include <linux/sockios.h>
40 #include <linux/in.h>
41 #include <linux/errno.h>
42 #include <linux/interrupt.h>
43 #include <linux/if_ether.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/etherdevice.h>
47 #include <linux/notifier.h>
48 #include <net/ip.h>
49 #include <net/route.h>
50 #include <linux/skbuff.h>
51 #include <linux/list.h>
52 #include <linux/compiler.h>
53 #include <net/netlink.h>
54 #include <net/sock.h>
55 #include <net/pkt_sched.h>
56 #include <linux/rbtree.h>
57 
58 /* HTB algorithm.
59     Author: devik@cdi.cz
60     ========================================================================
61     HTB is like TBF with multiple classes. It is also similar to CBQ because
62     it allows to assign priority to each class in hierarchy.
63     In fact it is another implementation of Floyd's formal sharing.
64 
65     Levels:
66     Each class is assigned level. Leaf has ALWAYS level 0 and root
67     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
68     one less than their parent.
69 */
70 
71 #define HTB_HSIZE 16		/* classid hash size */
72 #define HTB_EWMAC 2		/* rate average over HTB_EWMAC*HTB_HSIZE sec */
73 #define HTB_RATECM 1		/* whether to use rate computer */
74 #define HTB_HYSTERESIS 1	/* whether to use mode hysteresis for speedup */
75 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
76 
77 #if HTB_VER >> 16 != TC_HTB_PROTOVER
78 #error "Mismatched sch_htb.c and pkt_sch.h"
79 #endif
80 
81 /* used internaly to keep status of single class */
82 enum htb_cmode {
83 	HTB_CANT_SEND,		/* class can't send and can't borrow */
84 	HTB_MAY_BORROW,		/* class can't send but may borrow */
85 	HTB_CAN_SEND		/* class can send */
86 };
87 
88 /* interior & leaf nodes; props specific to leaves are marked L: */
89 struct htb_class {
90 	/* general class parameters */
91 	u32 classid;
92 	struct gnet_stats_basic bstats;
93 	struct gnet_stats_queue qstats;
94 	struct gnet_stats_rate_est rate_est;
95 	struct tc_htb_xstats xstats;	/* our special stats */
96 	int refcnt;		/* usage count of this class */
97 
98 #ifdef HTB_RATECM
99 	/* rate measurement counters */
100 	unsigned long rate_bytes, sum_bytes;
101 	unsigned long rate_packets, sum_packets;
102 #endif
103 
104 	/* topology */
105 	int level;		/* our level (see above) */
106 	struct htb_class *parent;	/* parent class */
107 	struct hlist_node hlist;	/* classid hash list item */
108 	struct list_head sibling;	/* sibling list item */
109 	struct list_head children;	/* children list */
110 
111 	union {
112 		struct htb_class_leaf {
113 			struct Qdisc *q;
114 			int prio;
115 			int aprio;
116 			int quantum;
117 			int deficit[TC_HTB_MAXDEPTH];
118 			struct list_head drop_list;
119 		} leaf;
120 		struct htb_class_inner {
121 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
122 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
123 			/* When class changes from state 1->2 and disconnects from
124 			   parent's feed then we lost ptr value and start from the
125 			   first child again. Here we store classid of the
126 			   last valid ptr (used when ptr is NULL). */
127 			u32 last_ptr_id[TC_HTB_NUMPRIO];
128 		} inner;
129 	} un;
130 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
131 	struct rb_node pq_node;	/* node for event queue */
132 	psched_time_t pq_key;
133 
134 	int prio_activity;	/* for which prios are we active */
135 	enum htb_cmode cmode;	/* current mode of the class */
136 
137 	/* class attached filters */
138 	struct tcf_proto *filter_list;
139 	int filter_cnt;
140 
141 	int warned;		/* only one warning about non work conserving .. */
142 
143 	/* token bucket parameters */
144 	struct qdisc_rate_table *rate;	/* rate table of the class itself */
145 	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
146 	long buffer, cbuffer;	/* token bucket depth/rate */
147 	psched_tdiff_t mbuffer;	/* max wait time */
148 	long tokens, ctokens;	/* current number of tokens */
149 	psched_time_t t_c;	/* checkpoint time */
150 
151 	int prio;		/* For parent to leaf return possible here */
152 	int quantum;		/* we do backup. Finally full replacement  */
153 				/* of un.leaf originals should be done. */
154 };
155 
156 /* TODO: maybe compute rate when size is too large .. or drop ? */
157 static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
158 			   int size)
159 {
160 	int slot = size >> rate->rate.cell_log;
161 	if (slot > 255) {
162 		cl->xstats.giants++;
163 		slot = 255;
164 	}
165 	return rate->data[slot];
166 }
167 
168 struct htb_sched {
169 	struct list_head root;	/* root classes list */
170 	struct hlist_head hash[HTB_HSIZE];	/* hashed by classid */
171 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
172 
173 	/* self list - roots of self generating tree */
174 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
175 	int row_mask[TC_HTB_MAXDEPTH];
176 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
177 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
178 
179 	/* self wait list - roots of wait PQs per row */
180 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
181 
182 	/* time of nearest event per level (row) */
183 	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
184 
185 	/* whether we hit non-work conserving class during this dequeue; we use */
186 	int nwc_hit;		/* this to disable mindelay complaint in dequeue */
187 
188 	int defcls;		/* class where unclassified flows go to */
189 
190 	/* filters for qdisc itself */
191 	struct tcf_proto *filter_list;
192 	int filter_cnt;
193 
194 	int rate2quantum;	/* quant = rate / rate2quantum */
195 	psched_time_t now;	/* cached dequeue time */
196 	struct qdisc_watchdog watchdog;
197 #ifdef HTB_RATECM
198 	struct timer_list rttim;	/* rate computer timer */
199 	int recmp_bucket;	/* which hash bucket to recompute next */
200 #endif
201 
202 	/* non shaped skbs; let them go directly thru */
203 	struct sk_buff_head direct_queue;
204 	int direct_qlen;	/* max qlen of above */
205 
206 	long direct_pkts;
207 };
208 
209 /* compute hash of size HTB_HSIZE for given handle */
210 static inline int htb_hash(u32 h)
211 {
212 #if HTB_HSIZE != 16
213 #error "Declare new hash for your HTB_HSIZE"
214 #endif
215 	h ^= h >> 8;		/* stolen from cbq_hash */
216 	h ^= h >> 4;
217 	return h & 0xf;
218 }
219 
220 /* find class in global hash table using given handle */
221 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
222 {
223 	struct htb_sched *q = qdisc_priv(sch);
224 	struct hlist_node *p;
225 	struct htb_class *cl;
226 
227 	if (TC_H_MAJ(handle) != sch->handle)
228 		return NULL;
229 
230 	hlist_for_each_entry(cl, p, q->hash + htb_hash(handle), hlist) {
231 		if (cl->classid == handle)
232 			return cl;
233 	}
234 	return NULL;
235 }
236 
237 /**
238  * htb_classify - classify a packet into class
239  *
240  * It returns NULL if the packet should be dropped or -1 if the packet
241  * should be passed directly thru. In all other cases leaf class is returned.
242  * We allow direct class selection by classid in priority. The we examine
243  * filters in qdisc and in inner nodes (if higher filter points to the inner
244  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
245  * internal fifo (direct). These packets then go directly thru. If we still
246  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
247  * then finish and return direct queue.
248  */
249 #define HTB_DIRECT (struct htb_class*)-1
250 static inline u32 htb_classid(struct htb_class *cl)
251 {
252 	return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC;
253 }
254 
255 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
256 				      int *qerr)
257 {
258 	struct htb_sched *q = qdisc_priv(sch);
259 	struct htb_class *cl;
260 	struct tcf_result res;
261 	struct tcf_proto *tcf;
262 	int result;
263 
264 	/* allow to select class by setting skb->priority to valid classid;
265 	   note that nfmark can be used too by attaching filter fw with no
266 	   rules in it */
267 	if (skb->priority == sch->handle)
268 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
269 	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
270 		return cl;
271 
272 	*qerr = NET_XMIT_BYPASS;
273 	tcf = q->filter_list;
274 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
275 #ifdef CONFIG_NET_CLS_ACT
276 		switch (result) {
277 		case TC_ACT_QUEUED:
278 		case TC_ACT_STOLEN:
279 			*qerr = NET_XMIT_SUCCESS;
280 		case TC_ACT_SHOT:
281 			return NULL;
282 		}
283 #elif defined(CONFIG_NET_CLS_POLICE)
284 		if (result == TC_POLICE_SHOT)
285 			return HTB_DIRECT;
286 #endif
287 		if ((cl = (void *)res.class) == NULL) {
288 			if (res.classid == sch->handle)
289 				return HTB_DIRECT;	/* X:0 (direct flow) */
290 			if ((cl = htb_find(res.classid, sch)) == NULL)
291 				break;	/* filter selected invalid classid */
292 		}
293 		if (!cl->level)
294 			return cl;	/* we hit leaf; return it */
295 
296 		/* we have got inner class; apply inner filter chain */
297 		tcf = cl->filter_list;
298 	}
299 	/* classification failed; try to use default class */
300 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
301 	if (!cl || cl->level)
302 		return HTB_DIRECT;	/* bad default .. this is safe bet */
303 	return cl;
304 }
305 
306 /**
307  * htb_add_to_id_tree - adds class to the round robin list
308  *
309  * Routine adds class to the list (actually tree) sorted by classid.
310  * Make sure that class is not already on such list for given prio.
311  */
312 static void htb_add_to_id_tree(struct rb_root *root,
313 			       struct htb_class *cl, int prio)
314 {
315 	struct rb_node **p = &root->rb_node, *parent = NULL;
316 
317 	while (*p) {
318 		struct htb_class *c;
319 		parent = *p;
320 		c = rb_entry(parent, struct htb_class, node[prio]);
321 
322 		if (cl->classid > c->classid)
323 			p = &parent->rb_right;
324 		else
325 			p = &parent->rb_left;
326 	}
327 	rb_link_node(&cl->node[prio], parent, p);
328 	rb_insert_color(&cl->node[prio], root);
329 }
330 
331 /**
332  * htb_add_to_wait_tree - adds class to the event queue with delay
333  *
334  * The class is added to priority event queue to indicate that class will
335  * change its mode in cl->pq_key microseconds. Make sure that class is not
336  * already in the queue.
337  */
338 static void htb_add_to_wait_tree(struct htb_sched *q,
339 				 struct htb_class *cl, long delay)
340 {
341 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
342 
343 	cl->pq_key = q->now + delay;
344 	if (cl->pq_key == q->now)
345 		cl->pq_key++;
346 
347 	/* update the nearest event cache */
348 	if (q->near_ev_cache[cl->level] > cl->pq_key)
349 		q->near_ev_cache[cl->level] = cl->pq_key;
350 
351 	while (*p) {
352 		struct htb_class *c;
353 		parent = *p;
354 		c = rb_entry(parent, struct htb_class, pq_node);
355 		if (cl->pq_key >= c->pq_key)
356 			p = &parent->rb_right;
357 		else
358 			p = &parent->rb_left;
359 	}
360 	rb_link_node(&cl->pq_node, parent, p);
361 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
362 }
363 
364 /**
365  * htb_next_rb_node - finds next node in binary tree
366  *
367  * When we are past last key we return NULL.
368  * Average complexity is 2 steps per call.
369  */
370 static inline void htb_next_rb_node(struct rb_node **n)
371 {
372 	*n = rb_next(*n);
373 }
374 
375 /**
376  * htb_add_class_to_row - add class to its row
377  *
378  * The class is added to row at priorities marked in mask.
379  * It does nothing if mask == 0.
380  */
381 static inline void htb_add_class_to_row(struct htb_sched *q,
382 					struct htb_class *cl, int mask)
383 {
384 	q->row_mask[cl->level] |= mask;
385 	while (mask) {
386 		int prio = ffz(~mask);
387 		mask &= ~(1 << prio);
388 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
389 	}
390 }
391 
392 /* If this triggers, it is a bug in this code, but it need not be fatal */
393 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
394 {
395 	if (RB_EMPTY_NODE(rb)) {
396 		WARN_ON(1);
397 	} else {
398 		rb_erase(rb, root);
399 		RB_CLEAR_NODE(rb);
400 	}
401 }
402 
403 
404 /**
405  * htb_remove_class_from_row - removes class from its row
406  *
407  * The class is removed from row at priorities marked in mask.
408  * It does nothing if mask == 0.
409  */
410 static inline void htb_remove_class_from_row(struct htb_sched *q,
411 						 struct htb_class *cl, int mask)
412 {
413 	int m = 0;
414 
415 	while (mask) {
416 		int prio = ffz(~mask);
417 
418 		mask &= ~(1 << prio);
419 		if (q->ptr[cl->level][prio] == cl->node + prio)
420 			htb_next_rb_node(q->ptr[cl->level] + prio);
421 
422 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
423 		if (!q->row[cl->level][prio].rb_node)
424 			m |= 1 << prio;
425 	}
426 	q->row_mask[cl->level] &= ~m;
427 }
428 
429 /**
430  * htb_activate_prios - creates active classe's feed chain
431  *
432  * The class is connected to ancestors and/or appropriate rows
433  * for priorities it is participating on. cl->cmode must be new
434  * (activated) mode. It does nothing if cl->prio_activity == 0.
435  */
436 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
437 {
438 	struct htb_class *p = cl->parent;
439 	long m, mask = cl->prio_activity;
440 
441 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
442 		m = mask;
443 		while (m) {
444 			int prio = ffz(~m);
445 			m &= ~(1 << prio);
446 
447 			if (p->un.inner.feed[prio].rb_node)
448 				/* parent already has its feed in use so that
449 				   reset bit in mask as parent is already ok */
450 				mask &= ~(1 << prio);
451 
452 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
453 		}
454 		p->prio_activity |= mask;
455 		cl = p;
456 		p = cl->parent;
457 
458 	}
459 	if (cl->cmode == HTB_CAN_SEND && mask)
460 		htb_add_class_to_row(q, cl, mask);
461 }
462 
463 /**
464  * htb_deactivate_prios - remove class from feed chain
465  *
466  * cl->cmode must represent old mode (before deactivation). It does
467  * nothing if cl->prio_activity == 0. Class is removed from all feed
468  * chains and rows.
469  */
470 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
471 {
472 	struct htb_class *p = cl->parent;
473 	long m, mask = cl->prio_activity;
474 
475 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
476 		m = mask;
477 		mask = 0;
478 		while (m) {
479 			int prio = ffz(~m);
480 			m &= ~(1 << prio);
481 
482 			if (p->un.inner.ptr[prio] == cl->node + prio) {
483 				/* we are removing child which is pointed to from
484 				   parent feed - forget the pointer but remember
485 				   classid */
486 				p->un.inner.last_ptr_id[prio] = cl->classid;
487 				p->un.inner.ptr[prio] = NULL;
488 			}
489 
490 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
491 
492 			if (!p->un.inner.feed[prio].rb_node)
493 				mask |= 1 << prio;
494 		}
495 
496 		p->prio_activity &= ~mask;
497 		cl = p;
498 		p = cl->parent;
499 
500 	}
501 	if (cl->cmode == HTB_CAN_SEND && mask)
502 		htb_remove_class_from_row(q, cl, mask);
503 }
504 
505 #if HTB_HYSTERESIS
506 static inline long htb_lowater(const struct htb_class *cl)
507 {
508 	return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
509 }
510 static inline long htb_hiwater(const struct htb_class *cl)
511 {
512 	return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
513 }
514 #else
515 #define htb_lowater(cl)	(0)
516 #define htb_hiwater(cl)	(0)
517 #endif
518 
519 /**
520  * htb_class_mode - computes and returns current class mode
521  *
522  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
523  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
524  * from now to time when cl will change its state.
525  * Also it is worth to note that class mode doesn't change simply
526  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
527  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
528  * mode transitions per time unit. The speed gain is about 1/6.
529  */
530 static inline enum htb_cmode
531 htb_class_mode(struct htb_class *cl, long *diff)
532 {
533 	long toks;
534 
535 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
536 		*diff = -toks;
537 		return HTB_CANT_SEND;
538 	}
539 
540 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
541 		return HTB_CAN_SEND;
542 
543 	*diff = -toks;
544 	return HTB_MAY_BORROW;
545 }
546 
547 /**
548  * htb_change_class_mode - changes classe's mode
549  *
550  * This should be the only way how to change classe's mode under normal
551  * cirsumstances. Routine will update feed lists linkage, change mode
552  * and add class to the wait event queue if appropriate. New mode should
553  * be different from old one and cl->pq_key has to be valid if changing
554  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
555  */
556 static void
557 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
558 {
559 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
560 
561 	if (new_mode == cl->cmode)
562 		return;
563 
564 	if (cl->prio_activity) {	/* not necessary: speed optimization */
565 		if (cl->cmode != HTB_CANT_SEND)
566 			htb_deactivate_prios(q, cl);
567 		cl->cmode = new_mode;
568 		if (new_mode != HTB_CANT_SEND)
569 			htb_activate_prios(q, cl);
570 	} else
571 		cl->cmode = new_mode;
572 }
573 
574 /**
575  * htb_activate - inserts leaf cl into appropriate active feeds
576  *
577  * Routine learns (new) priority of leaf and activates feed chain
578  * for the prio. It can be called on already active leaf safely.
579  * It also adds leaf into droplist.
580  */
581 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
582 {
583 	BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
584 
585 	if (!cl->prio_activity) {
586 		cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
587 		htb_activate_prios(q, cl);
588 		list_add_tail(&cl->un.leaf.drop_list,
589 			      q->drops + cl->un.leaf.aprio);
590 	}
591 }
592 
593 /**
594  * htb_deactivate - remove leaf cl from active feeds
595  *
596  * Make sure that leaf is active. In the other words it can't be called
597  * with non-active leaf. It also removes class from the drop list.
598  */
599 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
600 {
601 	BUG_TRAP(cl->prio_activity);
602 
603 	htb_deactivate_prios(q, cl);
604 	cl->prio_activity = 0;
605 	list_del_init(&cl->un.leaf.drop_list);
606 }
607 
608 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
609 {
610 	int ret;
611 	struct htb_sched *q = qdisc_priv(sch);
612 	struct htb_class *cl = htb_classify(skb, sch, &ret);
613 
614 	if (cl == HTB_DIRECT) {
615 		/* enqueue to helper queue */
616 		if (q->direct_queue.qlen < q->direct_qlen) {
617 			__skb_queue_tail(&q->direct_queue, skb);
618 			q->direct_pkts++;
619 		} else {
620 			kfree_skb(skb);
621 			sch->qstats.drops++;
622 			return NET_XMIT_DROP;
623 		}
624 #ifdef CONFIG_NET_CLS_ACT
625 	} else if (!cl) {
626 		if (ret == NET_XMIT_BYPASS)
627 			sch->qstats.drops++;
628 		kfree_skb(skb);
629 		return ret;
630 #endif
631 	} else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) !=
632 		   NET_XMIT_SUCCESS) {
633 		sch->qstats.drops++;
634 		cl->qstats.drops++;
635 		return NET_XMIT_DROP;
636 	} else {
637 		cl->bstats.packets++;
638 		cl->bstats.bytes += skb->len;
639 		htb_activate(q, cl);
640 	}
641 
642 	sch->q.qlen++;
643 	sch->bstats.packets++;
644 	sch->bstats.bytes += skb->len;
645 	return NET_XMIT_SUCCESS;
646 }
647 
648 /* TODO: requeuing packet charges it to policers again !! */
649 static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
650 {
651 	struct htb_sched *q = qdisc_priv(sch);
652 	int ret = NET_XMIT_SUCCESS;
653 	struct htb_class *cl = htb_classify(skb, sch, &ret);
654 	struct sk_buff *tskb;
655 
656 	if (cl == HTB_DIRECT || !cl) {
657 		/* enqueue to helper queue */
658 		if (q->direct_queue.qlen < q->direct_qlen && cl) {
659 			__skb_queue_head(&q->direct_queue, skb);
660 		} else {
661 			__skb_queue_head(&q->direct_queue, skb);
662 			tskb = __skb_dequeue_tail(&q->direct_queue);
663 			kfree_skb(tskb);
664 			sch->qstats.drops++;
665 			return NET_XMIT_CN;
666 		}
667 	} else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) !=
668 		   NET_XMIT_SUCCESS) {
669 		sch->qstats.drops++;
670 		cl->qstats.drops++;
671 		return NET_XMIT_DROP;
672 	} else
673 		htb_activate(q, cl);
674 
675 	sch->q.qlen++;
676 	sch->qstats.requeues++;
677 	return NET_XMIT_SUCCESS;
678 }
679 
680 #ifdef HTB_RATECM
681 #define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
682 static void htb_rate_timer(unsigned long arg)
683 {
684 	struct Qdisc *sch = (struct Qdisc *)arg;
685 	struct htb_sched *q = qdisc_priv(sch);
686 	struct hlist_node *p;
687 	struct htb_class *cl;
688 
689 
690 	/* lock queue so that we can muck with it */
691 	spin_lock_bh(&sch->dev->queue_lock);
692 
693 	q->rttim.expires = jiffies + HZ;
694 	add_timer(&q->rttim);
695 
696 	/* scan and recompute one bucket at time */
697 	if (++q->recmp_bucket >= HTB_HSIZE)
698 		q->recmp_bucket = 0;
699 
700 	hlist_for_each_entry(cl,p, q->hash + q->recmp_bucket, hlist) {
701 		RT_GEN(cl->sum_bytes, cl->rate_bytes);
702 		RT_GEN(cl->sum_packets, cl->rate_packets);
703 	}
704 	spin_unlock_bh(&sch->dev->queue_lock);
705 }
706 #endif
707 
708 /**
709  * htb_charge_class - charges amount "bytes" to leaf and ancestors
710  *
711  * Routine assumes that packet "bytes" long was dequeued from leaf cl
712  * borrowing from "level". It accounts bytes to ceil leaky bucket for
713  * leaf and all ancestors and to rate bucket for ancestors at levels
714  * "level" and higher. It also handles possible change of mode resulting
715  * from the update. Note that mode can also increase here (MAY_BORROW to
716  * CAN_SEND) because we can use more precise clock that event queue here.
717  * In such case we remove class from event queue first.
718  */
719 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
720 			     int level, int bytes)
721 {
722 	long toks, diff;
723 	enum htb_cmode old_mode;
724 
725 #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
726 	if (toks > cl->B) toks = cl->B; \
727 	toks -= L2T(cl, cl->R, bytes); \
728 	if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
729 	cl->T = toks
730 
731 	while (cl) {
732 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
733 		if (cl->level >= level) {
734 			if (cl->level == level)
735 				cl->xstats.lends++;
736 			HTB_ACCNT(tokens, buffer, rate);
737 		} else {
738 			cl->xstats.borrows++;
739 			cl->tokens += diff;	/* we moved t_c; update tokens */
740 		}
741 		HTB_ACCNT(ctokens, cbuffer, ceil);
742 		cl->t_c = q->now;
743 
744 		old_mode = cl->cmode;
745 		diff = 0;
746 		htb_change_class_mode(q, cl, &diff);
747 		if (old_mode != cl->cmode) {
748 			if (old_mode != HTB_CAN_SEND)
749 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
750 			if (cl->cmode != HTB_CAN_SEND)
751 				htb_add_to_wait_tree(q, cl, diff);
752 		}
753 #ifdef HTB_RATECM
754 		/* update rate counters */
755 		cl->sum_bytes += bytes;
756 		cl->sum_packets++;
757 #endif
758 
759 		/* update byte stats except for leaves which are already updated */
760 		if (cl->level) {
761 			cl->bstats.bytes += bytes;
762 			cl->bstats.packets++;
763 		}
764 		cl = cl->parent;
765 	}
766 }
767 
768 /**
769  * htb_do_events - make mode changes to classes at the level
770  *
771  * Scans event queue for pending events and applies them. Returns time of
772  * next pending event (0 for no event in pq).
773  * Note: Applied are events whose have cl->pq_key <= q->now.
774  */
775 static psched_time_t htb_do_events(struct htb_sched *q, int level)
776 {
777 	int i;
778 
779 	for (i = 0; i < 500; i++) {
780 		struct htb_class *cl;
781 		long diff;
782 		struct rb_node *p = rb_first(&q->wait_pq[level]);
783 
784 		if (!p)
785 			return 0;
786 
787 		cl = rb_entry(p, struct htb_class, pq_node);
788 		if (cl->pq_key > q->now)
789 			return cl->pq_key;
790 
791 		htb_safe_rb_erase(p, q->wait_pq + level);
792 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
793 		htb_change_class_mode(q, cl, &diff);
794 		if (cl->cmode != HTB_CAN_SEND)
795 			htb_add_to_wait_tree(q, cl, diff);
796 	}
797 	if (net_ratelimit())
798 		printk(KERN_WARNING "htb: too many events !\n");
799 	return q->now + PSCHED_TICKS_PER_SEC / 10;
800 }
801 
802 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
803    is no such one exists. */
804 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
805 					      u32 id)
806 {
807 	struct rb_node *r = NULL;
808 	while (n) {
809 		struct htb_class *cl =
810 		    rb_entry(n, struct htb_class, node[prio]);
811 		if (id == cl->classid)
812 			return n;
813 
814 		if (id > cl->classid) {
815 			n = n->rb_right;
816 		} else {
817 			r = n;
818 			n = n->rb_left;
819 		}
820 	}
821 	return r;
822 }
823 
824 /**
825  * htb_lookup_leaf - returns next leaf class in DRR order
826  *
827  * Find leaf where current feed pointers points to.
828  */
829 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
830 					 struct rb_node **pptr, u32 * pid)
831 {
832 	int i;
833 	struct {
834 		struct rb_node *root;
835 		struct rb_node **pptr;
836 		u32 *pid;
837 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
838 
839 	BUG_TRAP(tree->rb_node);
840 	sp->root = tree->rb_node;
841 	sp->pptr = pptr;
842 	sp->pid = pid;
843 
844 	for (i = 0; i < 65535; i++) {
845 		if (!*sp->pptr && *sp->pid) {
846 			/* ptr was invalidated but id is valid - try to recover
847 			   the original or next ptr */
848 			*sp->pptr =
849 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
850 		}
851 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
852 				   can become out of date quickly */
853 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
854 			*sp->pptr = sp->root;
855 			while ((*sp->pptr)->rb_left)
856 				*sp->pptr = (*sp->pptr)->rb_left;
857 			if (sp > stk) {
858 				sp--;
859 				BUG_TRAP(*sp->pptr);
860 				if (!*sp->pptr)
861 					return NULL;
862 				htb_next_rb_node(sp->pptr);
863 			}
864 		} else {
865 			struct htb_class *cl;
866 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
867 			if (!cl->level)
868 				return cl;
869 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
870 			sp->pptr = cl->un.inner.ptr + prio;
871 			sp->pid = cl->un.inner.last_ptr_id + prio;
872 		}
873 	}
874 	BUG_TRAP(0);
875 	return NULL;
876 }
877 
878 /* dequeues packet at given priority and level; call only if
879    you are sure that there is active class at prio/level */
880 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
881 					int level)
882 {
883 	struct sk_buff *skb = NULL;
884 	struct htb_class *cl, *start;
885 	/* look initial class up in the row */
886 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
887 				     q->ptr[level] + prio,
888 				     q->last_ptr_id[level] + prio);
889 
890 	do {
891 next:
892 		BUG_TRAP(cl);
893 		if (!cl)
894 			return NULL;
895 
896 		/* class can be empty - it is unlikely but can be true if leaf
897 		   qdisc drops packets in enqueue routine or if someone used
898 		   graft operation on the leaf since last dequeue;
899 		   simply deactivate and skip such class */
900 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
901 			struct htb_class *next;
902 			htb_deactivate(q, cl);
903 
904 			/* row/level might become empty */
905 			if ((q->row_mask[level] & (1 << prio)) == 0)
906 				return NULL;
907 
908 			next = htb_lookup_leaf(q->row[level] + prio,
909 					       prio, q->ptr[level] + prio,
910 					       q->last_ptr_id[level] + prio);
911 
912 			if (cl == start)	/* fix start if we just deleted it */
913 				start = next;
914 			cl = next;
915 			goto next;
916 		}
917 
918 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
919 		if (likely(skb != NULL))
920 			break;
921 		if (!cl->warned) {
922 			printk(KERN_WARNING
923 			       "htb: class %X isn't work conserving ?!\n",
924 			       cl->classid);
925 			cl->warned = 1;
926 		}
927 		q->nwc_hit++;
928 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
929 				  ptr[0]) + prio);
930 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
931 				     q->ptr[level] + prio,
932 				     q->last_ptr_id[level] + prio);
933 
934 	} while (cl != start);
935 
936 	if (likely(skb != NULL)) {
937 		if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
938 			cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
939 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
940 					  ptr[0]) + prio);
941 		}
942 		/* this used to be after charge_class but this constelation
943 		   gives us slightly better performance */
944 		if (!cl->un.leaf.q->q.qlen)
945 			htb_deactivate(q, cl);
946 		htb_charge_class(q, cl, level, skb->len);
947 	}
948 	return skb;
949 }
950 
951 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
952 {
953 	struct sk_buff *skb = NULL;
954 	struct htb_sched *q = qdisc_priv(sch);
955 	int level;
956 	psched_time_t next_event;
957 
958 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
959 	skb = __skb_dequeue(&q->direct_queue);
960 	if (skb != NULL) {
961 		sch->flags &= ~TCQ_F_THROTTLED;
962 		sch->q.qlen--;
963 		return skb;
964 	}
965 
966 	if (!sch->q.qlen)
967 		goto fin;
968 	q->now = psched_get_time();
969 
970 	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
971 	q->nwc_hit = 0;
972 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
973 		/* common case optimization - skip event handler quickly */
974 		int m;
975 		psched_time_t event;
976 
977 		if (q->now >= q->near_ev_cache[level]) {
978 			event = htb_do_events(q, level);
979 			q->near_ev_cache[level] = event ? event :
980 							  PSCHED_TICKS_PER_SEC;
981 		} else
982 			event = q->near_ev_cache[level];
983 
984 		if (event && next_event > event)
985 			next_event = event;
986 
987 		m = ~q->row_mask[level];
988 		while (m != (int)(-1)) {
989 			int prio = ffz(m);
990 			m |= 1 << prio;
991 			skb = htb_dequeue_tree(q, prio, level);
992 			if (likely(skb != NULL)) {
993 				sch->q.qlen--;
994 				sch->flags &= ~TCQ_F_THROTTLED;
995 				goto fin;
996 			}
997 		}
998 	}
999 	sch->qstats.overlimits++;
1000 	qdisc_watchdog_schedule(&q->watchdog, next_event);
1001 fin:
1002 	return skb;
1003 }
1004 
1005 /* try to drop from each class (by prio) until one succeed */
1006 static unsigned int htb_drop(struct Qdisc *sch)
1007 {
1008 	struct htb_sched *q = qdisc_priv(sch);
1009 	int prio;
1010 
1011 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
1012 		struct list_head *p;
1013 		list_for_each(p, q->drops + prio) {
1014 			struct htb_class *cl = list_entry(p, struct htb_class,
1015 							  un.leaf.drop_list);
1016 			unsigned int len;
1017 			if (cl->un.leaf.q->ops->drop &&
1018 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
1019 				sch->q.qlen--;
1020 				if (!cl->un.leaf.q->q.qlen)
1021 					htb_deactivate(q, cl);
1022 				return len;
1023 			}
1024 		}
1025 	}
1026 	return 0;
1027 }
1028 
1029 /* reset all classes */
1030 /* always caled under BH & queue lock */
1031 static void htb_reset(struct Qdisc *sch)
1032 {
1033 	struct htb_sched *q = qdisc_priv(sch);
1034 	int i;
1035 
1036 	for (i = 0; i < HTB_HSIZE; i++) {
1037 		struct hlist_node *p;
1038 		struct htb_class *cl;
1039 
1040 		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
1041 			if (cl->level)
1042 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
1043 			else {
1044 				if (cl->un.leaf.q)
1045 					qdisc_reset(cl->un.leaf.q);
1046 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1047 			}
1048 			cl->prio_activity = 0;
1049 			cl->cmode = HTB_CAN_SEND;
1050 
1051 		}
1052 	}
1053 	qdisc_watchdog_cancel(&q->watchdog);
1054 	__skb_queue_purge(&q->direct_queue);
1055 	sch->q.qlen = 0;
1056 	memset(q->row, 0, sizeof(q->row));
1057 	memset(q->row_mask, 0, sizeof(q->row_mask));
1058 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
1059 	memset(q->ptr, 0, sizeof(q->ptr));
1060 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1061 		INIT_LIST_HEAD(q->drops + i);
1062 }
1063 
1064 static int htb_init(struct Qdisc *sch, struct rtattr *opt)
1065 {
1066 	struct htb_sched *q = qdisc_priv(sch);
1067 	struct rtattr *tb[TCA_HTB_INIT];
1068 	struct tc_htb_glob *gopt;
1069 	int i;
1070 	if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) ||
1071 	    tb[TCA_HTB_INIT - 1] == NULL ||
1072 	    RTA_PAYLOAD(tb[TCA_HTB_INIT - 1]) < sizeof(*gopt)) {
1073 		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1074 		return -EINVAL;
1075 	}
1076 	gopt = RTA_DATA(tb[TCA_HTB_INIT - 1]);
1077 	if (gopt->version != HTB_VER >> 16) {
1078 		printk(KERN_ERR
1079 		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1080 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1081 		return -EINVAL;
1082 	}
1083 
1084 	INIT_LIST_HEAD(&q->root);
1085 	for (i = 0; i < HTB_HSIZE; i++)
1086 		INIT_HLIST_HEAD(q->hash + i);
1087 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1088 		INIT_LIST_HEAD(q->drops + i);
1089 
1090 	qdisc_watchdog_init(&q->watchdog, sch);
1091 	skb_queue_head_init(&q->direct_queue);
1092 
1093 	q->direct_qlen = sch->dev->tx_queue_len;
1094 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1095 		q->direct_qlen = 2;
1096 
1097 #ifdef HTB_RATECM
1098 	init_timer(&q->rttim);
1099 	q->rttim.function = htb_rate_timer;
1100 	q->rttim.data = (unsigned long)sch;
1101 	q->rttim.expires = jiffies + HZ;
1102 	add_timer(&q->rttim);
1103 #endif
1104 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1105 		q->rate2quantum = 1;
1106 	q->defcls = gopt->defcls;
1107 
1108 	return 0;
1109 }
1110 
1111 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1112 {
1113 	struct htb_sched *q = qdisc_priv(sch);
1114 	unsigned char *b = skb_tail_pointer(skb);
1115 	struct rtattr *rta;
1116 	struct tc_htb_glob gopt;
1117 	spin_lock_bh(&sch->dev->queue_lock);
1118 	gopt.direct_pkts = q->direct_pkts;
1119 
1120 	gopt.version = HTB_VER;
1121 	gopt.rate2quantum = q->rate2quantum;
1122 	gopt.defcls = q->defcls;
1123 	gopt.debug = 0;
1124 	rta = (struct rtattr *)b;
1125 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1126 	RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1127 	rta->rta_len = skb_tail_pointer(skb) - b;
1128 	spin_unlock_bh(&sch->dev->queue_lock);
1129 	return skb->len;
1130 rtattr_failure:
1131 	spin_unlock_bh(&sch->dev->queue_lock);
1132 	nlmsg_trim(skb, skb_tail_pointer(skb));
1133 	return -1;
1134 }
1135 
1136 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1137 			  struct sk_buff *skb, struct tcmsg *tcm)
1138 {
1139 	struct htb_class *cl = (struct htb_class *)arg;
1140 	unsigned char *b = skb_tail_pointer(skb);
1141 	struct rtattr *rta;
1142 	struct tc_htb_opt opt;
1143 
1144 	spin_lock_bh(&sch->dev->queue_lock);
1145 	tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
1146 	tcm->tcm_handle = cl->classid;
1147 	if (!cl->level && cl->un.leaf.q)
1148 		tcm->tcm_info = cl->un.leaf.q->handle;
1149 
1150 	rta = (struct rtattr *)b;
1151 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1152 
1153 	memset(&opt, 0, sizeof(opt));
1154 
1155 	opt.rate = cl->rate->rate;
1156 	opt.buffer = cl->buffer;
1157 	opt.ceil = cl->ceil->rate;
1158 	opt.cbuffer = cl->cbuffer;
1159 	opt.quantum = cl->un.leaf.quantum;
1160 	opt.prio = cl->un.leaf.prio;
1161 	opt.level = cl->level;
1162 	RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1163 	rta->rta_len = skb_tail_pointer(skb) - b;
1164 	spin_unlock_bh(&sch->dev->queue_lock);
1165 	return skb->len;
1166 rtattr_failure:
1167 	spin_unlock_bh(&sch->dev->queue_lock);
1168 	nlmsg_trim(skb, b);
1169 	return -1;
1170 }
1171 
1172 static int
1173 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1174 {
1175 	struct htb_class *cl = (struct htb_class *)arg;
1176 
1177 #ifdef HTB_RATECM
1178 	cl->rate_est.bps = cl->rate_bytes / (HTB_EWMAC * HTB_HSIZE);
1179 	cl->rate_est.pps = cl->rate_packets / (HTB_EWMAC * HTB_HSIZE);
1180 #endif
1181 
1182 	if (!cl->level && cl->un.leaf.q)
1183 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1184 	cl->xstats.tokens = cl->tokens;
1185 	cl->xstats.ctokens = cl->ctokens;
1186 
1187 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1188 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1189 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1190 		return -1;
1191 
1192 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1193 }
1194 
1195 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1196 		     struct Qdisc **old)
1197 {
1198 	struct htb_class *cl = (struct htb_class *)arg;
1199 
1200 	if (cl && !cl->level) {
1201 		if (new == NULL &&
1202 		    (new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1203 					     cl->classid))
1204 		    == NULL)
1205 			return -ENOBUFS;
1206 		sch_tree_lock(sch);
1207 		if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1208 			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1209 			qdisc_reset(*old);
1210 		}
1211 		sch_tree_unlock(sch);
1212 		return 0;
1213 	}
1214 	return -ENOENT;
1215 }
1216 
1217 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1218 {
1219 	struct htb_class *cl = (struct htb_class *)arg;
1220 	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1221 }
1222 
1223 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1224 {
1225 	struct htb_class *cl = (struct htb_class *)arg;
1226 
1227 	if (cl->un.leaf.q->q.qlen == 0)
1228 		htb_deactivate(qdisc_priv(sch), cl);
1229 }
1230 
1231 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1232 {
1233 	struct htb_class *cl = htb_find(classid, sch);
1234 	if (cl)
1235 		cl->refcnt++;
1236 	return (unsigned long)cl;
1237 }
1238 
1239 static inline int htb_parent_last_child(struct htb_class *cl)
1240 {
1241 	if (!cl->parent)
1242 		/* the root class */
1243 		return 0;
1244 
1245 	if (!(cl->parent->children.next == &cl->sibling &&
1246 		cl->parent->children.prev == &cl->sibling))
1247 		/* not the last child */
1248 		return 0;
1249 
1250 	return 1;
1251 }
1252 
1253 static void htb_parent_to_leaf(struct htb_class *cl, struct Qdisc *new_q)
1254 {
1255 	struct htb_class *parent = cl->parent;
1256 
1257 	BUG_TRAP(!cl->level && cl->un.leaf.q && !cl->prio_activity);
1258 
1259 	parent->level = 0;
1260 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1261 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1262 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1263 	parent->un.leaf.quantum = parent->quantum;
1264 	parent->un.leaf.prio = parent->prio;
1265 	parent->tokens = parent->buffer;
1266 	parent->ctokens = parent->cbuffer;
1267 	parent->t_c = psched_get_time();
1268 	parent->cmode = HTB_CAN_SEND;
1269 }
1270 
1271 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1272 {
1273 	struct htb_sched *q = qdisc_priv(sch);
1274 
1275 	if (!cl->level) {
1276 		BUG_TRAP(cl->un.leaf.q);
1277 		qdisc_destroy(cl->un.leaf.q);
1278 	}
1279 	qdisc_put_rtab(cl->rate);
1280 	qdisc_put_rtab(cl->ceil);
1281 
1282 	tcf_destroy_chain(cl->filter_list);
1283 
1284 	while (!list_empty(&cl->children))
1285 		htb_destroy_class(sch, list_entry(cl->children.next,
1286 						  struct htb_class, sibling));
1287 
1288 	/* note: this delete may happen twice (see htb_delete) */
1289 	hlist_del_init(&cl->hlist);
1290 	list_del(&cl->sibling);
1291 
1292 	if (cl->prio_activity)
1293 		htb_deactivate(q, cl);
1294 
1295 	if (cl->cmode != HTB_CAN_SEND)
1296 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1297 
1298 	kfree(cl);
1299 }
1300 
1301 /* always caled under BH & queue lock */
1302 static void htb_destroy(struct Qdisc *sch)
1303 {
1304 	struct htb_sched *q = qdisc_priv(sch);
1305 
1306 	qdisc_watchdog_cancel(&q->watchdog);
1307 #ifdef HTB_RATECM
1308 	del_timer_sync(&q->rttim);
1309 #endif
1310 	/* This line used to be after htb_destroy_class call below
1311 	   and surprisingly it worked in 2.4. But it must precede it
1312 	   because filter need its target class alive to be able to call
1313 	   unbind_filter on it (without Oops). */
1314 	tcf_destroy_chain(q->filter_list);
1315 
1316 	while (!list_empty(&q->root))
1317 		htb_destroy_class(sch, list_entry(q->root.next,
1318 						  struct htb_class, sibling));
1319 
1320 	__skb_queue_purge(&q->direct_queue);
1321 }
1322 
1323 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1324 {
1325 	struct htb_sched *q = qdisc_priv(sch);
1326 	struct htb_class *cl = (struct htb_class *)arg;
1327 	unsigned int qlen;
1328 	struct Qdisc *new_q = NULL;
1329 	int last_child = 0;
1330 
1331 	// TODO: why don't allow to delete subtree ? references ? does
1332 	// tc subsys quarantee us that in htb_destroy it holds no class
1333 	// refs so that we can remove children safely there ?
1334 	if (!list_empty(&cl->children) || cl->filter_cnt)
1335 		return -EBUSY;
1336 
1337 	if (!cl->level && htb_parent_last_child(cl)) {
1338 		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1339 						cl->parent->classid);
1340 		last_child = 1;
1341 	}
1342 
1343 	sch_tree_lock(sch);
1344 
1345 	if (!cl->level) {
1346 		qlen = cl->un.leaf.q->q.qlen;
1347 		qdisc_reset(cl->un.leaf.q);
1348 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1349 	}
1350 
1351 	/* delete from hash and active; remainder in destroy_class */
1352 	hlist_del_init(&cl->hlist);
1353 
1354 	if (cl->prio_activity)
1355 		htb_deactivate(q, cl);
1356 
1357 	if (last_child)
1358 		htb_parent_to_leaf(cl, new_q);
1359 
1360 	if (--cl->refcnt == 0)
1361 		htb_destroy_class(sch, cl);
1362 
1363 	sch_tree_unlock(sch);
1364 	return 0;
1365 }
1366 
1367 static void htb_put(struct Qdisc *sch, unsigned long arg)
1368 {
1369 	struct htb_class *cl = (struct htb_class *)arg;
1370 
1371 	if (--cl->refcnt == 0)
1372 		htb_destroy_class(sch, cl);
1373 }
1374 
1375 static int htb_change_class(struct Qdisc *sch, u32 classid,
1376 			    u32 parentid, struct rtattr **tca,
1377 			    unsigned long *arg)
1378 {
1379 	int err = -EINVAL;
1380 	struct htb_sched *q = qdisc_priv(sch);
1381 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1382 	struct rtattr *opt = tca[TCA_OPTIONS - 1];
1383 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1384 	struct rtattr *tb[TCA_HTB_RTAB];
1385 	struct tc_htb_opt *hopt;
1386 
1387 	/* extract all subattrs from opt attr */
1388 	if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) ||
1389 	    tb[TCA_HTB_PARMS - 1] == NULL ||
1390 	    RTA_PAYLOAD(tb[TCA_HTB_PARMS - 1]) < sizeof(*hopt))
1391 		goto failure;
1392 
1393 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1394 
1395 	hopt = RTA_DATA(tb[TCA_HTB_PARMS - 1]);
1396 
1397 	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB - 1]);
1398 	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB - 1]);
1399 	if (!rtab || !ctab)
1400 		goto failure;
1401 
1402 	if (!cl) {		/* new class */
1403 		struct Qdisc *new_q;
1404 		int prio;
1405 
1406 		/* check for valid classid */
1407 		if (!classid || TC_H_MAJ(classid ^ sch->handle)
1408 		    || htb_find(classid, sch))
1409 			goto failure;
1410 
1411 		/* check maximal depth */
1412 		if (parent && parent->parent && parent->parent->level < 2) {
1413 			printk(KERN_ERR "htb: tree is too deep\n");
1414 			goto failure;
1415 		}
1416 		err = -ENOBUFS;
1417 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1418 			goto failure;
1419 
1420 		cl->refcnt = 1;
1421 		INIT_LIST_HEAD(&cl->sibling);
1422 		INIT_HLIST_NODE(&cl->hlist);
1423 		INIT_LIST_HEAD(&cl->children);
1424 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1425 		RB_CLEAR_NODE(&cl->pq_node);
1426 
1427 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1428 			RB_CLEAR_NODE(&cl->node[prio]);
1429 
1430 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1431 		   so that can't be used inside of sch_tree_lock
1432 		   -- thanks to Karlis Peisenieks */
1433 		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid);
1434 		sch_tree_lock(sch);
1435 		if (parent && !parent->level) {
1436 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1437 
1438 			/* turn parent into inner node */
1439 			qdisc_reset(parent->un.leaf.q);
1440 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1441 			qdisc_destroy(parent->un.leaf.q);
1442 			if (parent->prio_activity)
1443 				htb_deactivate(q, parent);
1444 
1445 			/* remove from evt list because of level change */
1446 			if (parent->cmode != HTB_CAN_SEND) {
1447 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1448 				parent->cmode = HTB_CAN_SEND;
1449 			}
1450 			parent->level = (parent->parent ? parent->parent->level
1451 					 : TC_HTB_MAXDEPTH) - 1;
1452 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1453 		}
1454 		/* leaf (we) needs elementary qdisc */
1455 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1456 
1457 		cl->classid = classid;
1458 		cl->parent = parent;
1459 
1460 		/* set class to be in HTB_CAN_SEND state */
1461 		cl->tokens = hopt->buffer;
1462 		cl->ctokens = hopt->cbuffer;
1463 		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1464 		cl->t_c = psched_get_time();
1465 		cl->cmode = HTB_CAN_SEND;
1466 
1467 		/* attach to the hash list and parent's family */
1468 		hlist_add_head(&cl->hlist, q->hash + htb_hash(classid));
1469 		list_add_tail(&cl->sibling,
1470 			      parent ? &parent->children : &q->root);
1471 	} else
1472 		sch_tree_lock(sch);
1473 
1474 	/* it used to be a nasty bug here, we have to check that node
1475 	   is really leaf before changing cl->un.leaf ! */
1476 	if (!cl->level) {
1477 		cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1478 		if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1479 			printk(KERN_WARNING
1480 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1481 			       cl->classid);
1482 			cl->un.leaf.quantum = 1000;
1483 		}
1484 		if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1485 			printk(KERN_WARNING
1486 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1487 			       cl->classid);
1488 			cl->un.leaf.quantum = 200000;
1489 		}
1490 		if (hopt->quantum)
1491 			cl->un.leaf.quantum = hopt->quantum;
1492 		if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1493 			cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1494 
1495 		/* backup for htb_parent_to_leaf */
1496 		cl->quantum = cl->un.leaf.quantum;
1497 		cl->prio = cl->un.leaf.prio;
1498 	}
1499 
1500 	cl->buffer = hopt->buffer;
1501 	cl->cbuffer = hopt->cbuffer;
1502 	if (cl->rate)
1503 		qdisc_put_rtab(cl->rate);
1504 	cl->rate = rtab;
1505 	if (cl->ceil)
1506 		qdisc_put_rtab(cl->ceil);
1507 	cl->ceil = ctab;
1508 	sch_tree_unlock(sch);
1509 
1510 	*arg = (unsigned long)cl;
1511 	return 0;
1512 
1513 failure:
1514 	if (rtab)
1515 		qdisc_put_rtab(rtab);
1516 	if (ctab)
1517 		qdisc_put_rtab(ctab);
1518 	return err;
1519 }
1520 
1521 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1522 {
1523 	struct htb_sched *q = qdisc_priv(sch);
1524 	struct htb_class *cl = (struct htb_class *)arg;
1525 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1526 
1527 	return fl;
1528 }
1529 
1530 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1531 				     u32 classid)
1532 {
1533 	struct htb_sched *q = qdisc_priv(sch);
1534 	struct htb_class *cl = htb_find(classid, sch);
1535 
1536 	/*if (cl && !cl->level) return 0;
1537 	   The line above used to be there to prevent attaching filters to
1538 	   leaves. But at least tc_index filter uses this just to get class
1539 	   for other reasons so that we have to allow for it.
1540 	   ----
1541 	   19.6.2002 As Werner explained it is ok - bind filter is just
1542 	   another way to "lock" the class - unlike "get" this lock can
1543 	   be broken by class during destroy IIUC.
1544 	 */
1545 	if (cl)
1546 		cl->filter_cnt++;
1547 	else
1548 		q->filter_cnt++;
1549 	return (unsigned long)cl;
1550 }
1551 
1552 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1553 {
1554 	struct htb_sched *q = qdisc_priv(sch);
1555 	struct htb_class *cl = (struct htb_class *)arg;
1556 
1557 	if (cl)
1558 		cl->filter_cnt--;
1559 	else
1560 		q->filter_cnt--;
1561 }
1562 
1563 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1564 {
1565 	struct htb_sched *q = qdisc_priv(sch);
1566 	int i;
1567 
1568 	if (arg->stop)
1569 		return;
1570 
1571 	for (i = 0; i < HTB_HSIZE; i++) {
1572 		struct hlist_node *p;
1573 		struct htb_class *cl;
1574 
1575 		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
1576 			if (arg->count < arg->skip) {
1577 				arg->count++;
1578 				continue;
1579 			}
1580 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1581 				arg->stop = 1;
1582 				return;
1583 			}
1584 			arg->count++;
1585 		}
1586 	}
1587 }
1588 
1589 static struct Qdisc_class_ops htb_class_ops = {
1590 	.graft		=	htb_graft,
1591 	.leaf		=	htb_leaf,
1592 	.qlen_notify	=	htb_qlen_notify,
1593 	.get		=	htb_get,
1594 	.put		=	htb_put,
1595 	.change		=	htb_change_class,
1596 	.delete		=	htb_delete,
1597 	.walk		=	htb_walk,
1598 	.tcf_chain	=	htb_find_tcf,
1599 	.bind_tcf	=	htb_bind_filter,
1600 	.unbind_tcf	=	htb_unbind_filter,
1601 	.dump		=	htb_dump_class,
1602 	.dump_stats	=	htb_dump_class_stats,
1603 };
1604 
1605 static struct Qdisc_ops htb_qdisc_ops = {
1606 	.next		=	NULL,
1607 	.cl_ops		=	&htb_class_ops,
1608 	.id		=	"htb",
1609 	.priv_size	=	sizeof(struct htb_sched),
1610 	.enqueue	=	htb_enqueue,
1611 	.dequeue	=	htb_dequeue,
1612 	.requeue	=	htb_requeue,
1613 	.drop		=	htb_drop,
1614 	.init		=	htb_init,
1615 	.reset		=	htb_reset,
1616 	.destroy	=	htb_destroy,
1617 	.change		=	NULL /* htb_change */,
1618 	.dump		=	htb_dump,
1619 	.owner		=	THIS_MODULE,
1620 };
1621 
1622 static int __init htb_module_init(void)
1623 {
1624 	return register_qdisc(&htb_qdisc_ops);
1625 }
1626 static void __exit htb_module_exit(void)
1627 {
1628 	unregister_qdisc(&htb_qdisc_ops);
1629 }
1630 
1631 module_init(htb_module_init)
1632 module_exit(htb_module_exit)
1633 MODULE_LICENSE("GPL");
1634