xref: /linux/net/sched/sch_htb.c (revision b454cc6636d254fbf6049b73e9560aee76fb04a3)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  *
28  * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $
29  */
30 #include <linux/module.h>
31 #include <asm/uaccess.h>
32 #include <asm/system.h>
33 #include <linux/bitops.h>
34 #include <linux/types.h>
35 #include <linux/kernel.h>
36 #include <linux/sched.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/socket.h>
40 #include <linux/sockios.h>
41 #include <linux/in.h>
42 #include <linux/errno.h>
43 #include <linux/interrupt.h>
44 #include <linux/if_ether.h>
45 #include <linux/inet.h>
46 #include <linux/netdevice.h>
47 #include <linux/etherdevice.h>
48 #include <linux/notifier.h>
49 #include <net/ip.h>
50 #include <net/route.h>
51 #include <linux/skbuff.h>
52 #include <linux/list.h>
53 #include <linux/compiler.h>
54 #include <net/sock.h>
55 #include <net/pkt_sched.h>
56 #include <linux/rbtree.h>
57 
58 /* HTB algorithm.
59     Author: devik@cdi.cz
60     ========================================================================
61     HTB is like TBF with multiple classes. It is also similar to CBQ because
62     it allows to assign priority to each class in hierarchy.
63     In fact it is another implementation of Floyd's formal sharing.
64 
65     Levels:
66     Each class is assigned level. Leaf has ALWAYS level 0 and root
67     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
68     one less than their parent.
69 */
70 
71 #define HTB_HSIZE 16		/* classid hash size */
72 #define HTB_EWMAC 2		/* rate average over HTB_EWMAC*HTB_HSIZE sec */
73 #define HTB_RATECM 1		/* whether to use rate computer */
74 #define HTB_HYSTERESIS 1	/* whether to use mode hysteresis for speedup */
75 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
76 
77 #if HTB_VER >> 16 != TC_HTB_PROTOVER
78 #error "Mismatched sch_htb.c and pkt_sch.h"
79 #endif
80 
81 /* used internaly to keep status of single class */
82 enum htb_cmode {
83 	HTB_CANT_SEND,		/* class can't send and can't borrow */
84 	HTB_MAY_BORROW,		/* class can't send but may borrow */
85 	HTB_CAN_SEND		/* class can send */
86 };
87 
88 /* interior & leaf nodes; props specific to leaves are marked L: */
89 struct htb_class {
90 	/* general class parameters */
91 	u32 classid;
92 	struct gnet_stats_basic bstats;
93 	struct gnet_stats_queue qstats;
94 	struct gnet_stats_rate_est rate_est;
95 	struct tc_htb_xstats xstats;	/* our special stats */
96 	int refcnt;		/* usage count of this class */
97 
98 #ifdef HTB_RATECM
99 	/* rate measurement counters */
100 	unsigned long rate_bytes, sum_bytes;
101 	unsigned long rate_packets, sum_packets;
102 #endif
103 
104 	/* topology */
105 	int level;		/* our level (see above) */
106 	struct htb_class *parent;	/* parent class */
107 	struct hlist_node hlist;	/* classid hash list item */
108 	struct list_head sibling;	/* sibling list item */
109 	struct list_head children;	/* children list */
110 
111 	union {
112 		struct htb_class_leaf {
113 			struct Qdisc *q;
114 			int prio;
115 			int aprio;
116 			int quantum;
117 			int deficit[TC_HTB_MAXDEPTH];
118 			struct list_head drop_list;
119 		} leaf;
120 		struct htb_class_inner {
121 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
122 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
123 			/* When class changes from state 1->2 and disconnects from
124 			   parent's feed then we lost ptr value and start from the
125 			   first child again. Here we store classid of the
126 			   last valid ptr (used when ptr is NULL). */
127 			u32 last_ptr_id[TC_HTB_NUMPRIO];
128 		} inner;
129 	} un;
130 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
131 	struct rb_node pq_node;	/* node for event queue */
132 	unsigned long pq_key;	/* the same type as jiffies global */
133 
134 	int prio_activity;	/* for which prios are we active */
135 	enum htb_cmode cmode;	/* current mode of the class */
136 
137 	/* class attached filters */
138 	struct tcf_proto *filter_list;
139 	int filter_cnt;
140 
141 	int warned;		/* only one warning about non work conserving .. */
142 
143 	/* token bucket parameters */
144 	struct qdisc_rate_table *rate;	/* rate table of the class itself */
145 	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
146 	long buffer, cbuffer;	/* token bucket depth/rate */
147 	psched_tdiff_t mbuffer;	/* max wait time */
148 	long tokens, ctokens;	/* current number of tokens */
149 	psched_time_t t_c;	/* checkpoint time */
150 
151 	int prio;		/* For parent to leaf return possible here */
152 	int quantum;		/* we do backup. Finally full replacement  */
153 				/* of un.leaf originals should be done. */
154 };
155 
156 /* TODO: maybe compute rate when size is too large .. or drop ? */
157 static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
158 			   int size)
159 {
160 	int slot = size >> rate->rate.cell_log;
161 	if (slot > 255) {
162 		cl->xstats.giants++;
163 		slot = 255;
164 	}
165 	return rate->data[slot];
166 }
167 
168 struct htb_sched {
169 	struct list_head root;	/* root classes list */
170 	struct hlist_head hash[HTB_HSIZE];	/* hashed by classid */
171 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
172 
173 	/* self list - roots of self generating tree */
174 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
175 	int row_mask[TC_HTB_MAXDEPTH];
176 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
177 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
178 
179 	/* self wait list - roots of wait PQs per row */
180 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
181 
182 	/* time of nearest event per level (row) */
183 	unsigned long near_ev_cache[TC_HTB_MAXDEPTH];
184 
185 	/* cached value of jiffies in dequeue */
186 	unsigned long jiffies;
187 
188 	/* whether we hit non-work conserving class during this dequeue; we use */
189 	int nwc_hit;		/* this to disable mindelay complaint in dequeue */
190 
191 	int defcls;		/* class where unclassified flows go to */
192 
193 	/* filters for qdisc itself */
194 	struct tcf_proto *filter_list;
195 	int filter_cnt;
196 
197 	int rate2quantum;	/* quant = rate / rate2quantum */
198 	psched_time_t now;	/* cached dequeue time */
199 	struct timer_list timer;	/* send delay timer */
200 #ifdef HTB_RATECM
201 	struct timer_list rttim;	/* rate computer timer */
202 	int recmp_bucket;	/* which hash bucket to recompute next */
203 #endif
204 
205 	/* non shaped skbs; let them go directly thru */
206 	struct sk_buff_head direct_queue;
207 	int direct_qlen;	/* max qlen of above */
208 
209 	long direct_pkts;
210 };
211 
212 /* compute hash of size HTB_HSIZE for given handle */
213 static inline int htb_hash(u32 h)
214 {
215 #if HTB_HSIZE != 16
216 #error "Declare new hash for your HTB_HSIZE"
217 #endif
218 	h ^= h >> 8;		/* stolen from cbq_hash */
219 	h ^= h >> 4;
220 	return h & 0xf;
221 }
222 
223 /* find class in global hash table using given handle */
224 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
225 {
226 	struct htb_sched *q = qdisc_priv(sch);
227 	struct hlist_node *p;
228 	struct htb_class *cl;
229 
230 	if (TC_H_MAJ(handle) != sch->handle)
231 		return NULL;
232 
233 	hlist_for_each_entry(cl, p, q->hash + htb_hash(handle), hlist) {
234 		if (cl->classid == handle)
235 			return cl;
236 	}
237 	return NULL;
238 }
239 
240 /**
241  * htb_classify - classify a packet into class
242  *
243  * It returns NULL if the packet should be dropped or -1 if the packet
244  * should be passed directly thru. In all other cases leaf class is returned.
245  * We allow direct class selection by classid in priority. The we examine
246  * filters in qdisc and in inner nodes (if higher filter points to the inner
247  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
248  * internal fifo (direct). These packets then go directly thru. If we still
249  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
250  * then finish and return direct queue.
251  */
252 #define HTB_DIRECT (struct htb_class*)-1
253 static inline u32 htb_classid(struct htb_class *cl)
254 {
255 	return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC;
256 }
257 
258 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
259 				      int *qerr)
260 {
261 	struct htb_sched *q = qdisc_priv(sch);
262 	struct htb_class *cl;
263 	struct tcf_result res;
264 	struct tcf_proto *tcf;
265 	int result;
266 
267 	/* allow to select class by setting skb->priority to valid classid;
268 	   note that nfmark can be used too by attaching filter fw with no
269 	   rules in it */
270 	if (skb->priority == sch->handle)
271 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
272 	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
273 		return cl;
274 
275 	*qerr = NET_XMIT_BYPASS;
276 	tcf = q->filter_list;
277 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
278 #ifdef CONFIG_NET_CLS_ACT
279 		switch (result) {
280 		case TC_ACT_QUEUED:
281 		case TC_ACT_STOLEN:
282 			*qerr = NET_XMIT_SUCCESS;
283 		case TC_ACT_SHOT:
284 			return NULL;
285 		}
286 #elif defined(CONFIG_NET_CLS_POLICE)
287 		if (result == TC_POLICE_SHOT)
288 			return HTB_DIRECT;
289 #endif
290 		if ((cl = (void *)res.class) == NULL) {
291 			if (res.classid == sch->handle)
292 				return HTB_DIRECT;	/* X:0 (direct flow) */
293 			if ((cl = htb_find(res.classid, sch)) == NULL)
294 				break;	/* filter selected invalid classid */
295 		}
296 		if (!cl->level)
297 			return cl;	/* we hit leaf; return it */
298 
299 		/* we have got inner class; apply inner filter chain */
300 		tcf = cl->filter_list;
301 	}
302 	/* classification failed; try to use default class */
303 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
304 	if (!cl || cl->level)
305 		return HTB_DIRECT;	/* bad default .. this is safe bet */
306 	return cl;
307 }
308 
309 /**
310  * htb_add_to_id_tree - adds class to the round robin list
311  *
312  * Routine adds class to the list (actually tree) sorted by classid.
313  * Make sure that class is not already on such list for given prio.
314  */
315 static void htb_add_to_id_tree(struct rb_root *root,
316 			       struct htb_class *cl, int prio)
317 {
318 	struct rb_node **p = &root->rb_node, *parent = NULL;
319 
320 	while (*p) {
321 		struct htb_class *c;
322 		parent = *p;
323 		c = rb_entry(parent, struct htb_class, node[prio]);
324 
325 		if (cl->classid > c->classid)
326 			p = &parent->rb_right;
327 		else
328 			p = &parent->rb_left;
329 	}
330 	rb_link_node(&cl->node[prio], parent, p);
331 	rb_insert_color(&cl->node[prio], root);
332 }
333 
334 /**
335  * htb_add_to_wait_tree - adds class to the event queue with delay
336  *
337  * The class is added to priority event queue to indicate that class will
338  * change its mode in cl->pq_key microseconds. Make sure that class is not
339  * already in the queue.
340  */
341 static void htb_add_to_wait_tree(struct htb_sched *q,
342 				 struct htb_class *cl, long delay)
343 {
344 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
345 
346 	cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay);
347 	if (cl->pq_key == q->jiffies)
348 		cl->pq_key++;
349 
350 	/* update the nearest event cache */
351 	if (time_after(q->near_ev_cache[cl->level], cl->pq_key))
352 		q->near_ev_cache[cl->level] = cl->pq_key;
353 
354 	while (*p) {
355 		struct htb_class *c;
356 		parent = *p;
357 		c = rb_entry(parent, struct htb_class, pq_node);
358 		if (time_after_eq(cl->pq_key, c->pq_key))
359 			p = &parent->rb_right;
360 		else
361 			p = &parent->rb_left;
362 	}
363 	rb_link_node(&cl->pq_node, parent, p);
364 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
365 }
366 
367 /**
368  * htb_next_rb_node - finds next node in binary tree
369  *
370  * When we are past last key we return NULL.
371  * Average complexity is 2 steps per call.
372  */
373 static inline void htb_next_rb_node(struct rb_node **n)
374 {
375 	*n = rb_next(*n);
376 }
377 
378 /**
379  * htb_add_class_to_row - add class to its row
380  *
381  * The class is added to row at priorities marked in mask.
382  * It does nothing if mask == 0.
383  */
384 static inline void htb_add_class_to_row(struct htb_sched *q,
385 					struct htb_class *cl, int mask)
386 {
387 	q->row_mask[cl->level] |= mask;
388 	while (mask) {
389 		int prio = ffz(~mask);
390 		mask &= ~(1 << prio);
391 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
392 	}
393 }
394 
395 /* If this triggers, it is a bug in this code, but it need not be fatal */
396 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
397 {
398 	if (RB_EMPTY_NODE(rb)) {
399 		WARN_ON(1);
400 	} else {
401 		rb_erase(rb, root);
402 		RB_CLEAR_NODE(rb);
403 	}
404 }
405 
406 
407 /**
408  * htb_remove_class_from_row - removes class from its row
409  *
410  * The class is removed from row at priorities marked in mask.
411  * It does nothing if mask == 0.
412  */
413 static inline void htb_remove_class_from_row(struct htb_sched *q,
414 						 struct htb_class *cl, int mask)
415 {
416 	int m = 0;
417 
418 	while (mask) {
419 		int prio = ffz(~mask);
420 
421 		mask &= ~(1 << prio);
422 		if (q->ptr[cl->level][prio] == cl->node + prio)
423 			htb_next_rb_node(q->ptr[cl->level] + prio);
424 
425 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
426 		if (!q->row[cl->level][prio].rb_node)
427 			m |= 1 << prio;
428 	}
429 	q->row_mask[cl->level] &= ~m;
430 }
431 
432 /**
433  * htb_activate_prios - creates active classe's feed chain
434  *
435  * The class is connected to ancestors and/or appropriate rows
436  * for priorities it is participating on. cl->cmode must be new
437  * (activated) mode. It does nothing if cl->prio_activity == 0.
438  */
439 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
440 {
441 	struct htb_class *p = cl->parent;
442 	long m, mask = cl->prio_activity;
443 
444 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
445 		m = mask;
446 		while (m) {
447 			int prio = ffz(~m);
448 			m &= ~(1 << prio);
449 
450 			if (p->un.inner.feed[prio].rb_node)
451 				/* parent already has its feed in use so that
452 				   reset bit in mask as parent is already ok */
453 				mask &= ~(1 << prio);
454 
455 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
456 		}
457 		p->prio_activity |= mask;
458 		cl = p;
459 		p = cl->parent;
460 
461 	}
462 	if (cl->cmode == HTB_CAN_SEND && mask)
463 		htb_add_class_to_row(q, cl, mask);
464 }
465 
466 /**
467  * htb_deactivate_prios - remove class from feed chain
468  *
469  * cl->cmode must represent old mode (before deactivation). It does
470  * nothing if cl->prio_activity == 0. Class is removed from all feed
471  * chains and rows.
472  */
473 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
474 {
475 	struct htb_class *p = cl->parent;
476 	long m, mask = cl->prio_activity;
477 
478 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
479 		m = mask;
480 		mask = 0;
481 		while (m) {
482 			int prio = ffz(~m);
483 			m &= ~(1 << prio);
484 
485 			if (p->un.inner.ptr[prio] == cl->node + prio) {
486 				/* we are removing child which is pointed to from
487 				   parent feed - forget the pointer but remember
488 				   classid */
489 				p->un.inner.last_ptr_id[prio] = cl->classid;
490 				p->un.inner.ptr[prio] = NULL;
491 			}
492 
493 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
494 
495 			if (!p->un.inner.feed[prio].rb_node)
496 				mask |= 1 << prio;
497 		}
498 
499 		p->prio_activity &= ~mask;
500 		cl = p;
501 		p = cl->parent;
502 
503 	}
504 	if (cl->cmode == HTB_CAN_SEND && mask)
505 		htb_remove_class_from_row(q, cl, mask);
506 }
507 
508 #if HTB_HYSTERESIS
509 static inline long htb_lowater(const struct htb_class *cl)
510 {
511 	return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
512 }
513 static inline long htb_hiwater(const struct htb_class *cl)
514 {
515 	return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
516 }
517 #else
518 #define htb_lowater(cl)	(0)
519 #define htb_hiwater(cl)	(0)
520 #endif
521 
522 /**
523  * htb_class_mode - computes and returns current class mode
524  *
525  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
526  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
527  * from now to time when cl will change its state.
528  * Also it is worth to note that class mode doesn't change simply
529  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
530  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
531  * mode transitions per time unit. The speed gain is about 1/6.
532  */
533 static inline enum htb_cmode
534 htb_class_mode(struct htb_class *cl, long *diff)
535 {
536 	long toks;
537 
538 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
539 		*diff = -toks;
540 		return HTB_CANT_SEND;
541 	}
542 
543 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
544 		return HTB_CAN_SEND;
545 
546 	*diff = -toks;
547 	return HTB_MAY_BORROW;
548 }
549 
550 /**
551  * htb_change_class_mode - changes classe's mode
552  *
553  * This should be the only way how to change classe's mode under normal
554  * cirsumstances. Routine will update feed lists linkage, change mode
555  * and add class to the wait event queue if appropriate. New mode should
556  * be different from old one and cl->pq_key has to be valid if changing
557  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
558  */
559 static void
560 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
561 {
562 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
563 
564 	if (new_mode == cl->cmode)
565 		return;
566 
567 	if (cl->prio_activity) {	/* not necessary: speed optimization */
568 		if (cl->cmode != HTB_CANT_SEND)
569 			htb_deactivate_prios(q, cl);
570 		cl->cmode = new_mode;
571 		if (new_mode != HTB_CANT_SEND)
572 			htb_activate_prios(q, cl);
573 	} else
574 		cl->cmode = new_mode;
575 }
576 
577 /**
578  * htb_activate - inserts leaf cl into appropriate active feeds
579  *
580  * Routine learns (new) priority of leaf and activates feed chain
581  * for the prio. It can be called on already active leaf safely.
582  * It also adds leaf into droplist.
583  */
584 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
585 {
586 	BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
587 
588 	if (!cl->prio_activity) {
589 		cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
590 		htb_activate_prios(q, cl);
591 		list_add_tail(&cl->un.leaf.drop_list,
592 			      q->drops + cl->un.leaf.aprio);
593 	}
594 }
595 
596 /**
597  * htb_deactivate - remove leaf cl from active feeds
598  *
599  * Make sure that leaf is active. In the other words it can't be called
600  * with non-active leaf. It also removes class from the drop list.
601  */
602 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
603 {
604 	BUG_TRAP(cl->prio_activity);
605 
606 	htb_deactivate_prios(q, cl);
607 	cl->prio_activity = 0;
608 	list_del_init(&cl->un.leaf.drop_list);
609 }
610 
611 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
612 {
613 	int ret;
614 	struct htb_sched *q = qdisc_priv(sch);
615 	struct htb_class *cl = htb_classify(skb, sch, &ret);
616 
617 	if (cl == HTB_DIRECT) {
618 		/* enqueue to helper queue */
619 		if (q->direct_queue.qlen < q->direct_qlen) {
620 			__skb_queue_tail(&q->direct_queue, skb);
621 			q->direct_pkts++;
622 		} else {
623 			kfree_skb(skb);
624 			sch->qstats.drops++;
625 			return NET_XMIT_DROP;
626 		}
627 #ifdef CONFIG_NET_CLS_ACT
628 	} else if (!cl) {
629 		if (ret == NET_XMIT_BYPASS)
630 			sch->qstats.drops++;
631 		kfree_skb(skb);
632 		return ret;
633 #endif
634 	} else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) !=
635 		   NET_XMIT_SUCCESS) {
636 		sch->qstats.drops++;
637 		cl->qstats.drops++;
638 		return NET_XMIT_DROP;
639 	} else {
640 		cl->bstats.packets++;
641 		cl->bstats.bytes += skb->len;
642 		htb_activate(q, cl);
643 	}
644 
645 	sch->q.qlen++;
646 	sch->bstats.packets++;
647 	sch->bstats.bytes += skb->len;
648 	return NET_XMIT_SUCCESS;
649 }
650 
651 /* TODO: requeuing packet charges it to policers again !! */
652 static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
653 {
654 	struct htb_sched *q = qdisc_priv(sch);
655 	int ret = NET_XMIT_SUCCESS;
656 	struct htb_class *cl = htb_classify(skb, sch, &ret);
657 	struct sk_buff *tskb;
658 
659 	if (cl == HTB_DIRECT || !cl) {
660 		/* enqueue to helper queue */
661 		if (q->direct_queue.qlen < q->direct_qlen && cl) {
662 			__skb_queue_head(&q->direct_queue, skb);
663 		} else {
664 			__skb_queue_head(&q->direct_queue, skb);
665 			tskb = __skb_dequeue_tail(&q->direct_queue);
666 			kfree_skb(tskb);
667 			sch->qstats.drops++;
668 			return NET_XMIT_CN;
669 		}
670 	} else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) !=
671 		   NET_XMIT_SUCCESS) {
672 		sch->qstats.drops++;
673 		cl->qstats.drops++;
674 		return NET_XMIT_DROP;
675 	} else
676 		htb_activate(q, cl);
677 
678 	sch->q.qlen++;
679 	sch->qstats.requeues++;
680 	return NET_XMIT_SUCCESS;
681 }
682 
683 static void htb_timer(unsigned long arg)
684 {
685 	struct Qdisc *sch = (struct Qdisc *)arg;
686 	sch->flags &= ~TCQ_F_THROTTLED;
687 	wmb();
688 	netif_schedule(sch->dev);
689 }
690 
691 #ifdef HTB_RATECM
692 #define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
693 static void htb_rate_timer(unsigned long arg)
694 {
695 	struct Qdisc *sch = (struct Qdisc *)arg;
696 	struct htb_sched *q = qdisc_priv(sch);
697 	struct hlist_node *p;
698 	struct htb_class *cl;
699 
700 
701 	/* lock queue so that we can muck with it */
702 	spin_lock_bh(&sch->dev->queue_lock);
703 
704 	q->rttim.expires = jiffies + HZ;
705 	add_timer(&q->rttim);
706 
707 	/* scan and recompute one bucket at time */
708 	if (++q->recmp_bucket >= HTB_HSIZE)
709 		q->recmp_bucket = 0;
710 
711 	hlist_for_each_entry(cl,p, q->hash + q->recmp_bucket, hlist) {
712 		RT_GEN(cl->sum_bytes, cl->rate_bytes);
713 		RT_GEN(cl->sum_packets, cl->rate_packets);
714 	}
715 	spin_unlock_bh(&sch->dev->queue_lock);
716 }
717 #endif
718 
719 /**
720  * htb_charge_class - charges amount "bytes" to leaf and ancestors
721  *
722  * Routine assumes that packet "bytes" long was dequeued from leaf cl
723  * borrowing from "level". It accounts bytes to ceil leaky bucket for
724  * leaf and all ancestors and to rate bucket for ancestors at levels
725  * "level" and higher. It also handles possible change of mode resulting
726  * from the update. Note that mode can also increase here (MAY_BORROW to
727  * CAN_SEND) because we can use more precise clock that event queue here.
728  * In such case we remove class from event queue first.
729  */
730 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
731 			     int level, int bytes)
732 {
733 	long toks, diff;
734 	enum htb_cmode old_mode;
735 
736 #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
737 	if (toks > cl->B) toks = cl->B; \
738 	toks -= L2T(cl, cl->R, bytes); \
739 	if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
740 	cl->T = toks
741 
742 	while (cl) {
743 		diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32) cl->mbuffer);
744 		if (cl->level >= level) {
745 			if (cl->level == level)
746 				cl->xstats.lends++;
747 			HTB_ACCNT(tokens, buffer, rate);
748 		} else {
749 			cl->xstats.borrows++;
750 			cl->tokens += diff;	/* we moved t_c; update tokens */
751 		}
752 		HTB_ACCNT(ctokens, cbuffer, ceil);
753 		cl->t_c = q->now;
754 
755 		old_mode = cl->cmode;
756 		diff = 0;
757 		htb_change_class_mode(q, cl, &diff);
758 		if (old_mode != cl->cmode) {
759 			if (old_mode != HTB_CAN_SEND)
760 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
761 			if (cl->cmode != HTB_CAN_SEND)
762 				htb_add_to_wait_tree(q, cl, diff);
763 		}
764 #ifdef HTB_RATECM
765 		/* update rate counters */
766 		cl->sum_bytes += bytes;
767 		cl->sum_packets++;
768 #endif
769 
770 		/* update byte stats except for leaves which are already updated */
771 		if (cl->level) {
772 			cl->bstats.bytes += bytes;
773 			cl->bstats.packets++;
774 		}
775 		cl = cl->parent;
776 	}
777 }
778 
779 /**
780  * htb_do_events - make mode changes to classes at the level
781  *
782  * Scans event queue for pending events and applies them. Returns jiffies to
783  * next pending event (0 for no event in pq).
784  * Note: Aplied are events whose have cl->pq_key <= jiffies.
785  */
786 static long htb_do_events(struct htb_sched *q, int level)
787 {
788 	int i;
789 
790 	for (i = 0; i < 500; i++) {
791 		struct htb_class *cl;
792 		long diff;
793 		struct rb_node *p = rb_first(&q->wait_pq[level]);
794 
795 		if (!p)
796 			return 0;
797 
798 		cl = rb_entry(p, struct htb_class, pq_node);
799 		if (time_after(cl->pq_key, q->jiffies)) {
800 			return cl->pq_key - q->jiffies;
801 		}
802 		htb_safe_rb_erase(p, q->wait_pq + level);
803 		diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32) cl->mbuffer);
804 		htb_change_class_mode(q, cl, &diff);
805 		if (cl->cmode != HTB_CAN_SEND)
806 			htb_add_to_wait_tree(q, cl, diff);
807 	}
808 	if (net_ratelimit())
809 		printk(KERN_WARNING "htb: too many events !\n");
810 	return HZ / 10;
811 }
812 
813 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
814    is no such one exists. */
815 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
816 					      u32 id)
817 {
818 	struct rb_node *r = NULL;
819 	while (n) {
820 		struct htb_class *cl =
821 		    rb_entry(n, struct htb_class, node[prio]);
822 		if (id == cl->classid)
823 			return n;
824 
825 		if (id > cl->classid) {
826 			n = n->rb_right;
827 		} else {
828 			r = n;
829 			n = n->rb_left;
830 		}
831 	}
832 	return r;
833 }
834 
835 /**
836  * htb_lookup_leaf - returns next leaf class in DRR order
837  *
838  * Find leaf where current feed pointers points to.
839  */
840 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
841 					 struct rb_node **pptr, u32 * pid)
842 {
843 	int i;
844 	struct {
845 		struct rb_node *root;
846 		struct rb_node **pptr;
847 		u32 *pid;
848 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
849 
850 	BUG_TRAP(tree->rb_node);
851 	sp->root = tree->rb_node;
852 	sp->pptr = pptr;
853 	sp->pid = pid;
854 
855 	for (i = 0; i < 65535; i++) {
856 		if (!*sp->pptr && *sp->pid) {
857 			/* ptr was invalidated but id is valid - try to recover
858 			   the original or next ptr */
859 			*sp->pptr =
860 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
861 		}
862 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
863 				   can become out of date quickly */
864 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
865 			*sp->pptr = sp->root;
866 			while ((*sp->pptr)->rb_left)
867 				*sp->pptr = (*sp->pptr)->rb_left;
868 			if (sp > stk) {
869 				sp--;
870 				BUG_TRAP(*sp->pptr);
871 				if (!*sp->pptr)
872 					return NULL;
873 				htb_next_rb_node(sp->pptr);
874 			}
875 		} else {
876 			struct htb_class *cl;
877 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
878 			if (!cl->level)
879 				return cl;
880 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
881 			sp->pptr = cl->un.inner.ptr + prio;
882 			sp->pid = cl->un.inner.last_ptr_id + prio;
883 		}
884 	}
885 	BUG_TRAP(0);
886 	return NULL;
887 }
888 
889 /* dequeues packet at given priority and level; call only if
890    you are sure that there is active class at prio/level */
891 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
892 					int level)
893 {
894 	struct sk_buff *skb = NULL;
895 	struct htb_class *cl, *start;
896 	/* look initial class up in the row */
897 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
898 				     q->ptr[level] + prio,
899 				     q->last_ptr_id[level] + prio);
900 
901 	do {
902 next:
903 		BUG_TRAP(cl);
904 		if (!cl)
905 			return NULL;
906 
907 		/* class can be empty - it is unlikely but can be true if leaf
908 		   qdisc drops packets in enqueue routine or if someone used
909 		   graft operation on the leaf since last dequeue;
910 		   simply deactivate and skip such class */
911 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
912 			struct htb_class *next;
913 			htb_deactivate(q, cl);
914 
915 			/* row/level might become empty */
916 			if ((q->row_mask[level] & (1 << prio)) == 0)
917 				return NULL;
918 
919 			next = htb_lookup_leaf(q->row[level] + prio,
920 					       prio, q->ptr[level] + prio,
921 					       q->last_ptr_id[level] + prio);
922 
923 			if (cl == start)	/* fix start if we just deleted it */
924 				start = next;
925 			cl = next;
926 			goto next;
927 		}
928 
929 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
930 		if (likely(skb != NULL))
931 			break;
932 		if (!cl->warned) {
933 			printk(KERN_WARNING
934 			       "htb: class %X isn't work conserving ?!\n",
935 			       cl->classid);
936 			cl->warned = 1;
937 		}
938 		q->nwc_hit++;
939 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
940 				  ptr[0]) + prio);
941 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
942 				     q->ptr[level] + prio,
943 				     q->last_ptr_id[level] + prio);
944 
945 	} while (cl != start);
946 
947 	if (likely(skb != NULL)) {
948 		if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
949 			cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
950 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
951 					  ptr[0]) + prio);
952 		}
953 		/* this used to be after charge_class but this constelation
954 		   gives us slightly better performance */
955 		if (!cl->un.leaf.q->q.qlen)
956 			htb_deactivate(q, cl);
957 		htb_charge_class(q, cl, level, skb->len);
958 	}
959 	return skb;
960 }
961 
962 static void htb_delay_by(struct Qdisc *sch, long delay)
963 {
964 	struct htb_sched *q = qdisc_priv(sch);
965 	if (delay <= 0)
966 		delay = 1;
967 	if (unlikely(delay > 5 * HZ)) {
968 		if (net_ratelimit())
969 			printk(KERN_INFO "HTB delay %ld > 5sec\n", delay);
970 		delay = 5 * HZ;
971 	}
972 	/* why don't use jiffies here ? because expires can be in past */
973 	mod_timer(&q->timer, q->jiffies + delay);
974 	sch->flags |= TCQ_F_THROTTLED;
975 	sch->qstats.overlimits++;
976 }
977 
978 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
979 {
980 	struct sk_buff *skb = NULL;
981 	struct htb_sched *q = qdisc_priv(sch);
982 	int level;
983 	long min_delay;
984 
985 	q->jiffies = jiffies;
986 
987 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
988 	skb = __skb_dequeue(&q->direct_queue);
989 	if (skb != NULL) {
990 		sch->flags &= ~TCQ_F_THROTTLED;
991 		sch->q.qlen--;
992 		return skb;
993 	}
994 
995 	if (!sch->q.qlen)
996 		goto fin;
997 	PSCHED_GET_TIME(q->now);
998 
999 	min_delay = LONG_MAX;
1000 	q->nwc_hit = 0;
1001 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
1002 		/* common case optimization - skip event handler quickly */
1003 		int m;
1004 		long delay;
1005 		if (time_after_eq(q->jiffies, q->near_ev_cache[level])) {
1006 			delay = htb_do_events(q, level);
1007 			q->near_ev_cache[level] =
1008 			    q->jiffies + (delay ? delay : HZ);
1009 		} else
1010 			delay = q->near_ev_cache[level] - q->jiffies;
1011 
1012 		if (delay && min_delay > delay)
1013 			min_delay = delay;
1014 		m = ~q->row_mask[level];
1015 		while (m != (int)(-1)) {
1016 			int prio = ffz(m);
1017 			m |= 1 << prio;
1018 			skb = htb_dequeue_tree(q, prio, level);
1019 			if (likely(skb != NULL)) {
1020 				sch->q.qlen--;
1021 				sch->flags &= ~TCQ_F_THROTTLED;
1022 				goto fin;
1023 			}
1024 		}
1025 	}
1026 	htb_delay_by(sch, min_delay > 5 * HZ ? 5 * HZ : min_delay);
1027 fin:
1028 	return skb;
1029 }
1030 
1031 /* try to drop from each class (by prio) until one succeed */
1032 static unsigned int htb_drop(struct Qdisc *sch)
1033 {
1034 	struct htb_sched *q = qdisc_priv(sch);
1035 	int prio;
1036 
1037 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
1038 		struct list_head *p;
1039 		list_for_each(p, q->drops + prio) {
1040 			struct htb_class *cl = list_entry(p, struct htb_class,
1041 							  un.leaf.drop_list);
1042 			unsigned int len;
1043 			if (cl->un.leaf.q->ops->drop &&
1044 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
1045 				sch->q.qlen--;
1046 				if (!cl->un.leaf.q->q.qlen)
1047 					htb_deactivate(q, cl);
1048 				return len;
1049 			}
1050 		}
1051 	}
1052 	return 0;
1053 }
1054 
1055 /* reset all classes */
1056 /* always caled under BH & queue lock */
1057 static void htb_reset(struct Qdisc *sch)
1058 {
1059 	struct htb_sched *q = qdisc_priv(sch);
1060 	int i;
1061 
1062 	for (i = 0; i < HTB_HSIZE; i++) {
1063 		struct hlist_node *p;
1064 		struct htb_class *cl;
1065 
1066 		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
1067 			if (cl->level)
1068 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
1069 			else {
1070 				if (cl->un.leaf.q)
1071 					qdisc_reset(cl->un.leaf.q);
1072 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1073 			}
1074 			cl->prio_activity = 0;
1075 			cl->cmode = HTB_CAN_SEND;
1076 
1077 		}
1078 	}
1079 	sch->flags &= ~TCQ_F_THROTTLED;
1080 	del_timer(&q->timer);
1081 	__skb_queue_purge(&q->direct_queue);
1082 	sch->q.qlen = 0;
1083 	memset(q->row, 0, sizeof(q->row));
1084 	memset(q->row_mask, 0, sizeof(q->row_mask));
1085 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
1086 	memset(q->ptr, 0, sizeof(q->ptr));
1087 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1088 		INIT_LIST_HEAD(q->drops + i);
1089 }
1090 
1091 static int htb_init(struct Qdisc *sch, struct rtattr *opt)
1092 {
1093 	struct htb_sched *q = qdisc_priv(sch);
1094 	struct rtattr *tb[TCA_HTB_INIT];
1095 	struct tc_htb_glob *gopt;
1096 	int i;
1097 	if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) ||
1098 	    tb[TCA_HTB_INIT - 1] == NULL ||
1099 	    RTA_PAYLOAD(tb[TCA_HTB_INIT - 1]) < sizeof(*gopt)) {
1100 		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1101 		return -EINVAL;
1102 	}
1103 	gopt = RTA_DATA(tb[TCA_HTB_INIT - 1]);
1104 	if (gopt->version != HTB_VER >> 16) {
1105 		printk(KERN_ERR
1106 		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1107 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1108 		return -EINVAL;
1109 	}
1110 
1111 	INIT_LIST_HEAD(&q->root);
1112 	for (i = 0; i < HTB_HSIZE; i++)
1113 		INIT_HLIST_HEAD(q->hash + i);
1114 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1115 		INIT_LIST_HEAD(q->drops + i);
1116 
1117 	init_timer(&q->timer);
1118 	skb_queue_head_init(&q->direct_queue);
1119 
1120 	q->direct_qlen = sch->dev->tx_queue_len;
1121 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1122 		q->direct_qlen = 2;
1123 	q->timer.function = htb_timer;
1124 	q->timer.data = (unsigned long)sch;
1125 
1126 #ifdef HTB_RATECM
1127 	init_timer(&q->rttim);
1128 	q->rttim.function = htb_rate_timer;
1129 	q->rttim.data = (unsigned long)sch;
1130 	q->rttim.expires = jiffies + HZ;
1131 	add_timer(&q->rttim);
1132 #endif
1133 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1134 		q->rate2quantum = 1;
1135 	q->defcls = gopt->defcls;
1136 
1137 	return 0;
1138 }
1139 
1140 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1141 {
1142 	struct htb_sched *q = qdisc_priv(sch);
1143 	unsigned char *b = skb->tail;
1144 	struct rtattr *rta;
1145 	struct tc_htb_glob gopt;
1146 	spin_lock_bh(&sch->dev->queue_lock);
1147 	gopt.direct_pkts = q->direct_pkts;
1148 
1149 	gopt.version = HTB_VER;
1150 	gopt.rate2quantum = q->rate2quantum;
1151 	gopt.defcls = q->defcls;
1152 	gopt.debug = 0;
1153 	rta = (struct rtattr *)b;
1154 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1155 	RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1156 	rta->rta_len = skb->tail - b;
1157 	spin_unlock_bh(&sch->dev->queue_lock);
1158 	return skb->len;
1159 rtattr_failure:
1160 	spin_unlock_bh(&sch->dev->queue_lock);
1161 	skb_trim(skb, skb->tail - skb->data);
1162 	return -1;
1163 }
1164 
1165 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1166 			  struct sk_buff *skb, struct tcmsg *tcm)
1167 {
1168 	struct htb_class *cl = (struct htb_class *)arg;
1169 	unsigned char *b = skb->tail;
1170 	struct rtattr *rta;
1171 	struct tc_htb_opt opt;
1172 
1173 	spin_lock_bh(&sch->dev->queue_lock);
1174 	tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
1175 	tcm->tcm_handle = cl->classid;
1176 	if (!cl->level && cl->un.leaf.q)
1177 		tcm->tcm_info = cl->un.leaf.q->handle;
1178 
1179 	rta = (struct rtattr *)b;
1180 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1181 
1182 	memset(&opt, 0, sizeof(opt));
1183 
1184 	opt.rate = cl->rate->rate;
1185 	opt.buffer = cl->buffer;
1186 	opt.ceil = cl->ceil->rate;
1187 	opt.cbuffer = cl->cbuffer;
1188 	opt.quantum = cl->un.leaf.quantum;
1189 	opt.prio = cl->un.leaf.prio;
1190 	opt.level = cl->level;
1191 	RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1192 	rta->rta_len = skb->tail - b;
1193 	spin_unlock_bh(&sch->dev->queue_lock);
1194 	return skb->len;
1195 rtattr_failure:
1196 	spin_unlock_bh(&sch->dev->queue_lock);
1197 	skb_trim(skb, b - skb->data);
1198 	return -1;
1199 }
1200 
1201 static int
1202 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1203 {
1204 	struct htb_class *cl = (struct htb_class *)arg;
1205 
1206 #ifdef HTB_RATECM
1207 	cl->rate_est.bps = cl->rate_bytes / (HTB_EWMAC * HTB_HSIZE);
1208 	cl->rate_est.pps = cl->rate_packets / (HTB_EWMAC * HTB_HSIZE);
1209 #endif
1210 
1211 	if (!cl->level && cl->un.leaf.q)
1212 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1213 	cl->xstats.tokens = cl->tokens;
1214 	cl->xstats.ctokens = cl->ctokens;
1215 
1216 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1217 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1218 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1219 		return -1;
1220 
1221 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1222 }
1223 
1224 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1225 		     struct Qdisc **old)
1226 {
1227 	struct htb_class *cl = (struct htb_class *)arg;
1228 
1229 	if (cl && !cl->level) {
1230 		if (new == NULL &&
1231 		    (new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1232 		    			     cl->classid))
1233 		    == NULL)
1234 			return -ENOBUFS;
1235 		sch_tree_lock(sch);
1236 		if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1237 			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1238 			qdisc_reset(*old);
1239 		}
1240 		sch_tree_unlock(sch);
1241 		return 0;
1242 	}
1243 	return -ENOENT;
1244 }
1245 
1246 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1247 {
1248 	struct htb_class *cl = (struct htb_class *)arg;
1249 	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1250 }
1251 
1252 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1253 {
1254 	struct htb_class *cl = (struct htb_class *)arg;
1255 
1256 	if (cl->un.leaf.q->q.qlen == 0)
1257 		htb_deactivate(qdisc_priv(sch), cl);
1258 }
1259 
1260 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1261 {
1262 	struct htb_class *cl = htb_find(classid, sch);
1263 	if (cl)
1264 		cl->refcnt++;
1265 	return (unsigned long)cl;
1266 }
1267 
1268 static void htb_destroy_filters(struct tcf_proto **fl)
1269 {
1270 	struct tcf_proto *tp;
1271 
1272 	while ((tp = *fl) != NULL) {
1273 		*fl = tp->next;
1274 		tcf_destroy(tp);
1275 	}
1276 }
1277 
1278 static inline int htb_parent_last_child(struct htb_class *cl)
1279 {
1280 	if (!cl->parent)
1281 		/* the root class */
1282 		return 0;
1283 
1284 	if (!(cl->parent->children.next == &cl->sibling &&
1285 		cl->parent->children.prev == &cl->sibling))
1286 		/* not the last child */
1287 		return 0;
1288 
1289 	return 1;
1290 }
1291 
1292 static void htb_parent_to_leaf(struct htb_class *cl, struct Qdisc *new_q)
1293 {
1294 	struct htb_class *parent = cl->parent;
1295 
1296 	BUG_TRAP(!cl->level && cl->un.leaf.q && !cl->prio_activity);
1297 
1298 	parent->level = 0;
1299 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1300 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1301 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1302 	parent->un.leaf.quantum = parent->quantum;
1303 	parent->un.leaf.prio = parent->prio;
1304 	parent->tokens = parent->buffer;
1305 	parent->ctokens = parent->cbuffer;
1306 	PSCHED_GET_TIME(parent->t_c);
1307 	parent->cmode = HTB_CAN_SEND;
1308 }
1309 
1310 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1311 {
1312 	struct htb_sched *q = qdisc_priv(sch);
1313 
1314 	if (!cl->level) {
1315 		BUG_TRAP(cl->un.leaf.q);
1316 		qdisc_destroy(cl->un.leaf.q);
1317 	}
1318 	qdisc_put_rtab(cl->rate);
1319 	qdisc_put_rtab(cl->ceil);
1320 
1321 	htb_destroy_filters(&cl->filter_list);
1322 
1323 	while (!list_empty(&cl->children))
1324 		htb_destroy_class(sch, list_entry(cl->children.next,
1325 						  struct htb_class, sibling));
1326 
1327 	/* note: this delete may happen twice (see htb_delete) */
1328 	hlist_del_init(&cl->hlist);
1329 	list_del(&cl->sibling);
1330 
1331 	if (cl->prio_activity)
1332 		htb_deactivate(q, cl);
1333 
1334 	if (cl->cmode != HTB_CAN_SEND)
1335 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1336 
1337 	kfree(cl);
1338 }
1339 
1340 /* always caled under BH & queue lock */
1341 static void htb_destroy(struct Qdisc *sch)
1342 {
1343 	struct htb_sched *q = qdisc_priv(sch);
1344 
1345 	del_timer_sync(&q->timer);
1346 #ifdef HTB_RATECM
1347 	del_timer_sync(&q->rttim);
1348 #endif
1349 	/* This line used to be after htb_destroy_class call below
1350 	   and surprisingly it worked in 2.4. But it must precede it
1351 	   because filter need its target class alive to be able to call
1352 	   unbind_filter on it (without Oops). */
1353 	htb_destroy_filters(&q->filter_list);
1354 
1355 	while (!list_empty(&q->root))
1356 		htb_destroy_class(sch, list_entry(q->root.next,
1357 						  struct htb_class, sibling));
1358 
1359 	__skb_queue_purge(&q->direct_queue);
1360 }
1361 
1362 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1363 {
1364 	struct htb_sched *q = qdisc_priv(sch);
1365 	struct htb_class *cl = (struct htb_class *)arg;
1366 	unsigned int qlen;
1367 	struct Qdisc *new_q = NULL;
1368 	int last_child = 0;
1369 
1370 	// TODO: why don't allow to delete subtree ? references ? does
1371 	// tc subsys quarantee us that in htb_destroy it holds no class
1372 	// refs so that we can remove children safely there ?
1373 	if (!list_empty(&cl->children) || cl->filter_cnt)
1374 		return -EBUSY;
1375 
1376 	if (!cl->level && htb_parent_last_child(cl)) {
1377 		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1378 						cl->parent->classid);
1379 		last_child = 1;
1380 	}
1381 
1382 	sch_tree_lock(sch);
1383 
1384 	/* delete from hash and active; remainder in destroy_class */
1385 	hlist_del_init(&cl->hlist);
1386 
1387 	if (!cl->level) {
1388 		qlen = cl->un.leaf.q->q.qlen;
1389 		qdisc_reset(cl->un.leaf.q);
1390 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1391 	}
1392 
1393 	if (cl->prio_activity)
1394 		htb_deactivate(q, cl);
1395 
1396 	if (last_child)
1397 		htb_parent_to_leaf(cl, new_q);
1398 
1399 	if (--cl->refcnt == 0)
1400 		htb_destroy_class(sch, cl);
1401 
1402 	sch_tree_unlock(sch);
1403 	return 0;
1404 }
1405 
1406 static void htb_put(struct Qdisc *sch, unsigned long arg)
1407 {
1408 	struct htb_class *cl = (struct htb_class *)arg;
1409 
1410 	if (--cl->refcnt == 0)
1411 		htb_destroy_class(sch, cl);
1412 }
1413 
1414 static int htb_change_class(struct Qdisc *sch, u32 classid,
1415 			    u32 parentid, struct rtattr **tca,
1416 			    unsigned long *arg)
1417 {
1418 	int err = -EINVAL;
1419 	struct htb_sched *q = qdisc_priv(sch);
1420 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1421 	struct rtattr *opt = tca[TCA_OPTIONS - 1];
1422 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1423 	struct rtattr *tb[TCA_HTB_RTAB];
1424 	struct tc_htb_opt *hopt;
1425 
1426 	/* extract all subattrs from opt attr */
1427 	if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) ||
1428 	    tb[TCA_HTB_PARMS - 1] == NULL ||
1429 	    RTA_PAYLOAD(tb[TCA_HTB_PARMS - 1]) < sizeof(*hopt))
1430 		goto failure;
1431 
1432 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1433 
1434 	hopt = RTA_DATA(tb[TCA_HTB_PARMS - 1]);
1435 
1436 	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB - 1]);
1437 	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB - 1]);
1438 	if (!rtab || !ctab)
1439 		goto failure;
1440 
1441 	if (!cl) {		/* new class */
1442 		struct Qdisc *new_q;
1443 		int prio;
1444 
1445 		/* check for valid classid */
1446 		if (!classid || TC_H_MAJ(classid ^ sch->handle)
1447 		    || htb_find(classid, sch))
1448 			goto failure;
1449 
1450 		/* check maximal depth */
1451 		if (parent && parent->parent && parent->parent->level < 2) {
1452 			printk(KERN_ERR "htb: tree is too deep\n");
1453 			goto failure;
1454 		}
1455 		err = -ENOBUFS;
1456 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1457 			goto failure;
1458 
1459 		cl->refcnt = 1;
1460 		INIT_LIST_HEAD(&cl->sibling);
1461 		INIT_HLIST_NODE(&cl->hlist);
1462 		INIT_LIST_HEAD(&cl->children);
1463 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1464 		RB_CLEAR_NODE(&cl->pq_node);
1465 
1466 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1467 			RB_CLEAR_NODE(&cl->node[prio]);
1468 
1469 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1470 		   so that can't be used inside of sch_tree_lock
1471 		   -- thanks to Karlis Peisenieks */
1472 		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid);
1473 		sch_tree_lock(sch);
1474 		if (parent && !parent->level) {
1475 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1476 
1477 			/* turn parent into inner node */
1478 			qdisc_reset(parent->un.leaf.q);
1479 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1480 			qdisc_destroy(parent->un.leaf.q);
1481 			if (parent->prio_activity)
1482 				htb_deactivate(q, parent);
1483 
1484 			/* remove from evt list because of level change */
1485 			if (parent->cmode != HTB_CAN_SEND) {
1486 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1487 				parent->cmode = HTB_CAN_SEND;
1488 			}
1489 			parent->level = (parent->parent ? parent->parent->level
1490 					 : TC_HTB_MAXDEPTH) - 1;
1491 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1492 		}
1493 		/* leaf (we) needs elementary qdisc */
1494 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1495 
1496 		cl->classid = classid;
1497 		cl->parent = parent;
1498 
1499 		/* set class to be in HTB_CAN_SEND state */
1500 		cl->tokens = hopt->buffer;
1501 		cl->ctokens = hopt->cbuffer;
1502 		cl->mbuffer = PSCHED_JIFFIE2US(HZ * 60);	/* 1min */
1503 		PSCHED_GET_TIME(cl->t_c);
1504 		cl->cmode = HTB_CAN_SEND;
1505 
1506 		/* attach to the hash list and parent's family */
1507 		hlist_add_head(&cl->hlist, q->hash + htb_hash(classid));
1508 		list_add_tail(&cl->sibling,
1509 			      parent ? &parent->children : &q->root);
1510 	} else
1511 		sch_tree_lock(sch);
1512 
1513 	/* it used to be a nasty bug here, we have to check that node
1514 	   is really leaf before changing cl->un.leaf ! */
1515 	if (!cl->level) {
1516 		cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1517 		if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1518 			printk(KERN_WARNING
1519 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1520 			       cl->classid);
1521 			cl->un.leaf.quantum = 1000;
1522 		}
1523 		if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1524 			printk(KERN_WARNING
1525 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1526 			       cl->classid);
1527 			cl->un.leaf.quantum = 200000;
1528 		}
1529 		if (hopt->quantum)
1530 			cl->un.leaf.quantum = hopt->quantum;
1531 		if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1532 			cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1533 
1534 		/* backup for htb_parent_to_leaf */
1535 		cl->quantum = cl->un.leaf.quantum;
1536 		cl->prio = cl->un.leaf.prio;
1537 	}
1538 
1539 	cl->buffer = hopt->buffer;
1540 	cl->cbuffer = hopt->cbuffer;
1541 	if (cl->rate)
1542 		qdisc_put_rtab(cl->rate);
1543 	cl->rate = rtab;
1544 	if (cl->ceil)
1545 		qdisc_put_rtab(cl->ceil);
1546 	cl->ceil = ctab;
1547 	sch_tree_unlock(sch);
1548 
1549 	*arg = (unsigned long)cl;
1550 	return 0;
1551 
1552 failure:
1553 	if (rtab)
1554 		qdisc_put_rtab(rtab);
1555 	if (ctab)
1556 		qdisc_put_rtab(ctab);
1557 	return err;
1558 }
1559 
1560 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1561 {
1562 	struct htb_sched *q = qdisc_priv(sch);
1563 	struct htb_class *cl = (struct htb_class *)arg;
1564 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1565 
1566 	return fl;
1567 }
1568 
1569 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1570 				     u32 classid)
1571 {
1572 	struct htb_sched *q = qdisc_priv(sch);
1573 	struct htb_class *cl = htb_find(classid, sch);
1574 
1575 	/*if (cl && !cl->level) return 0;
1576 	   The line above used to be there to prevent attaching filters to
1577 	   leaves. But at least tc_index filter uses this just to get class
1578 	   for other reasons so that we have to allow for it.
1579 	   ----
1580 	   19.6.2002 As Werner explained it is ok - bind filter is just
1581 	   another way to "lock" the class - unlike "get" this lock can
1582 	   be broken by class during destroy IIUC.
1583 	 */
1584 	if (cl)
1585 		cl->filter_cnt++;
1586 	else
1587 		q->filter_cnt++;
1588 	return (unsigned long)cl;
1589 }
1590 
1591 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1592 {
1593 	struct htb_sched *q = qdisc_priv(sch);
1594 	struct htb_class *cl = (struct htb_class *)arg;
1595 
1596 	if (cl)
1597 		cl->filter_cnt--;
1598 	else
1599 		q->filter_cnt--;
1600 }
1601 
1602 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1603 {
1604 	struct htb_sched *q = qdisc_priv(sch);
1605 	int i;
1606 
1607 	if (arg->stop)
1608 		return;
1609 
1610 	for (i = 0; i < HTB_HSIZE; i++) {
1611 		struct hlist_node *p;
1612 		struct htb_class *cl;
1613 
1614 		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
1615 			if (arg->count < arg->skip) {
1616 				arg->count++;
1617 				continue;
1618 			}
1619 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1620 				arg->stop = 1;
1621 				return;
1622 			}
1623 			arg->count++;
1624 		}
1625 	}
1626 }
1627 
1628 static struct Qdisc_class_ops htb_class_ops = {
1629 	.graft		=	htb_graft,
1630 	.leaf		=	htb_leaf,
1631 	.qlen_notify	=	htb_qlen_notify,
1632 	.get		=	htb_get,
1633 	.put		=	htb_put,
1634 	.change		=	htb_change_class,
1635 	.delete		=	htb_delete,
1636 	.walk		=	htb_walk,
1637 	.tcf_chain	=	htb_find_tcf,
1638 	.bind_tcf	=	htb_bind_filter,
1639 	.unbind_tcf	=	htb_unbind_filter,
1640 	.dump		=	htb_dump_class,
1641 	.dump_stats	=	htb_dump_class_stats,
1642 };
1643 
1644 static struct Qdisc_ops htb_qdisc_ops = {
1645 	.next		=	NULL,
1646 	.cl_ops		=	&htb_class_ops,
1647 	.id		=	"htb",
1648 	.priv_size	=	sizeof(struct htb_sched),
1649 	.enqueue	=	htb_enqueue,
1650 	.dequeue	=	htb_dequeue,
1651 	.requeue	=	htb_requeue,
1652 	.drop		=	htb_drop,
1653 	.init		=	htb_init,
1654 	.reset		=	htb_reset,
1655 	.destroy	=	htb_destroy,
1656 	.change		=	NULL /* htb_change */,
1657 	.dump		=	htb_dump,
1658 	.owner		=	THIS_MODULE,
1659 };
1660 
1661 static int __init htb_module_init(void)
1662 {
1663 	return register_qdisc(&htb_qdisc_ops);
1664 }
1665 static void __exit htb_module_exit(void)
1666 {
1667 	unregister_qdisc(&htb_qdisc_ops);
1668 }
1669 
1670 module_init(htb_module_init)
1671 module_exit(htb_module_exit)
1672 MODULE_LICENSE("GPL");
1673