xref: /linux/net/sched/sch_htb.c (revision 7265706c8fd57722f622f336ec110cb35f83e739)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <net/netlink.h>
39 #include <net/pkt_sched.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 /* used internaly to keep status of single class */
66 enum htb_cmode {
67 	HTB_CANT_SEND,		/* class can't send and can't borrow */
68 	HTB_MAY_BORROW,		/* class can't send but may borrow */
69 	HTB_CAN_SEND		/* class can send */
70 };
71 
72 /* interior & leaf nodes; props specific to leaves are marked L: */
73 struct htb_class {
74 	struct Qdisc_class_common common;
75 	/* general class parameters */
76 	struct gnet_stats_basic bstats;
77 	struct gnet_stats_queue qstats;
78 	struct gnet_stats_rate_est rate_est;
79 	struct tc_htb_xstats xstats;	/* our special stats */
80 	int refcnt;		/* usage count of this class */
81 
82 	/* topology */
83 	int level;		/* our level (see above) */
84 	unsigned int children;
85 	struct htb_class *parent;	/* parent class */
86 
87 	union {
88 		struct htb_class_leaf {
89 			struct Qdisc *q;
90 			int prio;
91 			int aprio;
92 			int quantum;
93 			int deficit[TC_HTB_MAXDEPTH];
94 			struct list_head drop_list;
95 		} leaf;
96 		struct htb_class_inner {
97 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
98 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
99 			/* When class changes from state 1->2 and disconnects from
100 			   parent's feed then we lost ptr value and start from the
101 			   first child again. Here we store classid of the
102 			   last valid ptr (used when ptr is NULL). */
103 			u32 last_ptr_id[TC_HTB_NUMPRIO];
104 		} inner;
105 	} un;
106 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
107 	struct rb_node pq_node;	/* node for event queue */
108 	psched_time_t pq_key;
109 
110 	int prio_activity;	/* for which prios are we active */
111 	enum htb_cmode cmode;	/* current mode of the class */
112 
113 	/* class attached filters */
114 	struct tcf_proto *filter_list;
115 	int filter_cnt;
116 
117 	int warned;		/* only one warning about non work conserving .. */
118 
119 	/* token bucket parameters */
120 	struct qdisc_rate_table *rate;	/* rate table of the class itself */
121 	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
122 	long buffer, cbuffer;	/* token bucket depth/rate */
123 	psched_tdiff_t mbuffer;	/* max wait time */
124 	long tokens, ctokens;	/* current number of tokens */
125 	psched_time_t t_c;	/* checkpoint time */
126 
127 	int prio;		/* For parent to leaf return possible here */
128 	int quantum;		/* we do backup. Finally full replacement  */
129 				/* of un.leaf originals should be done. */
130 };
131 
132 static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
133 			   int size)
134 {
135 	long result = qdisc_l2t(rate, size);
136 	return result;
137 }
138 
139 struct htb_sched {
140 	struct Qdisc_class_hash clhash;
141 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
142 
143 	/* self list - roots of self generating tree */
144 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
145 	int row_mask[TC_HTB_MAXDEPTH];
146 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
147 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
148 
149 	/* self wait list - roots of wait PQs per row */
150 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
151 
152 	/* time of nearest event per level (row) */
153 	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
154 
155 	/* whether we hit non-work conserving class during this dequeue; we use */
156 	int nwc_hit;		/* this to disable mindelay complaint in dequeue */
157 
158 	int defcls;		/* class where unclassified flows go to */
159 
160 	/* filters for qdisc itself */
161 	struct tcf_proto *filter_list;
162 
163 	int rate2quantum;	/* quant = rate / rate2quantum */
164 	psched_time_t now;	/* cached dequeue time */
165 	struct qdisc_watchdog watchdog;
166 
167 	/* non shaped skbs; let them go directly thru */
168 	struct sk_buff_head direct_queue;
169 	int direct_qlen;	/* max qlen of above */
170 
171 	long direct_pkts;
172 };
173 
174 /* find class in global hash table using given handle */
175 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
176 {
177 	struct htb_sched *q = qdisc_priv(sch);
178 	struct Qdisc_class_common *clc;
179 
180 	clc = qdisc_class_find(&q->clhash, handle);
181 	if (clc == NULL)
182 		return NULL;
183 	return container_of(clc, struct htb_class, common);
184 }
185 
186 /**
187  * htb_classify - classify a packet into class
188  *
189  * It returns NULL if the packet should be dropped or -1 if the packet
190  * should be passed directly thru. In all other cases leaf class is returned.
191  * We allow direct class selection by classid in priority. The we examine
192  * filters in qdisc and in inner nodes (if higher filter points to the inner
193  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
194  * internal fifo (direct). These packets then go directly thru. If we still
195  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
196  * then finish and return direct queue.
197  */
198 #define HTB_DIRECT (struct htb_class*)-1
199 
200 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
201 				      int *qerr)
202 {
203 	struct htb_sched *q = qdisc_priv(sch);
204 	struct htb_class *cl;
205 	struct tcf_result res;
206 	struct tcf_proto *tcf;
207 	int result;
208 
209 	/* allow to select class by setting skb->priority to valid classid;
210 	   note that nfmark can be used too by attaching filter fw with no
211 	   rules in it */
212 	if (skb->priority == sch->handle)
213 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
214 	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
215 		return cl;
216 
217 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
218 	tcf = q->filter_list;
219 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
220 #ifdef CONFIG_NET_CLS_ACT
221 		switch (result) {
222 		case TC_ACT_QUEUED:
223 		case TC_ACT_STOLEN:
224 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
225 		case TC_ACT_SHOT:
226 			return NULL;
227 		}
228 #endif
229 		if ((cl = (void *)res.class) == NULL) {
230 			if (res.classid == sch->handle)
231 				return HTB_DIRECT;	/* X:0 (direct flow) */
232 			if ((cl = htb_find(res.classid, sch)) == NULL)
233 				break;	/* filter selected invalid classid */
234 		}
235 		if (!cl->level)
236 			return cl;	/* we hit leaf; return it */
237 
238 		/* we have got inner class; apply inner filter chain */
239 		tcf = cl->filter_list;
240 	}
241 	/* classification failed; try to use default class */
242 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
243 	if (!cl || cl->level)
244 		return HTB_DIRECT;	/* bad default .. this is safe bet */
245 	return cl;
246 }
247 
248 /**
249  * htb_add_to_id_tree - adds class to the round robin list
250  *
251  * Routine adds class to the list (actually tree) sorted by classid.
252  * Make sure that class is not already on such list for given prio.
253  */
254 static void htb_add_to_id_tree(struct rb_root *root,
255 			       struct htb_class *cl, int prio)
256 {
257 	struct rb_node **p = &root->rb_node, *parent = NULL;
258 
259 	while (*p) {
260 		struct htb_class *c;
261 		parent = *p;
262 		c = rb_entry(parent, struct htb_class, node[prio]);
263 
264 		if (cl->common.classid > c->common.classid)
265 			p = &parent->rb_right;
266 		else
267 			p = &parent->rb_left;
268 	}
269 	rb_link_node(&cl->node[prio], parent, p);
270 	rb_insert_color(&cl->node[prio], root);
271 }
272 
273 /**
274  * htb_add_to_wait_tree - adds class to the event queue with delay
275  *
276  * The class is added to priority event queue to indicate that class will
277  * change its mode in cl->pq_key microseconds. Make sure that class is not
278  * already in the queue.
279  */
280 static void htb_add_to_wait_tree(struct htb_sched *q,
281 				 struct htb_class *cl, long delay)
282 {
283 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
284 
285 	cl->pq_key = q->now + delay;
286 	if (cl->pq_key == q->now)
287 		cl->pq_key++;
288 
289 	/* update the nearest event cache */
290 	if (q->near_ev_cache[cl->level] > cl->pq_key)
291 		q->near_ev_cache[cl->level] = cl->pq_key;
292 
293 	while (*p) {
294 		struct htb_class *c;
295 		parent = *p;
296 		c = rb_entry(parent, struct htb_class, pq_node);
297 		if (cl->pq_key >= c->pq_key)
298 			p = &parent->rb_right;
299 		else
300 			p = &parent->rb_left;
301 	}
302 	rb_link_node(&cl->pq_node, parent, p);
303 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
304 }
305 
306 /**
307  * htb_next_rb_node - finds next node in binary tree
308  *
309  * When we are past last key we return NULL.
310  * Average complexity is 2 steps per call.
311  */
312 static inline void htb_next_rb_node(struct rb_node **n)
313 {
314 	*n = rb_next(*n);
315 }
316 
317 /**
318  * htb_add_class_to_row - add class to its row
319  *
320  * The class is added to row at priorities marked in mask.
321  * It does nothing if mask == 0.
322  */
323 static inline void htb_add_class_to_row(struct htb_sched *q,
324 					struct htb_class *cl, int mask)
325 {
326 	q->row_mask[cl->level] |= mask;
327 	while (mask) {
328 		int prio = ffz(~mask);
329 		mask &= ~(1 << prio);
330 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
331 	}
332 }
333 
334 /* If this triggers, it is a bug in this code, but it need not be fatal */
335 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
336 {
337 	if (RB_EMPTY_NODE(rb)) {
338 		WARN_ON(1);
339 	} else {
340 		rb_erase(rb, root);
341 		RB_CLEAR_NODE(rb);
342 	}
343 }
344 
345 
346 /**
347  * htb_remove_class_from_row - removes class from its row
348  *
349  * The class is removed from row at priorities marked in mask.
350  * It does nothing if mask == 0.
351  */
352 static inline void htb_remove_class_from_row(struct htb_sched *q,
353 						 struct htb_class *cl, int mask)
354 {
355 	int m = 0;
356 
357 	while (mask) {
358 		int prio = ffz(~mask);
359 
360 		mask &= ~(1 << prio);
361 		if (q->ptr[cl->level][prio] == cl->node + prio)
362 			htb_next_rb_node(q->ptr[cl->level] + prio);
363 
364 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
365 		if (!q->row[cl->level][prio].rb_node)
366 			m |= 1 << prio;
367 	}
368 	q->row_mask[cl->level] &= ~m;
369 }
370 
371 /**
372  * htb_activate_prios - creates active classe's feed chain
373  *
374  * The class is connected to ancestors and/or appropriate rows
375  * for priorities it is participating on. cl->cmode must be new
376  * (activated) mode. It does nothing if cl->prio_activity == 0.
377  */
378 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
379 {
380 	struct htb_class *p = cl->parent;
381 	long m, mask = cl->prio_activity;
382 
383 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
384 		m = mask;
385 		while (m) {
386 			int prio = ffz(~m);
387 			m &= ~(1 << prio);
388 
389 			if (p->un.inner.feed[prio].rb_node)
390 				/* parent already has its feed in use so that
391 				   reset bit in mask as parent is already ok */
392 				mask &= ~(1 << prio);
393 
394 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
395 		}
396 		p->prio_activity |= mask;
397 		cl = p;
398 		p = cl->parent;
399 
400 	}
401 	if (cl->cmode == HTB_CAN_SEND && mask)
402 		htb_add_class_to_row(q, cl, mask);
403 }
404 
405 /**
406  * htb_deactivate_prios - remove class from feed chain
407  *
408  * cl->cmode must represent old mode (before deactivation). It does
409  * nothing if cl->prio_activity == 0. Class is removed from all feed
410  * chains and rows.
411  */
412 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
413 {
414 	struct htb_class *p = cl->parent;
415 	long m, mask = cl->prio_activity;
416 
417 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
418 		m = mask;
419 		mask = 0;
420 		while (m) {
421 			int prio = ffz(~m);
422 			m &= ~(1 << prio);
423 
424 			if (p->un.inner.ptr[prio] == cl->node + prio) {
425 				/* we are removing child which is pointed to from
426 				   parent feed - forget the pointer but remember
427 				   classid */
428 				p->un.inner.last_ptr_id[prio] = cl->common.classid;
429 				p->un.inner.ptr[prio] = NULL;
430 			}
431 
432 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
433 
434 			if (!p->un.inner.feed[prio].rb_node)
435 				mask |= 1 << prio;
436 		}
437 
438 		p->prio_activity &= ~mask;
439 		cl = p;
440 		p = cl->parent;
441 
442 	}
443 	if (cl->cmode == HTB_CAN_SEND && mask)
444 		htb_remove_class_from_row(q, cl, mask);
445 }
446 
447 static inline long htb_lowater(const struct htb_class *cl)
448 {
449 	if (htb_hysteresis)
450 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
451 	else
452 		return 0;
453 }
454 static inline long htb_hiwater(const struct htb_class *cl)
455 {
456 	if (htb_hysteresis)
457 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
458 	else
459 		return 0;
460 }
461 
462 
463 /**
464  * htb_class_mode - computes and returns current class mode
465  *
466  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
467  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
468  * from now to time when cl will change its state.
469  * Also it is worth to note that class mode doesn't change simply
470  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
471  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
472  * mode transitions per time unit. The speed gain is about 1/6.
473  */
474 static inline enum htb_cmode
475 htb_class_mode(struct htb_class *cl, long *diff)
476 {
477 	long toks;
478 
479 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
480 		*diff = -toks;
481 		return HTB_CANT_SEND;
482 	}
483 
484 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
485 		return HTB_CAN_SEND;
486 
487 	*diff = -toks;
488 	return HTB_MAY_BORROW;
489 }
490 
491 /**
492  * htb_change_class_mode - changes classe's mode
493  *
494  * This should be the only way how to change classe's mode under normal
495  * cirsumstances. Routine will update feed lists linkage, change mode
496  * and add class to the wait event queue if appropriate. New mode should
497  * be different from old one and cl->pq_key has to be valid if changing
498  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
499  */
500 static void
501 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
502 {
503 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
504 
505 	if (new_mode == cl->cmode)
506 		return;
507 
508 	if (cl->prio_activity) {	/* not necessary: speed optimization */
509 		if (cl->cmode != HTB_CANT_SEND)
510 			htb_deactivate_prios(q, cl);
511 		cl->cmode = new_mode;
512 		if (new_mode != HTB_CANT_SEND)
513 			htb_activate_prios(q, cl);
514 	} else
515 		cl->cmode = new_mode;
516 }
517 
518 /**
519  * htb_activate - inserts leaf cl into appropriate active feeds
520  *
521  * Routine learns (new) priority of leaf and activates feed chain
522  * for the prio. It can be called on already active leaf safely.
523  * It also adds leaf into droplist.
524  */
525 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
526 {
527 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
528 
529 	if (!cl->prio_activity) {
530 		cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
531 		htb_activate_prios(q, cl);
532 		list_add_tail(&cl->un.leaf.drop_list,
533 			      q->drops + cl->un.leaf.aprio);
534 	}
535 }
536 
537 /**
538  * htb_deactivate - remove leaf cl from active feeds
539  *
540  * Make sure that leaf is active. In the other words it can't be called
541  * with non-active leaf. It also removes class from the drop list.
542  */
543 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
544 {
545 	WARN_ON(!cl->prio_activity);
546 
547 	htb_deactivate_prios(q, cl);
548 	cl->prio_activity = 0;
549 	list_del_init(&cl->un.leaf.drop_list);
550 }
551 
552 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
553 {
554 	int ret;
555 	struct htb_sched *q = qdisc_priv(sch);
556 	struct htb_class *cl = htb_classify(skb, sch, &ret);
557 
558 	if (cl == HTB_DIRECT) {
559 		/* enqueue to helper queue */
560 		if (q->direct_queue.qlen < q->direct_qlen) {
561 			__skb_queue_tail(&q->direct_queue, skb);
562 			q->direct_pkts++;
563 		} else {
564 			kfree_skb(skb);
565 			sch->qstats.drops++;
566 			return NET_XMIT_DROP;
567 		}
568 #ifdef CONFIG_NET_CLS_ACT
569 	} else if (!cl) {
570 		if (ret & __NET_XMIT_BYPASS)
571 			sch->qstats.drops++;
572 		kfree_skb(skb);
573 		return ret;
574 #endif
575 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
576 		if (net_xmit_drop_count(ret)) {
577 			sch->qstats.drops++;
578 			cl->qstats.drops++;
579 		}
580 		return NET_XMIT_DROP;
581 	} else {
582 		cl->bstats.packets +=
583 			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
584 		cl->bstats.bytes += qdisc_pkt_len(skb);
585 		htb_activate(q, cl);
586 	}
587 
588 	sch->q.qlen++;
589 	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
590 	sch->bstats.bytes += qdisc_pkt_len(skb);
591 	return NET_XMIT_SUCCESS;
592 }
593 
594 /* TODO: requeuing packet charges it to policers again !! */
595 static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
596 {
597 	int ret;
598 	struct htb_sched *q = qdisc_priv(sch);
599 	struct htb_class *cl = htb_classify(skb, sch, &ret);
600 	struct sk_buff *tskb;
601 
602 	if (cl == HTB_DIRECT) {
603 		/* enqueue to helper queue */
604 		if (q->direct_queue.qlen < q->direct_qlen) {
605 			__skb_queue_head(&q->direct_queue, skb);
606 		} else {
607 			__skb_queue_head(&q->direct_queue, skb);
608 			tskb = __skb_dequeue_tail(&q->direct_queue);
609 			kfree_skb(tskb);
610 			sch->qstats.drops++;
611 			return NET_XMIT_CN;
612 		}
613 #ifdef CONFIG_NET_CLS_ACT
614 	} else if (!cl) {
615 		if (ret & __NET_XMIT_BYPASS)
616 			sch->qstats.drops++;
617 		kfree_skb(skb);
618 		return ret;
619 #endif
620 	} else if ((ret = cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q)) !=
621 		   NET_XMIT_SUCCESS) {
622 		if (net_xmit_drop_count(ret)) {
623 			sch->qstats.drops++;
624 			cl->qstats.drops++;
625 		}
626 		return NET_XMIT_DROP;
627 	} else
628 		htb_activate(q, cl);
629 
630 	sch->q.qlen++;
631 	sch->qstats.requeues++;
632 	return NET_XMIT_SUCCESS;
633 }
634 
635 /**
636  * htb_charge_class - charges amount "bytes" to leaf and ancestors
637  *
638  * Routine assumes that packet "bytes" long was dequeued from leaf cl
639  * borrowing from "level". It accounts bytes to ceil leaky bucket for
640  * leaf and all ancestors and to rate bucket for ancestors at levels
641  * "level" and higher. It also handles possible change of mode resulting
642  * from the update. Note that mode can also increase here (MAY_BORROW to
643  * CAN_SEND) because we can use more precise clock that event queue here.
644  * In such case we remove class from event queue first.
645  */
646 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
647 			     int level, struct sk_buff *skb)
648 {
649 	int bytes = qdisc_pkt_len(skb);
650 	long toks, diff;
651 	enum htb_cmode old_mode;
652 
653 #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
654 	if (toks > cl->B) toks = cl->B; \
655 	toks -= L2T(cl, cl->R, bytes); \
656 	if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
657 	cl->T = toks
658 
659 	while (cl) {
660 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
661 		if (cl->level >= level) {
662 			if (cl->level == level)
663 				cl->xstats.lends++;
664 			HTB_ACCNT(tokens, buffer, rate);
665 		} else {
666 			cl->xstats.borrows++;
667 			cl->tokens += diff;	/* we moved t_c; update tokens */
668 		}
669 		HTB_ACCNT(ctokens, cbuffer, ceil);
670 		cl->t_c = q->now;
671 
672 		old_mode = cl->cmode;
673 		diff = 0;
674 		htb_change_class_mode(q, cl, &diff);
675 		if (old_mode != cl->cmode) {
676 			if (old_mode != HTB_CAN_SEND)
677 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
678 			if (cl->cmode != HTB_CAN_SEND)
679 				htb_add_to_wait_tree(q, cl, diff);
680 		}
681 
682 		/* update byte stats except for leaves which are already updated */
683 		if (cl->level) {
684 			cl->bstats.bytes += bytes;
685 			cl->bstats.packets += skb_is_gso(skb)?
686 					skb_shinfo(skb)->gso_segs:1;
687 		}
688 		cl = cl->parent;
689 	}
690 }
691 
692 /**
693  * htb_do_events - make mode changes to classes at the level
694  *
695  * Scans event queue for pending events and applies them. Returns time of
696  * next pending event (0 for no event in pq).
697  * Note: Applied are events whose have cl->pq_key <= q->now.
698  */
699 static psched_time_t htb_do_events(struct htb_sched *q, int level)
700 {
701 	/* don't run for longer than 2 jiffies; 2 is used instead of
702 	   1 to simplify things when jiffy is going to be incremented
703 	   too soon */
704 	unsigned long stop_at = jiffies + 2;
705 	while (time_before(jiffies, stop_at)) {
706 		struct htb_class *cl;
707 		long diff;
708 		struct rb_node *p = rb_first(&q->wait_pq[level]);
709 
710 		if (!p)
711 			return 0;
712 
713 		cl = rb_entry(p, struct htb_class, pq_node);
714 		if (cl->pq_key > q->now)
715 			return cl->pq_key;
716 
717 		htb_safe_rb_erase(p, q->wait_pq + level);
718 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
719 		htb_change_class_mode(q, cl, &diff);
720 		if (cl->cmode != HTB_CAN_SEND)
721 			htb_add_to_wait_tree(q, cl, diff);
722 	}
723 	/* too much load - let's continue on next jiffie */
724 	return q->now + PSCHED_TICKS_PER_SEC / HZ;
725 }
726 
727 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
728    is no such one exists. */
729 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
730 					      u32 id)
731 {
732 	struct rb_node *r = NULL;
733 	while (n) {
734 		struct htb_class *cl =
735 		    rb_entry(n, struct htb_class, node[prio]);
736 		if (id == cl->common.classid)
737 			return n;
738 
739 		if (id > cl->common.classid) {
740 			n = n->rb_right;
741 		} else {
742 			r = n;
743 			n = n->rb_left;
744 		}
745 	}
746 	return r;
747 }
748 
749 /**
750  * htb_lookup_leaf - returns next leaf class in DRR order
751  *
752  * Find leaf where current feed pointers points to.
753  */
754 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
755 					 struct rb_node **pptr, u32 * pid)
756 {
757 	int i;
758 	struct {
759 		struct rb_node *root;
760 		struct rb_node **pptr;
761 		u32 *pid;
762 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
763 
764 	WARN_ON(!tree->rb_node);
765 	sp->root = tree->rb_node;
766 	sp->pptr = pptr;
767 	sp->pid = pid;
768 
769 	for (i = 0; i < 65535; i++) {
770 		if (!*sp->pptr && *sp->pid) {
771 			/* ptr was invalidated but id is valid - try to recover
772 			   the original or next ptr */
773 			*sp->pptr =
774 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
775 		}
776 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
777 				   can become out of date quickly */
778 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
779 			*sp->pptr = sp->root;
780 			while ((*sp->pptr)->rb_left)
781 				*sp->pptr = (*sp->pptr)->rb_left;
782 			if (sp > stk) {
783 				sp--;
784 				WARN_ON(!*sp->pptr);
785 				if (!*sp->pptr)
786 					return NULL;
787 				htb_next_rb_node(sp->pptr);
788 			}
789 		} else {
790 			struct htb_class *cl;
791 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
792 			if (!cl->level)
793 				return cl;
794 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
795 			sp->pptr = cl->un.inner.ptr + prio;
796 			sp->pid = cl->un.inner.last_ptr_id + prio;
797 		}
798 	}
799 	WARN_ON(1);
800 	return NULL;
801 }
802 
803 /* dequeues packet at given priority and level; call only if
804    you are sure that there is active class at prio/level */
805 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
806 					int level)
807 {
808 	struct sk_buff *skb = NULL;
809 	struct htb_class *cl, *start;
810 	/* look initial class up in the row */
811 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
812 				     q->ptr[level] + prio,
813 				     q->last_ptr_id[level] + prio);
814 
815 	do {
816 next:
817 		WARN_ON(!cl);
818 		if (!cl)
819 			return NULL;
820 
821 		/* class can be empty - it is unlikely but can be true if leaf
822 		   qdisc drops packets in enqueue routine or if someone used
823 		   graft operation on the leaf since last dequeue;
824 		   simply deactivate and skip such class */
825 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
826 			struct htb_class *next;
827 			htb_deactivate(q, cl);
828 
829 			/* row/level might become empty */
830 			if ((q->row_mask[level] & (1 << prio)) == 0)
831 				return NULL;
832 
833 			next = htb_lookup_leaf(q->row[level] + prio,
834 					       prio, q->ptr[level] + prio,
835 					       q->last_ptr_id[level] + prio);
836 
837 			if (cl == start)	/* fix start if we just deleted it */
838 				start = next;
839 			cl = next;
840 			goto next;
841 		}
842 
843 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
844 		if (likely(skb != NULL))
845 			break;
846 		if (!cl->warned) {
847 			printk(KERN_WARNING
848 			       "htb: class %X isn't work conserving ?!\n",
849 			       cl->common.classid);
850 			cl->warned = 1;
851 		}
852 		q->nwc_hit++;
853 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
854 				  ptr[0]) + prio);
855 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
856 				     q->ptr[level] + prio,
857 				     q->last_ptr_id[level] + prio);
858 
859 	} while (cl != start);
860 
861 	if (likely(skb != NULL)) {
862 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
863 		if (cl->un.leaf.deficit[level] < 0) {
864 			cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
865 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
866 					  ptr[0]) + prio);
867 		}
868 		/* this used to be after charge_class but this constelation
869 		   gives us slightly better performance */
870 		if (!cl->un.leaf.q->q.qlen)
871 			htb_deactivate(q, cl);
872 		htb_charge_class(q, cl, level, skb);
873 	}
874 	return skb;
875 }
876 
877 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
878 {
879 	struct sk_buff *skb = NULL;
880 	struct htb_sched *q = qdisc_priv(sch);
881 	int level;
882 	psched_time_t next_event;
883 
884 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
885 	skb = __skb_dequeue(&q->direct_queue);
886 	if (skb != NULL) {
887 		sch->flags &= ~TCQ_F_THROTTLED;
888 		sch->q.qlen--;
889 		return skb;
890 	}
891 
892 	if (!sch->q.qlen)
893 		goto fin;
894 	q->now = psched_get_time();
895 
896 	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
897 	q->nwc_hit = 0;
898 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
899 		/* common case optimization - skip event handler quickly */
900 		int m;
901 		psched_time_t event;
902 
903 		if (q->now >= q->near_ev_cache[level]) {
904 			event = htb_do_events(q, level);
905 			if (!event)
906 				event = q->now + PSCHED_TICKS_PER_SEC;
907 			q->near_ev_cache[level] = event;
908 		} else
909 			event = q->near_ev_cache[level];
910 
911 		if (event && next_event > event)
912 			next_event = event;
913 
914 		m = ~q->row_mask[level];
915 		while (m != (int)(-1)) {
916 			int prio = ffz(m);
917 			m |= 1 << prio;
918 			skb = htb_dequeue_tree(q, prio, level);
919 			if (likely(skb != NULL)) {
920 				sch->q.qlen--;
921 				sch->flags &= ~TCQ_F_THROTTLED;
922 				goto fin;
923 			}
924 		}
925 	}
926 	sch->qstats.overlimits++;
927 	qdisc_watchdog_schedule(&q->watchdog, next_event);
928 fin:
929 	return skb;
930 }
931 
932 /* try to drop from each class (by prio) until one succeed */
933 static unsigned int htb_drop(struct Qdisc *sch)
934 {
935 	struct htb_sched *q = qdisc_priv(sch);
936 	int prio;
937 
938 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
939 		struct list_head *p;
940 		list_for_each(p, q->drops + prio) {
941 			struct htb_class *cl = list_entry(p, struct htb_class,
942 							  un.leaf.drop_list);
943 			unsigned int len;
944 			if (cl->un.leaf.q->ops->drop &&
945 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
946 				sch->q.qlen--;
947 				if (!cl->un.leaf.q->q.qlen)
948 					htb_deactivate(q, cl);
949 				return len;
950 			}
951 		}
952 	}
953 	return 0;
954 }
955 
956 /* reset all classes */
957 /* always caled under BH & queue lock */
958 static void htb_reset(struct Qdisc *sch)
959 {
960 	struct htb_sched *q = qdisc_priv(sch);
961 	struct htb_class *cl;
962 	struct hlist_node *n;
963 	unsigned int i;
964 
965 	for (i = 0; i < q->clhash.hashsize; i++) {
966 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
967 			if (cl->level)
968 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
969 			else {
970 				if (cl->un.leaf.q)
971 					qdisc_reset(cl->un.leaf.q);
972 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
973 			}
974 			cl->prio_activity = 0;
975 			cl->cmode = HTB_CAN_SEND;
976 
977 		}
978 	}
979 	qdisc_watchdog_cancel(&q->watchdog);
980 	__skb_queue_purge(&q->direct_queue);
981 	sch->q.qlen = 0;
982 	memset(q->row, 0, sizeof(q->row));
983 	memset(q->row_mask, 0, sizeof(q->row_mask));
984 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
985 	memset(q->ptr, 0, sizeof(q->ptr));
986 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
987 		INIT_LIST_HEAD(q->drops + i);
988 }
989 
990 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
991 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
992 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
993 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
994 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
995 };
996 
997 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
998 {
999 	struct htb_sched *q = qdisc_priv(sch);
1000 	struct nlattr *tb[TCA_HTB_INIT + 1];
1001 	struct tc_htb_glob *gopt;
1002 	int err;
1003 	int i;
1004 
1005 	if (!opt)
1006 		return -EINVAL;
1007 
1008 	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
1009 	if (err < 0)
1010 		return err;
1011 
1012 	if (tb[TCA_HTB_INIT] == NULL) {
1013 		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1014 		return -EINVAL;
1015 	}
1016 	gopt = nla_data(tb[TCA_HTB_INIT]);
1017 	if (gopt->version != HTB_VER >> 16) {
1018 		printk(KERN_ERR
1019 		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1020 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1021 		return -EINVAL;
1022 	}
1023 
1024 	err = qdisc_class_hash_init(&q->clhash);
1025 	if (err < 0)
1026 		return err;
1027 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1028 		INIT_LIST_HEAD(q->drops + i);
1029 
1030 	qdisc_watchdog_init(&q->watchdog, sch);
1031 	skb_queue_head_init(&q->direct_queue);
1032 
1033 	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1034 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1035 		q->direct_qlen = 2;
1036 
1037 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1038 		q->rate2quantum = 1;
1039 	q->defcls = gopt->defcls;
1040 
1041 	return 0;
1042 }
1043 
1044 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1045 {
1046 	spinlock_t *root_lock = qdisc_root_lock(sch);
1047 	struct htb_sched *q = qdisc_priv(sch);
1048 	struct nlattr *nest;
1049 	struct tc_htb_glob gopt;
1050 
1051 	spin_lock_bh(root_lock);
1052 
1053 	gopt.direct_pkts = q->direct_pkts;
1054 	gopt.version = HTB_VER;
1055 	gopt.rate2quantum = q->rate2quantum;
1056 	gopt.defcls = q->defcls;
1057 	gopt.debug = 0;
1058 
1059 	nest = nla_nest_start(skb, TCA_OPTIONS);
1060 	if (nest == NULL)
1061 		goto nla_put_failure;
1062 	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1063 	nla_nest_end(skb, nest);
1064 
1065 	spin_unlock_bh(root_lock);
1066 	return skb->len;
1067 
1068 nla_put_failure:
1069 	spin_unlock_bh(root_lock);
1070 	nla_nest_cancel(skb, nest);
1071 	return -1;
1072 }
1073 
1074 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1075 			  struct sk_buff *skb, struct tcmsg *tcm)
1076 {
1077 	struct htb_class *cl = (struct htb_class *)arg;
1078 	spinlock_t *root_lock = qdisc_root_lock(sch);
1079 	struct nlattr *nest;
1080 	struct tc_htb_opt opt;
1081 
1082 	spin_lock_bh(root_lock);
1083 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1084 	tcm->tcm_handle = cl->common.classid;
1085 	if (!cl->level && cl->un.leaf.q)
1086 		tcm->tcm_info = cl->un.leaf.q->handle;
1087 
1088 	nest = nla_nest_start(skb, TCA_OPTIONS);
1089 	if (nest == NULL)
1090 		goto nla_put_failure;
1091 
1092 	memset(&opt, 0, sizeof(opt));
1093 
1094 	opt.rate = cl->rate->rate;
1095 	opt.buffer = cl->buffer;
1096 	opt.ceil = cl->ceil->rate;
1097 	opt.cbuffer = cl->cbuffer;
1098 	opt.quantum = cl->un.leaf.quantum;
1099 	opt.prio = cl->un.leaf.prio;
1100 	opt.level = cl->level;
1101 	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1102 
1103 	nla_nest_end(skb, nest);
1104 	spin_unlock_bh(root_lock);
1105 	return skb->len;
1106 
1107 nla_put_failure:
1108 	spin_unlock_bh(root_lock);
1109 	nla_nest_cancel(skb, nest);
1110 	return -1;
1111 }
1112 
1113 static int
1114 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1115 {
1116 	struct htb_class *cl = (struct htb_class *)arg;
1117 
1118 	if (!cl->level && cl->un.leaf.q)
1119 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1120 	cl->xstats.tokens = cl->tokens;
1121 	cl->xstats.ctokens = cl->ctokens;
1122 
1123 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1124 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1125 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1126 		return -1;
1127 
1128 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1129 }
1130 
1131 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1132 		     struct Qdisc **old)
1133 {
1134 	struct htb_class *cl = (struct htb_class *)arg;
1135 
1136 	if (cl && !cl->level) {
1137 		if (new == NULL &&
1138 		    (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1139 					     &pfifo_qdisc_ops,
1140 					     cl->common.classid))
1141 		    == NULL)
1142 			return -ENOBUFS;
1143 		sch_tree_lock(sch);
1144 		if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1145 			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1146 			qdisc_reset(*old);
1147 		}
1148 		sch_tree_unlock(sch);
1149 		return 0;
1150 	}
1151 	return -ENOENT;
1152 }
1153 
1154 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1155 {
1156 	struct htb_class *cl = (struct htb_class *)arg;
1157 	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1158 }
1159 
1160 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1161 {
1162 	struct htb_class *cl = (struct htb_class *)arg;
1163 
1164 	if (cl->un.leaf.q->q.qlen == 0)
1165 		htb_deactivate(qdisc_priv(sch), cl);
1166 }
1167 
1168 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1169 {
1170 	struct htb_class *cl = htb_find(classid, sch);
1171 	if (cl)
1172 		cl->refcnt++;
1173 	return (unsigned long)cl;
1174 }
1175 
1176 static inline int htb_parent_last_child(struct htb_class *cl)
1177 {
1178 	if (!cl->parent)
1179 		/* the root class */
1180 		return 0;
1181 	if (cl->parent->children > 1)
1182 		/* not the last child */
1183 		return 0;
1184 	return 1;
1185 }
1186 
1187 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1188 			       struct Qdisc *new_q)
1189 {
1190 	struct htb_class *parent = cl->parent;
1191 
1192 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1193 
1194 	if (parent->cmode != HTB_CAN_SEND)
1195 		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
1196 
1197 	parent->level = 0;
1198 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1199 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1200 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1201 	parent->un.leaf.quantum = parent->quantum;
1202 	parent->un.leaf.prio = parent->prio;
1203 	parent->tokens = parent->buffer;
1204 	parent->ctokens = parent->cbuffer;
1205 	parent->t_c = psched_get_time();
1206 	parent->cmode = HTB_CAN_SEND;
1207 }
1208 
1209 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1210 {
1211 	if (!cl->level) {
1212 		WARN_ON(!cl->un.leaf.q);
1213 		qdisc_destroy(cl->un.leaf.q);
1214 	}
1215 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1216 	qdisc_put_rtab(cl->rate);
1217 	qdisc_put_rtab(cl->ceil);
1218 
1219 	tcf_destroy_chain(&cl->filter_list);
1220 	kfree(cl);
1221 }
1222 
1223 /* always caled under BH & queue lock */
1224 static void htb_destroy(struct Qdisc *sch)
1225 {
1226 	struct htb_sched *q = qdisc_priv(sch);
1227 	struct hlist_node *n, *next;
1228 	struct htb_class *cl;
1229 	unsigned int i;
1230 
1231 	qdisc_watchdog_cancel(&q->watchdog);
1232 	/* This line used to be after htb_destroy_class call below
1233 	   and surprisingly it worked in 2.4. But it must precede it
1234 	   because filter need its target class alive to be able to call
1235 	   unbind_filter on it (without Oops). */
1236 	tcf_destroy_chain(&q->filter_list);
1237 
1238 	for (i = 0; i < q->clhash.hashsize; i++) {
1239 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1240 			tcf_destroy_chain(&cl->filter_list);
1241 	}
1242 	for (i = 0; i < q->clhash.hashsize; i++) {
1243 		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
1244 					  common.hnode)
1245 			htb_destroy_class(sch, cl);
1246 	}
1247 	qdisc_class_hash_destroy(&q->clhash);
1248 	__skb_queue_purge(&q->direct_queue);
1249 }
1250 
1251 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1252 {
1253 	struct htb_sched *q = qdisc_priv(sch);
1254 	struct htb_class *cl = (struct htb_class *)arg;
1255 	unsigned int qlen;
1256 	struct Qdisc *new_q = NULL;
1257 	int last_child = 0;
1258 
1259 	// TODO: why don't allow to delete subtree ? references ? does
1260 	// tc subsys quarantee us that in htb_destroy it holds no class
1261 	// refs so that we can remove children safely there ?
1262 	if (cl->children || cl->filter_cnt)
1263 		return -EBUSY;
1264 
1265 	if (!cl->level && htb_parent_last_child(cl)) {
1266 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1267 					  &pfifo_qdisc_ops,
1268 					  cl->parent->common.classid);
1269 		last_child = 1;
1270 	}
1271 
1272 	sch_tree_lock(sch);
1273 
1274 	if (!cl->level) {
1275 		qlen = cl->un.leaf.q->q.qlen;
1276 		qdisc_reset(cl->un.leaf.q);
1277 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1278 	}
1279 
1280 	/* delete from hash and active; remainder in destroy_class */
1281 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1282 	cl->parent->children--;
1283 
1284 	if (cl->prio_activity)
1285 		htb_deactivate(q, cl);
1286 
1287 	if (cl->cmode != HTB_CAN_SEND)
1288 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1289 
1290 	if (last_child)
1291 		htb_parent_to_leaf(q, cl, new_q);
1292 
1293 	if (--cl->refcnt == 0)
1294 		htb_destroy_class(sch, cl);
1295 
1296 	sch_tree_unlock(sch);
1297 	return 0;
1298 }
1299 
1300 static void htb_put(struct Qdisc *sch, unsigned long arg)
1301 {
1302 	struct htb_class *cl = (struct htb_class *)arg;
1303 
1304 	if (--cl->refcnt == 0)
1305 		htb_destroy_class(sch, cl);
1306 }
1307 
1308 static int htb_change_class(struct Qdisc *sch, u32 classid,
1309 			    u32 parentid, struct nlattr **tca,
1310 			    unsigned long *arg)
1311 {
1312 	int err = -EINVAL;
1313 	struct htb_sched *q = qdisc_priv(sch);
1314 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1315 	struct nlattr *opt = tca[TCA_OPTIONS];
1316 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1317 	struct nlattr *tb[TCA_HTB_RTAB + 1];
1318 	struct tc_htb_opt *hopt;
1319 
1320 	/* extract all subattrs from opt attr */
1321 	if (!opt)
1322 		goto failure;
1323 
1324 	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1325 	if (err < 0)
1326 		goto failure;
1327 
1328 	err = -EINVAL;
1329 	if (tb[TCA_HTB_PARMS] == NULL)
1330 		goto failure;
1331 
1332 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1333 
1334 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1335 
1336 	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1337 	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1338 	if (!rtab || !ctab)
1339 		goto failure;
1340 
1341 	if (!cl) {		/* new class */
1342 		struct Qdisc *new_q;
1343 		int prio;
1344 		struct {
1345 			struct nlattr		nla;
1346 			struct gnet_estimator	opt;
1347 		} est = {
1348 			.nla = {
1349 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1350 				.nla_type	= TCA_RATE,
1351 			},
1352 			.opt = {
1353 				/* 4s interval, 16s averaging constant */
1354 				.interval	= 2,
1355 				.ewma_log	= 2,
1356 			},
1357 		};
1358 
1359 		/* check for valid classid */
1360 		if (!classid || TC_H_MAJ(classid ^ sch->handle)
1361 		    || htb_find(classid, sch))
1362 			goto failure;
1363 
1364 		/* check maximal depth */
1365 		if (parent && parent->parent && parent->parent->level < 2) {
1366 			printk(KERN_ERR "htb: tree is too deep\n");
1367 			goto failure;
1368 		}
1369 		err = -ENOBUFS;
1370 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1371 			goto failure;
1372 
1373 		gen_new_estimator(&cl->bstats, &cl->rate_est,
1374 				  qdisc_root_lock(sch),
1375 				  tca[TCA_RATE] ? : &est.nla);
1376 		cl->refcnt = 1;
1377 		cl->children = 0;
1378 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1379 		RB_CLEAR_NODE(&cl->pq_node);
1380 
1381 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1382 			RB_CLEAR_NODE(&cl->node[prio]);
1383 
1384 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1385 		   so that can't be used inside of sch_tree_lock
1386 		   -- thanks to Karlis Peisenieks */
1387 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1388 					  &pfifo_qdisc_ops, classid);
1389 		sch_tree_lock(sch);
1390 		if (parent && !parent->level) {
1391 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1392 
1393 			/* turn parent into inner node */
1394 			qdisc_reset(parent->un.leaf.q);
1395 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1396 			qdisc_destroy(parent->un.leaf.q);
1397 			if (parent->prio_activity)
1398 				htb_deactivate(q, parent);
1399 
1400 			/* remove from evt list because of level change */
1401 			if (parent->cmode != HTB_CAN_SEND) {
1402 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1403 				parent->cmode = HTB_CAN_SEND;
1404 			}
1405 			parent->level = (parent->parent ? parent->parent->level
1406 					 : TC_HTB_MAXDEPTH) - 1;
1407 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1408 		}
1409 		/* leaf (we) needs elementary qdisc */
1410 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1411 
1412 		cl->common.classid = classid;
1413 		cl->parent = parent;
1414 
1415 		/* set class to be in HTB_CAN_SEND state */
1416 		cl->tokens = hopt->buffer;
1417 		cl->ctokens = hopt->cbuffer;
1418 		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1419 		cl->t_c = psched_get_time();
1420 		cl->cmode = HTB_CAN_SEND;
1421 
1422 		/* attach to the hash list and parent's family */
1423 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1424 		if (parent)
1425 			parent->children++;
1426 	} else {
1427 		if (tca[TCA_RATE])
1428 			gen_replace_estimator(&cl->bstats, &cl->rate_est,
1429 					      qdisc_root_lock(sch),
1430 					      tca[TCA_RATE]);
1431 		sch_tree_lock(sch);
1432 	}
1433 
1434 	/* it used to be a nasty bug here, we have to check that node
1435 	   is really leaf before changing cl->un.leaf ! */
1436 	if (!cl->level) {
1437 		cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1438 		if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1439 			printk(KERN_WARNING
1440 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1441 			       cl->common.classid);
1442 			cl->un.leaf.quantum = 1000;
1443 		}
1444 		if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1445 			printk(KERN_WARNING
1446 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1447 			       cl->common.classid);
1448 			cl->un.leaf.quantum = 200000;
1449 		}
1450 		if (hopt->quantum)
1451 			cl->un.leaf.quantum = hopt->quantum;
1452 		if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1453 			cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1454 
1455 		/* backup for htb_parent_to_leaf */
1456 		cl->quantum = cl->un.leaf.quantum;
1457 		cl->prio = cl->un.leaf.prio;
1458 	}
1459 
1460 	cl->buffer = hopt->buffer;
1461 	cl->cbuffer = hopt->cbuffer;
1462 	if (cl->rate)
1463 		qdisc_put_rtab(cl->rate);
1464 	cl->rate = rtab;
1465 	if (cl->ceil)
1466 		qdisc_put_rtab(cl->ceil);
1467 	cl->ceil = ctab;
1468 	sch_tree_unlock(sch);
1469 
1470 	qdisc_class_hash_grow(sch, &q->clhash);
1471 
1472 	*arg = (unsigned long)cl;
1473 	return 0;
1474 
1475 failure:
1476 	if (rtab)
1477 		qdisc_put_rtab(rtab);
1478 	if (ctab)
1479 		qdisc_put_rtab(ctab);
1480 	return err;
1481 }
1482 
1483 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1484 {
1485 	struct htb_sched *q = qdisc_priv(sch);
1486 	struct htb_class *cl = (struct htb_class *)arg;
1487 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1488 
1489 	return fl;
1490 }
1491 
1492 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1493 				     u32 classid)
1494 {
1495 	struct htb_class *cl = htb_find(classid, sch);
1496 
1497 	/*if (cl && !cl->level) return 0;
1498 	   The line above used to be there to prevent attaching filters to
1499 	   leaves. But at least tc_index filter uses this just to get class
1500 	   for other reasons so that we have to allow for it.
1501 	   ----
1502 	   19.6.2002 As Werner explained it is ok - bind filter is just
1503 	   another way to "lock" the class - unlike "get" this lock can
1504 	   be broken by class during destroy IIUC.
1505 	 */
1506 	if (cl)
1507 		cl->filter_cnt++;
1508 	return (unsigned long)cl;
1509 }
1510 
1511 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1512 {
1513 	struct htb_class *cl = (struct htb_class *)arg;
1514 
1515 	if (cl)
1516 		cl->filter_cnt--;
1517 }
1518 
1519 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1520 {
1521 	struct htb_sched *q = qdisc_priv(sch);
1522 	struct htb_class *cl;
1523 	struct hlist_node *n;
1524 	unsigned int i;
1525 
1526 	if (arg->stop)
1527 		return;
1528 
1529 	for (i = 0; i < q->clhash.hashsize; i++) {
1530 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1531 			if (arg->count < arg->skip) {
1532 				arg->count++;
1533 				continue;
1534 			}
1535 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1536 				arg->stop = 1;
1537 				return;
1538 			}
1539 			arg->count++;
1540 		}
1541 	}
1542 }
1543 
1544 static const struct Qdisc_class_ops htb_class_ops = {
1545 	.graft		=	htb_graft,
1546 	.leaf		=	htb_leaf,
1547 	.qlen_notify	=	htb_qlen_notify,
1548 	.get		=	htb_get,
1549 	.put		=	htb_put,
1550 	.change		=	htb_change_class,
1551 	.delete		=	htb_delete,
1552 	.walk		=	htb_walk,
1553 	.tcf_chain	=	htb_find_tcf,
1554 	.bind_tcf	=	htb_bind_filter,
1555 	.unbind_tcf	=	htb_unbind_filter,
1556 	.dump		=	htb_dump_class,
1557 	.dump_stats	=	htb_dump_class_stats,
1558 };
1559 
1560 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1561 	.next		=	NULL,
1562 	.cl_ops		=	&htb_class_ops,
1563 	.id		=	"htb",
1564 	.priv_size	=	sizeof(struct htb_sched),
1565 	.enqueue	=	htb_enqueue,
1566 	.dequeue	=	htb_dequeue,
1567 	.requeue	=	htb_requeue,
1568 	.drop		=	htb_drop,
1569 	.init		=	htb_init,
1570 	.reset		=	htb_reset,
1571 	.destroy	=	htb_destroy,
1572 	.change		=	NULL /* htb_change */,
1573 	.dump		=	htb_dump,
1574 	.owner		=	THIS_MODULE,
1575 };
1576 
1577 static int __init htb_module_init(void)
1578 {
1579 	return register_qdisc(&htb_qdisc_ops);
1580 }
1581 static void __exit htb_module_exit(void)
1582 {
1583 	unregister_qdisc(&htb_qdisc_ops);
1584 }
1585 
1586 module_init(htb_module_init)
1587 module_exit(htb_module_exit)
1588 MODULE_LICENSE("GPL");
1589