xref: /linux/net/sched/sch_htb.c (revision 429508c84d95811dd1300181dfe84743caff9a38)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 
106 	int			level;		/* our level (see above) */
107 	unsigned int		children;
108 	struct htb_class	*parent;	/* parent class */
109 
110 	struct net_rate_estimator __rcu *rate_est;
111 
112 	/*
113 	 * Written often fields
114 	 */
115 	struct gnet_stats_basic_sync bstats;
116 	struct gnet_stats_basic_sync bstats_bias;
117 	struct tc_htb_xstats	xstats;	/* our special stats */
118 
119 	/* token bucket parameters */
120 	s64			tokens, ctokens;/* current number of tokens */
121 	s64			t_c;		/* checkpoint time */
122 
123 	union {
124 		struct htb_class_leaf {
125 			int		deficit[TC_HTB_MAXDEPTH];
126 			struct Qdisc	*q;
127 			struct netdev_queue *offload_queue;
128 		} leaf;
129 		struct htb_class_inner {
130 			struct htb_prio clprio[TC_HTB_NUMPRIO];
131 		} inner;
132 	};
133 	s64			pq_key;
134 
135 	int			prio_activity;	/* for which prios are we active */
136 	enum htb_cmode		cmode;		/* current mode of the class */
137 	struct rb_node		pq_node;	/* node for event queue */
138 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
139 
140 	unsigned int drops ____cacheline_aligned_in_smp;
141 	unsigned int		overlimits;
142 };
143 
144 struct htb_level {
145 	struct rb_root	wait_pq;
146 	struct htb_prio hprio[TC_HTB_NUMPRIO];
147 };
148 
149 struct htb_sched {
150 	struct Qdisc_class_hash clhash;
151 	int			defcls;		/* class where unclassified flows go to */
152 	int			rate2quantum;	/* quant = rate / rate2quantum */
153 
154 	/* filters for qdisc itself */
155 	struct tcf_proto __rcu	*filter_list;
156 	struct tcf_block	*block;
157 
158 #define HTB_WARN_TOOMANYEVENTS	0x1
159 	unsigned int		warned;	/* only one warning */
160 	int			direct_qlen;
161 	struct work_struct	work;
162 
163 	/* non shaped skbs; let them go directly thru */
164 	struct qdisc_skb_head	direct_queue;
165 	u32			direct_pkts;
166 	u32			overlimits;
167 
168 	struct qdisc_watchdog	watchdog;
169 
170 	s64			now;	/* cached dequeue time */
171 
172 	/* time of nearest event per level (row) */
173 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
174 
175 	int			row_mask[TC_HTB_MAXDEPTH];
176 
177 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
178 
179 	struct Qdisc		**direct_qdiscs;
180 	unsigned int            num_direct_qdiscs;
181 
182 	bool			offload;
183 };
184 
185 /* find class in global hash table using given handle */
186 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
187 {
188 	struct htb_sched *q = qdisc_priv(sch);
189 	struct Qdisc_class_common *clc;
190 
191 	clc = qdisc_class_find(&q->clhash, handle);
192 	if (clc == NULL)
193 		return NULL;
194 	return container_of(clc, struct htb_class, common);
195 }
196 
197 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
198 {
199 	return (unsigned long)htb_find(handle, sch);
200 }
201 
202 #define HTB_DIRECT ((struct htb_class *)-1L)
203 
204 /**
205  * htb_classify - classify a packet into class
206  * @skb: the socket buffer
207  * @sch: the active queue discipline
208  * @qerr: pointer for returned status code
209  *
210  * It returns NULL if the packet should be dropped or -1 if the packet
211  * should be passed directly thru. In all other cases leaf class is returned.
212  * We allow direct class selection by classid in priority. The we examine
213  * filters in qdisc and in inner nodes (if higher filter points to the inner
214  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
215  * internal fifo (direct). These packets then go directly thru. If we still
216  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
217  * then finish and return direct queue.
218  */
219 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
220 				      int *qerr)
221 {
222 	struct htb_sched *q = qdisc_priv(sch);
223 	struct htb_class *cl;
224 	struct tcf_result res;
225 	struct tcf_proto *tcf;
226 	int result;
227 
228 	/* allow to select class by setting skb->priority to valid classid;
229 	 * note that nfmark can be used too by attaching filter fw with no
230 	 * rules in it
231 	 */
232 	if (skb->priority == sch->handle)
233 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
234 	cl = htb_find(skb->priority, sch);
235 	if (cl) {
236 		if (cl->level == 0)
237 			return cl;
238 		/* Start with inner filter chain if a non-leaf class is selected */
239 		tcf = rcu_dereference_bh(cl->filter_list);
240 	} else {
241 		tcf = rcu_dereference_bh(q->filter_list);
242 	}
243 
244 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
245 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
246 #ifdef CONFIG_NET_CLS_ACT
247 		switch (result) {
248 		case TC_ACT_QUEUED:
249 		case TC_ACT_STOLEN:
250 		case TC_ACT_TRAP:
251 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
252 			fallthrough;
253 		case TC_ACT_SHOT:
254 			return NULL;
255 		}
256 #endif
257 		cl = (void *)res.class;
258 		if (!cl) {
259 			if (res.classid == sch->handle)
260 				return HTB_DIRECT;	/* X:0 (direct flow) */
261 			cl = htb_find(res.classid, sch);
262 			if (!cl)
263 				break;	/* filter selected invalid classid */
264 		}
265 		if (!cl->level)
266 			return cl;	/* we hit leaf; return it */
267 
268 		/* we have got inner class; apply inner filter chain */
269 		tcf = rcu_dereference_bh(cl->filter_list);
270 	}
271 	/* classification failed; try to use default class */
272 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
273 	if (!cl || cl->level)
274 		return HTB_DIRECT;	/* bad default .. this is safe bet */
275 	return cl;
276 }
277 
278 /**
279  * htb_add_to_id_tree - adds class to the round robin list
280  * @root: the root of the tree
281  * @cl: the class to add
282  * @prio: the give prio in class
283  *
284  * Routine adds class to the list (actually tree) sorted by classid.
285  * Make sure that class is not already on such list for given prio.
286  */
287 static void htb_add_to_id_tree(struct rb_root *root,
288 			       struct htb_class *cl, int prio)
289 {
290 	struct rb_node **p = &root->rb_node, *parent = NULL;
291 
292 	while (*p) {
293 		struct htb_class *c;
294 		parent = *p;
295 		c = rb_entry(parent, struct htb_class, node[prio]);
296 
297 		if (cl->common.classid > c->common.classid)
298 			p = &parent->rb_right;
299 		else
300 			p = &parent->rb_left;
301 	}
302 	rb_link_node(&cl->node[prio], parent, p);
303 	rb_insert_color(&cl->node[prio], root);
304 }
305 
306 /**
307  * htb_add_to_wait_tree - adds class to the event queue with delay
308  * @q: the priority event queue
309  * @cl: the class to add
310  * @delay: delay in microseconds
311  *
312  * The class is added to priority event queue to indicate that class will
313  * change its mode in cl->pq_key microseconds. Make sure that class is not
314  * already in the queue.
315  */
316 static void htb_add_to_wait_tree(struct htb_sched *q,
317 				 struct htb_class *cl, s64 delay)
318 {
319 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
320 
321 	cl->pq_key = q->now + delay;
322 	if (cl->pq_key == q->now)
323 		cl->pq_key++;
324 
325 	/* update the nearest event cache */
326 	if (q->near_ev_cache[cl->level] > cl->pq_key)
327 		q->near_ev_cache[cl->level] = cl->pq_key;
328 
329 	while (*p) {
330 		struct htb_class *c;
331 		parent = *p;
332 		c = rb_entry(parent, struct htb_class, pq_node);
333 		if (cl->pq_key >= c->pq_key)
334 			p = &parent->rb_right;
335 		else
336 			p = &parent->rb_left;
337 	}
338 	rb_link_node(&cl->pq_node, parent, p);
339 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
340 }
341 
342 /**
343  * htb_next_rb_node - finds next node in binary tree
344  * @n: the current node in binary tree
345  *
346  * When we are past last key we return NULL.
347  * Average complexity is 2 steps per call.
348  */
349 static inline void htb_next_rb_node(struct rb_node **n)
350 {
351 	*n = rb_next(*n);
352 }
353 
354 /**
355  * htb_add_class_to_row - add class to its row
356  * @q: the priority event queue
357  * @cl: the class to add
358  * @mask: the given priorities in class in bitmap
359  *
360  * The class is added to row at priorities marked in mask.
361  * It does nothing if mask == 0.
362  */
363 static inline void htb_add_class_to_row(struct htb_sched *q,
364 					struct htb_class *cl, int mask)
365 {
366 	q->row_mask[cl->level] |= mask;
367 	while (mask) {
368 		int prio = ffz(~mask);
369 		mask &= ~(1 << prio);
370 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
371 	}
372 }
373 
374 /* If this triggers, it is a bug in this code, but it need not be fatal */
375 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
376 {
377 	if (RB_EMPTY_NODE(rb)) {
378 		WARN_ON(1);
379 	} else {
380 		rb_erase(rb, root);
381 		RB_CLEAR_NODE(rb);
382 	}
383 }
384 
385 
386 /**
387  * htb_remove_class_from_row - removes class from its row
388  * @q: the priority event queue
389  * @cl: the class to add
390  * @mask: the given priorities in class in bitmap
391  *
392  * The class is removed from row at priorities marked in mask.
393  * It does nothing if mask == 0.
394  */
395 static inline void htb_remove_class_from_row(struct htb_sched *q,
396 						 struct htb_class *cl, int mask)
397 {
398 	int m = 0;
399 	struct htb_level *hlevel = &q->hlevel[cl->level];
400 
401 	while (mask) {
402 		int prio = ffz(~mask);
403 		struct htb_prio *hprio = &hlevel->hprio[prio];
404 
405 		mask &= ~(1 << prio);
406 		if (hprio->ptr == cl->node + prio)
407 			htb_next_rb_node(&hprio->ptr);
408 
409 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
410 		if (!hprio->row.rb_node)
411 			m |= 1 << prio;
412 	}
413 	q->row_mask[cl->level] &= ~m;
414 }
415 
416 /**
417  * htb_activate_prios - creates active classe's feed chain
418  * @q: the priority event queue
419  * @cl: the class to activate
420  *
421  * The class is connected to ancestors and/or appropriate rows
422  * for priorities it is participating on. cl->cmode must be new
423  * (activated) mode. It does nothing if cl->prio_activity == 0.
424  */
425 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
426 {
427 	struct htb_class *p = cl->parent;
428 	long m, mask = cl->prio_activity;
429 
430 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
431 		m = mask;
432 		while (m) {
433 			unsigned int prio = ffz(~m);
434 
435 			if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
436 				break;
437 			m &= ~(1 << prio);
438 
439 			if (p->inner.clprio[prio].feed.rb_node)
440 				/* parent already has its feed in use so that
441 				 * reset bit in mask as parent is already ok
442 				 */
443 				mask &= ~(1 << prio);
444 
445 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
446 		}
447 		p->prio_activity |= mask;
448 		cl = p;
449 		p = cl->parent;
450 
451 	}
452 	if (cl->cmode == HTB_CAN_SEND && mask)
453 		htb_add_class_to_row(q, cl, mask);
454 }
455 
456 /**
457  * htb_deactivate_prios - remove class from feed chain
458  * @q: the priority event queue
459  * @cl: the class to deactivate
460  *
461  * cl->cmode must represent old mode (before deactivation). It does
462  * nothing if cl->prio_activity == 0. Class is removed from all feed
463  * chains and rows.
464  */
465 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
466 {
467 	struct htb_class *p = cl->parent;
468 	long m, mask = cl->prio_activity;
469 
470 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
471 		m = mask;
472 		mask = 0;
473 		while (m) {
474 			int prio = ffz(~m);
475 			m &= ~(1 << prio);
476 
477 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
478 				/* we are removing child which is pointed to from
479 				 * parent feed - forget the pointer but remember
480 				 * classid
481 				 */
482 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
483 				p->inner.clprio[prio].ptr = NULL;
484 			}
485 
486 			htb_safe_rb_erase(cl->node + prio,
487 					  &p->inner.clprio[prio].feed);
488 
489 			if (!p->inner.clprio[prio].feed.rb_node)
490 				mask |= 1 << prio;
491 		}
492 
493 		p->prio_activity &= ~mask;
494 		cl = p;
495 		p = cl->parent;
496 
497 	}
498 	if (cl->cmode == HTB_CAN_SEND && mask)
499 		htb_remove_class_from_row(q, cl, mask);
500 }
501 
502 static inline s64 htb_lowater(const struct htb_class *cl)
503 {
504 	if (htb_hysteresis)
505 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
506 	else
507 		return 0;
508 }
509 static inline s64 htb_hiwater(const struct htb_class *cl)
510 {
511 	if (htb_hysteresis)
512 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
513 	else
514 		return 0;
515 }
516 
517 
518 /**
519  * htb_class_mode - computes and returns current class mode
520  * @cl: the target class
521  * @diff: diff time in microseconds
522  *
523  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
524  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
525  * from now to time when cl will change its state.
526  * Also it is worth to note that class mode doesn't change simply
527  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
528  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
529  * mode transitions per time unit. The speed gain is about 1/6.
530  */
531 static inline enum htb_cmode
532 htb_class_mode(struct htb_class *cl, s64 *diff)
533 {
534 	s64 toks;
535 
536 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
537 		*diff = -toks;
538 		return HTB_CANT_SEND;
539 	}
540 
541 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
542 		return HTB_CAN_SEND;
543 
544 	*diff = -toks;
545 	return HTB_MAY_BORROW;
546 }
547 
548 /**
549  * htb_change_class_mode - changes classe's mode
550  * @q: the priority event queue
551  * @cl: the target class
552  * @diff: diff time in microseconds
553  *
554  * This should be the only way how to change classe's mode under normal
555  * circumstances. Routine will update feed lists linkage, change mode
556  * and add class to the wait event queue if appropriate. New mode should
557  * be different from old one and cl->pq_key has to be valid if changing
558  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
559  */
560 static void
561 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
562 {
563 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
564 
565 	if (new_mode == cl->cmode)
566 		return;
567 
568 	if (new_mode == HTB_CANT_SEND) {
569 		cl->overlimits++;
570 		q->overlimits++;
571 	}
572 
573 	if (cl->prio_activity) {	/* not necessary: speed optimization */
574 		if (cl->cmode != HTB_CANT_SEND)
575 			htb_deactivate_prios(q, cl);
576 		cl->cmode = new_mode;
577 		if (new_mode != HTB_CANT_SEND)
578 			htb_activate_prios(q, cl);
579 	} else
580 		cl->cmode = new_mode;
581 }
582 
583 /**
584  * htb_activate - inserts leaf cl into appropriate active feeds
585  * @q: the priority event queue
586  * @cl: the target class
587  *
588  * Routine learns (new) priority of leaf and activates feed chain
589  * for the prio. It can be called on already active leaf safely.
590  * It also adds leaf into droplist.
591  */
592 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
593 {
594 	WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
595 
596 	if (!cl->prio_activity) {
597 		cl->prio_activity = 1 << cl->prio;
598 		htb_activate_prios(q, cl);
599 	}
600 }
601 
602 /**
603  * htb_deactivate - remove leaf cl from active feeds
604  * @q: the priority event queue
605  * @cl: the target class
606  *
607  * Make sure that leaf is active. In the other words it can't be called
608  * with non-active leaf. It also removes class from the drop list.
609  */
610 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
611 {
612 	WARN_ON(!cl->prio_activity);
613 
614 	htb_deactivate_prios(q, cl);
615 	cl->prio_activity = 0;
616 }
617 
618 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
619 		       struct sk_buff **to_free)
620 {
621 	int ret;
622 	unsigned int len = qdisc_pkt_len(skb);
623 	struct htb_sched *q = qdisc_priv(sch);
624 	struct htb_class *cl = htb_classify(skb, sch, &ret);
625 
626 	if (cl == HTB_DIRECT) {
627 		/* enqueue to helper queue */
628 		if (q->direct_queue.qlen < q->direct_qlen) {
629 			__qdisc_enqueue_tail(skb, &q->direct_queue);
630 			q->direct_pkts++;
631 		} else {
632 			return qdisc_drop(skb, sch, to_free);
633 		}
634 #ifdef CONFIG_NET_CLS_ACT
635 	} else if (!cl) {
636 		if (ret & __NET_XMIT_BYPASS)
637 			qdisc_qstats_drop(sch);
638 		__qdisc_drop(skb, to_free);
639 		return ret;
640 #endif
641 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
642 					to_free)) != NET_XMIT_SUCCESS) {
643 		if (net_xmit_drop_count(ret)) {
644 			qdisc_qstats_drop(sch);
645 			cl->drops++;
646 		}
647 		return ret;
648 	} else {
649 		htb_activate(q, cl);
650 	}
651 
652 	sch->qstats.backlog += len;
653 	sch->q.qlen++;
654 	return NET_XMIT_SUCCESS;
655 }
656 
657 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
658 {
659 	s64 toks = diff + cl->tokens;
660 
661 	if (toks > cl->buffer)
662 		toks = cl->buffer;
663 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
664 	if (toks <= -cl->mbuffer)
665 		toks = 1 - cl->mbuffer;
666 
667 	cl->tokens = toks;
668 }
669 
670 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
671 {
672 	s64 toks = diff + cl->ctokens;
673 
674 	if (toks > cl->cbuffer)
675 		toks = cl->cbuffer;
676 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
677 	if (toks <= -cl->mbuffer)
678 		toks = 1 - cl->mbuffer;
679 
680 	cl->ctokens = toks;
681 }
682 
683 /**
684  * htb_charge_class - charges amount "bytes" to leaf and ancestors
685  * @q: the priority event queue
686  * @cl: the class to start iterate
687  * @level: the minimum level to account
688  * @skb: the socket buffer
689  *
690  * Routine assumes that packet "bytes" long was dequeued from leaf cl
691  * borrowing from "level". It accounts bytes to ceil leaky bucket for
692  * leaf and all ancestors and to rate bucket for ancestors at levels
693  * "level" and higher. It also handles possible change of mode resulting
694  * from the update. Note that mode can also increase here (MAY_BORROW to
695  * CAN_SEND) because we can use more precise clock that event queue here.
696  * In such case we remove class from event queue first.
697  */
698 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
699 			     int level, struct sk_buff *skb)
700 {
701 	int bytes = qdisc_pkt_len(skb);
702 	enum htb_cmode old_mode;
703 	s64 diff;
704 
705 	while (cl) {
706 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
707 		if (cl->level >= level) {
708 			if (cl->level == level)
709 				cl->xstats.lends++;
710 			htb_accnt_tokens(cl, bytes, diff);
711 		} else {
712 			cl->xstats.borrows++;
713 			cl->tokens += diff;	/* we moved t_c; update tokens */
714 		}
715 		htb_accnt_ctokens(cl, bytes, diff);
716 		cl->t_c = q->now;
717 
718 		old_mode = cl->cmode;
719 		diff = 0;
720 		htb_change_class_mode(q, cl, &diff);
721 		if (old_mode != cl->cmode) {
722 			if (old_mode != HTB_CAN_SEND)
723 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
724 			if (cl->cmode != HTB_CAN_SEND)
725 				htb_add_to_wait_tree(q, cl, diff);
726 		}
727 
728 		/* update basic stats except for leaves which are already updated */
729 		if (cl->level)
730 			bstats_update(&cl->bstats, skb);
731 
732 		cl = cl->parent;
733 	}
734 }
735 
736 /**
737  * htb_do_events - make mode changes to classes at the level
738  * @q: the priority event queue
739  * @level: which wait_pq in 'q->hlevel'
740  * @start: start jiffies
741  *
742  * Scans event queue for pending events and applies them. Returns time of
743  * next pending event (0 for no event in pq, q->now for too many events).
744  * Note: Applied are events whose have cl->pq_key <= q->now.
745  */
746 static s64 htb_do_events(struct htb_sched *q, const int level,
747 			 unsigned long start)
748 {
749 	/* don't run for longer than 2 jiffies; 2 is used instead of
750 	 * 1 to simplify things when jiffy is going to be incremented
751 	 * too soon
752 	 */
753 	unsigned long stop_at = start + 2;
754 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
755 
756 	while (time_before(jiffies, stop_at)) {
757 		struct htb_class *cl;
758 		s64 diff;
759 		struct rb_node *p = rb_first(wait_pq);
760 
761 		if (!p)
762 			return 0;
763 
764 		cl = rb_entry(p, struct htb_class, pq_node);
765 		if (cl->pq_key > q->now)
766 			return cl->pq_key;
767 
768 		htb_safe_rb_erase(p, wait_pq);
769 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
770 		htb_change_class_mode(q, cl, &diff);
771 		if (cl->cmode != HTB_CAN_SEND)
772 			htb_add_to_wait_tree(q, cl, diff);
773 	}
774 
775 	/* too much load - let's continue after a break for scheduling */
776 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
777 		pr_warn("htb: too many events!\n");
778 		q->warned |= HTB_WARN_TOOMANYEVENTS;
779 	}
780 
781 	return q->now;
782 }
783 
784 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
785  * is no such one exists.
786  */
787 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
788 					      u32 id)
789 {
790 	struct rb_node *r = NULL;
791 	while (n) {
792 		struct htb_class *cl =
793 		    rb_entry(n, struct htb_class, node[prio]);
794 
795 		if (id > cl->common.classid) {
796 			n = n->rb_right;
797 		} else if (id < cl->common.classid) {
798 			r = n;
799 			n = n->rb_left;
800 		} else {
801 			return n;
802 		}
803 	}
804 	return r;
805 }
806 
807 /**
808  * htb_lookup_leaf - returns next leaf class in DRR order
809  * @hprio: the current one
810  * @prio: which prio in class
811  *
812  * Find leaf where current feed pointers points to.
813  */
814 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
815 {
816 	int i;
817 	struct {
818 		struct rb_node *root;
819 		struct rb_node **pptr;
820 		u32 *pid;
821 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
822 
823 	BUG_ON(!hprio->row.rb_node);
824 	sp->root = hprio->row.rb_node;
825 	sp->pptr = &hprio->ptr;
826 	sp->pid = &hprio->last_ptr_id;
827 
828 	for (i = 0; i < 65535; i++) {
829 		if (!*sp->pptr && *sp->pid) {
830 			/* ptr was invalidated but id is valid - try to recover
831 			 * the original or next ptr
832 			 */
833 			*sp->pptr =
834 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
835 		}
836 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
837 				 * can become out of date quickly
838 				 */
839 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
840 			*sp->pptr = sp->root;
841 			while ((*sp->pptr)->rb_left)
842 				*sp->pptr = (*sp->pptr)->rb_left;
843 			if (sp > stk) {
844 				sp--;
845 				if (!*sp->pptr) {
846 					WARN_ON(1);
847 					return NULL;
848 				}
849 				htb_next_rb_node(sp->pptr);
850 			}
851 		} else {
852 			struct htb_class *cl;
853 			struct htb_prio *clp;
854 
855 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
856 			if (!cl->level)
857 				return cl;
858 			clp = &cl->inner.clprio[prio];
859 			(++sp)->root = clp->feed.rb_node;
860 			sp->pptr = &clp->ptr;
861 			sp->pid = &clp->last_ptr_id;
862 		}
863 	}
864 	WARN_ON(1);
865 	return NULL;
866 }
867 
868 /* dequeues packet at given priority and level; call only if
869  * you are sure that there is active class at prio/level
870  */
871 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
872 					const int level)
873 {
874 	struct sk_buff *skb = NULL;
875 	struct htb_class *cl, *start;
876 	struct htb_level *hlevel = &q->hlevel[level];
877 	struct htb_prio *hprio = &hlevel->hprio[prio];
878 
879 	/* look initial class up in the row */
880 	start = cl = htb_lookup_leaf(hprio, prio);
881 
882 	do {
883 next:
884 		if (unlikely(!cl))
885 			return NULL;
886 
887 		/* class can be empty - it is unlikely but can be true if leaf
888 		 * qdisc drops packets in enqueue routine or if someone used
889 		 * graft operation on the leaf since last dequeue;
890 		 * simply deactivate and skip such class
891 		 */
892 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
893 			struct htb_class *next;
894 			htb_deactivate(q, cl);
895 
896 			/* row/level might become empty */
897 			if ((q->row_mask[level] & (1 << prio)) == 0)
898 				return NULL;
899 
900 			next = htb_lookup_leaf(hprio, prio);
901 
902 			if (cl == start)	/* fix start if we just deleted it */
903 				start = next;
904 			cl = next;
905 			goto next;
906 		}
907 
908 		skb = cl->leaf.q->dequeue(cl->leaf.q);
909 		if (likely(skb != NULL))
910 			break;
911 
912 		qdisc_warn_nonwc("htb", cl->leaf.q);
913 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
914 					 &q->hlevel[0].hprio[prio].ptr);
915 		cl = htb_lookup_leaf(hprio, prio);
916 
917 	} while (cl != start);
918 
919 	if (likely(skb != NULL)) {
920 		bstats_update(&cl->bstats, skb);
921 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
922 		if (cl->leaf.deficit[level] < 0) {
923 			cl->leaf.deficit[level] += cl->quantum;
924 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
925 						 &q->hlevel[0].hprio[prio].ptr);
926 		}
927 		/* this used to be after charge_class but this constelation
928 		 * gives us slightly better performance
929 		 */
930 		if (!cl->leaf.q->q.qlen)
931 			htb_deactivate(q, cl);
932 		htb_charge_class(q, cl, level, skb);
933 	}
934 	return skb;
935 }
936 
937 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
938 {
939 	struct sk_buff *skb;
940 	struct htb_sched *q = qdisc_priv(sch);
941 	int level;
942 	s64 next_event;
943 	unsigned long start_at;
944 
945 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
946 	skb = __qdisc_dequeue_head(&q->direct_queue);
947 	if (skb != NULL) {
948 ok:
949 		qdisc_bstats_update(sch, skb);
950 		qdisc_qstats_backlog_dec(sch, skb);
951 		sch->q.qlen--;
952 		return skb;
953 	}
954 
955 	if (!sch->q.qlen)
956 		goto fin;
957 	q->now = ktime_get_ns();
958 	start_at = jiffies;
959 
960 	next_event = q->now + 5LLU * NSEC_PER_SEC;
961 
962 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
963 		/* common case optimization - skip event handler quickly */
964 		int m;
965 		s64 event = q->near_ev_cache[level];
966 
967 		if (q->now >= event) {
968 			event = htb_do_events(q, level, start_at);
969 			if (!event)
970 				event = q->now + NSEC_PER_SEC;
971 			q->near_ev_cache[level] = event;
972 		}
973 
974 		if (next_event > event)
975 			next_event = event;
976 
977 		m = ~q->row_mask[level];
978 		while (m != (int)(-1)) {
979 			int prio = ffz(m);
980 
981 			m |= 1 << prio;
982 			skb = htb_dequeue_tree(q, prio, level);
983 			if (likely(skb != NULL))
984 				goto ok;
985 		}
986 	}
987 	if (likely(next_event > q->now))
988 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
989 	else
990 		schedule_work(&q->work);
991 fin:
992 	return skb;
993 }
994 
995 /* reset all classes */
996 /* always caled under BH & queue lock */
997 static void htb_reset(struct Qdisc *sch)
998 {
999 	struct htb_sched *q = qdisc_priv(sch);
1000 	struct htb_class *cl;
1001 	unsigned int i;
1002 
1003 	for (i = 0; i < q->clhash.hashsize; i++) {
1004 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1005 			if (cl->level)
1006 				memset(&cl->inner, 0, sizeof(cl->inner));
1007 			else {
1008 				if (cl->leaf.q && !q->offload)
1009 					qdisc_reset(cl->leaf.q);
1010 			}
1011 			cl->prio_activity = 0;
1012 			cl->cmode = HTB_CAN_SEND;
1013 		}
1014 	}
1015 	qdisc_watchdog_cancel(&q->watchdog);
1016 	__qdisc_reset_queue(&q->direct_queue);
1017 	memset(q->hlevel, 0, sizeof(q->hlevel));
1018 	memset(q->row_mask, 0, sizeof(q->row_mask));
1019 }
1020 
1021 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1022 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1023 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1024 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1025 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1026 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1027 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1028 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1029 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1030 };
1031 
1032 static void htb_work_func(struct work_struct *work)
1033 {
1034 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1035 	struct Qdisc *sch = q->watchdog.qdisc;
1036 
1037 	rcu_read_lock();
1038 	__netif_schedule(qdisc_root(sch));
1039 	rcu_read_unlock();
1040 }
1041 
1042 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1043 {
1044 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1045 }
1046 
1047 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1048 		    struct netlink_ext_ack *extack)
1049 {
1050 	struct net_device *dev = qdisc_dev(sch);
1051 	struct tc_htb_qopt_offload offload_opt;
1052 	struct htb_sched *q = qdisc_priv(sch);
1053 	struct nlattr *tb[TCA_HTB_MAX + 1];
1054 	struct tc_htb_glob *gopt;
1055 	unsigned int ntx;
1056 	bool offload;
1057 	int err;
1058 
1059 	qdisc_watchdog_init(&q->watchdog, sch);
1060 	INIT_WORK(&q->work, htb_work_func);
1061 
1062 	if (!opt)
1063 		return -EINVAL;
1064 
1065 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1066 	if (err)
1067 		return err;
1068 
1069 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1070 					  NULL);
1071 	if (err < 0)
1072 		return err;
1073 
1074 	if (!tb[TCA_HTB_INIT])
1075 		return -EINVAL;
1076 
1077 	gopt = nla_data(tb[TCA_HTB_INIT]);
1078 	if (gopt->version != HTB_VER >> 16)
1079 		return -EINVAL;
1080 
1081 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1082 
1083 	if (offload) {
1084 		if (sch->parent != TC_H_ROOT) {
1085 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1086 			return -EOPNOTSUPP;
1087 		}
1088 
1089 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1090 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1091 			return -EOPNOTSUPP;
1092 		}
1093 
1094 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1095 		q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
1096 					   sizeof(*q->direct_qdiscs),
1097 					   GFP_KERNEL);
1098 		if (!q->direct_qdiscs)
1099 			return -ENOMEM;
1100 	}
1101 
1102 	err = qdisc_class_hash_init(&q->clhash);
1103 	if (err < 0)
1104 		return err;
1105 
1106 	if (tb[TCA_HTB_DIRECT_QLEN])
1107 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1108 	else
1109 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1110 
1111 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1112 		q->rate2quantum = 1;
1113 	q->defcls = gopt->defcls;
1114 
1115 	if (!offload)
1116 		return 0;
1117 
1118 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1119 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1120 		struct Qdisc *qdisc;
1121 
1122 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1123 					  TC_H_MAKE(sch->handle, 0), extack);
1124 		if (!qdisc) {
1125 			return -ENOMEM;
1126 		}
1127 
1128 		q->direct_qdiscs[ntx] = qdisc;
1129 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1130 	}
1131 
1132 	sch->flags |= TCQ_F_MQROOT;
1133 
1134 	offload_opt = (struct tc_htb_qopt_offload) {
1135 		.command = TC_HTB_CREATE,
1136 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1137 		.classid = TC_H_MIN(q->defcls),
1138 		.extack = extack,
1139 	};
1140 	err = htb_offload(dev, &offload_opt);
1141 	if (err)
1142 		return err;
1143 
1144 	/* Defer this assignment, so that htb_destroy skips offload-related
1145 	 * parts (especially calling ndo_setup_tc) on errors.
1146 	 */
1147 	q->offload = true;
1148 
1149 	return 0;
1150 }
1151 
1152 static void htb_attach_offload(struct Qdisc *sch)
1153 {
1154 	struct net_device *dev = qdisc_dev(sch);
1155 	struct htb_sched *q = qdisc_priv(sch);
1156 	unsigned int ntx;
1157 
1158 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1159 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1160 
1161 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1162 		qdisc_put(old);
1163 		qdisc_hash_add(qdisc, false);
1164 	}
1165 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1166 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1167 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1168 
1169 		qdisc_put(old);
1170 	}
1171 
1172 	kfree(q->direct_qdiscs);
1173 	q->direct_qdiscs = NULL;
1174 }
1175 
1176 static void htb_attach_software(struct Qdisc *sch)
1177 {
1178 	struct net_device *dev = qdisc_dev(sch);
1179 	unsigned int ntx;
1180 
1181 	/* Resemble qdisc_graft behavior. */
1182 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1183 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1184 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1185 
1186 		qdisc_refcount_inc(sch);
1187 
1188 		qdisc_put(old);
1189 	}
1190 }
1191 
1192 static void htb_attach(struct Qdisc *sch)
1193 {
1194 	struct htb_sched *q = qdisc_priv(sch);
1195 
1196 	if (q->offload)
1197 		htb_attach_offload(sch);
1198 	else
1199 		htb_attach_software(sch);
1200 }
1201 
1202 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1203 {
1204 	struct htb_sched *q = qdisc_priv(sch);
1205 	struct nlattr *nest;
1206 	struct tc_htb_glob gopt;
1207 
1208 	if (q->offload)
1209 		sch->flags |= TCQ_F_OFFLOADED;
1210 	else
1211 		sch->flags &= ~TCQ_F_OFFLOADED;
1212 
1213 	sch->qstats.overlimits = q->overlimits;
1214 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1215 	 * no change can happen on the qdisc parameters.
1216 	 */
1217 
1218 	gopt.direct_pkts = q->direct_pkts;
1219 	gopt.version = HTB_VER;
1220 	gopt.rate2quantum = q->rate2quantum;
1221 	gopt.defcls = q->defcls;
1222 	gopt.debug = 0;
1223 
1224 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1225 	if (nest == NULL)
1226 		goto nla_put_failure;
1227 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1228 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1229 		goto nla_put_failure;
1230 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1231 		goto nla_put_failure;
1232 
1233 	return nla_nest_end(skb, nest);
1234 
1235 nla_put_failure:
1236 	nla_nest_cancel(skb, nest);
1237 	return -1;
1238 }
1239 
1240 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1241 			  struct sk_buff *skb, struct tcmsg *tcm)
1242 {
1243 	struct htb_class *cl = (struct htb_class *)arg;
1244 	struct htb_sched *q = qdisc_priv(sch);
1245 	struct nlattr *nest;
1246 	struct tc_htb_opt opt;
1247 
1248 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1249 	 * no change can happen on the class parameters.
1250 	 */
1251 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1252 	tcm->tcm_handle = cl->common.classid;
1253 	if (!cl->level && cl->leaf.q)
1254 		tcm->tcm_info = cl->leaf.q->handle;
1255 
1256 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1257 	if (nest == NULL)
1258 		goto nla_put_failure;
1259 
1260 	memset(&opt, 0, sizeof(opt));
1261 
1262 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1263 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1264 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1265 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1266 	opt.quantum = cl->quantum;
1267 	opt.prio = cl->prio;
1268 	opt.level = cl->level;
1269 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1270 		goto nla_put_failure;
1271 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1272 		goto nla_put_failure;
1273 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1274 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1275 			      TCA_HTB_PAD))
1276 		goto nla_put_failure;
1277 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1278 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1279 			      TCA_HTB_PAD))
1280 		goto nla_put_failure;
1281 
1282 	return nla_nest_end(skb, nest);
1283 
1284 nla_put_failure:
1285 	nla_nest_cancel(skb, nest);
1286 	return -1;
1287 }
1288 
1289 static void htb_offload_aggregate_stats(struct htb_sched *q,
1290 					struct htb_class *cl)
1291 {
1292 	u64 bytes = 0, packets = 0;
1293 	struct htb_class *c;
1294 	unsigned int i;
1295 
1296 	gnet_stats_basic_sync_init(&cl->bstats);
1297 
1298 	for (i = 0; i < q->clhash.hashsize; i++) {
1299 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1300 			struct htb_class *p = c;
1301 
1302 			while (p && p->level < cl->level)
1303 				p = p->parent;
1304 
1305 			if (p != cl)
1306 				continue;
1307 
1308 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1309 			packets += u64_stats_read(&c->bstats_bias.packets);
1310 			if (c->level == 0) {
1311 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1312 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1313 			}
1314 		}
1315 	}
1316 	_bstats_update(&cl->bstats, bytes, packets);
1317 }
1318 
1319 static int
1320 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1321 {
1322 	struct htb_class *cl = (struct htb_class *)arg;
1323 	struct htb_sched *q = qdisc_priv(sch);
1324 	struct gnet_stats_queue qs = {
1325 		.drops = cl->drops,
1326 		.overlimits = cl->overlimits,
1327 	};
1328 	__u32 qlen = 0;
1329 
1330 	if (!cl->level && cl->leaf.q)
1331 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1332 
1333 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1334 				    INT_MIN, INT_MAX);
1335 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1336 				     INT_MIN, INT_MAX);
1337 
1338 	if (q->offload) {
1339 		if (!cl->level) {
1340 			if (cl->leaf.q)
1341 				cl->bstats = cl->leaf.q->bstats;
1342 			else
1343 				gnet_stats_basic_sync_init(&cl->bstats);
1344 			_bstats_update(&cl->bstats,
1345 				       u64_stats_read(&cl->bstats_bias.bytes),
1346 				       u64_stats_read(&cl->bstats_bias.packets));
1347 		} else {
1348 			htb_offload_aggregate_stats(q, cl);
1349 		}
1350 	}
1351 
1352 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1353 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1354 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1355 		return -1;
1356 
1357 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1358 }
1359 
1360 static struct netdev_queue *
1361 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1362 {
1363 	struct net_device *dev = qdisc_dev(sch);
1364 	struct tc_htb_qopt_offload offload_opt;
1365 	struct htb_sched *q = qdisc_priv(sch);
1366 	int err;
1367 
1368 	if (!q->offload)
1369 		return sch->dev_queue;
1370 
1371 	offload_opt = (struct tc_htb_qopt_offload) {
1372 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1373 		.classid = TC_H_MIN(tcm->tcm_parent),
1374 	};
1375 	err = htb_offload(dev, &offload_opt);
1376 	if (err || offload_opt.qid >= dev->num_tx_queues)
1377 		return NULL;
1378 	return netdev_get_tx_queue(dev, offload_opt.qid);
1379 }
1380 
1381 static struct Qdisc *
1382 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1383 {
1384 	struct net_device *dev = dev_queue->dev;
1385 	struct Qdisc *old_q;
1386 
1387 	if (dev->flags & IFF_UP)
1388 		dev_deactivate(dev);
1389 	old_q = dev_graft_qdisc(dev_queue, new_q);
1390 	if (new_q)
1391 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1392 	if (dev->flags & IFF_UP)
1393 		dev_activate(dev);
1394 
1395 	return old_q;
1396 }
1397 
1398 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1399 {
1400 	struct netdev_queue *queue;
1401 
1402 	queue = cl->leaf.offload_queue;
1403 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1404 		WARN_ON(cl->leaf.q->dev_queue != queue);
1405 
1406 	return queue;
1407 }
1408 
1409 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1410 				   struct htb_class *cl_new, bool destroying)
1411 {
1412 	struct netdev_queue *queue_old, *queue_new;
1413 	struct net_device *dev = qdisc_dev(sch);
1414 
1415 	queue_old = htb_offload_get_queue(cl_old);
1416 	queue_new = htb_offload_get_queue(cl_new);
1417 
1418 	if (!destroying) {
1419 		struct Qdisc *qdisc;
1420 
1421 		if (dev->flags & IFF_UP)
1422 			dev_deactivate(dev);
1423 		qdisc = dev_graft_qdisc(queue_old, NULL);
1424 		WARN_ON(qdisc != cl_old->leaf.q);
1425 	}
1426 
1427 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1428 		cl_old->leaf.q->dev_queue = queue_new;
1429 	cl_old->leaf.offload_queue = queue_new;
1430 
1431 	if (!destroying) {
1432 		struct Qdisc *qdisc;
1433 
1434 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1435 		if (dev->flags & IFF_UP)
1436 			dev_activate(dev);
1437 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1438 	}
1439 }
1440 
1441 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1442 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1443 {
1444 	struct netdev_queue *dev_queue = sch->dev_queue;
1445 	struct htb_class *cl = (struct htb_class *)arg;
1446 	struct htb_sched *q = qdisc_priv(sch);
1447 	struct Qdisc *old_q;
1448 
1449 	if (cl->level)
1450 		return -EINVAL;
1451 
1452 	if (q->offload)
1453 		dev_queue = htb_offload_get_queue(cl);
1454 
1455 	if (!new) {
1456 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1457 					cl->common.classid, extack);
1458 		if (!new)
1459 			return -ENOBUFS;
1460 	}
1461 
1462 	if (q->offload) {
1463 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1464 		qdisc_refcount_inc(new);
1465 		old_q = htb_graft_helper(dev_queue, new);
1466 	}
1467 
1468 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1469 
1470 	if (q->offload) {
1471 		WARN_ON(old_q != *old);
1472 		qdisc_put(old_q);
1473 	}
1474 
1475 	return 0;
1476 }
1477 
1478 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1479 {
1480 	struct htb_class *cl = (struct htb_class *)arg;
1481 	return !cl->level ? cl->leaf.q : NULL;
1482 }
1483 
1484 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1485 {
1486 	struct htb_class *cl = (struct htb_class *)arg;
1487 
1488 	htb_deactivate(qdisc_priv(sch), cl);
1489 }
1490 
1491 static inline int htb_parent_last_child(struct htb_class *cl)
1492 {
1493 	if (!cl->parent)
1494 		/* the root class */
1495 		return 0;
1496 	if (cl->parent->children > 1)
1497 		/* not the last child */
1498 		return 0;
1499 	return 1;
1500 }
1501 
1502 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1503 			       struct Qdisc *new_q)
1504 {
1505 	struct htb_sched *q = qdisc_priv(sch);
1506 	struct htb_class *parent = cl->parent;
1507 
1508 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1509 
1510 	if (parent->cmode != HTB_CAN_SEND)
1511 		htb_safe_rb_erase(&parent->pq_node,
1512 				  &q->hlevel[parent->level].wait_pq);
1513 
1514 	parent->level = 0;
1515 	memset(&parent->inner, 0, sizeof(parent->inner));
1516 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1517 	parent->tokens = parent->buffer;
1518 	parent->ctokens = parent->cbuffer;
1519 	parent->t_c = ktime_get_ns();
1520 	parent->cmode = HTB_CAN_SEND;
1521 	if (q->offload)
1522 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1523 }
1524 
1525 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1526 				       struct netdev_queue *dev_queue,
1527 				       struct Qdisc *new_q)
1528 {
1529 	struct Qdisc *old_q;
1530 
1531 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1532 	if (new_q)
1533 		qdisc_refcount_inc(new_q);
1534 	old_q = htb_graft_helper(dev_queue, new_q);
1535 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1536 }
1537 
1538 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1539 				     bool last_child, bool destroying,
1540 				     struct netlink_ext_ack *extack)
1541 {
1542 	struct tc_htb_qopt_offload offload_opt;
1543 	struct netdev_queue *dev_queue;
1544 	struct Qdisc *q = cl->leaf.q;
1545 	struct Qdisc *old;
1546 	int err;
1547 
1548 	if (cl->level)
1549 		return -EINVAL;
1550 
1551 	WARN_ON(!q);
1552 	dev_queue = htb_offload_get_queue(cl);
1553 	/* When destroying, caller qdisc_graft grafts the new qdisc and invokes
1554 	 * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload
1555 	 * does not need to graft or qdisc_put the qdisc being destroyed.
1556 	 */
1557 	if (!destroying) {
1558 		old = htb_graft_helper(dev_queue, NULL);
1559 		/* Last qdisc grafted should be the same as cl->leaf.q when
1560 		 * calling htb_delete.
1561 		 */
1562 		WARN_ON(old != q);
1563 	}
1564 
1565 	if (cl->parent) {
1566 		_bstats_update(&cl->parent->bstats_bias,
1567 			       u64_stats_read(&q->bstats.bytes),
1568 			       u64_stats_read(&q->bstats.packets));
1569 	}
1570 
1571 	offload_opt = (struct tc_htb_qopt_offload) {
1572 		.command = !last_child ? TC_HTB_LEAF_DEL :
1573 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1574 			   TC_HTB_LEAF_DEL_LAST,
1575 		.classid = cl->common.classid,
1576 		.extack = extack,
1577 	};
1578 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1579 
1580 	if (!destroying) {
1581 		if (!err)
1582 			qdisc_put(old);
1583 		else
1584 			htb_graft_helper(dev_queue, old);
1585 	}
1586 
1587 	if (last_child)
1588 		return err;
1589 
1590 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1591 		u32 classid = TC_H_MAJ(sch->handle) |
1592 			      TC_H_MIN(offload_opt.classid);
1593 		struct htb_class *moved_cl = htb_find(classid, sch);
1594 
1595 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1596 	}
1597 
1598 	return err;
1599 }
1600 
1601 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1602 {
1603 	if (!cl->level) {
1604 		WARN_ON(!cl->leaf.q);
1605 		qdisc_put(cl->leaf.q);
1606 	}
1607 	gen_kill_estimator(&cl->rate_est);
1608 	tcf_block_put(cl->block);
1609 	kfree(cl);
1610 }
1611 
1612 static void htb_destroy(struct Qdisc *sch)
1613 {
1614 	struct net_device *dev = qdisc_dev(sch);
1615 	struct tc_htb_qopt_offload offload_opt;
1616 	struct htb_sched *q = qdisc_priv(sch);
1617 	struct hlist_node *next;
1618 	bool nonempty, changed;
1619 	struct htb_class *cl;
1620 	unsigned int i;
1621 
1622 	cancel_work_sync(&q->work);
1623 	qdisc_watchdog_cancel(&q->watchdog);
1624 	/* This line used to be after htb_destroy_class call below
1625 	 * and surprisingly it worked in 2.4. But it must precede it
1626 	 * because filter need its target class alive to be able to call
1627 	 * unbind_filter on it (without Oops).
1628 	 */
1629 	tcf_block_put(q->block);
1630 
1631 	for (i = 0; i < q->clhash.hashsize; i++) {
1632 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1633 			tcf_block_put(cl->block);
1634 			cl->block = NULL;
1635 		}
1636 	}
1637 
1638 	do {
1639 		nonempty = false;
1640 		changed = false;
1641 		for (i = 0; i < q->clhash.hashsize; i++) {
1642 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1643 						  common.hnode) {
1644 				bool last_child;
1645 
1646 				if (!q->offload) {
1647 					htb_destroy_class(sch, cl);
1648 					continue;
1649 				}
1650 
1651 				nonempty = true;
1652 
1653 				if (cl->level)
1654 					continue;
1655 
1656 				changed = true;
1657 
1658 				last_child = htb_parent_last_child(cl);
1659 				htb_destroy_class_offload(sch, cl, last_child,
1660 							  true, NULL);
1661 				qdisc_class_hash_remove(&q->clhash,
1662 							&cl->common);
1663 				if (cl->parent)
1664 					cl->parent->children--;
1665 				if (last_child)
1666 					htb_parent_to_leaf(sch, cl, NULL);
1667 				htb_destroy_class(sch, cl);
1668 			}
1669 		}
1670 	} while (changed);
1671 	WARN_ON(nonempty);
1672 
1673 	qdisc_class_hash_destroy(&q->clhash);
1674 	__qdisc_reset_queue(&q->direct_queue);
1675 
1676 	if (q->offload) {
1677 		offload_opt = (struct tc_htb_qopt_offload) {
1678 			.command = TC_HTB_DESTROY,
1679 		};
1680 		htb_offload(dev, &offload_opt);
1681 	}
1682 
1683 	if (!q->direct_qdiscs)
1684 		return;
1685 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1686 		qdisc_put(q->direct_qdiscs[i]);
1687 	kfree(q->direct_qdiscs);
1688 }
1689 
1690 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1691 		      struct netlink_ext_ack *extack)
1692 {
1693 	struct htb_sched *q = qdisc_priv(sch);
1694 	struct htb_class *cl = (struct htb_class *)arg;
1695 	struct Qdisc *new_q = NULL;
1696 	int last_child = 0;
1697 	int err;
1698 
1699 	/* TODO: why don't allow to delete subtree ? references ? does
1700 	 * tc subsys guarantee us that in htb_destroy it holds no class
1701 	 * refs so that we can remove children safely there ?
1702 	 */
1703 	if (cl->children || qdisc_class_in_use(&cl->common)) {
1704 		NL_SET_ERR_MSG(extack, "HTB class in use");
1705 		return -EBUSY;
1706 	}
1707 
1708 	if (!cl->level && htb_parent_last_child(cl))
1709 		last_child = 1;
1710 
1711 	if (q->offload) {
1712 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1713 						extack);
1714 		if (err)
1715 			return err;
1716 	}
1717 
1718 	if (last_child) {
1719 		struct netdev_queue *dev_queue = sch->dev_queue;
1720 
1721 		if (q->offload)
1722 			dev_queue = htb_offload_get_queue(cl);
1723 
1724 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1725 					  cl->parent->common.classid,
1726 					  NULL);
1727 		if (q->offload)
1728 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1729 	}
1730 
1731 	sch_tree_lock(sch);
1732 
1733 	if (!cl->level)
1734 		qdisc_purge_queue(cl->leaf.q);
1735 
1736 	/* delete from hash and active; remainder in destroy_class */
1737 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1738 	if (cl->parent)
1739 		cl->parent->children--;
1740 
1741 	if (cl->prio_activity)
1742 		htb_deactivate(q, cl);
1743 
1744 	if (cl->cmode != HTB_CAN_SEND)
1745 		htb_safe_rb_erase(&cl->pq_node,
1746 				  &q->hlevel[cl->level].wait_pq);
1747 
1748 	if (last_child)
1749 		htb_parent_to_leaf(sch, cl, new_q);
1750 
1751 	sch_tree_unlock(sch);
1752 
1753 	htb_destroy_class(sch, cl);
1754 	return 0;
1755 }
1756 
1757 static int htb_change_class(struct Qdisc *sch, u32 classid,
1758 			    u32 parentid, struct nlattr **tca,
1759 			    unsigned long *arg, struct netlink_ext_ack *extack)
1760 {
1761 	int err = -EINVAL;
1762 	struct htb_sched *q = qdisc_priv(sch);
1763 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1764 	struct tc_htb_qopt_offload offload_opt;
1765 	struct nlattr *opt = tca[TCA_OPTIONS];
1766 	struct nlattr *tb[TCA_HTB_MAX + 1];
1767 	struct Qdisc *parent_qdisc = NULL;
1768 	struct netdev_queue *dev_queue;
1769 	struct tc_htb_opt *hopt;
1770 	u64 rate64, ceil64;
1771 	int warn = 0;
1772 
1773 	/* extract all subattrs from opt attr */
1774 	if (!opt)
1775 		goto failure;
1776 
1777 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1778 					  extack);
1779 	if (err < 0)
1780 		goto failure;
1781 
1782 	err = -EINVAL;
1783 	if (tb[TCA_HTB_PARMS] == NULL)
1784 		goto failure;
1785 
1786 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1787 
1788 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1789 	if (!hopt->rate.rate || !hopt->ceil.rate)
1790 		goto failure;
1791 
1792 	if (q->offload) {
1793 		/* Options not supported by the offload. */
1794 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1795 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1796 			goto failure;
1797 		}
1798 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1799 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1800 			goto failure;
1801 		}
1802 	}
1803 
1804 	/* Keeping backward compatible with rate_table based iproute2 tc */
1805 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1806 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1807 					      NULL));
1808 
1809 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1810 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1811 					      NULL));
1812 
1813 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1814 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1815 
1816 	if (!cl) {		/* new class */
1817 		struct net_device *dev = qdisc_dev(sch);
1818 		struct Qdisc *new_q, *old_q;
1819 		int prio;
1820 		struct {
1821 			struct nlattr		nla;
1822 			struct gnet_estimator	opt;
1823 		} est = {
1824 			.nla = {
1825 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1826 				.nla_type	= TCA_RATE,
1827 			},
1828 			.opt = {
1829 				/* 4s interval, 16s averaging constant */
1830 				.interval	= 2,
1831 				.ewma_log	= 2,
1832 			},
1833 		};
1834 
1835 		/* check for valid classid */
1836 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1837 		    htb_find(classid, sch))
1838 			goto failure;
1839 
1840 		/* check maximal depth */
1841 		if (parent && parent->parent && parent->parent->level < 2) {
1842 			NL_SET_ERR_MSG_MOD(extack, "tree is too deep");
1843 			goto failure;
1844 		}
1845 		err = -ENOBUFS;
1846 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1847 		if (!cl)
1848 			goto failure;
1849 
1850 		gnet_stats_basic_sync_init(&cl->bstats);
1851 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1852 
1853 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1854 		if (err) {
1855 			kfree(cl);
1856 			goto failure;
1857 		}
1858 		if (htb_rate_est || tca[TCA_RATE]) {
1859 			err = gen_new_estimator(&cl->bstats, NULL,
1860 						&cl->rate_est,
1861 						NULL,
1862 						true,
1863 						tca[TCA_RATE] ? : &est.nla);
1864 			if (err)
1865 				goto err_block_put;
1866 		}
1867 
1868 		cl->children = 0;
1869 		RB_CLEAR_NODE(&cl->pq_node);
1870 
1871 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1872 			RB_CLEAR_NODE(&cl->node[prio]);
1873 
1874 		cl->common.classid = classid;
1875 
1876 		/* Make sure nothing interrupts us in between of two
1877 		 * ndo_setup_tc calls.
1878 		 */
1879 		ASSERT_RTNL();
1880 
1881 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1882 		 * so that can't be used inside of sch_tree_lock
1883 		 * -- thanks to Karlis Peisenieks
1884 		 */
1885 		if (!q->offload) {
1886 			dev_queue = sch->dev_queue;
1887 		} else if (!(parent && !parent->level)) {
1888 			/* Assign a dev_queue to this classid. */
1889 			offload_opt = (struct tc_htb_qopt_offload) {
1890 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1891 				.classid = cl->common.classid,
1892 				.parent_classid = parent ?
1893 					TC_H_MIN(parent->common.classid) :
1894 					TC_HTB_CLASSID_ROOT,
1895 				.rate = max_t(u64, hopt->rate.rate, rate64),
1896 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1897 				.prio = hopt->prio,
1898 				.quantum = hopt->quantum,
1899 				.extack = extack,
1900 			};
1901 			err = htb_offload(dev, &offload_opt);
1902 			if (err) {
1903 				NL_SET_ERR_MSG_WEAK(extack,
1904 						    "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE");
1905 				goto err_kill_estimator;
1906 			}
1907 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1908 		} else { /* First child. */
1909 			dev_queue = htb_offload_get_queue(parent);
1910 			old_q = htb_graft_helper(dev_queue, NULL);
1911 			WARN_ON(old_q != parent->leaf.q);
1912 			offload_opt = (struct tc_htb_qopt_offload) {
1913 				.command = TC_HTB_LEAF_TO_INNER,
1914 				.classid = cl->common.classid,
1915 				.parent_classid =
1916 					TC_H_MIN(parent->common.classid),
1917 				.rate = max_t(u64, hopt->rate.rate, rate64),
1918 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1919 				.prio = hopt->prio,
1920 				.quantum = hopt->quantum,
1921 				.extack = extack,
1922 			};
1923 			err = htb_offload(dev, &offload_opt);
1924 			if (err) {
1925 				NL_SET_ERR_MSG_WEAK(extack,
1926 						    "Failed to offload TC_HTB_LEAF_TO_INNER");
1927 				htb_graft_helper(dev_queue, old_q);
1928 				goto err_kill_estimator;
1929 			}
1930 			_bstats_update(&parent->bstats_bias,
1931 				       u64_stats_read(&old_q->bstats.bytes),
1932 				       u64_stats_read(&old_q->bstats.packets));
1933 			qdisc_put(old_q);
1934 		}
1935 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1936 					  classid, NULL);
1937 		if (q->offload) {
1938 			/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1939 			if (new_q)
1940 				qdisc_refcount_inc(new_q);
1941 			old_q = htb_graft_helper(dev_queue, new_q);
1942 			/* No qdisc_put needed. */
1943 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1944 		}
1945 		sch_tree_lock(sch);
1946 		if (parent && !parent->level) {
1947 			/* turn parent into inner node */
1948 			qdisc_purge_queue(parent->leaf.q);
1949 			parent_qdisc = parent->leaf.q;
1950 			if (parent->prio_activity)
1951 				htb_deactivate(q, parent);
1952 
1953 			/* remove from evt list because of level change */
1954 			if (parent->cmode != HTB_CAN_SEND) {
1955 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1956 				parent->cmode = HTB_CAN_SEND;
1957 			}
1958 			parent->level = (parent->parent ? parent->parent->level
1959 					 : TC_HTB_MAXDEPTH) - 1;
1960 			memset(&parent->inner, 0, sizeof(parent->inner));
1961 		}
1962 
1963 		/* leaf (we) needs elementary qdisc */
1964 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1965 		if (q->offload)
1966 			cl->leaf.offload_queue = dev_queue;
1967 
1968 		cl->parent = parent;
1969 
1970 		/* set class to be in HTB_CAN_SEND state */
1971 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1972 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1973 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1974 		cl->t_c = ktime_get_ns();
1975 		cl->cmode = HTB_CAN_SEND;
1976 
1977 		/* attach to the hash list and parent's family */
1978 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1979 		if (parent)
1980 			parent->children++;
1981 		if (cl->leaf.q != &noop_qdisc)
1982 			qdisc_hash_add(cl->leaf.q, true);
1983 	} else {
1984 		if (tca[TCA_RATE]) {
1985 			err = gen_replace_estimator(&cl->bstats, NULL,
1986 						    &cl->rate_est,
1987 						    NULL,
1988 						    true,
1989 						    tca[TCA_RATE]);
1990 			if (err)
1991 				return err;
1992 		}
1993 
1994 		if (q->offload) {
1995 			struct net_device *dev = qdisc_dev(sch);
1996 
1997 			offload_opt = (struct tc_htb_qopt_offload) {
1998 				.command = TC_HTB_NODE_MODIFY,
1999 				.classid = cl->common.classid,
2000 				.rate = max_t(u64, hopt->rate.rate, rate64),
2001 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2002 				.prio = hopt->prio,
2003 				.quantum = hopt->quantum,
2004 				.extack = extack,
2005 			};
2006 			err = htb_offload(dev, &offload_opt);
2007 			if (err)
2008 				/* Estimator was replaced, and rollback may fail
2009 				 * as well, so we don't try to recover it, and
2010 				 * the estimator won't work property with the
2011 				 * offload anyway, because bstats are updated
2012 				 * only when the stats are queried.
2013 				 */
2014 				return err;
2015 		}
2016 
2017 		sch_tree_lock(sch);
2018 	}
2019 
2020 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2021 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2022 
2023 	/* it used to be a nasty bug here, we have to check that node
2024 	 * is really leaf before changing cl->leaf !
2025 	 */
2026 	if (!cl->level) {
2027 		u64 quantum = cl->rate.rate_bytes_ps;
2028 
2029 		do_div(quantum, q->rate2quantum);
2030 		cl->quantum = min_t(u64, quantum, INT_MAX);
2031 
2032 		if (!hopt->quantum && cl->quantum < 1000) {
2033 			warn = -1;
2034 			cl->quantum = 1000;
2035 		}
2036 		if (!hopt->quantum && cl->quantum > 200000) {
2037 			warn = 1;
2038 			cl->quantum = 200000;
2039 		}
2040 		if (hopt->quantum)
2041 			cl->quantum = hopt->quantum;
2042 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2043 			cl->prio = TC_HTB_NUMPRIO - 1;
2044 	}
2045 
2046 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2047 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2048 
2049 	sch_tree_unlock(sch);
2050 	qdisc_put(parent_qdisc);
2051 
2052 	if (warn)
2053 		NL_SET_ERR_MSG_FMT_MOD(extack,
2054 				       "quantum of class %X is %s. Consider r2q change.",
2055 				       cl->common.classid, (warn == -1 ? "small" : "big"));
2056 
2057 	qdisc_class_hash_grow(sch, &q->clhash);
2058 
2059 	*arg = (unsigned long)cl;
2060 	return 0;
2061 
2062 err_kill_estimator:
2063 	gen_kill_estimator(&cl->rate_est);
2064 err_block_put:
2065 	tcf_block_put(cl->block);
2066 	kfree(cl);
2067 failure:
2068 	return err;
2069 }
2070 
2071 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2072 				       struct netlink_ext_ack *extack)
2073 {
2074 	struct htb_sched *q = qdisc_priv(sch);
2075 	struct htb_class *cl = (struct htb_class *)arg;
2076 
2077 	return cl ? cl->block : q->block;
2078 }
2079 
2080 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2081 				     u32 classid)
2082 {
2083 	struct htb_class *cl = htb_find(classid, sch);
2084 
2085 	/*if (cl && !cl->level) return 0;
2086 	 * The line above used to be there to prevent attaching filters to
2087 	 * leaves. But at least tc_index filter uses this just to get class
2088 	 * for other reasons so that we have to allow for it.
2089 	 * ----
2090 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2091 	 * another way to "lock" the class - unlike "get" this lock can
2092 	 * be broken by class during destroy IIUC.
2093 	 */
2094 	if (cl)
2095 		qdisc_class_get(&cl->common);
2096 	return (unsigned long)cl;
2097 }
2098 
2099 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2100 {
2101 	struct htb_class *cl = (struct htb_class *)arg;
2102 
2103 	qdisc_class_put(&cl->common);
2104 }
2105 
2106 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2107 {
2108 	struct htb_sched *q = qdisc_priv(sch);
2109 	struct htb_class *cl;
2110 	unsigned int i;
2111 
2112 	if (arg->stop)
2113 		return;
2114 
2115 	for (i = 0; i < q->clhash.hashsize; i++) {
2116 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2117 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
2118 				return;
2119 		}
2120 	}
2121 }
2122 
2123 static const struct Qdisc_class_ops htb_class_ops = {
2124 	.select_queue	=	htb_select_queue,
2125 	.graft		=	htb_graft,
2126 	.leaf		=	htb_leaf,
2127 	.qlen_notify	=	htb_qlen_notify,
2128 	.find		=	htb_search,
2129 	.change		=	htb_change_class,
2130 	.delete		=	htb_delete,
2131 	.walk		=	htb_walk,
2132 	.tcf_block	=	htb_tcf_block,
2133 	.bind_tcf	=	htb_bind_filter,
2134 	.unbind_tcf	=	htb_unbind_filter,
2135 	.dump		=	htb_dump_class,
2136 	.dump_stats	=	htb_dump_class_stats,
2137 };
2138 
2139 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2140 	.cl_ops		=	&htb_class_ops,
2141 	.id		=	"htb",
2142 	.priv_size	=	sizeof(struct htb_sched),
2143 	.enqueue	=	htb_enqueue,
2144 	.dequeue	=	htb_dequeue,
2145 	.peek		=	qdisc_peek_dequeued,
2146 	.init		=	htb_init,
2147 	.attach		=	htb_attach,
2148 	.reset		=	htb_reset,
2149 	.destroy	=	htb_destroy,
2150 	.dump		=	htb_dump,
2151 	.owner		=	THIS_MODULE,
2152 };
2153 MODULE_ALIAS_NET_SCH("htb");
2154 
2155 static int __init htb_module_init(void)
2156 {
2157 	return register_qdisc(&htb_qdisc_ops);
2158 }
2159 static void __exit htb_module_exit(void)
2160 {
2161 	unregister_qdisc(&htb_qdisc_ops);
2162 }
2163 
2164 module_init(htb_module_init)
2165 module_exit(htb_module_exit)
2166 MODULE_LICENSE("GPL");
2167 MODULE_DESCRIPTION("Hierarchical Token Bucket scheduler");
2168