xref: /linux/net/sched/sch_htb.c (revision c4bbe83d27c2446a033cc0381c3fb6be5e8c41c7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 
106 	int			level;		/* our level (see above) */
107 	unsigned int		children;
108 	struct htb_class	*parent;	/* parent class */
109 
110 	struct net_rate_estimator __rcu *rate_est;
111 
112 	/*
113 	 * Written often fields
114 	 */
115 	struct gnet_stats_basic_sync bstats;
116 	struct gnet_stats_basic_sync bstats_bias;
117 	struct tc_htb_xstats	xstats;	/* our special stats */
118 
119 	/* token bucket parameters */
120 	s64			tokens, ctokens;/* current number of tokens */
121 	s64			t_c;		/* checkpoint time */
122 
123 	union {
124 		struct htb_class_leaf {
125 			int		deficit[TC_HTB_MAXDEPTH];
126 			struct Qdisc	*q;
127 			struct netdev_queue *offload_queue;
128 		} leaf;
129 		struct htb_class_inner {
130 			struct htb_prio clprio[TC_HTB_NUMPRIO];
131 		} inner;
132 	};
133 	s64			pq_key;
134 
135 	int			prio_activity;	/* for which prios are we active */
136 	enum htb_cmode		cmode;		/* current mode of the class */
137 	struct rb_node		pq_node;	/* node for event queue */
138 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
139 
140 	unsigned int drops ____cacheline_aligned_in_smp;
141 	unsigned int		overlimits;
142 };
143 
144 struct htb_level {
145 	struct rb_root	wait_pq;
146 	struct htb_prio hprio[TC_HTB_NUMPRIO];
147 };
148 
149 struct htb_sched {
150 	struct Qdisc_class_hash clhash;
151 	int			defcls;		/* class where unclassified flows go to */
152 	int			rate2quantum;	/* quant = rate / rate2quantum */
153 
154 	/* filters for qdisc itself */
155 	struct tcf_proto __rcu	*filter_list;
156 	struct tcf_block	*block;
157 
158 #define HTB_WARN_TOOMANYEVENTS	0x1
159 	unsigned int		warned;	/* only one warning */
160 	int			direct_qlen;
161 	struct work_struct	work;
162 
163 	/* non shaped skbs; let them go directly thru */
164 	struct qdisc_skb_head	direct_queue;
165 	u32			direct_pkts;
166 	u32			overlimits;
167 
168 	struct qdisc_watchdog	watchdog;
169 
170 	s64			now;	/* cached dequeue time */
171 
172 	/* time of nearest event per level (row) */
173 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
174 
175 	int			row_mask[TC_HTB_MAXDEPTH];
176 
177 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
178 
179 	struct Qdisc		**direct_qdiscs;
180 	unsigned int            num_direct_qdiscs;
181 
182 	bool			offload;
183 };
184 
185 /* find class in global hash table using given handle */
186 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
187 {
188 	struct htb_sched *q = qdisc_priv(sch);
189 	struct Qdisc_class_common *clc;
190 
191 	clc = qdisc_class_find(&q->clhash, handle);
192 	if (clc == NULL)
193 		return NULL;
194 	return container_of(clc, struct htb_class, common);
195 }
196 
197 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
198 {
199 	return (unsigned long)htb_find(handle, sch);
200 }
201 
202 #define HTB_DIRECT ((struct htb_class *)-1L)
203 
204 /**
205  * htb_classify - classify a packet into class
206  * @skb: the socket buffer
207  * @sch: the active queue discipline
208  * @qerr: pointer for returned status code
209  *
210  * It returns NULL if the packet should be dropped or -1 if the packet
211  * should be passed directly thru. In all other cases leaf class is returned.
212  * We allow direct class selection by classid in priority. The we examine
213  * filters in qdisc and in inner nodes (if higher filter points to the inner
214  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
215  * internal fifo (direct). These packets then go directly thru. If we still
216  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
217  * then finish and return direct queue.
218  */
219 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
220 				      int *qerr)
221 {
222 	struct htb_sched *q = qdisc_priv(sch);
223 	struct htb_class *cl;
224 	struct tcf_result res;
225 	struct tcf_proto *tcf;
226 	int result;
227 
228 	/* allow to select class by setting skb->priority to valid classid;
229 	 * note that nfmark can be used too by attaching filter fw with no
230 	 * rules in it
231 	 */
232 	if (skb->priority == sch->handle)
233 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
234 	cl = htb_find(skb->priority, sch);
235 	if (cl) {
236 		if (cl->level == 0)
237 			return cl;
238 		/* Start with inner filter chain if a non-leaf class is selected */
239 		tcf = rcu_dereference_bh(cl->filter_list);
240 	} else {
241 		tcf = rcu_dereference_bh(q->filter_list);
242 	}
243 
244 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
245 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
246 #ifdef CONFIG_NET_CLS_ACT
247 		switch (result) {
248 		case TC_ACT_QUEUED:
249 		case TC_ACT_STOLEN:
250 		case TC_ACT_TRAP:
251 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
252 			fallthrough;
253 		case TC_ACT_SHOT:
254 			return NULL;
255 		}
256 #endif
257 		cl = (void *)res.class;
258 		if (!cl) {
259 			if (res.classid == sch->handle)
260 				return HTB_DIRECT;	/* X:0 (direct flow) */
261 			cl = htb_find(res.classid, sch);
262 			if (!cl)
263 				break;	/* filter selected invalid classid */
264 		}
265 		if (!cl->level)
266 			return cl;	/* we hit leaf; return it */
267 
268 		/* we have got inner class; apply inner filter chain */
269 		tcf = rcu_dereference_bh(cl->filter_list);
270 	}
271 	/* classification failed; try to use default class */
272 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
273 	if (!cl || cl->level)
274 		return HTB_DIRECT;	/* bad default .. this is safe bet */
275 	return cl;
276 }
277 
278 /**
279  * htb_add_to_id_tree - adds class to the round robin list
280  * @root: the root of the tree
281  * @cl: the class to add
282  * @prio: the give prio in class
283  *
284  * Routine adds class to the list (actually tree) sorted by classid.
285  * Make sure that class is not already on such list for given prio.
286  */
287 static void htb_add_to_id_tree(struct rb_root *root,
288 			       struct htb_class *cl, int prio)
289 {
290 	struct rb_node **p = &root->rb_node, *parent = NULL;
291 
292 	while (*p) {
293 		struct htb_class *c;
294 		parent = *p;
295 		c = rb_entry(parent, struct htb_class, node[prio]);
296 
297 		if (cl->common.classid > c->common.classid)
298 			p = &parent->rb_right;
299 		else
300 			p = &parent->rb_left;
301 	}
302 	rb_link_node(&cl->node[prio], parent, p);
303 	rb_insert_color(&cl->node[prio], root);
304 }
305 
306 /**
307  * htb_add_to_wait_tree - adds class to the event queue with delay
308  * @q: the priority event queue
309  * @cl: the class to add
310  * @delay: delay in microseconds
311  *
312  * The class is added to priority event queue to indicate that class will
313  * change its mode in cl->pq_key microseconds. Make sure that class is not
314  * already in the queue.
315  */
316 static void htb_add_to_wait_tree(struct htb_sched *q,
317 				 struct htb_class *cl, s64 delay)
318 {
319 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
320 
321 	cl->pq_key = q->now + delay;
322 	if (cl->pq_key == q->now)
323 		cl->pq_key++;
324 
325 	/* update the nearest event cache */
326 	if (q->near_ev_cache[cl->level] > cl->pq_key)
327 		q->near_ev_cache[cl->level] = cl->pq_key;
328 
329 	while (*p) {
330 		struct htb_class *c;
331 		parent = *p;
332 		c = rb_entry(parent, struct htb_class, pq_node);
333 		if (cl->pq_key >= c->pq_key)
334 			p = &parent->rb_right;
335 		else
336 			p = &parent->rb_left;
337 	}
338 	rb_link_node(&cl->pq_node, parent, p);
339 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
340 }
341 
342 /**
343  * htb_next_rb_node - finds next node in binary tree
344  * @n: the current node in binary tree
345  *
346  * When we are past last key we return NULL.
347  * Average complexity is 2 steps per call.
348  */
349 static inline void htb_next_rb_node(struct rb_node **n)
350 {
351 	*n = rb_next(*n);
352 }
353 
354 /**
355  * htb_add_class_to_row - add class to its row
356  * @q: the priority event queue
357  * @cl: the class to add
358  * @mask: the given priorities in class in bitmap
359  *
360  * The class is added to row at priorities marked in mask.
361  * It does nothing if mask == 0.
362  */
363 static inline void htb_add_class_to_row(struct htb_sched *q,
364 					struct htb_class *cl, int mask)
365 {
366 	q->row_mask[cl->level] |= mask;
367 	while (mask) {
368 		int prio = ffz(~mask);
369 		mask &= ~(1 << prio);
370 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
371 	}
372 }
373 
374 /* If this triggers, it is a bug in this code, but it need not be fatal */
375 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
376 {
377 	if (RB_EMPTY_NODE(rb)) {
378 		WARN_ON(1);
379 	} else {
380 		rb_erase(rb, root);
381 		RB_CLEAR_NODE(rb);
382 	}
383 }
384 
385 
386 /**
387  * htb_remove_class_from_row - removes class from its row
388  * @q: the priority event queue
389  * @cl: the class to add
390  * @mask: the given priorities in class in bitmap
391  *
392  * The class is removed from row at priorities marked in mask.
393  * It does nothing if mask == 0.
394  */
395 static inline void htb_remove_class_from_row(struct htb_sched *q,
396 						 struct htb_class *cl, int mask)
397 {
398 	int m = 0;
399 	struct htb_level *hlevel = &q->hlevel[cl->level];
400 
401 	while (mask) {
402 		int prio = ffz(~mask);
403 		struct htb_prio *hprio = &hlevel->hprio[prio];
404 
405 		mask &= ~(1 << prio);
406 		if (hprio->ptr == cl->node + prio)
407 			htb_next_rb_node(&hprio->ptr);
408 
409 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
410 		if (!hprio->row.rb_node)
411 			m |= 1 << prio;
412 	}
413 	q->row_mask[cl->level] &= ~m;
414 }
415 
416 /**
417  * htb_activate_prios - creates active classe's feed chain
418  * @q: the priority event queue
419  * @cl: the class to activate
420  *
421  * The class is connected to ancestors and/or appropriate rows
422  * for priorities it is participating on. cl->cmode must be new
423  * (activated) mode. It does nothing if cl->prio_activity == 0.
424  */
425 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
426 {
427 	struct htb_class *p = cl->parent;
428 	long m, mask = cl->prio_activity;
429 
430 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
431 		m = mask;
432 		while (m) {
433 			unsigned int prio = ffz(~m);
434 
435 			if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
436 				break;
437 			m &= ~(1 << prio);
438 
439 			if (p->inner.clprio[prio].feed.rb_node)
440 				/* parent already has its feed in use so that
441 				 * reset bit in mask as parent is already ok
442 				 */
443 				mask &= ~(1 << prio);
444 
445 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
446 		}
447 		p->prio_activity |= mask;
448 		cl = p;
449 		p = cl->parent;
450 
451 	}
452 	if (cl->cmode == HTB_CAN_SEND && mask)
453 		htb_add_class_to_row(q, cl, mask);
454 }
455 
456 /**
457  * htb_deactivate_prios - remove class from feed chain
458  * @q: the priority event queue
459  * @cl: the class to deactivate
460  *
461  * cl->cmode must represent old mode (before deactivation). It does
462  * nothing if cl->prio_activity == 0. Class is removed from all feed
463  * chains and rows.
464  */
465 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
466 {
467 	struct htb_class *p = cl->parent;
468 	long m, mask = cl->prio_activity;
469 
470 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
471 		m = mask;
472 		mask = 0;
473 		while (m) {
474 			int prio = ffz(~m);
475 			m &= ~(1 << prio);
476 
477 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
478 				/* we are removing child which is pointed to from
479 				 * parent feed - forget the pointer but remember
480 				 * classid
481 				 */
482 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
483 				p->inner.clprio[prio].ptr = NULL;
484 			}
485 
486 			htb_safe_rb_erase(cl->node + prio,
487 					  &p->inner.clprio[prio].feed);
488 
489 			if (!p->inner.clprio[prio].feed.rb_node)
490 				mask |= 1 << prio;
491 		}
492 
493 		p->prio_activity &= ~mask;
494 		cl = p;
495 		p = cl->parent;
496 
497 	}
498 	if (cl->cmode == HTB_CAN_SEND && mask)
499 		htb_remove_class_from_row(q, cl, mask);
500 }
501 
502 static inline s64 htb_lowater(const struct htb_class *cl)
503 {
504 	if (htb_hysteresis)
505 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
506 	else
507 		return 0;
508 }
509 static inline s64 htb_hiwater(const struct htb_class *cl)
510 {
511 	if (htb_hysteresis)
512 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
513 	else
514 		return 0;
515 }
516 
517 
518 /**
519  * htb_class_mode - computes and returns current class mode
520  * @cl: the target class
521  * @diff: diff time in microseconds
522  *
523  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
524  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
525  * from now to time when cl will change its state.
526  * Also it is worth to note that class mode doesn't change simply
527  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
528  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
529  * mode transitions per time unit. The speed gain is about 1/6.
530  */
531 static inline enum htb_cmode
532 htb_class_mode(struct htb_class *cl, s64 *diff)
533 {
534 	s64 toks;
535 
536 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
537 		*diff = -toks;
538 		return HTB_CANT_SEND;
539 	}
540 
541 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
542 		return HTB_CAN_SEND;
543 
544 	*diff = -toks;
545 	return HTB_MAY_BORROW;
546 }
547 
548 /**
549  * htb_change_class_mode - changes classe's mode
550  * @q: the priority event queue
551  * @cl: the target class
552  * @diff: diff time in microseconds
553  *
554  * This should be the only way how to change classe's mode under normal
555  * circumstances. Routine will update feed lists linkage, change mode
556  * and add class to the wait event queue if appropriate. New mode should
557  * be different from old one and cl->pq_key has to be valid if changing
558  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
559  */
560 static void
561 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
562 {
563 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
564 
565 	if (new_mode == cl->cmode)
566 		return;
567 
568 	if (new_mode == HTB_CANT_SEND) {
569 		cl->overlimits++;
570 		q->overlimits++;
571 	}
572 
573 	if (cl->prio_activity) {	/* not necessary: speed optimization */
574 		if (cl->cmode != HTB_CANT_SEND)
575 			htb_deactivate_prios(q, cl);
576 		cl->cmode = new_mode;
577 		if (new_mode != HTB_CANT_SEND)
578 			htb_activate_prios(q, cl);
579 	} else
580 		cl->cmode = new_mode;
581 }
582 
583 /**
584  * htb_activate - inserts leaf cl into appropriate active feeds
585  * @q: the priority event queue
586  * @cl: the target class
587  *
588  * Routine learns (new) priority of leaf and activates feed chain
589  * for the prio. It can be called on already active leaf safely.
590  * It also adds leaf into droplist.
591  */
592 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
593 {
594 	WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
595 
596 	if (!cl->prio_activity) {
597 		cl->prio_activity = 1 << cl->prio;
598 		htb_activate_prios(q, cl);
599 	}
600 }
601 
602 /**
603  * htb_deactivate - remove leaf cl from active feeds
604  * @q: the priority event queue
605  * @cl: the target class
606  *
607  * Make sure that leaf is active. In the other words it can't be called
608  * with non-active leaf. It also removes class from the drop list.
609  */
610 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
611 {
612 	WARN_ON(!cl->prio_activity);
613 
614 	htb_deactivate_prios(q, cl);
615 	cl->prio_activity = 0;
616 }
617 
618 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
619 		       struct sk_buff **to_free)
620 {
621 	int ret;
622 	unsigned int len = qdisc_pkt_len(skb);
623 	struct htb_sched *q = qdisc_priv(sch);
624 	struct htb_class *cl = htb_classify(skb, sch, &ret);
625 
626 	if (cl == HTB_DIRECT) {
627 		/* enqueue to helper queue */
628 		if (q->direct_queue.qlen < q->direct_qlen) {
629 			__qdisc_enqueue_tail(skb, &q->direct_queue);
630 			q->direct_pkts++;
631 		} else {
632 			return qdisc_drop(skb, sch, to_free);
633 		}
634 #ifdef CONFIG_NET_CLS_ACT
635 	} else if (!cl) {
636 		if (ret & __NET_XMIT_BYPASS)
637 			qdisc_qstats_drop(sch);
638 		__qdisc_drop(skb, to_free);
639 		return ret;
640 #endif
641 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
642 					to_free)) != NET_XMIT_SUCCESS) {
643 		if (net_xmit_drop_count(ret)) {
644 			qdisc_qstats_drop(sch);
645 			cl->drops++;
646 		}
647 		return ret;
648 	} else {
649 		htb_activate(q, cl);
650 	}
651 
652 	sch->qstats.backlog += len;
653 	sch->q.qlen++;
654 	return NET_XMIT_SUCCESS;
655 }
656 
657 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
658 {
659 	s64 toks = diff + cl->tokens;
660 
661 	if (toks > cl->buffer)
662 		toks = cl->buffer;
663 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
664 	if (toks <= -cl->mbuffer)
665 		toks = 1 - cl->mbuffer;
666 
667 	cl->tokens = toks;
668 }
669 
670 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
671 {
672 	s64 toks = diff + cl->ctokens;
673 
674 	if (toks > cl->cbuffer)
675 		toks = cl->cbuffer;
676 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
677 	if (toks <= -cl->mbuffer)
678 		toks = 1 - cl->mbuffer;
679 
680 	cl->ctokens = toks;
681 }
682 
683 /**
684  * htb_charge_class - charges amount "bytes" to leaf and ancestors
685  * @q: the priority event queue
686  * @cl: the class to start iterate
687  * @level: the minimum level to account
688  * @skb: the socket buffer
689  *
690  * Routine assumes that packet "bytes" long was dequeued from leaf cl
691  * borrowing from "level". It accounts bytes to ceil leaky bucket for
692  * leaf and all ancestors and to rate bucket for ancestors at levels
693  * "level" and higher. It also handles possible change of mode resulting
694  * from the update. Note that mode can also increase here (MAY_BORROW to
695  * CAN_SEND) because we can use more precise clock that event queue here.
696  * In such case we remove class from event queue first.
697  */
698 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
699 			     int level, struct sk_buff *skb)
700 {
701 	int bytes = qdisc_pkt_len(skb);
702 	enum htb_cmode old_mode;
703 	s64 diff;
704 
705 	while (cl) {
706 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
707 		if (cl->level >= level) {
708 			if (cl->level == level)
709 				cl->xstats.lends++;
710 			htb_accnt_tokens(cl, bytes, diff);
711 		} else {
712 			cl->xstats.borrows++;
713 			cl->tokens += diff;	/* we moved t_c; update tokens */
714 		}
715 		htb_accnt_ctokens(cl, bytes, diff);
716 		cl->t_c = q->now;
717 
718 		old_mode = cl->cmode;
719 		diff = 0;
720 		htb_change_class_mode(q, cl, &diff);
721 		if (old_mode != cl->cmode) {
722 			if (old_mode != HTB_CAN_SEND)
723 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
724 			if (cl->cmode != HTB_CAN_SEND)
725 				htb_add_to_wait_tree(q, cl, diff);
726 		}
727 
728 		/* update basic stats except for leaves which are already updated */
729 		if (cl->level)
730 			bstats_update(&cl->bstats, skb);
731 
732 		cl = cl->parent;
733 	}
734 }
735 
736 /**
737  * htb_do_events - make mode changes to classes at the level
738  * @q: the priority event queue
739  * @level: which wait_pq in 'q->hlevel'
740  * @start: start jiffies
741  *
742  * Scans event queue for pending events and applies them. Returns time of
743  * next pending event (0 for no event in pq, q->now for too many events).
744  * Note: Applied are events whose have cl->pq_key <= q->now.
745  */
746 static s64 htb_do_events(struct htb_sched *q, const int level,
747 			 unsigned long start)
748 {
749 	/* don't run for longer than 2 jiffies; 2 is used instead of
750 	 * 1 to simplify things when jiffy is going to be incremented
751 	 * too soon
752 	 */
753 	unsigned long stop_at = start + 2;
754 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
755 
756 	while (time_before(jiffies, stop_at)) {
757 		struct htb_class *cl;
758 		s64 diff;
759 		struct rb_node *p = rb_first(wait_pq);
760 
761 		if (!p)
762 			return 0;
763 
764 		cl = rb_entry(p, struct htb_class, pq_node);
765 		if (cl->pq_key > q->now)
766 			return cl->pq_key;
767 
768 		htb_safe_rb_erase(p, wait_pq);
769 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
770 		htb_change_class_mode(q, cl, &diff);
771 		if (cl->cmode != HTB_CAN_SEND)
772 			htb_add_to_wait_tree(q, cl, diff);
773 	}
774 
775 	/* too much load - let's continue after a break for scheduling */
776 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
777 		pr_warn("htb: too many events!\n");
778 		q->warned |= HTB_WARN_TOOMANYEVENTS;
779 	}
780 
781 	return q->now;
782 }
783 
784 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
785  * is no such one exists.
786  */
787 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
788 					      u32 id)
789 {
790 	struct rb_node *r = NULL;
791 	while (n) {
792 		struct htb_class *cl =
793 		    rb_entry(n, struct htb_class, node[prio]);
794 
795 		if (id > cl->common.classid) {
796 			n = n->rb_right;
797 		} else if (id < cl->common.classid) {
798 			r = n;
799 			n = n->rb_left;
800 		} else {
801 			return n;
802 		}
803 	}
804 	return r;
805 }
806 
807 /**
808  * htb_lookup_leaf - returns next leaf class in DRR order
809  * @hprio: the current one
810  * @prio: which prio in class
811  *
812  * Find leaf where current feed pointers points to.
813  */
814 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
815 {
816 	int i;
817 	struct {
818 		struct rb_node *root;
819 		struct rb_node **pptr;
820 		u32 *pid;
821 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
822 
823 	BUG_ON(!hprio->row.rb_node);
824 	sp->root = hprio->row.rb_node;
825 	sp->pptr = &hprio->ptr;
826 	sp->pid = &hprio->last_ptr_id;
827 
828 	for (i = 0; i < 65535; i++) {
829 		if (!*sp->pptr && *sp->pid) {
830 			/* ptr was invalidated but id is valid - try to recover
831 			 * the original or next ptr
832 			 */
833 			*sp->pptr =
834 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
835 		}
836 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
837 				 * can become out of date quickly
838 				 */
839 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
840 			*sp->pptr = sp->root;
841 			while ((*sp->pptr)->rb_left)
842 				*sp->pptr = (*sp->pptr)->rb_left;
843 			if (sp > stk) {
844 				sp--;
845 				if (!*sp->pptr) {
846 					WARN_ON(1);
847 					return NULL;
848 				}
849 				htb_next_rb_node(sp->pptr);
850 			}
851 		} else {
852 			struct htb_class *cl;
853 			struct htb_prio *clp;
854 
855 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
856 			if (!cl->level)
857 				return cl;
858 			clp = &cl->inner.clprio[prio];
859 			(++sp)->root = clp->feed.rb_node;
860 			sp->pptr = &clp->ptr;
861 			sp->pid = &clp->last_ptr_id;
862 		}
863 	}
864 	WARN_ON(1);
865 	return NULL;
866 }
867 
868 /* dequeues packet at given priority and level; call only if
869  * you are sure that there is active class at prio/level
870  */
871 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
872 					const int level)
873 {
874 	struct sk_buff *skb = NULL;
875 	struct htb_class *cl, *start;
876 	struct htb_level *hlevel = &q->hlevel[level];
877 	struct htb_prio *hprio = &hlevel->hprio[prio];
878 
879 	/* look initial class up in the row */
880 	start = cl = htb_lookup_leaf(hprio, prio);
881 
882 	do {
883 next:
884 		if (unlikely(!cl))
885 			return NULL;
886 
887 		/* class can be empty - it is unlikely but can be true if leaf
888 		 * qdisc drops packets in enqueue routine or if someone used
889 		 * graft operation on the leaf since last dequeue;
890 		 * simply deactivate and skip such class
891 		 */
892 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
893 			struct htb_class *next;
894 			htb_deactivate(q, cl);
895 
896 			/* row/level might become empty */
897 			if ((q->row_mask[level] & (1 << prio)) == 0)
898 				return NULL;
899 
900 			next = htb_lookup_leaf(hprio, prio);
901 
902 			if (cl == start)	/* fix start if we just deleted it */
903 				start = next;
904 			cl = next;
905 			goto next;
906 		}
907 
908 		skb = cl->leaf.q->dequeue(cl->leaf.q);
909 		if (likely(skb != NULL))
910 			break;
911 
912 		qdisc_warn_nonwc("htb", cl->leaf.q);
913 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
914 					 &q->hlevel[0].hprio[prio].ptr);
915 		cl = htb_lookup_leaf(hprio, prio);
916 
917 	} while (cl != start);
918 
919 	if (likely(skb != NULL)) {
920 		bstats_update(&cl->bstats, skb);
921 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
922 		if (cl->leaf.deficit[level] < 0) {
923 			cl->leaf.deficit[level] += cl->quantum;
924 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
925 						 &q->hlevel[0].hprio[prio].ptr);
926 		}
927 		/* this used to be after charge_class but this constelation
928 		 * gives us slightly better performance
929 		 */
930 		if (!cl->leaf.q->q.qlen)
931 			htb_deactivate(q, cl);
932 		htb_charge_class(q, cl, level, skb);
933 	}
934 	return skb;
935 }
936 
937 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
938 {
939 	struct sk_buff *skb;
940 	struct htb_sched *q = qdisc_priv(sch);
941 	int level;
942 	s64 next_event;
943 	unsigned long start_at;
944 
945 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
946 	skb = __qdisc_dequeue_head(&q->direct_queue);
947 	if (skb != NULL) {
948 ok:
949 		qdisc_bstats_update(sch, skb);
950 		qdisc_qstats_backlog_dec(sch, skb);
951 		sch->q.qlen--;
952 		return skb;
953 	}
954 
955 	if (!sch->q.qlen)
956 		goto fin;
957 	q->now = ktime_get_ns();
958 	start_at = jiffies;
959 
960 	next_event = q->now + 5LLU * NSEC_PER_SEC;
961 
962 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
963 		/* common case optimization - skip event handler quickly */
964 		int m;
965 		s64 event = q->near_ev_cache[level];
966 
967 		if (q->now >= event) {
968 			event = htb_do_events(q, level, start_at);
969 			if (!event)
970 				event = q->now + NSEC_PER_SEC;
971 			q->near_ev_cache[level] = event;
972 		}
973 
974 		if (next_event > event)
975 			next_event = event;
976 
977 		m = ~q->row_mask[level];
978 		while (m != (int)(-1)) {
979 			int prio = ffz(m);
980 
981 			m |= 1 << prio;
982 			skb = htb_dequeue_tree(q, prio, level);
983 			if (likely(skb != NULL))
984 				goto ok;
985 		}
986 	}
987 	if (likely(next_event > q->now))
988 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
989 	else
990 		schedule_work(&q->work);
991 fin:
992 	return skb;
993 }
994 
995 /* reset all classes */
996 /* always caled under BH & queue lock */
997 static void htb_reset(struct Qdisc *sch)
998 {
999 	struct htb_sched *q = qdisc_priv(sch);
1000 	struct htb_class *cl;
1001 	unsigned int i;
1002 
1003 	for (i = 0; i < q->clhash.hashsize; i++) {
1004 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1005 			if (cl->level)
1006 				memset(&cl->inner, 0, sizeof(cl->inner));
1007 			else {
1008 				if (cl->leaf.q && !q->offload)
1009 					qdisc_reset(cl->leaf.q);
1010 			}
1011 			cl->prio_activity = 0;
1012 			cl->cmode = HTB_CAN_SEND;
1013 		}
1014 	}
1015 	qdisc_watchdog_cancel(&q->watchdog);
1016 	__qdisc_reset_queue(&q->direct_queue);
1017 	memset(q->hlevel, 0, sizeof(q->hlevel));
1018 	memset(q->row_mask, 0, sizeof(q->row_mask));
1019 }
1020 
1021 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1022 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1023 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1024 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1025 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1026 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1027 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1028 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1029 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1030 };
1031 
1032 static void htb_work_func(struct work_struct *work)
1033 {
1034 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1035 	struct Qdisc *sch = q->watchdog.qdisc;
1036 
1037 	rcu_read_lock();
1038 	__netif_schedule(qdisc_root(sch));
1039 	rcu_read_unlock();
1040 }
1041 
1042 static void htb_set_lockdep_class_child(struct Qdisc *q)
1043 {
1044 	static struct lock_class_key child_key;
1045 
1046 	lockdep_set_class(qdisc_lock(q), &child_key);
1047 }
1048 
1049 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1050 {
1051 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1052 }
1053 
1054 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1055 		    struct netlink_ext_ack *extack)
1056 {
1057 	struct net_device *dev = qdisc_dev(sch);
1058 	struct tc_htb_qopt_offload offload_opt;
1059 	struct htb_sched *q = qdisc_priv(sch);
1060 	struct nlattr *tb[TCA_HTB_MAX + 1];
1061 	struct tc_htb_glob *gopt;
1062 	unsigned int ntx;
1063 	bool offload;
1064 	int err;
1065 
1066 	qdisc_watchdog_init(&q->watchdog, sch);
1067 	INIT_WORK(&q->work, htb_work_func);
1068 
1069 	if (!opt)
1070 		return -EINVAL;
1071 
1072 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1073 	if (err)
1074 		return err;
1075 
1076 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1077 					  NULL);
1078 	if (err < 0)
1079 		return err;
1080 
1081 	if (!tb[TCA_HTB_INIT])
1082 		return -EINVAL;
1083 
1084 	gopt = nla_data(tb[TCA_HTB_INIT]);
1085 	if (gopt->version != HTB_VER >> 16)
1086 		return -EINVAL;
1087 
1088 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1089 
1090 	if (offload) {
1091 		if (sch->parent != TC_H_ROOT) {
1092 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1093 			return -EOPNOTSUPP;
1094 		}
1095 
1096 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1097 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1098 			return -EOPNOTSUPP;
1099 		}
1100 
1101 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1102 		q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
1103 					   sizeof(*q->direct_qdiscs),
1104 					   GFP_KERNEL);
1105 		if (!q->direct_qdiscs)
1106 			return -ENOMEM;
1107 	}
1108 
1109 	err = qdisc_class_hash_init(&q->clhash);
1110 	if (err < 0)
1111 		return err;
1112 
1113 	if (tb[TCA_HTB_DIRECT_QLEN])
1114 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1115 	else
1116 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1117 
1118 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1119 		q->rate2quantum = 1;
1120 	q->defcls = gopt->defcls;
1121 
1122 	if (!offload)
1123 		return 0;
1124 
1125 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1126 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1127 		struct Qdisc *qdisc;
1128 
1129 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1130 					  TC_H_MAKE(sch->handle, 0), extack);
1131 		if (!qdisc) {
1132 			return -ENOMEM;
1133 		}
1134 
1135 		htb_set_lockdep_class_child(qdisc);
1136 		q->direct_qdiscs[ntx] = qdisc;
1137 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1138 	}
1139 
1140 	sch->flags |= TCQ_F_MQROOT;
1141 
1142 	offload_opt = (struct tc_htb_qopt_offload) {
1143 		.command = TC_HTB_CREATE,
1144 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1145 		.classid = TC_H_MIN(q->defcls),
1146 		.extack = extack,
1147 	};
1148 	err = htb_offload(dev, &offload_opt);
1149 	if (err)
1150 		return err;
1151 
1152 	/* Defer this assignment, so that htb_destroy skips offload-related
1153 	 * parts (especially calling ndo_setup_tc) on errors.
1154 	 */
1155 	q->offload = true;
1156 
1157 	return 0;
1158 }
1159 
1160 static void htb_attach_offload(struct Qdisc *sch)
1161 {
1162 	struct net_device *dev = qdisc_dev(sch);
1163 	struct htb_sched *q = qdisc_priv(sch);
1164 	unsigned int ntx;
1165 
1166 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1167 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1168 
1169 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1170 		qdisc_put(old);
1171 		qdisc_hash_add(qdisc, false);
1172 	}
1173 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1174 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1175 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1176 
1177 		qdisc_put(old);
1178 	}
1179 
1180 	kfree(q->direct_qdiscs);
1181 	q->direct_qdiscs = NULL;
1182 }
1183 
1184 static void htb_attach_software(struct Qdisc *sch)
1185 {
1186 	struct net_device *dev = qdisc_dev(sch);
1187 	unsigned int ntx;
1188 
1189 	/* Resemble qdisc_graft behavior. */
1190 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1191 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1192 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1193 
1194 		qdisc_refcount_inc(sch);
1195 
1196 		qdisc_put(old);
1197 	}
1198 }
1199 
1200 static void htb_attach(struct Qdisc *sch)
1201 {
1202 	struct htb_sched *q = qdisc_priv(sch);
1203 
1204 	if (q->offload)
1205 		htb_attach_offload(sch);
1206 	else
1207 		htb_attach_software(sch);
1208 }
1209 
1210 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1211 {
1212 	struct htb_sched *q = qdisc_priv(sch);
1213 	struct nlattr *nest;
1214 	struct tc_htb_glob gopt;
1215 
1216 	if (q->offload)
1217 		sch->flags |= TCQ_F_OFFLOADED;
1218 	else
1219 		sch->flags &= ~TCQ_F_OFFLOADED;
1220 
1221 	sch->qstats.overlimits = q->overlimits;
1222 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1223 	 * no change can happen on the qdisc parameters.
1224 	 */
1225 
1226 	gopt.direct_pkts = q->direct_pkts;
1227 	gopt.version = HTB_VER;
1228 	gopt.rate2quantum = q->rate2quantum;
1229 	gopt.defcls = q->defcls;
1230 	gopt.debug = 0;
1231 
1232 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1233 	if (nest == NULL)
1234 		goto nla_put_failure;
1235 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1236 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1237 		goto nla_put_failure;
1238 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1239 		goto nla_put_failure;
1240 
1241 	return nla_nest_end(skb, nest);
1242 
1243 nla_put_failure:
1244 	nla_nest_cancel(skb, nest);
1245 	return -1;
1246 }
1247 
1248 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1249 			  struct sk_buff *skb, struct tcmsg *tcm)
1250 {
1251 	struct htb_class *cl = (struct htb_class *)arg;
1252 	struct htb_sched *q = qdisc_priv(sch);
1253 	struct nlattr *nest;
1254 	struct tc_htb_opt opt;
1255 
1256 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1257 	 * no change can happen on the class parameters.
1258 	 */
1259 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1260 	tcm->tcm_handle = cl->common.classid;
1261 	if (!cl->level && cl->leaf.q)
1262 		tcm->tcm_info = cl->leaf.q->handle;
1263 
1264 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1265 	if (nest == NULL)
1266 		goto nla_put_failure;
1267 
1268 	memset(&opt, 0, sizeof(opt));
1269 
1270 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1271 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1272 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1273 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1274 	opt.quantum = cl->quantum;
1275 	opt.prio = cl->prio;
1276 	opt.level = cl->level;
1277 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1278 		goto nla_put_failure;
1279 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1280 		goto nla_put_failure;
1281 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1282 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1283 			      TCA_HTB_PAD))
1284 		goto nla_put_failure;
1285 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1286 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1287 			      TCA_HTB_PAD))
1288 		goto nla_put_failure;
1289 
1290 	return nla_nest_end(skb, nest);
1291 
1292 nla_put_failure:
1293 	nla_nest_cancel(skb, nest);
1294 	return -1;
1295 }
1296 
1297 static void htb_offload_aggregate_stats(struct htb_sched *q,
1298 					struct htb_class *cl)
1299 {
1300 	u64 bytes = 0, packets = 0;
1301 	struct htb_class *c;
1302 	unsigned int i;
1303 
1304 	gnet_stats_basic_sync_init(&cl->bstats);
1305 
1306 	for (i = 0; i < q->clhash.hashsize; i++) {
1307 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1308 			struct htb_class *p = c;
1309 
1310 			while (p && p->level < cl->level)
1311 				p = p->parent;
1312 
1313 			if (p != cl)
1314 				continue;
1315 
1316 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1317 			packets += u64_stats_read(&c->bstats_bias.packets);
1318 			if (c->level == 0) {
1319 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1320 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1321 			}
1322 		}
1323 	}
1324 	_bstats_update(&cl->bstats, bytes, packets);
1325 }
1326 
1327 static int
1328 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1329 {
1330 	struct htb_class *cl = (struct htb_class *)arg;
1331 	struct htb_sched *q = qdisc_priv(sch);
1332 	struct gnet_stats_queue qs = {
1333 		.drops = cl->drops,
1334 		.overlimits = cl->overlimits,
1335 	};
1336 	__u32 qlen = 0;
1337 
1338 	if (!cl->level && cl->leaf.q)
1339 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1340 
1341 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1342 				    INT_MIN, INT_MAX);
1343 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1344 				     INT_MIN, INT_MAX);
1345 
1346 	if (q->offload) {
1347 		if (!cl->level) {
1348 			if (cl->leaf.q)
1349 				cl->bstats = cl->leaf.q->bstats;
1350 			else
1351 				gnet_stats_basic_sync_init(&cl->bstats);
1352 			_bstats_update(&cl->bstats,
1353 				       u64_stats_read(&cl->bstats_bias.bytes),
1354 				       u64_stats_read(&cl->bstats_bias.packets));
1355 		} else {
1356 			htb_offload_aggregate_stats(q, cl);
1357 		}
1358 	}
1359 
1360 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1361 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1362 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1363 		return -1;
1364 
1365 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1366 }
1367 
1368 static struct netdev_queue *
1369 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1370 {
1371 	struct net_device *dev = qdisc_dev(sch);
1372 	struct tc_htb_qopt_offload offload_opt;
1373 	struct htb_sched *q = qdisc_priv(sch);
1374 	int err;
1375 
1376 	if (!q->offload)
1377 		return sch->dev_queue;
1378 
1379 	offload_opt = (struct tc_htb_qopt_offload) {
1380 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1381 		.classid = TC_H_MIN(tcm->tcm_parent),
1382 	};
1383 	err = htb_offload(dev, &offload_opt);
1384 	if (err || offload_opt.qid >= dev->num_tx_queues)
1385 		return NULL;
1386 	return netdev_get_tx_queue(dev, offload_opt.qid);
1387 }
1388 
1389 static struct Qdisc *
1390 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1391 {
1392 	struct net_device *dev = dev_queue->dev;
1393 	struct Qdisc *old_q;
1394 
1395 	if (dev->flags & IFF_UP)
1396 		dev_deactivate(dev);
1397 	old_q = dev_graft_qdisc(dev_queue, new_q);
1398 	if (new_q)
1399 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1400 	if (dev->flags & IFF_UP)
1401 		dev_activate(dev);
1402 
1403 	return old_q;
1404 }
1405 
1406 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1407 {
1408 	struct netdev_queue *queue;
1409 
1410 	queue = cl->leaf.offload_queue;
1411 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1412 		WARN_ON(cl->leaf.q->dev_queue != queue);
1413 
1414 	return queue;
1415 }
1416 
1417 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1418 				   struct htb_class *cl_new, bool destroying)
1419 {
1420 	struct netdev_queue *queue_old, *queue_new;
1421 	struct net_device *dev = qdisc_dev(sch);
1422 
1423 	queue_old = htb_offload_get_queue(cl_old);
1424 	queue_new = htb_offload_get_queue(cl_new);
1425 
1426 	if (!destroying) {
1427 		struct Qdisc *qdisc;
1428 
1429 		if (dev->flags & IFF_UP)
1430 			dev_deactivate(dev);
1431 		qdisc = dev_graft_qdisc(queue_old, NULL);
1432 		WARN_ON(qdisc != cl_old->leaf.q);
1433 	}
1434 
1435 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1436 		cl_old->leaf.q->dev_queue = queue_new;
1437 	cl_old->leaf.offload_queue = queue_new;
1438 
1439 	if (!destroying) {
1440 		struct Qdisc *qdisc;
1441 
1442 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1443 		if (dev->flags & IFF_UP)
1444 			dev_activate(dev);
1445 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1446 	}
1447 }
1448 
1449 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1450 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1451 {
1452 	struct netdev_queue *dev_queue = sch->dev_queue;
1453 	struct htb_class *cl = (struct htb_class *)arg;
1454 	struct htb_sched *q = qdisc_priv(sch);
1455 	struct Qdisc *old_q;
1456 
1457 	if (cl->level)
1458 		return -EINVAL;
1459 
1460 	if (q->offload)
1461 		dev_queue = htb_offload_get_queue(cl);
1462 
1463 	if (!new) {
1464 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1465 					cl->common.classid, extack);
1466 		if (!new)
1467 			return -ENOBUFS;
1468 	}
1469 
1470 	if (q->offload) {
1471 		htb_set_lockdep_class_child(new);
1472 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1473 		qdisc_refcount_inc(new);
1474 		old_q = htb_graft_helper(dev_queue, new);
1475 	}
1476 
1477 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1478 
1479 	if (q->offload) {
1480 		WARN_ON(old_q != *old);
1481 		qdisc_put(old_q);
1482 	}
1483 
1484 	return 0;
1485 }
1486 
1487 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1488 {
1489 	struct htb_class *cl = (struct htb_class *)arg;
1490 	return !cl->level ? cl->leaf.q : NULL;
1491 }
1492 
1493 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1494 {
1495 	struct htb_class *cl = (struct htb_class *)arg;
1496 
1497 	htb_deactivate(qdisc_priv(sch), cl);
1498 }
1499 
1500 static inline int htb_parent_last_child(struct htb_class *cl)
1501 {
1502 	if (!cl->parent)
1503 		/* the root class */
1504 		return 0;
1505 	if (cl->parent->children > 1)
1506 		/* not the last child */
1507 		return 0;
1508 	return 1;
1509 }
1510 
1511 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1512 			       struct Qdisc *new_q)
1513 {
1514 	struct htb_sched *q = qdisc_priv(sch);
1515 	struct htb_class *parent = cl->parent;
1516 
1517 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1518 
1519 	if (parent->cmode != HTB_CAN_SEND)
1520 		htb_safe_rb_erase(&parent->pq_node,
1521 				  &q->hlevel[parent->level].wait_pq);
1522 
1523 	parent->level = 0;
1524 	memset(&parent->inner, 0, sizeof(parent->inner));
1525 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1526 	parent->tokens = parent->buffer;
1527 	parent->ctokens = parent->cbuffer;
1528 	parent->t_c = ktime_get_ns();
1529 	parent->cmode = HTB_CAN_SEND;
1530 	if (q->offload)
1531 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1532 }
1533 
1534 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1535 				       struct netdev_queue *dev_queue,
1536 				       struct Qdisc *new_q)
1537 {
1538 	struct Qdisc *old_q;
1539 
1540 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1541 	if (new_q)
1542 		qdisc_refcount_inc(new_q);
1543 	old_q = htb_graft_helper(dev_queue, new_q);
1544 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1545 }
1546 
1547 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1548 				     bool last_child, bool destroying,
1549 				     struct netlink_ext_ack *extack)
1550 {
1551 	struct tc_htb_qopt_offload offload_opt;
1552 	struct netdev_queue *dev_queue;
1553 	struct Qdisc *q = cl->leaf.q;
1554 	struct Qdisc *old;
1555 	int err;
1556 
1557 	if (cl->level)
1558 		return -EINVAL;
1559 
1560 	WARN_ON(!q);
1561 	dev_queue = htb_offload_get_queue(cl);
1562 	/* When destroying, caller qdisc_graft grafts the new qdisc and invokes
1563 	 * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload
1564 	 * does not need to graft or qdisc_put the qdisc being destroyed.
1565 	 */
1566 	if (!destroying) {
1567 		old = htb_graft_helper(dev_queue, NULL);
1568 		/* Last qdisc grafted should be the same as cl->leaf.q when
1569 		 * calling htb_delete.
1570 		 */
1571 		WARN_ON(old != q);
1572 	}
1573 
1574 	if (cl->parent) {
1575 		_bstats_update(&cl->parent->bstats_bias,
1576 			       u64_stats_read(&q->bstats.bytes),
1577 			       u64_stats_read(&q->bstats.packets));
1578 	}
1579 
1580 	offload_opt = (struct tc_htb_qopt_offload) {
1581 		.command = !last_child ? TC_HTB_LEAF_DEL :
1582 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1583 			   TC_HTB_LEAF_DEL_LAST,
1584 		.classid = cl->common.classid,
1585 		.extack = extack,
1586 	};
1587 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1588 
1589 	if (!destroying) {
1590 		if (!err)
1591 			qdisc_put(old);
1592 		else
1593 			htb_graft_helper(dev_queue, old);
1594 	}
1595 
1596 	if (last_child)
1597 		return err;
1598 
1599 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1600 		u32 classid = TC_H_MAJ(sch->handle) |
1601 			      TC_H_MIN(offload_opt.classid);
1602 		struct htb_class *moved_cl = htb_find(classid, sch);
1603 
1604 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1605 	}
1606 
1607 	return err;
1608 }
1609 
1610 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1611 {
1612 	if (!cl->level) {
1613 		WARN_ON(!cl->leaf.q);
1614 		qdisc_put(cl->leaf.q);
1615 	}
1616 	gen_kill_estimator(&cl->rate_est);
1617 	tcf_block_put(cl->block);
1618 	kfree(cl);
1619 }
1620 
1621 static void htb_destroy(struct Qdisc *sch)
1622 {
1623 	struct net_device *dev = qdisc_dev(sch);
1624 	struct tc_htb_qopt_offload offload_opt;
1625 	struct htb_sched *q = qdisc_priv(sch);
1626 	struct hlist_node *next;
1627 	bool nonempty, changed;
1628 	struct htb_class *cl;
1629 	unsigned int i;
1630 
1631 	cancel_work_sync(&q->work);
1632 	qdisc_watchdog_cancel(&q->watchdog);
1633 	/* This line used to be after htb_destroy_class call below
1634 	 * and surprisingly it worked in 2.4. But it must precede it
1635 	 * because filter need its target class alive to be able to call
1636 	 * unbind_filter on it (without Oops).
1637 	 */
1638 	tcf_block_put(q->block);
1639 
1640 	for (i = 0; i < q->clhash.hashsize; i++) {
1641 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1642 			tcf_block_put(cl->block);
1643 			cl->block = NULL;
1644 		}
1645 	}
1646 
1647 	do {
1648 		nonempty = false;
1649 		changed = false;
1650 		for (i = 0; i < q->clhash.hashsize; i++) {
1651 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1652 						  common.hnode) {
1653 				bool last_child;
1654 
1655 				if (!q->offload) {
1656 					htb_destroy_class(sch, cl);
1657 					continue;
1658 				}
1659 
1660 				nonempty = true;
1661 
1662 				if (cl->level)
1663 					continue;
1664 
1665 				changed = true;
1666 
1667 				last_child = htb_parent_last_child(cl);
1668 				htb_destroy_class_offload(sch, cl, last_child,
1669 							  true, NULL);
1670 				qdisc_class_hash_remove(&q->clhash,
1671 							&cl->common);
1672 				if (cl->parent)
1673 					cl->parent->children--;
1674 				if (last_child)
1675 					htb_parent_to_leaf(sch, cl, NULL);
1676 				htb_destroy_class(sch, cl);
1677 			}
1678 		}
1679 	} while (changed);
1680 	WARN_ON(nonempty);
1681 
1682 	qdisc_class_hash_destroy(&q->clhash);
1683 	__qdisc_reset_queue(&q->direct_queue);
1684 
1685 	if (q->offload) {
1686 		offload_opt = (struct tc_htb_qopt_offload) {
1687 			.command = TC_HTB_DESTROY,
1688 		};
1689 		htb_offload(dev, &offload_opt);
1690 	}
1691 
1692 	if (!q->direct_qdiscs)
1693 		return;
1694 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1695 		qdisc_put(q->direct_qdiscs[i]);
1696 	kfree(q->direct_qdiscs);
1697 }
1698 
1699 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1700 		      struct netlink_ext_ack *extack)
1701 {
1702 	struct htb_sched *q = qdisc_priv(sch);
1703 	struct htb_class *cl = (struct htb_class *)arg;
1704 	struct Qdisc *new_q = NULL;
1705 	int last_child = 0;
1706 	int err;
1707 
1708 	/* TODO: why don't allow to delete subtree ? references ? does
1709 	 * tc subsys guarantee us that in htb_destroy it holds no class
1710 	 * refs so that we can remove children safely there ?
1711 	 */
1712 	if (cl->children || qdisc_class_in_use(&cl->common)) {
1713 		NL_SET_ERR_MSG(extack, "HTB class in use");
1714 		return -EBUSY;
1715 	}
1716 
1717 	if (!cl->level && htb_parent_last_child(cl))
1718 		last_child = 1;
1719 
1720 	if (q->offload) {
1721 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1722 						extack);
1723 		if (err)
1724 			return err;
1725 	}
1726 
1727 	if (last_child) {
1728 		struct netdev_queue *dev_queue = sch->dev_queue;
1729 
1730 		if (q->offload)
1731 			dev_queue = htb_offload_get_queue(cl);
1732 
1733 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1734 					  cl->parent->common.classid,
1735 					  NULL);
1736 		if (q->offload) {
1737 			if (new_q)
1738 				htb_set_lockdep_class_child(new_q);
1739 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1740 		}
1741 	}
1742 
1743 	sch_tree_lock(sch);
1744 
1745 	if (!cl->level)
1746 		qdisc_purge_queue(cl->leaf.q);
1747 
1748 	/* delete from hash and active; remainder in destroy_class */
1749 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1750 	if (cl->parent)
1751 		cl->parent->children--;
1752 
1753 	if (cl->prio_activity)
1754 		htb_deactivate(q, cl);
1755 
1756 	if (cl->cmode != HTB_CAN_SEND)
1757 		htb_safe_rb_erase(&cl->pq_node,
1758 				  &q->hlevel[cl->level].wait_pq);
1759 
1760 	if (last_child)
1761 		htb_parent_to_leaf(sch, cl, new_q);
1762 
1763 	sch_tree_unlock(sch);
1764 
1765 	htb_destroy_class(sch, cl);
1766 	return 0;
1767 }
1768 
1769 static int htb_change_class(struct Qdisc *sch, u32 classid,
1770 			    u32 parentid, struct nlattr **tca,
1771 			    unsigned long *arg, struct netlink_ext_ack *extack)
1772 {
1773 	int err = -EINVAL;
1774 	struct htb_sched *q = qdisc_priv(sch);
1775 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1776 	struct tc_htb_qopt_offload offload_opt;
1777 	struct nlattr *opt = tca[TCA_OPTIONS];
1778 	struct nlattr *tb[TCA_HTB_MAX + 1];
1779 	struct Qdisc *parent_qdisc = NULL;
1780 	struct netdev_queue *dev_queue;
1781 	struct tc_htb_opt *hopt;
1782 	u64 rate64, ceil64;
1783 	int warn = 0;
1784 
1785 	/* extract all subattrs from opt attr */
1786 	if (!opt)
1787 		goto failure;
1788 
1789 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1790 					  extack);
1791 	if (err < 0)
1792 		goto failure;
1793 
1794 	err = -EINVAL;
1795 	if (tb[TCA_HTB_PARMS] == NULL)
1796 		goto failure;
1797 
1798 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1799 
1800 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1801 	if (!hopt->rate.rate || !hopt->ceil.rate)
1802 		goto failure;
1803 
1804 	if (q->offload) {
1805 		/* Options not supported by the offload. */
1806 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1807 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1808 			goto failure;
1809 		}
1810 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1811 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1812 			goto failure;
1813 		}
1814 	}
1815 
1816 	/* Keeping backward compatible with rate_table based iproute2 tc */
1817 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1818 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1819 					      NULL));
1820 
1821 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1822 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1823 					      NULL));
1824 
1825 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1826 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1827 
1828 	if (!cl) {		/* new class */
1829 		struct net_device *dev = qdisc_dev(sch);
1830 		struct Qdisc *new_q, *old_q;
1831 		int prio;
1832 		struct {
1833 			struct nlattr		nla;
1834 			struct gnet_estimator	opt;
1835 		} est = {
1836 			.nla = {
1837 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1838 				.nla_type	= TCA_RATE,
1839 			},
1840 			.opt = {
1841 				/* 4s interval, 16s averaging constant */
1842 				.interval	= 2,
1843 				.ewma_log	= 2,
1844 			},
1845 		};
1846 
1847 		/* check for valid classid */
1848 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1849 		    htb_find(classid, sch))
1850 			goto failure;
1851 
1852 		/* check maximal depth */
1853 		if (parent && parent->parent && parent->parent->level < 2) {
1854 			NL_SET_ERR_MSG_MOD(extack, "tree is too deep");
1855 			goto failure;
1856 		}
1857 		err = -ENOBUFS;
1858 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1859 		if (!cl)
1860 			goto failure;
1861 
1862 		gnet_stats_basic_sync_init(&cl->bstats);
1863 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1864 
1865 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1866 		if (err) {
1867 			kfree(cl);
1868 			goto failure;
1869 		}
1870 		if (htb_rate_est || tca[TCA_RATE]) {
1871 			err = gen_new_estimator(&cl->bstats, NULL,
1872 						&cl->rate_est,
1873 						NULL,
1874 						true,
1875 						tca[TCA_RATE] ? : &est.nla);
1876 			if (err)
1877 				goto err_block_put;
1878 		}
1879 
1880 		cl->children = 0;
1881 		RB_CLEAR_NODE(&cl->pq_node);
1882 
1883 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1884 			RB_CLEAR_NODE(&cl->node[prio]);
1885 
1886 		cl->common.classid = classid;
1887 
1888 		/* Make sure nothing interrupts us in between of two
1889 		 * ndo_setup_tc calls.
1890 		 */
1891 		ASSERT_RTNL();
1892 
1893 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1894 		 * so that can't be used inside of sch_tree_lock
1895 		 * -- thanks to Karlis Peisenieks
1896 		 */
1897 		if (!q->offload) {
1898 			dev_queue = sch->dev_queue;
1899 		} else if (!(parent && !parent->level)) {
1900 			/* Assign a dev_queue to this classid. */
1901 			offload_opt = (struct tc_htb_qopt_offload) {
1902 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1903 				.classid = cl->common.classid,
1904 				.parent_classid = parent ?
1905 					TC_H_MIN(parent->common.classid) :
1906 					TC_HTB_CLASSID_ROOT,
1907 				.rate = max_t(u64, hopt->rate.rate, rate64),
1908 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1909 				.prio = hopt->prio,
1910 				.quantum = hopt->quantum,
1911 				.extack = extack,
1912 			};
1913 			err = htb_offload(dev, &offload_opt);
1914 			if (err) {
1915 				NL_SET_ERR_MSG_WEAK(extack,
1916 						    "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE");
1917 				goto err_kill_estimator;
1918 			}
1919 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1920 		} else { /* First child. */
1921 			dev_queue = htb_offload_get_queue(parent);
1922 			old_q = htb_graft_helper(dev_queue, NULL);
1923 			WARN_ON(old_q != parent->leaf.q);
1924 			offload_opt = (struct tc_htb_qopt_offload) {
1925 				.command = TC_HTB_LEAF_TO_INNER,
1926 				.classid = cl->common.classid,
1927 				.parent_classid =
1928 					TC_H_MIN(parent->common.classid),
1929 				.rate = max_t(u64, hopt->rate.rate, rate64),
1930 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1931 				.prio = hopt->prio,
1932 				.quantum = hopt->quantum,
1933 				.extack = extack,
1934 			};
1935 			err = htb_offload(dev, &offload_opt);
1936 			if (err) {
1937 				NL_SET_ERR_MSG_WEAK(extack,
1938 						    "Failed to offload TC_HTB_LEAF_TO_INNER");
1939 				htb_graft_helper(dev_queue, old_q);
1940 				goto err_kill_estimator;
1941 			}
1942 			_bstats_update(&parent->bstats_bias,
1943 				       u64_stats_read(&old_q->bstats.bytes),
1944 				       u64_stats_read(&old_q->bstats.packets));
1945 			qdisc_put(old_q);
1946 		}
1947 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1948 					  classid, NULL);
1949 		if (q->offload) {
1950 			if (new_q) {
1951 				htb_set_lockdep_class_child(new_q);
1952 				/* One ref for cl->leaf.q, the other for
1953 				 * dev_queue->qdisc.
1954 				 */
1955 				qdisc_refcount_inc(new_q);
1956 			}
1957 			old_q = htb_graft_helper(dev_queue, new_q);
1958 			/* No qdisc_put needed. */
1959 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1960 		}
1961 		sch_tree_lock(sch);
1962 		if (parent && !parent->level) {
1963 			/* turn parent into inner node */
1964 			qdisc_purge_queue(parent->leaf.q);
1965 			parent_qdisc = parent->leaf.q;
1966 			if (parent->prio_activity)
1967 				htb_deactivate(q, parent);
1968 
1969 			/* remove from evt list because of level change */
1970 			if (parent->cmode != HTB_CAN_SEND) {
1971 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1972 				parent->cmode = HTB_CAN_SEND;
1973 			}
1974 			parent->level = (parent->parent ? parent->parent->level
1975 					 : TC_HTB_MAXDEPTH) - 1;
1976 			memset(&parent->inner, 0, sizeof(parent->inner));
1977 		}
1978 
1979 		/* leaf (we) needs elementary qdisc */
1980 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1981 		if (q->offload)
1982 			cl->leaf.offload_queue = dev_queue;
1983 
1984 		cl->parent = parent;
1985 
1986 		/* set class to be in HTB_CAN_SEND state */
1987 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1988 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1989 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1990 		cl->t_c = ktime_get_ns();
1991 		cl->cmode = HTB_CAN_SEND;
1992 
1993 		/* attach to the hash list and parent's family */
1994 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1995 		if (parent)
1996 			parent->children++;
1997 		if (cl->leaf.q != &noop_qdisc)
1998 			qdisc_hash_add(cl->leaf.q, true);
1999 	} else {
2000 		if (tca[TCA_RATE]) {
2001 			err = gen_replace_estimator(&cl->bstats, NULL,
2002 						    &cl->rate_est,
2003 						    NULL,
2004 						    true,
2005 						    tca[TCA_RATE]);
2006 			if (err)
2007 				return err;
2008 		}
2009 
2010 		if (q->offload) {
2011 			struct net_device *dev = qdisc_dev(sch);
2012 
2013 			offload_opt = (struct tc_htb_qopt_offload) {
2014 				.command = TC_HTB_NODE_MODIFY,
2015 				.classid = cl->common.classid,
2016 				.rate = max_t(u64, hopt->rate.rate, rate64),
2017 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2018 				.prio = hopt->prio,
2019 				.quantum = hopt->quantum,
2020 				.extack = extack,
2021 			};
2022 			err = htb_offload(dev, &offload_opt);
2023 			if (err)
2024 				/* Estimator was replaced, and rollback may fail
2025 				 * as well, so we don't try to recover it, and
2026 				 * the estimator won't work property with the
2027 				 * offload anyway, because bstats are updated
2028 				 * only when the stats are queried.
2029 				 */
2030 				return err;
2031 		}
2032 
2033 		sch_tree_lock(sch);
2034 	}
2035 
2036 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2037 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2038 
2039 	/* it used to be a nasty bug here, we have to check that node
2040 	 * is really leaf before changing cl->leaf !
2041 	 */
2042 	if (!cl->level) {
2043 		u64 quantum = cl->rate.rate_bytes_ps;
2044 
2045 		do_div(quantum, q->rate2quantum);
2046 		cl->quantum = min_t(u64, quantum, INT_MAX);
2047 
2048 		if (!hopt->quantum && cl->quantum < 1000) {
2049 			warn = -1;
2050 			cl->quantum = 1000;
2051 		}
2052 		if (!hopt->quantum && cl->quantum > 200000) {
2053 			warn = 1;
2054 			cl->quantum = 200000;
2055 		}
2056 		if (hopt->quantum)
2057 			cl->quantum = hopt->quantum;
2058 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2059 			cl->prio = TC_HTB_NUMPRIO - 1;
2060 	}
2061 
2062 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2063 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2064 
2065 	sch_tree_unlock(sch);
2066 	qdisc_put(parent_qdisc);
2067 
2068 	if (warn)
2069 		NL_SET_ERR_MSG_FMT_MOD(extack,
2070 				       "quantum of class %X is %s. Consider r2q change.",
2071 				       cl->common.classid, (warn == -1 ? "small" : "big"));
2072 
2073 	qdisc_class_hash_grow(sch, &q->clhash);
2074 
2075 	*arg = (unsigned long)cl;
2076 	return 0;
2077 
2078 err_kill_estimator:
2079 	gen_kill_estimator(&cl->rate_est);
2080 err_block_put:
2081 	tcf_block_put(cl->block);
2082 	kfree(cl);
2083 failure:
2084 	return err;
2085 }
2086 
2087 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2088 				       struct netlink_ext_ack *extack)
2089 {
2090 	struct htb_sched *q = qdisc_priv(sch);
2091 	struct htb_class *cl = (struct htb_class *)arg;
2092 
2093 	return cl ? cl->block : q->block;
2094 }
2095 
2096 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2097 				     u32 classid)
2098 {
2099 	struct htb_class *cl = htb_find(classid, sch);
2100 
2101 	/*if (cl && !cl->level) return 0;
2102 	 * The line above used to be there to prevent attaching filters to
2103 	 * leaves. But at least tc_index filter uses this just to get class
2104 	 * for other reasons so that we have to allow for it.
2105 	 * ----
2106 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2107 	 * another way to "lock" the class - unlike "get" this lock can
2108 	 * be broken by class during destroy IIUC.
2109 	 */
2110 	if (cl)
2111 		qdisc_class_get(&cl->common);
2112 	return (unsigned long)cl;
2113 }
2114 
2115 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2116 {
2117 	struct htb_class *cl = (struct htb_class *)arg;
2118 
2119 	qdisc_class_put(&cl->common);
2120 }
2121 
2122 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2123 {
2124 	struct htb_sched *q = qdisc_priv(sch);
2125 	struct htb_class *cl;
2126 	unsigned int i;
2127 
2128 	if (arg->stop)
2129 		return;
2130 
2131 	for (i = 0; i < q->clhash.hashsize; i++) {
2132 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2133 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
2134 				return;
2135 		}
2136 	}
2137 }
2138 
2139 static const struct Qdisc_class_ops htb_class_ops = {
2140 	.select_queue	=	htb_select_queue,
2141 	.graft		=	htb_graft,
2142 	.leaf		=	htb_leaf,
2143 	.qlen_notify	=	htb_qlen_notify,
2144 	.find		=	htb_search,
2145 	.change		=	htb_change_class,
2146 	.delete		=	htb_delete,
2147 	.walk		=	htb_walk,
2148 	.tcf_block	=	htb_tcf_block,
2149 	.bind_tcf	=	htb_bind_filter,
2150 	.unbind_tcf	=	htb_unbind_filter,
2151 	.dump		=	htb_dump_class,
2152 	.dump_stats	=	htb_dump_class_stats,
2153 };
2154 
2155 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2156 	.cl_ops		=	&htb_class_ops,
2157 	.id		=	"htb",
2158 	.priv_size	=	sizeof(struct htb_sched),
2159 	.enqueue	=	htb_enqueue,
2160 	.dequeue	=	htb_dequeue,
2161 	.peek		=	qdisc_peek_dequeued,
2162 	.init		=	htb_init,
2163 	.attach		=	htb_attach,
2164 	.reset		=	htb_reset,
2165 	.destroy	=	htb_destroy,
2166 	.dump		=	htb_dump,
2167 	.owner		=	THIS_MODULE,
2168 };
2169 
2170 static int __init htb_module_init(void)
2171 {
2172 	return register_qdisc(&htb_qdisc_ops);
2173 }
2174 static void __exit htb_module_exit(void)
2175 {
2176 	unregister_qdisc(&htb_qdisc_ops);
2177 }
2178 
2179 module_init(htb_module_init)
2180 module_exit(htb_module_exit)
2181 MODULE_LICENSE("GPL");
2182 MODULE_DESCRIPTION("Hierarchical Token Bucket scheduler");
2183