xref: /linux/net/sched/sch_htb.c (revision 231c53e1a529f21c4b2d59df32389d5a57ce1289)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 
106 	int			level;		/* our level (see above) */
107 	unsigned int		children;
108 	struct htb_class	*parent;	/* parent class */
109 
110 	struct net_rate_estimator __rcu *rate_est;
111 
112 	/*
113 	 * Written often fields
114 	 */
115 	struct gnet_stats_basic_sync bstats;
116 	struct gnet_stats_basic_sync bstats_bias;
117 	u32			xstats_lends;
118 	u32			xstats_borrows;
119 
120 	/* token bucket parameters */
121 	s64			tokens, ctokens;/* current number of tokens */
122 	s64			t_c;		/* checkpoint time */
123 
124 	union {
125 		struct htb_class_leaf {
126 			int		deficit[TC_HTB_MAXDEPTH];
127 			struct Qdisc	*q;
128 			struct netdev_queue *offload_queue;
129 		} leaf;
130 		struct htb_class_inner {
131 			struct htb_prio clprio[TC_HTB_NUMPRIO];
132 		} inner;
133 	};
134 	s64			pq_key;
135 
136 	int			prio_activity;	/* for which prios are we active */
137 	enum htb_cmode		cmode;		/* current mode of the class */
138 	struct rb_node		pq_node;	/* node for event queue */
139 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
140 
141 	unsigned int drops ____cacheline_aligned_in_smp;
142 	unsigned int		overlimits;
143 };
144 
145 struct htb_level {
146 	struct rb_root	wait_pq;
147 	struct htb_prio hprio[TC_HTB_NUMPRIO];
148 };
149 
150 struct htb_sched {
151 	struct Qdisc_class_hash clhash;
152 	int			defcls;		/* class where unclassified flows go to */
153 	int			rate2quantum;	/* quant = rate / rate2quantum */
154 
155 	/* filters for qdisc itself */
156 	struct tcf_proto __rcu	*filter_list;
157 	struct tcf_block	*block;
158 
159 #define HTB_WARN_TOOMANYEVENTS	0x1
160 	unsigned int		warned;	/* only one warning */
161 	int			direct_qlen;
162 	struct work_struct	work;
163 
164 	/* non shaped skbs; let them go directly thru */
165 	struct qdisc_skb_head	direct_queue;
166 	u32			direct_pkts;
167 	u32			overlimits;
168 
169 	struct qdisc_watchdog	watchdog;
170 
171 	s64			now;	/* cached dequeue time */
172 
173 	/* time of nearest event per level (row) */
174 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
175 
176 	int			row_mask[TC_HTB_MAXDEPTH];
177 
178 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
179 
180 	struct Qdisc		**direct_qdiscs;
181 	unsigned int            num_direct_qdiscs;
182 
183 	bool			offload;
184 };
185 
186 /* find class in global hash table using given handle */
187 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
188 {
189 	struct htb_sched *q = qdisc_priv(sch);
190 	struct Qdisc_class_common *clc;
191 
192 	clc = qdisc_class_find(&q->clhash, handle);
193 	if (clc == NULL)
194 		return NULL;
195 	return container_of(clc, struct htb_class, common);
196 }
197 
198 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
199 {
200 	return (unsigned long)htb_find(handle, sch);
201 }
202 
203 #define HTB_DIRECT ((struct htb_class *)-1L)
204 
205 /**
206  * htb_classify - classify a packet into class
207  * @skb: the socket buffer
208  * @sch: the active queue discipline
209  * @qerr: pointer for returned status code
210  *
211  * It returns NULL if the packet should be dropped or -1 if the packet
212  * should be passed directly thru. In all other cases leaf class is returned.
213  * We allow direct class selection by classid in priority. The we examine
214  * filters in qdisc and in inner nodes (if higher filter points to the inner
215  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
216  * internal fifo (direct). These packets then go directly thru. If we still
217  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
218  * then finish and return direct queue.
219  */
220 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
221 				      int *qerr)
222 {
223 	struct htb_sched *q = qdisc_priv(sch);
224 	struct htb_class *cl;
225 	struct tcf_result res;
226 	struct tcf_proto *tcf;
227 	int result;
228 
229 	/* allow to select class by setting skb->priority to valid classid;
230 	 * note that nfmark can be used too by attaching filter fw with no
231 	 * rules in it
232 	 */
233 	if (skb->priority == sch->handle)
234 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
235 	cl = htb_find(skb->priority, sch);
236 	if (cl) {
237 		if (cl->level == 0)
238 			return cl;
239 		/* Start with inner filter chain if a non-leaf class is selected */
240 		tcf = rcu_dereference_bh(cl->filter_list);
241 	} else {
242 		tcf = rcu_dereference_bh(q->filter_list);
243 	}
244 
245 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
246 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
247 #ifdef CONFIG_NET_CLS_ACT
248 		switch (result) {
249 		case TC_ACT_QUEUED:
250 		case TC_ACT_STOLEN:
251 		case TC_ACT_TRAP:
252 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
253 			fallthrough;
254 		case TC_ACT_SHOT:
255 			return NULL;
256 		}
257 #endif
258 		cl = (void *)res.class;
259 		if (!cl) {
260 			if (res.classid == sch->handle)
261 				return HTB_DIRECT;	/* X:0 (direct flow) */
262 			cl = htb_find(res.classid, sch);
263 			if (!cl)
264 				break;	/* filter selected invalid classid */
265 		}
266 		if (!cl->level)
267 			return cl;	/* we hit leaf; return it */
268 
269 		/* we have got inner class; apply inner filter chain */
270 		tcf = rcu_dereference_bh(cl->filter_list);
271 	}
272 	/* classification failed; try to use default class */
273 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
274 	if (!cl || cl->level)
275 		return HTB_DIRECT;	/* bad default .. this is safe bet */
276 	return cl;
277 }
278 
279 /**
280  * htb_add_to_id_tree - adds class to the round robin list
281  * @root: the root of the tree
282  * @cl: the class to add
283  * @prio: the give prio in class
284  *
285  * Routine adds class to the list (actually tree) sorted by classid.
286  * Make sure that class is not already on such list for given prio.
287  */
288 static void htb_add_to_id_tree(struct rb_root *root,
289 			       struct htb_class *cl, int prio)
290 {
291 	struct rb_node **p = &root->rb_node, *parent = NULL;
292 
293 	while (*p) {
294 		struct htb_class *c;
295 		parent = *p;
296 		c = rb_entry(parent, struct htb_class, node[prio]);
297 
298 		if (cl->common.classid > c->common.classid)
299 			p = &parent->rb_right;
300 		else
301 			p = &parent->rb_left;
302 	}
303 	rb_link_node(&cl->node[prio], parent, p);
304 	rb_insert_color(&cl->node[prio], root);
305 }
306 
307 /**
308  * htb_add_to_wait_tree - adds class to the event queue with delay
309  * @q: the priority event queue
310  * @cl: the class to add
311  * @delay: delay in microseconds
312  *
313  * The class is added to priority event queue to indicate that class will
314  * change its mode in cl->pq_key microseconds. Make sure that class is not
315  * already in the queue.
316  */
317 static void htb_add_to_wait_tree(struct htb_sched *q,
318 				 struct htb_class *cl, s64 delay)
319 {
320 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
321 
322 	cl->pq_key = q->now + delay;
323 	if (cl->pq_key == q->now)
324 		cl->pq_key++;
325 
326 	/* update the nearest event cache */
327 	if (q->near_ev_cache[cl->level] > cl->pq_key)
328 		q->near_ev_cache[cl->level] = cl->pq_key;
329 
330 	while (*p) {
331 		struct htb_class *c;
332 		parent = *p;
333 		c = rb_entry(parent, struct htb_class, pq_node);
334 		if (cl->pq_key >= c->pq_key)
335 			p = &parent->rb_right;
336 		else
337 			p = &parent->rb_left;
338 	}
339 	rb_link_node(&cl->pq_node, parent, p);
340 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
341 }
342 
343 /**
344  * htb_next_rb_node - finds next node in binary tree
345  * @n: the current node in binary tree
346  *
347  * When we are past last key we return NULL.
348  * Average complexity is 2 steps per call.
349  */
350 static inline void htb_next_rb_node(struct rb_node **n)
351 {
352 	if (*n)
353 		*n = rb_next(*n);
354 }
355 
356 /**
357  * htb_add_class_to_row - add class to its row
358  * @q: the priority event queue
359  * @cl: the class to add
360  * @mask: the given priorities in class in bitmap
361  *
362  * The class is added to row at priorities marked in mask.
363  * It does nothing if mask == 0.
364  */
365 static inline void htb_add_class_to_row(struct htb_sched *q,
366 					struct htb_class *cl, int mask)
367 {
368 	q->row_mask[cl->level] |= mask;
369 	while (mask) {
370 		int prio = ffz(~mask);
371 		mask &= ~(1 << prio);
372 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
373 	}
374 }
375 
376 /* If this triggers, it is a bug in this code, but it need not be fatal */
377 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
378 {
379 	if (RB_EMPTY_NODE(rb)) {
380 		WARN_ON(1);
381 	} else {
382 		rb_erase(rb, root);
383 		RB_CLEAR_NODE(rb);
384 	}
385 }
386 
387 
388 /**
389  * htb_remove_class_from_row - removes class from its row
390  * @q: the priority event queue
391  * @cl: the class to add
392  * @mask: the given priorities in class in bitmap
393  *
394  * The class is removed from row at priorities marked in mask.
395  * It does nothing if mask == 0.
396  */
397 static inline void htb_remove_class_from_row(struct htb_sched *q,
398 						 struct htb_class *cl, int mask)
399 {
400 	int m = 0;
401 	struct htb_level *hlevel = &q->hlevel[cl->level];
402 
403 	while (mask) {
404 		int prio = ffz(~mask);
405 		struct htb_prio *hprio = &hlevel->hprio[prio];
406 
407 		mask &= ~(1 << prio);
408 		if (hprio->ptr == cl->node + prio)
409 			htb_next_rb_node(&hprio->ptr);
410 
411 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
412 		if (!hprio->row.rb_node)
413 			m |= 1 << prio;
414 	}
415 	q->row_mask[cl->level] &= ~m;
416 }
417 
418 /**
419  * htb_activate_prios - creates active classe's feed chain
420  * @q: the priority event queue
421  * @cl: the class to activate
422  *
423  * The class is connected to ancestors and/or appropriate rows
424  * for priorities it is participating on. cl->cmode must be new
425  * (activated) mode. It does nothing if cl->prio_activity == 0.
426  */
427 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
428 {
429 	struct htb_class *p = cl->parent;
430 	long m, mask = cl->prio_activity;
431 
432 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
433 		m = mask;
434 		while (m) {
435 			unsigned int prio = ffz(~m);
436 
437 			if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
438 				break;
439 			m &= ~(1 << prio);
440 
441 			if (p->inner.clprio[prio].feed.rb_node)
442 				/* parent already has its feed in use so that
443 				 * reset bit in mask as parent is already ok
444 				 */
445 				mask &= ~(1 << prio);
446 
447 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
448 		}
449 		p->prio_activity |= mask;
450 		cl = p;
451 		p = cl->parent;
452 
453 	}
454 	if (cl->cmode == HTB_CAN_SEND && mask)
455 		htb_add_class_to_row(q, cl, mask);
456 }
457 
458 /**
459  * htb_deactivate_prios - remove class from feed chain
460  * @q: the priority event queue
461  * @cl: the class to deactivate
462  *
463  * cl->cmode must represent old mode (before deactivation). It does
464  * nothing if cl->prio_activity == 0. Class is removed from all feed
465  * chains and rows.
466  */
467 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
468 {
469 	struct htb_class *p = cl->parent;
470 	long m, mask = cl->prio_activity;
471 
472 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
473 		m = mask;
474 		mask = 0;
475 		while (m) {
476 			int prio = ffz(~m);
477 			m &= ~(1 << prio);
478 
479 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
480 				/* we are removing child which is pointed to from
481 				 * parent feed - forget the pointer but remember
482 				 * classid
483 				 */
484 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
485 				p->inner.clprio[prio].ptr = NULL;
486 			}
487 
488 			htb_safe_rb_erase(cl->node + prio,
489 					  &p->inner.clprio[prio].feed);
490 
491 			if (!p->inner.clprio[prio].feed.rb_node)
492 				mask |= 1 << prio;
493 		}
494 
495 		p->prio_activity &= ~mask;
496 		cl = p;
497 		p = cl->parent;
498 
499 	}
500 	if (cl->cmode == HTB_CAN_SEND && mask)
501 		htb_remove_class_from_row(q, cl, mask);
502 }
503 
504 static inline s64 htb_lowater(const struct htb_class *cl)
505 {
506 	if (htb_hysteresis)
507 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
508 	else
509 		return 0;
510 }
511 static inline s64 htb_hiwater(const struct htb_class *cl)
512 {
513 	if (htb_hysteresis)
514 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
515 	else
516 		return 0;
517 }
518 
519 
520 /**
521  * htb_class_mode - computes and returns current class mode
522  * @cl: the target class
523  * @diff: diff time in microseconds
524  *
525  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
526  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
527  * from now to time when cl will change its state.
528  * Also it is worth to note that class mode doesn't change simply
529  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
530  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
531  * mode transitions per time unit. The speed gain is about 1/6.
532  */
533 static inline enum htb_cmode
534 htb_class_mode(struct htb_class *cl, s64 *diff)
535 {
536 	s64 toks;
537 
538 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
539 		*diff = -toks;
540 		return HTB_CANT_SEND;
541 	}
542 
543 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
544 		return HTB_CAN_SEND;
545 
546 	*diff = -toks;
547 	return HTB_MAY_BORROW;
548 }
549 
550 /**
551  * htb_change_class_mode - changes classe's mode
552  * @q: the priority event queue
553  * @cl: the target class
554  * @diff: diff time in microseconds
555  *
556  * This should be the only way how to change classe's mode under normal
557  * circumstances. Routine will update feed lists linkage, change mode
558  * and add class to the wait event queue if appropriate. New mode should
559  * be different from old one and cl->pq_key has to be valid if changing
560  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
561  */
562 static void
563 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
564 {
565 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
566 
567 	if (new_mode == cl->cmode)
568 		return;
569 
570 	if (new_mode == HTB_CANT_SEND) {
571 		WRITE_ONCE(cl->overlimits, cl->overlimits + 1);
572 		WRITE_ONCE(q->overlimits, q->overlimits + 1);
573 	}
574 
575 	if (cl->prio_activity) {	/* not necessary: speed optimization */
576 		if (cl->cmode != HTB_CANT_SEND)
577 			htb_deactivate_prios(q, cl);
578 		cl->cmode = new_mode;
579 		if (new_mode != HTB_CANT_SEND)
580 			htb_activate_prios(q, cl);
581 	} else
582 		cl->cmode = new_mode;
583 }
584 
585 /**
586  * htb_activate - inserts leaf cl into appropriate active feeds
587  * @q: the priority event queue
588  * @cl: the target class
589  *
590  * Routine learns (new) priority of leaf and activates feed chain
591  * for the prio. It can be called on already active leaf safely.
592  * It also adds leaf into droplist.
593  */
594 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
595 {
596 	WARN_ON(cl->level || !cl->leaf.q);
597 
598 	if (!cl->prio_activity) {
599 		cl->prio_activity = 1 << cl->prio;
600 		htb_activate_prios(q, cl);
601 	}
602 }
603 
604 /**
605  * htb_deactivate - remove leaf cl from active feeds
606  * @q: the priority event queue
607  * @cl: the target class
608  *
609  * Make sure that leaf is active. In the other words it can't be called
610  * with non-active leaf. It also removes class from the drop list.
611  */
612 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
613 {
614 	if (!cl->prio_activity)
615 		return;
616 	htb_deactivate_prios(q, cl);
617 	cl->prio_activity = 0;
618 }
619 
620 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
621 		       struct sk_buff **to_free)
622 {
623 	int ret;
624 	unsigned int len = qdisc_pkt_len(skb);
625 	struct htb_sched *q = qdisc_priv(sch);
626 	struct htb_class *cl = htb_classify(skb, sch, &ret);
627 
628 	if (cl == HTB_DIRECT) {
629 		/* enqueue to helper queue */
630 		if (q->direct_queue.qlen < q->direct_qlen) {
631 			__qdisc_enqueue_tail(skb, &q->direct_queue);
632 			WRITE_ONCE(q->direct_pkts, q->direct_pkts + 1);
633 		} else {
634 			return qdisc_drop(skb, sch, to_free);
635 		}
636 #ifdef CONFIG_NET_CLS_ACT
637 	} else if (!cl) {
638 		if (ret & __NET_XMIT_BYPASS)
639 			qdisc_qstats_drop(sch);
640 		__qdisc_drop(skb, to_free);
641 		return ret;
642 #endif
643 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
644 					to_free)) != NET_XMIT_SUCCESS) {
645 		if (net_xmit_drop_count(ret)) {
646 			qdisc_qstats_drop(sch);
647 			WRITE_ONCE(cl->drops, cl->drops + 1);
648 		}
649 		return ret;
650 	} else {
651 		htb_activate(q, cl);
652 	}
653 
654 	qstats_backlog_add(sch, len);
655 	qdisc_qlen_inc(sch);
656 	return NET_XMIT_SUCCESS;
657 }
658 
659 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
660 {
661 	s64 toks = diff + cl->tokens;
662 
663 	if (toks > cl->buffer)
664 		toks = cl->buffer;
665 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
666 	if (toks <= -cl->mbuffer)
667 		toks = 1 - cl->mbuffer;
668 
669 	WRITE_ONCE(cl->tokens, toks);
670 }
671 
672 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
673 {
674 	s64 toks = diff + cl->ctokens;
675 
676 	if (toks > cl->cbuffer)
677 		toks = cl->cbuffer;
678 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
679 	if (toks <= -cl->mbuffer)
680 		toks = 1 - cl->mbuffer;
681 
682 	WRITE_ONCE(cl->ctokens, toks);
683 }
684 
685 /**
686  * htb_charge_class - charges amount "bytes" to leaf and ancestors
687  * @q: the priority event queue
688  * @cl: the class to start iterate
689  * @level: the minimum level to account
690  * @skb: the socket buffer
691  *
692  * Routine assumes that packet "bytes" long was dequeued from leaf cl
693  * borrowing from "level". It accounts bytes to ceil leaky bucket for
694  * leaf and all ancestors and to rate bucket for ancestors at levels
695  * "level" and higher. It also handles possible change of mode resulting
696  * from the update. Note that mode can also increase here (MAY_BORROW to
697  * CAN_SEND) because we can use more precise clock that event queue here.
698  * In such case we remove class from event queue first.
699  */
700 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
701 			     int level, struct sk_buff *skb)
702 {
703 	int bytes = qdisc_pkt_len(skb);
704 	enum htb_cmode old_mode;
705 	s64 diff;
706 
707 	while (cl) {
708 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
709 		if (cl->level >= level) {
710 			if (cl->level == level)
711 				WRITE_ONCE(cl->xstats_lends, cl->xstats_lends + 1);
712 			htb_accnt_tokens(cl, bytes, diff);
713 		} else {
714 			WRITE_ONCE(cl->xstats_borrows, cl->xstats_borrows + 1);
715 			/* we moved t_c; update tokens */
716 			WRITE_ONCE(cl->tokens, cl->tokens + diff);
717 		}
718 		htb_accnt_ctokens(cl, bytes, diff);
719 		cl->t_c = q->now;
720 
721 		old_mode = cl->cmode;
722 		diff = 0;
723 		htb_change_class_mode(q, cl, &diff);
724 		if (old_mode != cl->cmode) {
725 			if (old_mode != HTB_CAN_SEND)
726 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
727 			if (cl->cmode != HTB_CAN_SEND)
728 				htb_add_to_wait_tree(q, cl, diff);
729 		}
730 
731 		/* update basic stats except for leaves which are already updated */
732 		if (cl->level)
733 			bstats_update(&cl->bstats, skb);
734 
735 		cl = cl->parent;
736 	}
737 }
738 
739 /**
740  * htb_do_events - make mode changes to classes at the level
741  * @q: the priority event queue
742  * @level: which wait_pq in 'q->hlevel'
743  * @start: start jiffies
744  *
745  * Scans event queue for pending events and applies them. Returns time of
746  * next pending event (0 for no event in pq, q->now for too many events).
747  * Note: Applied are events whose have cl->pq_key <= q->now.
748  */
749 static s64 htb_do_events(struct htb_sched *q, const int level,
750 			 unsigned long start)
751 {
752 	/* don't run for longer than 2 jiffies; 2 is used instead of
753 	 * 1 to simplify things when jiffy is going to be incremented
754 	 * too soon
755 	 */
756 	unsigned long stop_at = start + 2;
757 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
758 
759 	while (time_before(jiffies, stop_at)) {
760 		struct htb_class *cl;
761 		s64 diff;
762 		struct rb_node *p = rb_first(wait_pq);
763 
764 		if (!p)
765 			return 0;
766 
767 		cl = rb_entry(p, struct htb_class, pq_node);
768 		if (cl->pq_key > q->now)
769 			return cl->pq_key;
770 
771 		htb_safe_rb_erase(p, wait_pq);
772 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
773 		htb_change_class_mode(q, cl, &diff);
774 		if (cl->cmode != HTB_CAN_SEND)
775 			htb_add_to_wait_tree(q, cl, diff);
776 	}
777 
778 	/* too much load - let's continue after a break for scheduling */
779 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
780 		pr_warn("htb: too many events!\n");
781 		q->warned |= HTB_WARN_TOOMANYEVENTS;
782 	}
783 
784 	return q->now;
785 }
786 
787 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
788  * is no such one exists.
789  */
790 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
791 					      u32 id)
792 {
793 	struct rb_node *r = NULL;
794 	while (n) {
795 		struct htb_class *cl =
796 		    rb_entry(n, struct htb_class, node[prio]);
797 
798 		if (id > cl->common.classid) {
799 			n = n->rb_right;
800 		} else if (id < cl->common.classid) {
801 			r = n;
802 			n = n->rb_left;
803 		} else {
804 			return n;
805 		}
806 	}
807 	return r;
808 }
809 
810 /**
811  * htb_lookup_leaf - returns next leaf class in DRR order
812  * @hprio: the current one
813  * @prio: which prio in class
814  *
815  * Find leaf where current feed pointers points to.
816  */
817 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
818 {
819 	int i;
820 	struct {
821 		struct rb_node *root;
822 		struct rb_node **pptr;
823 		u32 *pid;
824 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
825 
826 	if (unlikely(!hprio->row.rb_node))
827 		return NULL;
828 
829 	sp->root = hprio->row.rb_node;
830 	sp->pptr = &hprio->ptr;
831 	sp->pid = &hprio->last_ptr_id;
832 
833 	for (i = 0; i < 65535; i++) {
834 		if (!*sp->pptr && *sp->pid) {
835 			/* ptr was invalidated but id is valid - try to recover
836 			 * the original or next ptr
837 			 */
838 			*sp->pptr =
839 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
840 		}
841 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
842 				 * can become out of date quickly
843 				 */
844 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
845 			*sp->pptr = sp->root;
846 			while ((*sp->pptr)->rb_left)
847 				*sp->pptr = (*sp->pptr)->rb_left;
848 			if (sp > stk) {
849 				sp--;
850 				if (!*sp->pptr) {
851 					WARN_ON(1);
852 					return NULL;
853 				}
854 				htb_next_rb_node(sp->pptr);
855 			}
856 		} else {
857 			struct htb_class *cl;
858 			struct htb_prio *clp;
859 
860 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
861 			if (!cl->level)
862 				return cl;
863 			clp = &cl->inner.clprio[prio];
864 			(++sp)->root = clp->feed.rb_node;
865 			sp->pptr = &clp->ptr;
866 			sp->pid = &clp->last_ptr_id;
867 		}
868 	}
869 	WARN_ON(1);
870 	return NULL;
871 }
872 
873 /* dequeues packet at given priority and level; call only if
874  * you are sure that there is active class at prio/level
875  */
876 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
877 					const int level)
878 {
879 	struct sk_buff *skb = NULL;
880 	struct htb_class *cl, *start;
881 	struct htb_level *hlevel = &q->hlevel[level];
882 	struct htb_prio *hprio = &hlevel->hprio[prio];
883 
884 	/* look initial class up in the row */
885 	start = cl = htb_lookup_leaf(hprio, prio);
886 
887 	do {
888 next:
889 		if (unlikely(!cl))
890 			return NULL;
891 
892 		/* class can be empty - it is unlikely but can be true if leaf
893 		 * qdisc drops packets in enqueue routine or if someone used
894 		 * graft operation on the leaf since last dequeue;
895 		 * simply deactivate and skip such class
896 		 */
897 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
898 			struct htb_class *next;
899 			htb_deactivate(q, cl);
900 
901 			/* row/level might become empty */
902 			if ((q->row_mask[level] & (1 << prio)) == 0)
903 				return NULL;
904 
905 			next = htb_lookup_leaf(hprio, prio);
906 
907 			if (cl == start)	/* fix start if we just deleted it */
908 				start = next;
909 			cl = next;
910 			goto next;
911 		}
912 
913 		skb = cl->leaf.q->dequeue(cl->leaf.q);
914 		if (likely(skb != NULL))
915 			break;
916 
917 		qdisc_warn_nonwc("htb", cl->leaf.q);
918 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
919 					 &q->hlevel[0].hprio[prio].ptr);
920 		cl = htb_lookup_leaf(hprio, prio);
921 
922 	} while (cl != start);
923 
924 	if (likely(skb != NULL)) {
925 		bstats_update(&cl->bstats, skb);
926 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
927 		if (cl->leaf.deficit[level] < 0) {
928 			cl->leaf.deficit[level] += cl->quantum;
929 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
930 						 &q->hlevel[0].hprio[prio].ptr);
931 		}
932 		/* this used to be after charge_class but this constelation
933 		 * gives us slightly better performance
934 		 */
935 		if (!cl->leaf.q->q.qlen)
936 			htb_deactivate(q, cl);
937 		htb_charge_class(q, cl, level, skb);
938 	}
939 	return skb;
940 }
941 
942 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
943 {
944 	struct sk_buff *skb;
945 	struct htb_sched *q = qdisc_priv(sch);
946 	int level;
947 	s64 next_event;
948 	unsigned long start_at;
949 
950 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
951 	skb = __qdisc_dequeue_head(&q->direct_queue);
952 	if (skb != NULL) {
953 ok:
954 		qdisc_bstats_update(sch, skb);
955 		qdisc_qstats_backlog_dec(sch, skb);
956 		qdisc_qlen_dec(sch);
957 		return skb;
958 	}
959 
960 	if (!sch->q.qlen)
961 		goto fin;
962 	q->now = ktime_get_ns();
963 	start_at = jiffies;
964 
965 	next_event = q->now + 5LLU * NSEC_PER_SEC;
966 
967 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
968 		/* common case optimization - skip event handler quickly */
969 		int m;
970 		s64 event = q->near_ev_cache[level];
971 
972 		if (q->now >= event) {
973 			event = htb_do_events(q, level, start_at);
974 			if (!event)
975 				event = q->now + NSEC_PER_SEC;
976 			q->near_ev_cache[level] = event;
977 		}
978 
979 		if (next_event > event)
980 			next_event = event;
981 
982 		m = ~q->row_mask[level];
983 		while (m != (int)(-1)) {
984 			int prio = ffz(m);
985 
986 			m |= 1 << prio;
987 			skb = htb_dequeue_tree(q, prio, level);
988 			if (likely(skb != NULL))
989 				goto ok;
990 		}
991 	}
992 	if (likely(next_event > q->now))
993 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
994 	else
995 		schedule_work(&q->work);
996 fin:
997 	return skb;
998 }
999 
1000 /* reset all classes */
1001 /* always caled under BH & queue lock */
1002 static void htb_reset(struct Qdisc *sch)
1003 {
1004 	struct htb_sched *q = qdisc_priv(sch);
1005 	struct htb_class *cl;
1006 	unsigned int i;
1007 
1008 	for (i = 0; i < q->clhash.hashsize; i++) {
1009 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1010 			if (cl->level)
1011 				memset(&cl->inner, 0, sizeof(cl->inner));
1012 			else {
1013 				if (cl->leaf.q && !q->offload)
1014 					qdisc_reset(cl->leaf.q);
1015 			}
1016 			cl->prio_activity = 0;
1017 			cl->cmode = HTB_CAN_SEND;
1018 		}
1019 	}
1020 	qdisc_watchdog_cancel(&q->watchdog);
1021 	__qdisc_reset_queue(&q->direct_queue);
1022 	memset(q->hlevel, 0, sizeof(q->hlevel));
1023 	memset(q->row_mask, 0, sizeof(q->row_mask));
1024 }
1025 
1026 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1027 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1028 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1029 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1030 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1031 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1032 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1033 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1034 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1035 };
1036 
1037 static void htb_work_func(struct work_struct *work)
1038 {
1039 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1040 	struct Qdisc *sch = q->watchdog.qdisc;
1041 
1042 	rcu_read_lock();
1043 	__netif_schedule(qdisc_root(sch));
1044 	rcu_read_unlock();
1045 }
1046 
1047 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1048 {
1049 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1050 }
1051 
1052 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1053 		    struct netlink_ext_ack *extack)
1054 {
1055 	struct net_device *dev = qdisc_dev(sch);
1056 	struct tc_htb_qopt_offload offload_opt;
1057 	struct htb_sched *q = qdisc_priv(sch);
1058 	struct nlattr *tb[TCA_HTB_MAX + 1];
1059 	struct tc_htb_glob *gopt;
1060 	unsigned int ntx;
1061 	bool offload;
1062 	int err;
1063 
1064 	qdisc_watchdog_init(&q->watchdog, sch);
1065 	INIT_WORK(&q->work, htb_work_func);
1066 
1067 	if (!opt)
1068 		return -EINVAL;
1069 
1070 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1071 	if (err)
1072 		return err;
1073 
1074 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1075 					  NULL);
1076 	if (err < 0)
1077 		return err;
1078 
1079 	if (!tb[TCA_HTB_INIT])
1080 		return -EINVAL;
1081 
1082 	gopt = nla_data(tb[TCA_HTB_INIT]);
1083 	if (gopt->version != HTB_VER >> 16)
1084 		return -EINVAL;
1085 
1086 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1087 
1088 	if (offload) {
1089 		if (sch->parent != TC_H_ROOT) {
1090 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1091 			return -EOPNOTSUPP;
1092 		}
1093 
1094 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1095 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1096 			return -EOPNOTSUPP;
1097 		}
1098 
1099 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1100 		q->direct_qdiscs = kzalloc_objs(*q->direct_qdiscs,
1101 						q->num_direct_qdiscs);
1102 		if (!q->direct_qdiscs)
1103 			return -ENOMEM;
1104 	}
1105 
1106 	err = qdisc_class_hash_init(&q->clhash);
1107 	if (err < 0)
1108 		return err;
1109 
1110 	if (tb[TCA_HTB_DIRECT_QLEN])
1111 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1112 	else
1113 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1114 
1115 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1116 		q->rate2quantum = 1;
1117 	q->defcls = gopt->defcls;
1118 
1119 	if (!offload)
1120 		return 0;
1121 
1122 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1123 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1124 		struct Qdisc *qdisc;
1125 
1126 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1127 					  TC_H_MAKE(sch->handle, 0), extack);
1128 		if (!qdisc) {
1129 			return -ENOMEM;
1130 		}
1131 
1132 		q->direct_qdiscs[ntx] = qdisc;
1133 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1134 	}
1135 
1136 	sch->flags |= TCQ_F_MQROOT;
1137 
1138 	offload_opt = (struct tc_htb_qopt_offload) {
1139 		.command = TC_HTB_CREATE,
1140 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1141 		.classid = TC_H_MIN(q->defcls),
1142 		.extack = extack,
1143 	};
1144 	err = htb_offload(dev, &offload_opt);
1145 	if (err)
1146 		return err;
1147 
1148 	/* Defer this assignment, so that htb_destroy skips offload-related
1149 	 * parts (especially calling ndo_setup_tc) on errors.
1150 	 */
1151 	q->offload = true;
1152 	sch->flags |= TCQ_F_OFFLOADED;
1153 
1154 	return 0;
1155 }
1156 
1157 static void htb_attach_offload(struct Qdisc *sch)
1158 {
1159 	struct net_device *dev = qdisc_dev(sch);
1160 	struct htb_sched *q = qdisc_priv(sch);
1161 	unsigned int ntx;
1162 
1163 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1164 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1165 
1166 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1167 		qdisc_put(old);
1168 		qdisc_hash_add(qdisc, false);
1169 	}
1170 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1171 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1172 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1173 
1174 		qdisc_put(old);
1175 	}
1176 
1177 	kfree(q->direct_qdiscs);
1178 	q->direct_qdiscs = NULL;
1179 }
1180 
1181 static void htb_attach_software(struct Qdisc *sch)
1182 {
1183 	struct net_device *dev = qdisc_dev(sch);
1184 	unsigned int ntx;
1185 
1186 	/* Resemble qdisc_graft behavior. */
1187 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1188 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1189 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1190 
1191 		qdisc_refcount_inc(sch);
1192 
1193 		qdisc_put(old);
1194 	}
1195 }
1196 
1197 static void htb_attach(struct Qdisc *sch)
1198 {
1199 	struct htb_sched *q = qdisc_priv(sch);
1200 
1201 	if (q->offload)
1202 		htb_attach_offload(sch);
1203 	else
1204 		htb_attach_software(sch);
1205 }
1206 
1207 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1208 {
1209 	struct htb_sched *q = qdisc_priv(sch);
1210 	struct nlattr *nest;
1211 	struct tc_htb_glob gopt;
1212 
1213 	sch->qstats.overlimits = READ_ONCE(q->overlimits);
1214 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1215 	 * no change can happen on the qdisc parameters.
1216 	 */
1217 
1218 	gopt.direct_pkts = READ_ONCE(q->direct_pkts);
1219 	gopt.version = HTB_VER;
1220 	gopt.rate2quantum = q->rate2quantum;
1221 	gopt.defcls = q->defcls;
1222 	gopt.debug = 0;
1223 
1224 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1225 	if (nest == NULL)
1226 		goto nla_put_failure;
1227 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1228 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1229 		goto nla_put_failure;
1230 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1231 		goto nla_put_failure;
1232 
1233 	return nla_nest_end(skb, nest);
1234 
1235 nla_put_failure:
1236 	nla_nest_cancel(skb, nest);
1237 	return -1;
1238 }
1239 
1240 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1241 			  struct sk_buff *skb, struct tcmsg *tcm)
1242 {
1243 	struct htb_class *cl = (struct htb_class *)arg;
1244 	struct htb_sched *q = qdisc_priv(sch);
1245 	struct nlattr *nest;
1246 	struct tc_htb_opt opt;
1247 
1248 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1249 	 * no change can happen on the class parameters.
1250 	 */
1251 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1252 	tcm->tcm_handle = cl->common.classid;
1253 	if (!cl->level && cl->leaf.q)
1254 		tcm->tcm_info = cl->leaf.q->handle;
1255 
1256 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1257 	if (nest == NULL)
1258 		goto nla_put_failure;
1259 
1260 	memset(&opt, 0, sizeof(opt));
1261 
1262 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1263 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1264 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1265 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1266 	opt.quantum = cl->quantum;
1267 	opt.prio = cl->prio;
1268 	opt.level = cl->level;
1269 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1270 		goto nla_put_failure;
1271 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1272 		goto nla_put_failure;
1273 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1274 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1275 			      TCA_HTB_PAD))
1276 		goto nla_put_failure;
1277 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1278 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1279 			      TCA_HTB_PAD))
1280 		goto nla_put_failure;
1281 
1282 	return nla_nest_end(skb, nest);
1283 
1284 nla_put_failure:
1285 	nla_nest_cancel(skb, nest);
1286 	return -1;
1287 }
1288 
1289 static void htb_offload_aggregate_stats(struct htb_sched *q,
1290 					struct htb_class *cl)
1291 {
1292 	u64 bytes = 0, packets = 0;
1293 	struct htb_class *c;
1294 	unsigned int i;
1295 
1296 	for (i = 0; i < q->clhash.hashsize; i++) {
1297 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1298 			struct htb_class *p = c;
1299 
1300 			while (p && p->level < cl->level)
1301 				p = p->parent;
1302 
1303 			if (p != cl)
1304 				continue;
1305 
1306 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1307 			packets += u64_stats_read(&c->bstats_bias.packets);
1308 			if (c->level == 0) {
1309 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1310 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1311 			}
1312 		}
1313 	}
1314 	_bstats_set(&cl->bstats, bytes, packets);
1315 }
1316 
1317 static int
1318 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1319 {
1320 	struct htb_class *cl = (struct htb_class *)arg;
1321 	struct htb_sched *q = qdisc_priv(sch);
1322 	struct tc_htb_xstats xstats = {
1323 		.lends = READ_ONCE(cl->xstats_lends),
1324 		.borrows = READ_ONCE(cl->xstats_borrows),
1325 	};
1326 	struct gnet_stats_queue qs = {
1327 		.drops = READ_ONCE(cl->drops),
1328 		.overlimits = READ_ONCE(cl->overlimits),
1329 	};
1330 	__u32 qlen = 0;
1331 
1332 	if (!cl->level && cl->leaf.q)
1333 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1334 
1335 	xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(READ_ONCE(cl->tokens)),
1336 				INT_MIN, INT_MAX);
1337 	xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(READ_ONCE(cl->ctokens)),
1338 				 INT_MIN, INT_MAX);
1339 
1340 	if (q->offload) {
1341 		spin_lock_bh(qdisc_lock(sch));
1342 		if (!cl->level) {
1343 			u64 bytes = 0, packets = 0;
1344 
1345 			if (cl->leaf.q) {
1346 				bytes = u64_stats_read(&cl->leaf.q->bstats.bytes);
1347 				packets = u64_stats_read(&cl->leaf.q->bstats.packets);
1348 			}
1349 			bytes += u64_stats_read(&cl->bstats_bias.bytes);
1350 			packets += u64_stats_read(&cl->bstats_bias.packets);
1351 			_bstats_set(&cl->bstats, bytes, packets);
1352 		} else {
1353 			htb_offload_aggregate_stats(q, cl);
1354 		}
1355 		spin_unlock_bh(qdisc_lock(sch));
1356 	}
1357 
1358 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1359 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1360 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1361 		return -1;
1362 
1363 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
1364 }
1365 
1366 static struct netdev_queue *
1367 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1368 {
1369 	struct net_device *dev = qdisc_dev(sch);
1370 	struct tc_htb_qopt_offload offload_opt;
1371 	struct htb_sched *q = qdisc_priv(sch);
1372 	int err;
1373 
1374 	if (!q->offload)
1375 		return sch->dev_queue;
1376 
1377 	offload_opt = (struct tc_htb_qopt_offload) {
1378 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1379 		.classid = TC_H_MIN(tcm->tcm_parent),
1380 	};
1381 	err = htb_offload(dev, &offload_opt);
1382 	if (err || offload_opt.qid >= dev->num_tx_queues)
1383 		return NULL;
1384 	return netdev_get_tx_queue(dev, offload_opt.qid);
1385 }
1386 
1387 static struct Qdisc *
1388 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1389 {
1390 	struct net_device *dev = dev_queue->dev;
1391 	struct Qdisc *old_q;
1392 
1393 	if (dev->flags & IFF_UP)
1394 		dev_deactivate(dev, false);
1395 	old_q = dev_graft_qdisc(dev_queue, new_q);
1396 	if (new_q)
1397 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1398 	if (dev->flags & IFF_UP)
1399 		dev_activate(dev);
1400 
1401 	return old_q;
1402 }
1403 
1404 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1405 {
1406 	struct netdev_queue *queue;
1407 
1408 	queue = cl->leaf.offload_queue;
1409 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1410 		WARN_ON(cl->leaf.q->dev_queue != queue);
1411 
1412 	return queue;
1413 }
1414 
1415 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1416 				   struct htb_class *cl_new, bool destroying)
1417 {
1418 	struct netdev_queue *queue_old, *queue_new;
1419 	struct net_device *dev = qdisc_dev(sch);
1420 
1421 	queue_old = htb_offload_get_queue(cl_old);
1422 	queue_new = htb_offload_get_queue(cl_new);
1423 
1424 	if (!destroying) {
1425 		struct Qdisc *qdisc;
1426 
1427 		if (dev->flags & IFF_UP)
1428 			dev_deactivate(dev, false);
1429 		qdisc = dev_graft_qdisc(queue_old, NULL);
1430 		WARN_ON(qdisc != cl_old->leaf.q);
1431 	}
1432 
1433 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1434 		cl_old->leaf.q->dev_queue = queue_new;
1435 	cl_old->leaf.offload_queue = queue_new;
1436 
1437 	if (!destroying) {
1438 		struct Qdisc *qdisc;
1439 
1440 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1441 		if (dev->flags & IFF_UP)
1442 			dev_activate(dev);
1443 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1444 	}
1445 }
1446 
1447 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1448 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1449 {
1450 	struct netdev_queue *dev_queue = sch->dev_queue;
1451 	struct htb_class *cl = (struct htb_class *)arg;
1452 	struct htb_sched *q = qdisc_priv(sch);
1453 	struct Qdisc *old_q;
1454 
1455 	if (cl->level)
1456 		return -EINVAL;
1457 
1458 	if (q->offload)
1459 		dev_queue = htb_offload_get_queue(cl);
1460 
1461 	if (!new) {
1462 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1463 					cl->common.classid, extack);
1464 		if (!new)
1465 			return -ENOBUFS;
1466 	}
1467 
1468 	if (q->offload) {
1469 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1470 		qdisc_refcount_inc(new);
1471 		old_q = htb_graft_helper(dev_queue, new);
1472 	}
1473 
1474 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1475 
1476 	if (q->offload) {
1477 		WARN_ON(old_q != *old);
1478 		qdisc_put(old_q);
1479 	}
1480 
1481 	return 0;
1482 }
1483 
1484 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1485 {
1486 	struct htb_class *cl = (struct htb_class *)arg;
1487 	return !cl->level ? cl->leaf.q : NULL;
1488 }
1489 
1490 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1491 {
1492 	struct htb_class *cl = (struct htb_class *)arg;
1493 
1494 	htb_deactivate(qdisc_priv(sch), cl);
1495 }
1496 
1497 static inline int htb_parent_last_child(struct htb_class *cl)
1498 {
1499 	if (!cl->parent)
1500 		/* the root class */
1501 		return 0;
1502 	if (cl->parent->children > 1)
1503 		/* not the last child */
1504 		return 0;
1505 	return 1;
1506 }
1507 
1508 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1509 			       struct Qdisc *new_q)
1510 {
1511 	struct htb_sched *q = qdisc_priv(sch);
1512 	struct htb_class *parent = cl->parent;
1513 
1514 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1515 
1516 	if (parent->cmode != HTB_CAN_SEND)
1517 		htb_safe_rb_erase(&parent->pq_node,
1518 				  &q->hlevel[parent->level].wait_pq);
1519 
1520 	parent->level = 0;
1521 	memset(&parent->inner, 0, sizeof(parent->inner));
1522 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1523 	WRITE_ONCE(parent->tokens, parent->buffer);
1524 	WRITE_ONCE(parent->ctokens, parent->cbuffer);
1525 	parent->t_c = ktime_get_ns();
1526 	parent->cmode = HTB_CAN_SEND;
1527 	if (q->offload)
1528 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1529 }
1530 
1531 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1532 				       struct netdev_queue *dev_queue,
1533 				       struct Qdisc *new_q)
1534 {
1535 	struct Qdisc *old_q;
1536 
1537 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1538 	if (new_q)
1539 		qdisc_refcount_inc(new_q);
1540 	old_q = htb_graft_helper(dev_queue, new_q);
1541 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1542 }
1543 
1544 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1545 				     bool last_child, bool destroying,
1546 				     struct netlink_ext_ack *extack)
1547 {
1548 	struct tc_htb_qopt_offload offload_opt;
1549 	struct netdev_queue *dev_queue;
1550 	struct Qdisc *q = cl->leaf.q;
1551 	struct Qdisc *old;
1552 	int err;
1553 
1554 	if (cl->level)
1555 		return -EINVAL;
1556 
1557 	WARN_ON(!q);
1558 	dev_queue = htb_offload_get_queue(cl);
1559 	/* When destroying, caller qdisc_graft grafts the new qdisc and invokes
1560 	 * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload
1561 	 * does not need to graft or qdisc_put the qdisc being destroyed.
1562 	 */
1563 	if (!destroying) {
1564 		old = htb_graft_helper(dev_queue, NULL);
1565 		/* Last qdisc grafted should be the same as cl->leaf.q when
1566 		 * calling htb_delete.
1567 		 */
1568 		WARN_ON(old != q);
1569 	}
1570 
1571 	if (cl->parent) {
1572 		_bstats_update(&cl->parent->bstats_bias,
1573 			       u64_stats_read(&q->bstats.bytes),
1574 			       u64_stats_read(&q->bstats.packets));
1575 	}
1576 
1577 	offload_opt = (struct tc_htb_qopt_offload) {
1578 		.command = !last_child ? TC_HTB_LEAF_DEL :
1579 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1580 			   TC_HTB_LEAF_DEL_LAST,
1581 		.classid = cl->common.classid,
1582 		.extack = extack,
1583 	};
1584 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1585 
1586 	if (!destroying) {
1587 		if (!err)
1588 			qdisc_put(old);
1589 		else
1590 			htb_graft_helper(dev_queue, old);
1591 	}
1592 
1593 	if (last_child)
1594 		return err;
1595 
1596 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1597 		u32 classid = TC_H_MAJ(sch->handle) |
1598 			      TC_H_MIN(offload_opt.classid);
1599 		struct htb_class *moved_cl = htb_find(classid, sch);
1600 
1601 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1602 	}
1603 
1604 	return err;
1605 }
1606 
1607 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1608 {
1609 	if (!cl->level) {
1610 		WARN_ON(!cl->leaf.q);
1611 		qdisc_put(cl->leaf.q);
1612 	}
1613 	gen_kill_estimator(&cl->rate_est);
1614 	tcf_block_put(cl->block);
1615 	kfree(cl);
1616 }
1617 
1618 static void htb_destroy(struct Qdisc *sch)
1619 {
1620 	struct net_device *dev = qdisc_dev(sch);
1621 	struct tc_htb_qopt_offload offload_opt;
1622 	struct htb_sched *q = qdisc_priv(sch);
1623 	struct hlist_node *next;
1624 	bool nonempty, changed;
1625 	struct htb_class *cl;
1626 	unsigned int i;
1627 
1628 	cancel_work_sync(&q->work);
1629 	qdisc_watchdog_cancel(&q->watchdog);
1630 	/* This line used to be after htb_destroy_class call below
1631 	 * and surprisingly it worked in 2.4. But it must precede it
1632 	 * because filter need its target class alive to be able to call
1633 	 * unbind_filter on it (without Oops).
1634 	 */
1635 	tcf_block_put(q->block);
1636 
1637 	for (i = 0; i < q->clhash.hashsize; i++) {
1638 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1639 			tcf_block_put(cl->block);
1640 			cl->block = NULL;
1641 		}
1642 	}
1643 
1644 	do {
1645 		nonempty = false;
1646 		changed = false;
1647 		for (i = 0; i < q->clhash.hashsize; i++) {
1648 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1649 						  common.hnode) {
1650 				bool last_child;
1651 
1652 				if (!q->offload) {
1653 					htb_destroy_class(sch, cl);
1654 					continue;
1655 				}
1656 
1657 				nonempty = true;
1658 
1659 				if (cl->level)
1660 					continue;
1661 
1662 				changed = true;
1663 
1664 				last_child = htb_parent_last_child(cl);
1665 				htb_destroy_class_offload(sch, cl, last_child,
1666 							  true, NULL);
1667 				qdisc_class_hash_remove(&q->clhash,
1668 							&cl->common);
1669 				if (cl->parent)
1670 					cl->parent->children--;
1671 				if (last_child)
1672 					htb_parent_to_leaf(sch, cl, NULL);
1673 				htb_destroy_class(sch, cl);
1674 			}
1675 		}
1676 	} while (changed);
1677 	WARN_ON(nonempty);
1678 
1679 	qdisc_class_hash_destroy(&q->clhash);
1680 	__qdisc_reset_queue(&q->direct_queue);
1681 
1682 	if (q->offload) {
1683 		offload_opt = (struct tc_htb_qopt_offload) {
1684 			.command = TC_HTB_DESTROY,
1685 		};
1686 		htb_offload(dev, &offload_opt);
1687 	}
1688 
1689 	if (!q->direct_qdiscs)
1690 		return;
1691 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1692 		qdisc_put(q->direct_qdiscs[i]);
1693 	kfree(q->direct_qdiscs);
1694 }
1695 
1696 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1697 		      struct netlink_ext_ack *extack)
1698 {
1699 	struct htb_sched *q = qdisc_priv(sch);
1700 	struct htb_class *cl = (struct htb_class *)arg;
1701 	struct Qdisc *new_q = NULL;
1702 	int last_child = 0;
1703 	int err;
1704 
1705 	/* TODO: why don't allow to delete subtree ? references ? does
1706 	 * tc subsys guarantee us that in htb_destroy it holds no class
1707 	 * refs so that we can remove children safely there ?
1708 	 */
1709 	if (cl->children || qdisc_class_in_use(&cl->common)) {
1710 		NL_SET_ERR_MSG(extack, "HTB class in use");
1711 		return -EBUSY;
1712 	}
1713 
1714 	if (!cl->level && htb_parent_last_child(cl))
1715 		last_child = 1;
1716 
1717 	if (q->offload) {
1718 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1719 						extack);
1720 		if (err)
1721 			return err;
1722 	}
1723 
1724 	if (last_child) {
1725 		struct netdev_queue *dev_queue = sch->dev_queue;
1726 
1727 		if (q->offload)
1728 			dev_queue = htb_offload_get_queue(cl);
1729 
1730 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1731 					  cl->parent->common.classid,
1732 					  NULL);
1733 		if (q->offload)
1734 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1735 	}
1736 
1737 	sch_tree_lock(sch);
1738 
1739 	if (!cl->level)
1740 		qdisc_purge_queue(cl->leaf.q);
1741 
1742 	/* delete from hash and active; remainder in destroy_class */
1743 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1744 	if (cl->parent)
1745 		cl->parent->children--;
1746 
1747 	htb_deactivate(q, cl);
1748 
1749 	if (cl->cmode != HTB_CAN_SEND)
1750 		htb_safe_rb_erase(&cl->pq_node,
1751 				  &q->hlevel[cl->level].wait_pq);
1752 
1753 	if (last_child)
1754 		htb_parent_to_leaf(sch, cl, new_q);
1755 
1756 	sch_tree_unlock(sch);
1757 
1758 	htb_destroy_class(sch, cl);
1759 	return 0;
1760 }
1761 
1762 static int htb_change_class(struct Qdisc *sch, u32 classid,
1763 			    u32 parentid, struct nlattr **tca,
1764 			    unsigned long *arg, struct netlink_ext_ack *extack)
1765 {
1766 	int err = -EINVAL;
1767 	struct htb_sched *q = qdisc_priv(sch);
1768 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1769 	struct tc_htb_qopt_offload offload_opt;
1770 	struct nlattr *opt = tca[TCA_OPTIONS];
1771 	struct nlattr *tb[TCA_HTB_MAX + 1];
1772 	struct Qdisc *parent_qdisc = NULL;
1773 	struct netdev_queue *dev_queue;
1774 	struct tc_htb_opt *hopt;
1775 	u64 rate64, ceil64;
1776 	int warn = 0;
1777 
1778 	/* extract all subattrs from opt attr */
1779 	if (!opt)
1780 		goto failure;
1781 
1782 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1783 					  extack);
1784 	if (err < 0)
1785 		goto failure;
1786 
1787 	err = -EINVAL;
1788 	if (tb[TCA_HTB_PARMS] == NULL)
1789 		goto failure;
1790 
1791 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1792 
1793 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1794 	if (!hopt->rate.rate || !hopt->ceil.rate)
1795 		goto failure;
1796 
1797 	if (q->offload) {
1798 		/* Options not supported by the offload. */
1799 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1800 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1801 			goto failure;
1802 		}
1803 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1804 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1805 			goto failure;
1806 		}
1807 	}
1808 
1809 	/* Keeping backward compatible with rate_table based iproute2 tc */
1810 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1811 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1812 					      NULL));
1813 
1814 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1815 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1816 					      NULL));
1817 
1818 	rate64 = nla_get_u64_default(tb[TCA_HTB_RATE64], 0);
1819 	ceil64 = nla_get_u64_default(tb[TCA_HTB_CEIL64], 0);
1820 
1821 	if (!cl) {		/* new class */
1822 		struct net_device *dev = qdisc_dev(sch);
1823 		struct Qdisc *new_q, *old_q;
1824 		int prio;
1825 		struct {
1826 			struct nlattr		nla;
1827 			struct gnet_estimator	opt;
1828 		} est = {
1829 			.nla = {
1830 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1831 				.nla_type	= TCA_RATE,
1832 			},
1833 			.opt = {
1834 				/* 4s interval, 16s averaging constant */
1835 				.interval	= 2,
1836 				.ewma_log	= 2,
1837 			},
1838 		};
1839 
1840 		/* check for valid classid */
1841 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1842 		    htb_find(classid, sch))
1843 			goto failure;
1844 
1845 		/* check maximal depth */
1846 		if (parent && parent->parent && parent->parent->level < 2) {
1847 			NL_SET_ERR_MSG_MOD(extack, "tree is too deep");
1848 			goto failure;
1849 		}
1850 		err = -ENOBUFS;
1851 		cl = kzalloc_obj(*cl);
1852 		if (!cl)
1853 			goto failure;
1854 
1855 		gnet_stats_basic_sync_init(&cl->bstats);
1856 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1857 
1858 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1859 		if (err) {
1860 			kfree(cl);
1861 			goto failure;
1862 		}
1863 		if (htb_rate_est || tca[TCA_RATE]) {
1864 			err = gen_new_estimator(&cl->bstats, NULL,
1865 						&cl->rate_est,
1866 						NULL,
1867 						true,
1868 						tca[TCA_RATE] ? : &est.nla);
1869 			if (err)
1870 				goto err_block_put;
1871 		}
1872 
1873 		cl->children = 0;
1874 		RB_CLEAR_NODE(&cl->pq_node);
1875 
1876 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1877 			RB_CLEAR_NODE(&cl->node[prio]);
1878 
1879 		cl->common.classid = classid;
1880 
1881 		/* Make sure nothing interrupts us in between of two
1882 		 * ndo_setup_tc calls.
1883 		 */
1884 		ASSERT_RTNL();
1885 
1886 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1887 		 * so that can't be used inside of sch_tree_lock
1888 		 * -- thanks to Karlis Peisenieks
1889 		 */
1890 		if (!q->offload) {
1891 			dev_queue = sch->dev_queue;
1892 		} else if (!(parent && !parent->level)) {
1893 			/* Assign a dev_queue to this classid. */
1894 			offload_opt = (struct tc_htb_qopt_offload) {
1895 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1896 				.classid = cl->common.classid,
1897 				.parent_classid = parent ?
1898 					TC_H_MIN(parent->common.classid) :
1899 					TC_HTB_CLASSID_ROOT,
1900 				.rate = max_t(u64, hopt->rate.rate, rate64),
1901 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1902 				.prio = hopt->prio,
1903 				.quantum = hopt->quantum,
1904 				.extack = extack,
1905 			};
1906 			err = htb_offload(dev, &offload_opt);
1907 			if (err) {
1908 				NL_SET_ERR_MSG_WEAK(extack,
1909 						    "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE");
1910 				goto err_kill_estimator;
1911 			}
1912 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1913 		} else { /* First child. */
1914 			dev_queue = htb_offload_get_queue(parent);
1915 			old_q = htb_graft_helper(dev_queue, NULL);
1916 			WARN_ON(old_q != parent->leaf.q);
1917 			offload_opt = (struct tc_htb_qopt_offload) {
1918 				.command = TC_HTB_LEAF_TO_INNER,
1919 				.classid = cl->common.classid,
1920 				.parent_classid =
1921 					TC_H_MIN(parent->common.classid),
1922 				.rate = max_t(u64, hopt->rate.rate, rate64),
1923 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1924 				.prio = hopt->prio,
1925 				.quantum = hopt->quantum,
1926 				.extack = extack,
1927 			};
1928 			err = htb_offload(dev, &offload_opt);
1929 			if (err) {
1930 				NL_SET_ERR_MSG_WEAK(extack,
1931 						    "Failed to offload TC_HTB_LEAF_TO_INNER");
1932 				htb_graft_helper(dev_queue, old_q);
1933 				goto err_kill_estimator;
1934 			}
1935 			_bstats_update(&parent->bstats_bias,
1936 				       u64_stats_read(&old_q->bstats.bytes),
1937 				       u64_stats_read(&old_q->bstats.packets));
1938 			qdisc_put(old_q);
1939 		}
1940 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1941 					  classid, NULL);
1942 		if (q->offload) {
1943 			/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1944 			if (new_q)
1945 				qdisc_refcount_inc(new_q);
1946 			old_q = htb_graft_helper(dev_queue, new_q);
1947 			/* No qdisc_put needed. */
1948 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1949 		}
1950 		sch_tree_lock(sch);
1951 		if (parent && !parent->level) {
1952 			/* turn parent into inner node */
1953 			qdisc_purge_queue(parent->leaf.q);
1954 			parent_qdisc = parent->leaf.q;
1955 			htb_deactivate(q, parent);
1956 
1957 			/* remove from evt list because of level change */
1958 			if (parent->cmode != HTB_CAN_SEND) {
1959 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1960 				parent->cmode = HTB_CAN_SEND;
1961 			}
1962 			parent->level = (parent->parent ? parent->parent->level
1963 					 : TC_HTB_MAXDEPTH) - 1;
1964 			memset(&parent->inner, 0, sizeof(parent->inner));
1965 		}
1966 
1967 		/* leaf (we) needs elementary qdisc */
1968 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1969 		if (q->offload)
1970 			cl->leaf.offload_queue = dev_queue;
1971 
1972 		cl->parent = parent;
1973 
1974 		/* set class to be in HTB_CAN_SEND state */
1975 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1976 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1977 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1978 		cl->t_c = ktime_get_ns();
1979 		cl->cmode = HTB_CAN_SEND;
1980 
1981 		/* attach to the hash list and parent's family */
1982 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1983 		if (parent)
1984 			parent->children++;
1985 		if (cl->leaf.q != &noop_qdisc)
1986 			qdisc_hash_add(cl->leaf.q, true);
1987 	} else {
1988 		if (tca[TCA_RATE]) {
1989 			err = gen_replace_estimator(&cl->bstats, NULL,
1990 						    &cl->rate_est,
1991 						    NULL,
1992 						    true,
1993 						    tca[TCA_RATE]);
1994 			if (err)
1995 				return err;
1996 		}
1997 
1998 		if (q->offload) {
1999 			struct net_device *dev = qdisc_dev(sch);
2000 
2001 			offload_opt = (struct tc_htb_qopt_offload) {
2002 				.command = TC_HTB_NODE_MODIFY,
2003 				.classid = cl->common.classid,
2004 				.rate = max_t(u64, hopt->rate.rate, rate64),
2005 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2006 				.prio = hopt->prio,
2007 				.quantum = hopt->quantum,
2008 				.extack = extack,
2009 			};
2010 			err = htb_offload(dev, &offload_opt);
2011 			if (err)
2012 				/* Estimator was replaced, and rollback may fail
2013 				 * as well, so we don't try to recover it, and
2014 				 * the estimator won't work property with the
2015 				 * offload anyway, because bstats are updated
2016 				 * only when the stats are queried.
2017 				 */
2018 				return err;
2019 		}
2020 
2021 		sch_tree_lock(sch);
2022 	}
2023 
2024 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2025 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2026 
2027 	/* it used to be a nasty bug here, we have to check that node
2028 	 * is really leaf before changing cl->leaf !
2029 	 */
2030 	if (!cl->level) {
2031 		u64 quantum = cl->rate.rate_bytes_ps;
2032 
2033 		do_div(quantum, q->rate2quantum);
2034 		cl->quantum = min_t(u64, quantum, INT_MAX);
2035 
2036 		if (!hopt->quantum && cl->quantum < 1000) {
2037 			warn = -1;
2038 			cl->quantum = 1000;
2039 		}
2040 		if (!hopt->quantum && cl->quantum > 200000) {
2041 			warn = 1;
2042 			cl->quantum = 200000;
2043 		}
2044 		if (hopt->quantum)
2045 			cl->quantum = hopt->quantum;
2046 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2047 			cl->prio = TC_HTB_NUMPRIO - 1;
2048 	}
2049 
2050 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2051 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2052 
2053 	sch_tree_unlock(sch);
2054 	qdisc_put(parent_qdisc);
2055 
2056 	if (warn)
2057 		NL_SET_ERR_MSG_FMT_MOD(extack,
2058 				       "quantum of class %X is %s. Consider r2q change.",
2059 				       cl->common.classid, (warn == -1 ? "small" : "big"));
2060 
2061 	qdisc_class_hash_grow(sch, &q->clhash);
2062 
2063 	*arg = (unsigned long)cl;
2064 	return 0;
2065 
2066 err_kill_estimator:
2067 	gen_kill_estimator(&cl->rate_est);
2068 err_block_put:
2069 	tcf_block_put(cl->block);
2070 	kfree(cl);
2071 failure:
2072 	return err;
2073 }
2074 
2075 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2076 				       struct netlink_ext_ack *extack)
2077 {
2078 	struct htb_sched *q = qdisc_priv(sch);
2079 	struct htb_class *cl = (struct htb_class *)arg;
2080 
2081 	return cl ? cl->block : q->block;
2082 }
2083 
2084 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2085 				     u32 classid)
2086 {
2087 	struct htb_class *cl = htb_find(classid, sch);
2088 
2089 	/*if (cl && !cl->level) return 0;
2090 	 * The line above used to be there to prevent attaching filters to
2091 	 * leaves. But at least tc_index filter uses this just to get class
2092 	 * for other reasons so that we have to allow for it.
2093 	 * ----
2094 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2095 	 * another way to "lock" the class - unlike "get" this lock can
2096 	 * be broken by class during destroy IIUC.
2097 	 */
2098 	if (cl)
2099 		qdisc_class_get(&cl->common);
2100 	return (unsigned long)cl;
2101 }
2102 
2103 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2104 {
2105 	struct htb_class *cl = (struct htb_class *)arg;
2106 
2107 	qdisc_class_put(&cl->common);
2108 }
2109 
2110 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2111 {
2112 	struct htb_sched *q = qdisc_priv(sch);
2113 	struct htb_class *cl;
2114 	unsigned int i;
2115 
2116 	if (arg->stop)
2117 		return;
2118 
2119 	for (i = 0; i < q->clhash.hashsize; i++) {
2120 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2121 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
2122 				return;
2123 		}
2124 	}
2125 }
2126 
2127 static const struct Qdisc_class_ops htb_class_ops = {
2128 	.select_queue	=	htb_select_queue,
2129 	.graft		=	htb_graft,
2130 	.leaf		=	htb_leaf,
2131 	.qlen_notify	=	htb_qlen_notify,
2132 	.find		=	htb_search,
2133 	.change		=	htb_change_class,
2134 	.delete		=	htb_delete,
2135 	.walk		=	htb_walk,
2136 	.tcf_block	=	htb_tcf_block,
2137 	.bind_tcf	=	htb_bind_filter,
2138 	.unbind_tcf	=	htb_unbind_filter,
2139 	.dump		=	htb_dump_class,
2140 	.dump_stats	=	htb_dump_class_stats,
2141 };
2142 
2143 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2144 	.cl_ops		=	&htb_class_ops,
2145 	.id		=	"htb",
2146 	.priv_size	=	sizeof(struct htb_sched),
2147 	.enqueue	=	htb_enqueue,
2148 	.dequeue	=	htb_dequeue,
2149 	.peek		=	qdisc_peek_dequeued,
2150 	.init		=	htb_init,
2151 	.attach		=	htb_attach,
2152 	.reset		=	htb_reset,
2153 	.destroy	=	htb_destroy,
2154 	.dump		=	htb_dump,
2155 	.owner		=	THIS_MODULE,
2156 };
2157 MODULE_ALIAS_NET_SCH("htb");
2158 
2159 static int __init htb_module_init(void)
2160 {
2161 	return register_qdisc(&htb_qdisc_ops);
2162 }
2163 static void __exit htb_module_exit(void)
2164 {
2165 	unregister_qdisc(&htb_qdisc_ops);
2166 }
2167 
2168 module_init(htb_module_init)
2169 module_exit(htb_module_exit)
2170 MODULE_LICENSE("GPL");
2171 MODULE_DESCRIPTION("Hierarchical Token Bucket scheduler");
2172