xref: /linux/net/sched/sch_htb.c (revision 2330437da0994321020777c605a2a8cb0ecb7001)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 
106 	int			level;		/* our level (see above) */
107 	unsigned int		children;
108 	struct htb_class	*parent;	/* parent class */
109 
110 	struct net_rate_estimator __rcu *rate_est;
111 
112 	/*
113 	 * Written often fields
114 	 */
115 	struct gnet_stats_basic_sync bstats;
116 	struct gnet_stats_basic_sync bstats_bias;
117 	struct tc_htb_xstats	xstats;	/* our special stats */
118 
119 	/* token bucket parameters */
120 	s64			tokens, ctokens;/* current number of tokens */
121 	s64			t_c;		/* checkpoint time */
122 
123 	union {
124 		struct htb_class_leaf {
125 			int		deficit[TC_HTB_MAXDEPTH];
126 			struct Qdisc	*q;
127 			struct netdev_queue *offload_queue;
128 		} leaf;
129 		struct htb_class_inner {
130 			struct htb_prio clprio[TC_HTB_NUMPRIO];
131 		} inner;
132 	};
133 	s64			pq_key;
134 
135 	int			prio_activity;	/* for which prios are we active */
136 	enum htb_cmode		cmode;		/* current mode of the class */
137 	struct rb_node		pq_node;	/* node for event queue */
138 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
139 
140 	unsigned int drops ____cacheline_aligned_in_smp;
141 	unsigned int		overlimits;
142 };
143 
144 struct htb_level {
145 	struct rb_root	wait_pq;
146 	struct htb_prio hprio[TC_HTB_NUMPRIO];
147 };
148 
149 struct htb_sched {
150 	struct Qdisc_class_hash clhash;
151 	int			defcls;		/* class where unclassified flows go to */
152 	int			rate2quantum;	/* quant = rate / rate2quantum */
153 
154 	/* filters for qdisc itself */
155 	struct tcf_proto __rcu	*filter_list;
156 	struct tcf_block	*block;
157 
158 #define HTB_WARN_TOOMANYEVENTS	0x1
159 	unsigned int		warned;	/* only one warning */
160 	int			direct_qlen;
161 	struct work_struct	work;
162 
163 	/* non shaped skbs; let them go directly thru */
164 	struct qdisc_skb_head	direct_queue;
165 	u32			direct_pkts;
166 	u32			overlimits;
167 
168 	struct qdisc_watchdog	watchdog;
169 
170 	s64			now;	/* cached dequeue time */
171 
172 	/* time of nearest event per level (row) */
173 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
174 
175 	int			row_mask[TC_HTB_MAXDEPTH];
176 
177 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
178 
179 	struct Qdisc		**direct_qdiscs;
180 	unsigned int            num_direct_qdiscs;
181 
182 	bool			offload;
183 };
184 
185 /* find class in global hash table using given handle */
186 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
187 {
188 	struct htb_sched *q = qdisc_priv(sch);
189 	struct Qdisc_class_common *clc;
190 
191 	clc = qdisc_class_find(&q->clhash, handle);
192 	if (clc == NULL)
193 		return NULL;
194 	return container_of(clc, struct htb_class, common);
195 }
196 
197 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
198 {
199 	return (unsigned long)htb_find(handle, sch);
200 }
201 
202 #define HTB_DIRECT ((struct htb_class *)-1L)
203 
204 /**
205  * htb_classify - classify a packet into class
206  * @skb: the socket buffer
207  * @sch: the active queue discipline
208  * @qerr: pointer for returned status code
209  *
210  * It returns NULL if the packet should be dropped or -1 if the packet
211  * should be passed directly thru. In all other cases leaf class is returned.
212  * We allow direct class selection by classid in priority. The we examine
213  * filters in qdisc and in inner nodes (if higher filter points to the inner
214  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
215  * internal fifo (direct). These packets then go directly thru. If we still
216  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
217  * then finish and return direct queue.
218  */
219 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
220 				      int *qerr)
221 {
222 	struct htb_sched *q = qdisc_priv(sch);
223 	struct htb_class *cl;
224 	struct tcf_result res;
225 	struct tcf_proto *tcf;
226 	int result;
227 
228 	/* allow to select class by setting skb->priority to valid classid;
229 	 * note that nfmark can be used too by attaching filter fw with no
230 	 * rules in it
231 	 */
232 	if (skb->priority == sch->handle)
233 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
234 	cl = htb_find(skb->priority, sch);
235 	if (cl) {
236 		if (cl->level == 0)
237 			return cl;
238 		/* Start with inner filter chain if a non-leaf class is selected */
239 		tcf = rcu_dereference_bh(cl->filter_list);
240 	} else {
241 		tcf = rcu_dereference_bh(q->filter_list);
242 	}
243 
244 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
245 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
246 #ifdef CONFIG_NET_CLS_ACT
247 		switch (result) {
248 		case TC_ACT_QUEUED:
249 		case TC_ACT_STOLEN:
250 		case TC_ACT_TRAP:
251 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
252 			fallthrough;
253 		case TC_ACT_SHOT:
254 			return NULL;
255 		}
256 #endif
257 		cl = (void *)res.class;
258 		if (!cl) {
259 			if (res.classid == sch->handle)
260 				return HTB_DIRECT;	/* X:0 (direct flow) */
261 			cl = htb_find(res.classid, sch);
262 			if (!cl)
263 				break;	/* filter selected invalid classid */
264 		}
265 		if (!cl->level)
266 			return cl;	/* we hit leaf; return it */
267 
268 		/* we have got inner class; apply inner filter chain */
269 		tcf = rcu_dereference_bh(cl->filter_list);
270 	}
271 	/* classification failed; try to use default class */
272 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
273 	if (!cl || cl->level)
274 		return HTB_DIRECT;	/* bad default .. this is safe bet */
275 	return cl;
276 }
277 
278 /**
279  * htb_add_to_id_tree - adds class to the round robin list
280  * @root: the root of the tree
281  * @cl: the class to add
282  * @prio: the give prio in class
283  *
284  * Routine adds class to the list (actually tree) sorted by classid.
285  * Make sure that class is not already on such list for given prio.
286  */
287 static void htb_add_to_id_tree(struct rb_root *root,
288 			       struct htb_class *cl, int prio)
289 {
290 	struct rb_node **p = &root->rb_node, *parent = NULL;
291 
292 	while (*p) {
293 		struct htb_class *c;
294 		parent = *p;
295 		c = rb_entry(parent, struct htb_class, node[prio]);
296 
297 		if (cl->common.classid > c->common.classid)
298 			p = &parent->rb_right;
299 		else
300 			p = &parent->rb_left;
301 	}
302 	rb_link_node(&cl->node[prio], parent, p);
303 	rb_insert_color(&cl->node[prio], root);
304 }
305 
306 /**
307  * htb_add_to_wait_tree - adds class to the event queue with delay
308  * @q: the priority event queue
309  * @cl: the class to add
310  * @delay: delay in microseconds
311  *
312  * The class is added to priority event queue to indicate that class will
313  * change its mode in cl->pq_key microseconds. Make sure that class is not
314  * already in the queue.
315  */
316 static void htb_add_to_wait_tree(struct htb_sched *q,
317 				 struct htb_class *cl, s64 delay)
318 {
319 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
320 
321 	cl->pq_key = q->now + delay;
322 	if (cl->pq_key == q->now)
323 		cl->pq_key++;
324 
325 	/* update the nearest event cache */
326 	if (q->near_ev_cache[cl->level] > cl->pq_key)
327 		q->near_ev_cache[cl->level] = cl->pq_key;
328 
329 	while (*p) {
330 		struct htb_class *c;
331 		parent = *p;
332 		c = rb_entry(parent, struct htb_class, pq_node);
333 		if (cl->pq_key >= c->pq_key)
334 			p = &parent->rb_right;
335 		else
336 			p = &parent->rb_left;
337 	}
338 	rb_link_node(&cl->pq_node, parent, p);
339 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
340 }
341 
342 /**
343  * htb_next_rb_node - finds next node in binary tree
344  * @n: the current node in binary tree
345  *
346  * When we are past last key we return NULL.
347  * Average complexity is 2 steps per call.
348  */
349 static inline void htb_next_rb_node(struct rb_node **n)
350 {
351 	if (*n)
352 		*n = rb_next(*n);
353 }
354 
355 /**
356  * htb_add_class_to_row - add class to its row
357  * @q: the priority event queue
358  * @cl: the class to add
359  * @mask: the given priorities in class in bitmap
360  *
361  * The class is added to row at priorities marked in mask.
362  * It does nothing if mask == 0.
363  */
364 static inline void htb_add_class_to_row(struct htb_sched *q,
365 					struct htb_class *cl, int mask)
366 {
367 	q->row_mask[cl->level] |= mask;
368 	while (mask) {
369 		int prio = ffz(~mask);
370 		mask &= ~(1 << prio);
371 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
372 	}
373 }
374 
375 /* If this triggers, it is a bug in this code, but it need not be fatal */
376 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
377 {
378 	if (RB_EMPTY_NODE(rb)) {
379 		WARN_ON(1);
380 	} else {
381 		rb_erase(rb, root);
382 		RB_CLEAR_NODE(rb);
383 	}
384 }
385 
386 
387 /**
388  * htb_remove_class_from_row - removes class from its row
389  * @q: the priority event queue
390  * @cl: the class to add
391  * @mask: the given priorities in class in bitmap
392  *
393  * The class is removed from row at priorities marked in mask.
394  * It does nothing if mask == 0.
395  */
396 static inline void htb_remove_class_from_row(struct htb_sched *q,
397 						 struct htb_class *cl, int mask)
398 {
399 	int m = 0;
400 	struct htb_level *hlevel = &q->hlevel[cl->level];
401 
402 	while (mask) {
403 		int prio = ffz(~mask);
404 		struct htb_prio *hprio = &hlevel->hprio[prio];
405 
406 		mask &= ~(1 << prio);
407 		if (hprio->ptr == cl->node + prio)
408 			htb_next_rb_node(&hprio->ptr);
409 
410 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
411 		if (!hprio->row.rb_node)
412 			m |= 1 << prio;
413 	}
414 	q->row_mask[cl->level] &= ~m;
415 }
416 
417 /**
418  * htb_activate_prios - creates active classe's feed chain
419  * @q: the priority event queue
420  * @cl: the class to activate
421  *
422  * The class is connected to ancestors and/or appropriate rows
423  * for priorities it is participating on. cl->cmode must be new
424  * (activated) mode. It does nothing if cl->prio_activity == 0.
425  */
426 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
427 {
428 	struct htb_class *p = cl->parent;
429 	long m, mask = cl->prio_activity;
430 
431 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
432 		m = mask;
433 		while (m) {
434 			unsigned int prio = ffz(~m);
435 
436 			if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
437 				break;
438 			m &= ~(1 << prio);
439 
440 			if (p->inner.clprio[prio].feed.rb_node)
441 				/* parent already has its feed in use so that
442 				 * reset bit in mask as parent is already ok
443 				 */
444 				mask &= ~(1 << prio);
445 
446 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
447 		}
448 		p->prio_activity |= mask;
449 		cl = p;
450 		p = cl->parent;
451 
452 	}
453 	if (cl->cmode == HTB_CAN_SEND && mask)
454 		htb_add_class_to_row(q, cl, mask);
455 }
456 
457 /**
458  * htb_deactivate_prios - remove class from feed chain
459  * @q: the priority event queue
460  * @cl: the class to deactivate
461  *
462  * cl->cmode must represent old mode (before deactivation). It does
463  * nothing if cl->prio_activity == 0. Class is removed from all feed
464  * chains and rows.
465  */
466 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
467 {
468 	struct htb_class *p = cl->parent;
469 	long m, mask = cl->prio_activity;
470 
471 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
472 		m = mask;
473 		mask = 0;
474 		while (m) {
475 			int prio = ffz(~m);
476 			m &= ~(1 << prio);
477 
478 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
479 				/* we are removing child which is pointed to from
480 				 * parent feed - forget the pointer but remember
481 				 * classid
482 				 */
483 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
484 				p->inner.clprio[prio].ptr = NULL;
485 			}
486 
487 			htb_safe_rb_erase(cl->node + prio,
488 					  &p->inner.clprio[prio].feed);
489 
490 			if (!p->inner.clprio[prio].feed.rb_node)
491 				mask |= 1 << prio;
492 		}
493 
494 		p->prio_activity &= ~mask;
495 		cl = p;
496 		p = cl->parent;
497 
498 	}
499 	if (cl->cmode == HTB_CAN_SEND && mask)
500 		htb_remove_class_from_row(q, cl, mask);
501 }
502 
503 static inline s64 htb_lowater(const struct htb_class *cl)
504 {
505 	if (htb_hysteresis)
506 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
507 	else
508 		return 0;
509 }
510 static inline s64 htb_hiwater(const struct htb_class *cl)
511 {
512 	if (htb_hysteresis)
513 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
514 	else
515 		return 0;
516 }
517 
518 
519 /**
520  * htb_class_mode - computes and returns current class mode
521  * @cl: the target class
522  * @diff: diff time in microseconds
523  *
524  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
525  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
526  * from now to time when cl will change its state.
527  * Also it is worth to note that class mode doesn't change simply
528  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
529  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
530  * mode transitions per time unit. The speed gain is about 1/6.
531  */
532 static inline enum htb_cmode
533 htb_class_mode(struct htb_class *cl, s64 *diff)
534 {
535 	s64 toks;
536 
537 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
538 		*diff = -toks;
539 		return HTB_CANT_SEND;
540 	}
541 
542 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
543 		return HTB_CAN_SEND;
544 
545 	*diff = -toks;
546 	return HTB_MAY_BORROW;
547 }
548 
549 /**
550  * htb_change_class_mode - changes classe's mode
551  * @q: the priority event queue
552  * @cl: the target class
553  * @diff: diff time in microseconds
554  *
555  * This should be the only way how to change classe's mode under normal
556  * circumstances. Routine will update feed lists linkage, change mode
557  * and add class to the wait event queue if appropriate. New mode should
558  * be different from old one and cl->pq_key has to be valid if changing
559  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
560  */
561 static void
562 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
563 {
564 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
565 
566 	if (new_mode == cl->cmode)
567 		return;
568 
569 	if (new_mode == HTB_CANT_SEND) {
570 		cl->overlimits++;
571 		q->overlimits++;
572 	}
573 
574 	if (cl->prio_activity) {	/* not necessary: speed optimization */
575 		if (cl->cmode != HTB_CANT_SEND)
576 			htb_deactivate_prios(q, cl);
577 		cl->cmode = new_mode;
578 		if (new_mode != HTB_CANT_SEND)
579 			htb_activate_prios(q, cl);
580 	} else
581 		cl->cmode = new_mode;
582 }
583 
584 /**
585  * htb_activate - inserts leaf cl into appropriate active feeds
586  * @q: the priority event queue
587  * @cl: the target class
588  *
589  * Routine learns (new) priority of leaf and activates feed chain
590  * for the prio. It can be called on already active leaf safely.
591  * It also adds leaf into droplist.
592  */
593 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
594 {
595 	WARN_ON(cl->level || !cl->leaf.q);
596 
597 	if (!cl->prio_activity) {
598 		cl->prio_activity = 1 << cl->prio;
599 		htb_activate_prios(q, cl);
600 	}
601 }
602 
603 /**
604  * htb_deactivate - remove leaf cl from active feeds
605  * @q: the priority event queue
606  * @cl: the target class
607  *
608  * Make sure that leaf is active. In the other words it can't be called
609  * with non-active leaf. It also removes class from the drop list.
610  */
611 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
612 {
613 	if (!cl->prio_activity)
614 		return;
615 	htb_deactivate_prios(q, cl);
616 	cl->prio_activity = 0;
617 }
618 
619 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
620 		       struct sk_buff **to_free)
621 {
622 	int ret;
623 	unsigned int len = qdisc_pkt_len(skb);
624 	struct htb_sched *q = qdisc_priv(sch);
625 	struct htb_class *cl = htb_classify(skb, sch, &ret);
626 
627 	if (cl == HTB_DIRECT) {
628 		/* enqueue to helper queue */
629 		if (q->direct_queue.qlen < q->direct_qlen) {
630 			__qdisc_enqueue_tail(skb, &q->direct_queue);
631 			q->direct_pkts++;
632 		} else {
633 			return qdisc_drop(skb, sch, to_free);
634 		}
635 #ifdef CONFIG_NET_CLS_ACT
636 	} else if (!cl) {
637 		if (ret & __NET_XMIT_BYPASS)
638 			qdisc_qstats_drop(sch);
639 		__qdisc_drop(skb, to_free);
640 		return ret;
641 #endif
642 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
643 					to_free)) != NET_XMIT_SUCCESS) {
644 		if (net_xmit_drop_count(ret)) {
645 			qdisc_qstats_drop(sch);
646 			cl->drops++;
647 		}
648 		return ret;
649 	} else {
650 		htb_activate(q, cl);
651 	}
652 
653 	sch->qstats.backlog += len;
654 	sch->q.qlen++;
655 	return NET_XMIT_SUCCESS;
656 }
657 
658 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
659 {
660 	s64 toks = diff + cl->tokens;
661 
662 	if (toks > cl->buffer)
663 		toks = cl->buffer;
664 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
665 	if (toks <= -cl->mbuffer)
666 		toks = 1 - cl->mbuffer;
667 
668 	cl->tokens = toks;
669 }
670 
671 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
672 {
673 	s64 toks = diff + cl->ctokens;
674 
675 	if (toks > cl->cbuffer)
676 		toks = cl->cbuffer;
677 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
678 	if (toks <= -cl->mbuffer)
679 		toks = 1 - cl->mbuffer;
680 
681 	cl->ctokens = toks;
682 }
683 
684 /**
685  * htb_charge_class - charges amount "bytes" to leaf and ancestors
686  * @q: the priority event queue
687  * @cl: the class to start iterate
688  * @level: the minimum level to account
689  * @skb: the socket buffer
690  *
691  * Routine assumes that packet "bytes" long was dequeued from leaf cl
692  * borrowing from "level". It accounts bytes to ceil leaky bucket for
693  * leaf and all ancestors and to rate bucket for ancestors at levels
694  * "level" and higher. It also handles possible change of mode resulting
695  * from the update. Note that mode can also increase here (MAY_BORROW to
696  * CAN_SEND) because we can use more precise clock that event queue here.
697  * In such case we remove class from event queue first.
698  */
699 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
700 			     int level, struct sk_buff *skb)
701 {
702 	int bytes = qdisc_pkt_len(skb);
703 	enum htb_cmode old_mode;
704 	s64 diff;
705 
706 	while (cl) {
707 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
708 		if (cl->level >= level) {
709 			if (cl->level == level)
710 				cl->xstats.lends++;
711 			htb_accnt_tokens(cl, bytes, diff);
712 		} else {
713 			cl->xstats.borrows++;
714 			cl->tokens += diff;	/* we moved t_c; update tokens */
715 		}
716 		htb_accnt_ctokens(cl, bytes, diff);
717 		cl->t_c = q->now;
718 
719 		old_mode = cl->cmode;
720 		diff = 0;
721 		htb_change_class_mode(q, cl, &diff);
722 		if (old_mode != cl->cmode) {
723 			if (old_mode != HTB_CAN_SEND)
724 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
725 			if (cl->cmode != HTB_CAN_SEND)
726 				htb_add_to_wait_tree(q, cl, diff);
727 		}
728 
729 		/* update basic stats except for leaves which are already updated */
730 		if (cl->level)
731 			bstats_update(&cl->bstats, skb);
732 
733 		cl = cl->parent;
734 	}
735 }
736 
737 /**
738  * htb_do_events - make mode changes to classes at the level
739  * @q: the priority event queue
740  * @level: which wait_pq in 'q->hlevel'
741  * @start: start jiffies
742  *
743  * Scans event queue for pending events and applies them. Returns time of
744  * next pending event (0 for no event in pq, q->now for too many events).
745  * Note: Applied are events whose have cl->pq_key <= q->now.
746  */
747 static s64 htb_do_events(struct htb_sched *q, const int level,
748 			 unsigned long start)
749 {
750 	/* don't run for longer than 2 jiffies; 2 is used instead of
751 	 * 1 to simplify things when jiffy is going to be incremented
752 	 * too soon
753 	 */
754 	unsigned long stop_at = start + 2;
755 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
756 
757 	while (time_before(jiffies, stop_at)) {
758 		struct htb_class *cl;
759 		s64 diff;
760 		struct rb_node *p = rb_first(wait_pq);
761 
762 		if (!p)
763 			return 0;
764 
765 		cl = rb_entry(p, struct htb_class, pq_node);
766 		if (cl->pq_key > q->now)
767 			return cl->pq_key;
768 
769 		htb_safe_rb_erase(p, wait_pq);
770 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
771 		htb_change_class_mode(q, cl, &diff);
772 		if (cl->cmode != HTB_CAN_SEND)
773 			htb_add_to_wait_tree(q, cl, diff);
774 	}
775 
776 	/* too much load - let's continue after a break for scheduling */
777 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
778 		pr_warn("htb: too many events!\n");
779 		q->warned |= HTB_WARN_TOOMANYEVENTS;
780 	}
781 
782 	return q->now;
783 }
784 
785 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
786  * is no such one exists.
787  */
788 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
789 					      u32 id)
790 {
791 	struct rb_node *r = NULL;
792 	while (n) {
793 		struct htb_class *cl =
794 		    rb_entry(n, struct htb_class, node[prio]);
795 
796 		if (id > cl->common.classid) {
797 			n = n->rb_right;
798 		} else if (id < cl->common.classid) {
799 			r = n;
800 			n = n->rb_left;
801 		} else {
802 			return n;
803 		}
804 	}
805 	return r;
806 }
807 
808 /**
809  * htb_lookup_leaf - returns next leaf class in DRR order
810  * @hprio: the current one
811  * @prio: which prio in class
812  *
813  * Find leaf where current feed pointers points to.
814  */
815 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
816 {
817 	int i;
818 	struct {
819 		struct rb_node *root;
820 		struct rb_node **pptr;
821 		u32 *pid;
822 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
823 
824 	if (unlikely(!hprio->row.rb_node))
825 		return NULL;
826 
827 	sp->root = hprio->row.rb_node;
828 	sp->pptr = &hprio->ptr;
829 	sp->pid = &hprio->last_ptr_id;
830 
831 	for (i = 0; i < 65535; i++) {
832 		if (!*sp->pptr && *sp->pid) {
833 			/* ptr was invalidated but id is valid - try to recover
834 			 * the original or next ptr
835 			 */
836 			*sp->pptr =
837 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
838 		}
839 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
840 				 * can become out of date quickly
841 				 */
842 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
843 			*sp->pptr = sp->root;
844 			while ((*sp->pptr)->rb_left)
845 				*sp->pptr = (*sp->pptr)->rb_left;
846 			if (sp > stk) {
847 				sp--;
848 				if (!*sp->pptr) {
849 					WARN_ON(1);
850 					return NULL;
851 				}
852 				htb_next_rb_node(sp->pptr);
853 			}
854 		} else {
855 			struct htb_class *cl;
856 			struct htb_prio *clp;
857 
858 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
859 			if (!cl->level)
860 				return cl;
861 			clp = &cl->inner.clprio[prio];
862 			(++sp)->root = clp->feed.rb_node;
863 			sp->pptr = &clp->ptr;
864 			sp->pid = &clp->last_ptr_id;
865 		}
866 	}
867 	WARN_ON(1);
868 	return NULL;
869 }
870 
871 /* dequeues packet at given priority and level; call only if
872  * you are sure that there is active class at prio/level
873  */
874 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
875 					const int level)
876 {
877 	struct sk_buff *skb = NULL;
878 	struct htb_class *cl, *start;
879 	struct htb_level *hlevel = &q->hlevel[level];
880 	struct htb_prio *hprio = &hlevel->hprio[prio];
881 
882 	/* look initial class up in the row */
883 	start = cl = htb_lookup_leaf(hprio, prio);
884 
885 	do {
886 next:
887 		if (unlikely(!cl))
888 			return NULL;
889 
890 		/* class can be empty - it is unlikely but can be true if leaf
891 		 * qdisc drops packets in enqueue routine or if someone used
892 		 * graft operation on the leaf since last dequeue;
893 		 * simply deactivate and skip such class
894 		 */
895 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
896 			struct htb_class *next;
897 			htb_deactivate(q, cl);
898 
899 			/* row/level might become empty */
900 			if ((q->row_mask[level] & (1 << prio)) == 0)
901 				return NULL;
902 
903 			next = htb_lookup_leaf(hprio, prio);
904 
905 			if (cl == start)	/* fix start if we just deleted it */
906 				start = next;
907 			cl = next;
908 			goto next;
909 		}
910 
911 		skb = cl->leaf.q->dequeue(cl->leaf.q);
912 		if (likely(skb != NULL))
913 			break;
914 
915 		qdisc_warn_nonwc("htb", cl->leaf.q);
916 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
917 					 &q->hlevel[0].hprio[prio].ptr);
918 		cl = htb_lookup_leaf(hprio, prio);
919 
920 	} while (cl != start);
921 
922 	if (likely(skb != NULL)) {
923 		bstats_update(&cl->bstats, skb);
924 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
925 		if (cl->leaf.deficit[level] < 0) {
926 			cl->leaf.deficit[level] += cl->quantum;
927 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
928 						 &q->hlevel[0].hprio[prio].ptr);
929 		}
930 		/* this used to be after charge_class but this constelation
931 		 * gives us slightly better performance
932 		 */
933 		if (!cl->leaf.q->q.qlen)
934 			htb_deactivate(q, cl);
935 		htb_charge_class(q, cl, level, skb);
936 	}
937 	return skb;
938 }
939 
940 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
941 {
942 	struct sk_buff *skb;
943 	struct htb_sched *q = qdisc_priv(sch);
944 	int level;
945 	s64 next_event;
946 	unsigned long start_at;
947 
948 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
949 	skb = __qdisc_dequeue_head(&q->direct_queue);
950 	if (skb != NULL) {
951 ok:
952 		qdisc_bstats_update(sch, skb);
953 		qdisc_qstats_backlog_dec(sch, skb);
954 		sch->q.qlen--;
955 		return skb;
956 	}
957 
958 	if (!sch->q.qlen)
959 		goto fin;
960 	q->now = ktime_get_ns();
961 	start_at = jiffies;
962 
963 	next_event = q->now + 5LLU * NSEC_PER_SEC;
964 
965 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
966 		/* common case optimization - skip event handler quickly */
967 		int m;
968 		s64 event = q->near_ev_cache[level];
969 
970 		if (q->now >= event) {
971 			event = htb_do_events(q, level, start_at);
972 			if (!event)
973 				event = q->now + NSEC_PER_SEC;
974 			q->near_ev_cache[level] = event;
975 		}
976 
977 		if (next_event > event)
978 			next_event = event;
979 
980 		m = ~q->row_mask[level];
981 		while (m != (int)(-1)) {
982 			int prio = ffz(m);
983 
984 			m |= 1 << prio;
985 			skb = htb_dequeue_tree(q, prio, level);
986 			if (likely(skb != NULL))
987 				goto ok;
988 		}
989 	}
990 	if (likely(next_event > q->now))
991 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
992 	else
993 		schedule_work(&q->work);
994 fin:
995 	return skb;
996 }
997 
998 /* reset all classes */
999 /* always caled under BH & queue lock */
1000 static void htb_reset(struct Qdisc *sch)
1001 {
1002 	struct htb_sched *q = qdisc_priv(sch);
1003 	struct htb_class *cl;
1004 	unsigned int i;
1005 
1006 	for (i = 0; i < q->clhash.hashsize; i++) {
1007 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1008 			if (cl->level)
1009 				memset(&cl->inner, 0, sizeof(cl->inner));
1010 			else {
1011 				if (cl->leaf.q && !q->offload)
1012 					qdisc_reset(cl->leaf.q);
1013 			}
1014 			cl->prio_activity = 0;
1015 			cl->cmode = HTB_CAN_SEND;
1016 		}
1017 	}
1018 	qdisc_watchdog_cancel(&q->watchdog);
1019 	__qdisc_reset_queue(&q->direct_queue);
1020 	memset(q->hlevel, 0, sizeof(q->hlevel));
1021 	memset(q->row_mask, 0, sizeof(q->row_mask));
1022 }
1023 
1024 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1025 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1026 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1027 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1028 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1029 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1030 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1031 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1032 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1033 };
1034 
1035 static void htb_work_func(struct work_struct *work)
1036 {
1037 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1038 	struct Qdisc *sch = q->watchdog.qdisc;
1039 
1040 	rcu_read_lock();
1041 	__netif_schedule(qdisc_root(sch));
1042 	rcu_read_unlock();
1043 }
1044 
1045 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1046 {
1047 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1048 }
1049 
1050 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1051 		    struct netlink_ext_ack *extack)
1052 {
1053 	struct net_device *dev = qdisc_dev(sch);
1054 	struct tc_htb_qopt_offload offload_opt;
1055 	struct htb_sched *q = qdisc_priv(sch);
1056 	struct nlattr *tb[TCA_HTB_MAX + 1];
1057 	struct tc_htb_glob *gopt;
1058 	unsigned int ntx;
1059 	bool offload;
1060 	int err;
1061 
1062 	qdisc_watchdog_init(&q->watchdog, sch);
1063 	INIT_WORK(&q->work, htb_work_func);
1064 
1065 	if (!opt)
1066 		return -EINVAL;
1067 
1068 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1069 	if (err)
1070 		return err;
1071 
1072 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1073 					  NULL);
1074 	if (err < 0)
1075 		return err;
1076 
1077 	if (!tb[TCA_HTB_INIT])
1078 		return -EINVAL;
1079 
1080 	gopt = nla_data(tb[TCA_HTB_INIT]);
1081 	if (gopt->version != HTB_VER >> 16)
1082 		return -EINVAL;
1083 
1084 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1085 
1086 	if (offload) {
1087 		if (sch->parent != TC_H_ROOT) {
1088 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1089 			return -EOPNOTSUPP;
1090 		}
1091 
1092 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1093 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1094 			return -EOPNOTSUPP;
1095 		}
1096 
1097 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1098 		q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
1099 					   sizeof(*q->direct_qdiscs),
1100 					   GFP_KERNEL);
1101 		if (!q->direct_qdiscs)
1102 			return -ENOMEM;
1103 	}
1104 
1105 	err = qdisc_class_hash_init(&q->clhash);
1106 	if (err < 0)
1107 		return err;
1108 
1109 	if (tb[TCA_HTB_DIRECT_QLEN])
1110 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1111 	else
1112 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1113 
1114 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1115 		q->rate2quantum = 1;
1116 	q->defcls = gopt->defcls;
1117 
1118 	if (!offload)
1119 		return 0;
1120 
1121 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1122 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1123 		struct Qdisc *qdisc;
1124 
1125 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1126 					  TC_H_MAKE(sch->handle, 0), extack);
1127 		if (!qdisc) {
1128 			return -ENOMEM;
1129 		}
1130 
1131 		q->direct_qdiscs[ntx] = qdisc;
1132 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1133 	}
1134 
1135 	sch->flags |= TCQ_F_MQROOT;
1136 
1137 	offload_opt = (struct tc_htb_qopt_offload) {
1138 		.command = TC_HTB_CREATE,
1139 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1140 		.classid = TC_H_MIN(q->defcls),
1141 		.extack = extack,
1142 	};
1143 	err = htb_offload(dev, &offload_opt);
1144 	if (err)
1145 		return err;
1146 
1147 	/* Defer this assignment, so that htb_destroy skips offload-related
1148 	 * parts (especially calling ndo_setup_tc) on errors.
1149 	 */
1150 	q->offload = true;
1151 
1152 	return 0;
1153 }
1154 
1155 static void htb_attach_offload(struct Qdisc *sch)
1156 {
1157 	struct net_device *dev = qdisc_dev(sch);
1158 	struct htb_sched *q = qdisc_priv(sch);
1159 	unsigned int ntx;
1160 
1161 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1162 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1163 
1164 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1165 		qdisc_put(old);
1166 		qdisc_hash_add(qdisc, false);
1167 	}
1168 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1169 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1170 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1171 
1172 		qdisc_put(old);
1173 	}
1174 
1175 	kfree(q->direct_qdiscs);
1176 	q->direct_qdiscs = NULL;
1177 }
1178 
1179 static void htb_attach_software(struct Qdisc *sch)
1180 {
1181 	struct net_device *dev = qdisc_dev(sch);
1182 	unsigned int ntx;
1183 
1184 	/* Resemble qdisc_graft behavior. */
1185 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1186 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1187 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1188 
1189 		qdisc_refcount_inc(sch);
1190 
1191 		qdisc_put(old);
1192 	}
1193 }
1194 
1195 static void htb_attach(struct Qdisc *sch)
1196 {
1197 	struct htb_sched *q = qdisc_priv(sch);
1198 
1199 	if (q->offload)
1200 		htb_attach_offload(sch);
1201 	else
1202 		htb_attach_software(sch);
1203 }
1204 
1205 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1206 {
1207 	struct htb_sched *q = qdisc_priv(sch);
1208 	struct nlattr *nest;
1209 	struct tc_htb_glob gopt;
1210 
1211 	if (q->offload)
1212 		sch->flags |= TCQ_F_OFFLOADED;
1213 	else
1214 		sch->flags &= ~TCQ_F_OFFLOADED;
1215 
1216 	sch->qstats.overlimits = q->overlimits;
1217 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1218 	 * no change can happen on the qdisc parameters.
1219 	 */
1220 
1221 	gopt.direct_pkts = q->direct_pkts;
1222 	gopt.version = HTB_VER;
1223 	gopt.rate2quantum = q->rate2quantum;
1224 	gopt.defcls = q->defcls;
1225 	gopt.debug = 0;
1226 
1227 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1228 	if (nest == NULL)
1229 		goto nla_put_failure;
1230 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1231 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1232 		goto nla_put_failure;
1233 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1234 		goto nla_put_failure;
1235 
1236 	return nla_nest_end(skb, nest);
1237 
1238 nla_put_failure:
1239 	nla_nest_cancel(skb, nest);
1240 	return -1;
1241 }
1242 
1243 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1244 			  struct sk_buff *skb, struct tcmsg *tcm)
1245 {
1246 	struct htb_class *cl = (struct htb_class *)arg;
1247 	struct htb_sched *q = qdisc_priv(sch);
1248 	struct nlattr *nest;
1249 	struct tc_htb_opt opt;
1250 
1251 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1252 	 * no change can happen on the class parameters.
1253 	 */
1254 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1255 	tcm->tcm_handle = cl->common.classid;
1256 	if (!cl->level && cl->leaf.q)
1257 		tcm->tcm_info = cl->leaf.q->handle;
1258 
1259 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1260 	if (nest == NULL)
1261 		goto nla_put_failure;
1262 
1263 	memset(&opt, 0, sizeof(opt));
1264 
1265 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1266 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1267 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1268 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1269 	opt.quantum = cl->quantum;
1270 	opt.prio = cl->prio;
1271 	opt.level = cl->level;
1272 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1273 		goto nla_put_failure;
1274 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1275 		goto nla_put_failure;
1276 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1277 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1278 			      TCA_HTB_PAD))
1279 		goto nla_put_failure;
1280 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1281 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1282 			      TCA_HTB_PAD))
1283 		goto nla_put_failure;
1284 
1285 	return nla_nest_end(skb, nest);
1286 
1287 nla_put_failure:
1288 	nla_nest_cancel(skb, nest);
1289 	return -1;
1290 }
1291 
1292 static void htb_offload_aggregate_stats(struct htb_sched *q,
1293 					struct htb_class *cl)
1294 {
1295 	u64 bytes = 0, packets = 0;
1296 	struct htb_class *c;
1297 	unsigned int i;
1298 
1299 	gnet_stats_basic_sync_init(&cl->bstats);
1300 
1301 	for (i = 0; i < q->clhash.hashsize; i++) {
1302 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1303 			struct htb_class *p = c;
1304 
1305 			while (p && p->level < cl->level)
1306 				p = p->parent;
1307 
1308 			if (p != cl)
1309 				continue;
1310 
1311 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1312 			packets += u64_stats_read(&c->bstats_bias.packets);
1313 			if (c->level == 0) {
1314 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1315 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1316 			}
1317 		}
1318 	}
1319 	_bstats_update(&cl->bstats, bytes, packets);
1320 }
1321 
1322 static int
1323 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1324 {
1325 	struct htb_class *cl = (struct htb_class *)arg;
1326 	struct htb_sched *q = qdisc_priv(sch);
1327 	struct gnet_stats_queue qs = {
1328 		.drops = cl->drops,
1329 		.overlimits = cl->overlimits,
1330 	};
1331 	__u32 qlen = 0;
1332 
1333 	if (!cl->level && cl->leaf.q)
1334 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1335 
1336 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1337 				    INT_MIN, INT_MAX);
1338 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1339 				     INT_MIN, INT_MAX);
1340 
1341 	if (q->offload) {
1342 		if (!cl->level) {
1343 			if (cl->leaf.q)
1344 				cl->bstats = cl->leaf.q->bstats;
1345 			else
1346 				gnet_stats_basic_sync_init(&cl->bstats);
1347 			_bstats_update(&cl->bstats,
1348 				       u64_stats_read(&cl->bstats_bias.bytes),
1349 				       u64_stats_read(&cl->bstats_bias.packets));
1350 		} else {
1351 			htb_offload_aggregate_stats(q, cl);
1352 		}
1353 	}
1354 
1355 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1356 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1357 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1358 		return -1;
1359 
1360 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1361 }
1362 
1363 static struct netdev_queue *
1364 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1365 {
1366 	struct net_device *dev = qdisc_dev(sch);
1367 	struct tc_htb_qopt_offload offload_opt;
1368 	struct htb_sched *q = qdisc_priv(sch);
1369 	int err;
1370 
1371 	if (!q->offload)
1372 		return sch->dev_queue;
1373 
1374 	offload_opt = (struct tc_htb_qopt_offload) {
1375 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1376 		.classid = TC_H_MIN(tcm->tcm_parent),
1377 	};
1378 	err = htb_offload(dev, &offload_opt);
1379 	if (err || offload_opt.qid >= dev->num_tx_queues)
1380 		return NULL;
1381 	return netdev_get_tx_queue(dev, offload_opt.qid);
1382 }
1383 
1384 static struct Qdisc *
1385 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1386 {
1387 	struct net_device *dev = dev_queue->dev;
1388 	struct Qdisc *old_q;
1389 
1390 	if (dev->flags & IFF_UP)
1391 		dev_deactivate(dev);
1392 	old_q = dev_graft_qdisc(dev_queue, new_q);
1393 	if (new_q)
1394 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1395 	if (dev->flags & IFF_UP)
1396 		dev_activate(dev);
1397 
1398 	return old_q;
1399 }
1400 
1401 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1402 {
1403 	struct netdev_queue *queue;
1404 
1405 	queue = cl->leaf.offload_queue;
1406 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1407 		WARN_ON(cl->leaf.q->dev_queue != queue);
1408 
1409 	return queue;
1410 }
1411 
1412 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1413 				   struct htb_class *cl_new, bool destroying)
1414 {
1415 	struct netdev_queue *queue_old, *queue_new;
1416 	struct net_device *dev = qdisc_dev(sch);
1417 
1418 	queue_old = htb_offload_get_queue(cl_old);
1419 	queue_new = htb_offload_get_queue(cl_new);
1420 
1421 	if (!destroying) {
1422 		struct Qdisc *qdisc;
1423 
1424 		if (dev->flags & IFF_UP)
1425 			dev_deactivate(dev);
1426 		qdisc = dev_graft_qdisc(queue_old, NULL);
1427 		WARN_ON(qdisc != cl_old->leaf.q);
1428 	}
1429 
1430 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1431 		cl_old->leaf.q->dev_queue = queue_new;
1432 	cl_old->leaf.offload_queue = queue_new;
1433 
1434 	if (!destroying) {
1435 		struct Qdisc *qdisc;
1436 
1437 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1438 		if (dev->flags & IFF_UP)
1439 			dev_activate(dev);
1440 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1441 	}
1442 }
1443 
1444 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1445 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1446 {
1447 	struct netdev_queue *dev_queue = sch->dev_queue;
1448 	struct htb_class *cl = (struct htb_class *)arg;
1449 	struct htb_sched *q = qdisc_priv(sch);
1450 	struct Qdisc *old_q;
1451 
1452 	if (cl->level)
1453 		return -EINVAL;
1454 
1455 	if (q->offload)
1456 		dev_queue = htb_offload_get_queue(cl);
1457 
1458 	if (!new) {
1459 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1460 					cl->common.classid, extack);
1461 		if (!new)
1462 			return -ENOBUFS;
1463 	}
1464 
1465 	if (q->offload) {
1466 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1467 		qdisc_refcount_inc(new);
1468 		old_q = htb_graft_helper(dev_queue, new);
1469 	}
1470 
1471 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1472 
1473 	if (q->offload) {
1474 		WARN_ON(old_q != *old);
1475 		qdisc_put(old_q);
1476 	}
1477 
1478 	return 0;
1479 }
1480 
1481 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1482 {
1483 	struct htb_class *cl = (struct htb_class *)arg;
1484 	return !cl->level ? cl->leaf.q : NULL;
1485 }
1486 
1487 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1488 {
1489 	struct htb_class *cl = (struct htb_class *)arg;
1490 
1491 	htb_deactivate(qdisc_priv(sch), cl);
1492 }
1493 
1494 static inline int htb_parent_last_child(struct htb_class *cl)
1495 {
1496 	if (!cl->parent)
1497 		/* the root class */
1498 		return 0;
1499 	if (cl->parent->children > 1)
1500 		/* not the last child */
1501 		return 0;
1502 	return 1;
1503 }
1504 
1505 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1506 			       struct Qdisc *new_q)
1507 {
1508 	struct htb_sched *q = qdisc_priv(sch);
1509 	struct htb_class *parent = cl->parent;
1510 
1511 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1512 
1513 	if (parent->cmode != HTB_CAN_SEND)
1514 		htb_safe_rb_erase(&parent->pq_node,
1515 				  &q->hlevel[parent->level].wait_pq);
1516 
1517 	parent->level = 0;
1518 	memset(&parent->inner, 0, sizeof(parent->inner));
1519 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1520 	parent->tokens = parent->buffer;
1521 	parent->ctokens = parent->cbuffer;
1522 	parent->t_c = ktime_get_ns();
1523 	parent->cmode = HTB_CAN_SEND;
1524 	if (q->offload)
1525 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1526 }
1527 
1528 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1529 				       struct netdev_queue *dev_queue,
1530 				       struct Qdisc *new_q)
1531 {
1532 	struct Qdisc *old_q;
1533 
1534 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1535 	if (new_q)
1536 		qdisc_refcount_inc(new_q);
1537 	old_q = htb_graft_helper(dev_queue, new_q);
1538 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1539 }
1540 
1541 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1542 				     bool last_child, bool destroying,
1543 				     struct netlink_ext_ack *extack)
1544 {
1545 	struct tc_htb_qopt_offload offload_opt;
1546 	struct netdev_queue *dev_queue;
1547 	struct Qdisc *q = cl->leaf.q;
1548 	struct Qdisc *old;
1549 	int err;
1550 
1551 	if (cl->level)
1552 		return -EINVAL;
1553 
1554 	WARN_ON(!q);
1555 	dev_queue = htb_offload_get_queue(cl);
1556 	/* When destroying, caller qdisc_graft grafts the new qdisc and invokes
1557 	 * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload
1558 	 * does not need to graft or qdisc_put the qdisc being destroyed.
1559 	 */
1560 	if (!destroying) {
1561 		old = htb_graft_helper(dev_queue, NULL);
1562 		/* Last qdisc grafted should be the same as cl->leaf.q when
1563 		 * calling htb_delete.
1564 		 */
1565 		WARN_ON(old != q);
1566 	}
1567 
1568 	if (cl->parent) {
1569 		_bstats_update(&cl->parent->bstats_bias,
1570 			       u64_stats_read(&q->bstats.bytes),
1571 			       u64_stats_read(&q->bstats.packets));
1572 	}
1573 
1574 	offload_opt = (struct tc_htb_qopt_offload) {
1575 		.command = !last_child ? TC_HTB_LEAF_DEL :
1576 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1577 			   TC_HTB_LEAF_DEL_LAST,
1578 		.classid = cl->common.classid,
1579 		.extack = extack,
1580 	};
1581 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1582 
1583 	if (!destroying) {
1584 		if (!err)
1585 			qdisc_put(old);
1586 		else
1587 			htb_graft_helper(dev_queue, old);
1588 	}
1589 
1590 	if (last_child)
1591 		return err;
1592 
1593 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1594 		u32 classid = TC_H_MAJ(sch->handle) |
1595 			      TC_H_MIN(offload_opt.classid);
1596 		struct htb_class *moved_cl = htb_find(classid, sch);
1597 
1598 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1599 	}
1600 
1601 	return err;
1602 }
1603 
1604 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1605 {
1606 	if (!cl->level) {
1607 		WARN_ON(!cl->leaf.q);
1608 		qdisc_put(cl->leaf.q);
1609 	}
1610 	gen_kill_estimator(&cl->rate_est);
1611 	tcf_block_put(cl->block);
1612 	kfree(cl);
1613 }
1614 
1615 static void htb_destroy(struct Qdisc *sch)
1616 {
1617 	struct net_device *dev = qdisc_dev(sch);
1618 	struct tc_htb_qopt_offload offload_opt;
1619 	struct htb_sched *q = qdisc_priv(sch);
1620 	struct hlist_node *next;
1621 	bool nonempty, changed;
1622 	struct htb_class *cl;
1623 	unsigned int i;
1624 
1625 	cancel_work_sync(&q->work);
1626 	qdisc_watchdog_cancel(&q->watchdog);
1627 	/* This line used to be after htb_destroy_class call below
1628 	 * and surprisingly it worked in 2.4. But it must precede it
1629 	 * because filter need its target class alive to be able to call
1630 	 * unbind_filter on it (without Oops).
1631 	 */
1632 	tcf_block_put(q->block);
1633 
1634 	for (i = 0; i < q->clhash.hashsize; i++) {
1635 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1636 			tcf_block_put(cl->block);
1637 			cl->block = NULL;
1638 		}
1639 	}
1640 
1641 	do {
1642 		nonempty = false;
1643 		changed = false;
1644 		for (i = 0; i < q->clhash.hashsize; i++) {
1645 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1646 						  common.hnode) {
1647 				bool last_child;
1648 
1649 				if (!q->offload) {
1650 					htb_destroy_class(sch, cl);
1651 					continue;
1652 				}
1653 
1654 				nonempty = true;
1655 
1656 				if (cl->level)
1657 					continue;
1658 
1659 				changed = true;
1660 
1661 				last_child = htb_parent_last_child(cl);
1662 				htb_destroy_class_offload(sch, cl, last_child,
1663 							  true, NULL);
1664 				qdisc_class_hash_remove(&q->clhash,
1665 							&cl->common);
1666 				if (cl->parent)
1667 					cl->parent->children--;
1668 				if (last_child)
1669 					htb_parent_to_leaf(sch, cl, NULL);
1670 				htb_destroy_class(sch, cl);
1671 			}
1672 		}
1673 	} while (changed);
1674 	WARN_ON(nonempty);
1675 
1676 	qdisc_class_hash_destroy(&q->clhash);
1677 	__qdisc_reset_queue(&q->direct_queue);
1678 
1679 	if (q->offload) {
1680 		offload_opt = (struct tc_htb_qopt_offload) {
1681 			.command = TC_HTB_DESTROY,
1682 		};
1683 		htb_offload(dev, &offload_opt);
1684 	}
1685 
1686 	if (!q->direct_qdiscs)
1687 		return;
1688 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1689 		qdisc_put(q->direct_qdiscs[i]);
1690 	kfree(q->direct_qdiscs);
1691 }
1692 
1693 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1694 		      struct netlink_ext_ack *extack)
1695 {
1696 	struct htb_sched *q = qdisc_priv(sch);
1697 	struct htb_class *cl = (struct htb_class *)arg;
1698 	struct Qdisc *new_q = NULL;
1699 	int last_child = 0;
1700 	int err;
1701 
1702 	/* TODO: why don't allow to delete subtree ? references ? does
1703 	 * tc subsys guarantee us that in htb_destroy it holds no class
1704 	 * refs so that we can remove children safely there ?
1705 	 */
1706 	if (cl->children || qdisc_class_in_use(&cl->common)) {
1707 		NL_SET_ERR_MSG(extack, "HTB class in use");
1708 		return -EBUSY;
1709 	}
1710 
1711 	if (!cl->level && htb_parent_last_child(cl))
1712 		last_child = 1;
1713 
1714 	if (q->offload) {
1715 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1716 						extack);
1717 		if (err)
1718 			return err;
1719 	}
1720 
1721 	if (last_child) {
1722 		struct netdev_queue *dev_queue = sch->dev_queue;
1723 
1724 		if (q->offload)
1725 			dev_queue = htb_offload_get_queue(cl);
1726 
1727 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1728 					  cl->parent->common.classid,
1729 					  NULL);
1730 		if (q->offload)
1731 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1732 	}
1733 
1734 	sch_tree_lock(sch);
1735 
1736 	if (!cl->level)
1737 		qdisc_purge_queue(cl->leaf.q);
1738 
1739 	/* delete from hash and active; remainder in destroy_class */
1740 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1741 	if (cl->parent)
1742 		cl->parent->children--;
1743 
1744 	htb_deactivate(q, cl);
1745 
1746 	if (cl->cmode != HTB_CAN_SEND)
1747 		htb_safe_rb_erase(&cl->pq_node,
1748 				  &q->hlevel[cl->level].wait_pq);
1749 
1750 	if (last_child)
1751 		htb_parent_to_leaf(sch, cl, new_q);
1752 
1753 	sch_tree_unlock(sch);
1754 
1755 	htb_destroy_class(sch, cl);
1756 	return 0;
1757 }
1758 
1759 static int htb_change_class(struct Qdisc *sch, u32 classid,
1760 			    u32 parentid, struct nlattr **tca,
1761 			    unsigned long *arg, struct netlink_ext_ack *extack)
1762 {
1763 	int err = -EINVAL;
1764 	struct htb_sched *q = qdisc_priv(sch);
1765 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1766 	struct tc_htb_qopt_offload offload_opt;
1767 	struct nlattr *opt = tca[TCA_OPTIONS];
1768 	struct nlattr *tb[TCA_HTB_MAX + 1];
1769 	struct Qdisc *parent_qdisc = NULL;
1770 	struct netdev_queue *dev_queue;
1771 	struct tc_htb_opt *hopt;
1772 	u64 rate64, ceil64;
1773 	int warn = 0;
1774 
1775 	/* extract all subattrs from opt attr */
1776 	if (!opt)
1777 		goto failure;
1778 
1779 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1780 					  extack);
1781 	if (err < 0)
1782 		goto failure;
1783 
1784 	err = -EINVAL;
1785 	if (tb[TCA_HTB_PARMS] == NULL)
1786 		goto failure;
1787 
1788 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1789 
1790 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1791 	if (!hopt->rate.rate || !hopt->ceil.rate)
1792 		goto failure;
1793 
1794 	if (q->offload) {
1795 		/* Options not supported by the offload. */
1796 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1797 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1798 			goto failure;
1799 		}
1800 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1801 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1802 			goto failure;
1803 		}
1804 	}
1805 
1806 	/* Keeping backward compatible with rate_table based iproute2 tc */
1807 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1808 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1809 					      NULL));
1810 
1811 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1812 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1813 					      NULL));
1814 
1815 	rate64 = nla_get_u64_default(tb[TCA_HTB_RATE64], 0);
1816 	ceil64 = nla_get_u64_default(tb[TCA_HTB_CEIL64], 0);
1817 
1818 	if (!cl) {		/* new class */
1819 		struct net_device *dev = qdisc_dev(sch);
1820 		struct Qdisc *new_q, *old_q;
1821 		int prio;
1822 		struct {
1823 			struct nlattr		nla;
1824 			struct gnet_estimator	opt;
1825 		} est = {
1826 			.nla = {
1827 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1828 				.nla_type	= TCA_RATE,
1829 			},
1830 			.opt = {
1831 				/* 4s interval, 16s averaging constant */
1832 				.interval	= 2,
1833 				.ewma_log	= 2,
1834 			},
1835 		};
1836 
1837 		/* check for valid classid */
1838 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1839 		    htb_find(classid, sch))
1840 			goto failure;
1841 
1842 		/* check maximal depth */
1843 		if (parent && parent->parent && parent->parent->level < 2) {
1844 			NL_SET_ERR_MSG_MOD(extack, "tree is too deep");
1845 			goto failure;
1846 		}
1847 		err = -ENOBUFS;
1848 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1849 		if (!cl)
1850 			goto failure;
1851 
1852 		gnet_stats_basic_sync_init(&cl->bstats);
1853 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1854 
1855 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1856 		if (err) {
1857 			kfree(cl);
1858 			goto failure;
1859 		}
1860 		if (htb_rate_est || tca[TCA_RATE]) {
1861 			err = gen_new_estimator(&cl->bstats, NULL,
1862 						&cl->rate_est,
1863 						NULL,
1864 						true,
1865 						tca[TCA_RATE] ? : &est.nla);
1866 			if (err)
1867 				goto err_block_put;
1868 		}
1869 
1870 		cl->children = 0;
1871 		RB_CLEAR_NODE(&cl->pq_node);
1872 
1873 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1874 			RB_CLEAR_NODE(&cl->node[prio]);
1875 
1876 		cl->common.classid = classid;
1877 
1878 		/* Make sure nothing interrupts us in between of two
1879 		 * ndo_setup_tc calls.
1880 		 */
1881 		ASSERT_RTNL();
1882 
1883 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1884 		 * so that can't be used inside of sch_tree_lock
1885 		 * -- thanks to Karlis Peisenieks
1886 		 */
1887 		if (!q->offload) {
1888 			dev_queue = sch->dev_queue;
1889 		} else if (!(parent && !parent->level)) {
1890 			/* Assign a dev_queue to this classid. */
1891 			offload_opt = (struct tc_htb_qopt_offload) {
1892 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1893 				.classid = cl->common.classid,
1894 				.parent_classid = parent ?
1895 					TC_H_MIN(parent->common.classid) :
1896 					TC_HTB_CLASSID_ROOT,
1897 				.rate = max_t(u64, hopt->rate.rate, rate64),
1898 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1899 				.prio = hopt->prio,
1900 				.quantum = hopt->quantum,
1901 				.extack = extack,
1902 			};
1903 			err = htb_offload(dev, &offload_opt);
1904 			if (err) {
1905 				NL_SET_ERR_MSG_WEAK(extack,
1906 						    "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE");
1907 				goto err_kill_estimator;
1908 			}
1909 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1910 		} else { /* First child. */
1911 			dev_queue = htb_offload_get_queue(parent);
1912 			old_q = htb_graft_helper(dev_queue, NULL);
1913 			WARN_ON(old_q != parent->leaf.q);
1914 			offload_opt = (struct tc_htb_qopt_offload) {
1915 				.command = TC_HTB_LEAF_TO_INNER,
1916 				.classid = cl->common.classid,
1917 				.parent_classid =
1918 					TC_H_MIN(parent->common.classid),
1919 				.rate = max_t(u64, hopt->rate.rate, rate64),
1920 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1921 				.prio = hopt->prio,
1922 				.quantum = hopt->quantum,
1923 				.extack = extack,
1924 			};
1925 			err = htb_offload(dev, &offload_opt);
1926 			if (err) {
1927 				NL_SET_ERR_MSG_WEAK(extack,
1928 						    "Failed to offload TC_HTB_LEAF_TO_INNER");
1929 				htb_graft_helper(dev_queue, old_q);
1930 				goto err_kill_estimator;
1931 			}
1932 			_bstats_update(&parent->bstats_bias,
1933 				       u64_stats_read(&old_q->bstats.bytes),
1934 				       u64_stats_read(&old_q->bstats.packets));
1935 			qdisc_put(old_q);
1936 		}
1937 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1938 					  classid, NULL);
1939 		if (q->offload) {
1940 			/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1941 			if (new_q)
1942 				qdisc_refcount_inc(new_q);
1943 			old_q = htb_graft_helper(dev_queue, new_q);
1944 			/* No qdisc_put needed. */
1945 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1946 		}
1947 		sch_tree_lock(sch);
1948 		if (parent && !parent->level) {
1949 			/* turn parent into inner node */
1950 			qdisc_purge_queue(parent->leaf.q);
1951 			parent_qdisc = parent->leaf.q;
1952 			htb_deactivate(q, parent);
1953 
1954 			/* remove from evt list because of level change */
1955 			if (parent->cmode != HTB_CAN_SEND) {
1956 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1957 				parent->cmode = HTB_CAN_SEND;
1958 			}
1959 			parent->level = (parent->parent ? parent->parent->level
1960 					 : TC_HTB_MAXDEPTH) - 1;
1961 			memset(&parent->inner, 0, sizeof(parent->inner));
1962 		}
1963 
1964 		/* leaf (we) needs elementary qdisc */
1965 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1966 		if (q->offload)
1967 			cl->leaf.offload_queue = dev_queue;
1968 
1969 		cl->parent = parent;
1970 
1971 		/* set class to be in HTB_CAN_SEND state */
1972 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1973 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1974 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1975 		cl->t_c = ktime_get_ns();
1976 		cl->cmode = HTB_CAN_SEND;
1977 
1978 		/* attach to the hash list and parent's family */
1979 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1980 		if (parent)
1981 			parent->children++;
1982 		if (cl->leaf.q != &noop_qdisc)
1983 			qdisc_hash_add(cl->leaf.q, true);
1984 	} else {
1985 		if (tca[TCA_RATE]) {
1986 			err = gen_replace_estimator(&cl->bstats, NULL,
1987 						    &cl->rate_est,
1988 						    NULL,
1989 						    true,
1990 						    tca[TCA_RATE]);
1991 			if (err)
1992 				return err;
1993 		}
1994 
1995 		if (q->offload) {
1996 			struct net_device *dev = qdisc_dev(sch);
1997 
1998 			offload_opt = (struct tc_htb_qopt_offload) {
1999 				.command = TC_HTB_NODE_MODIFY,
2000 				.classid = cl->common.classid,
2001 				.rate = max_t(u64, hopt->rate.rate, rate64),
2002 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2003 				.prio = hopt->prio,
2004 				.quantum = hopt->quantum,
2005 				.extack = extack,
2006 			};
2007 			err = htb_offload(dev, &offload_opt);
2008 			if (err)
2009 				/* Estimator was replaced, and rollback may fail
2010 				 * as well, so we don't try to recover it, and
2011 				 * the estimator won't work property with the
2012 				 * offload anyway, because bstats are updated
2013 				 * only when the stats are queried.
2014 				 */
2015 				return err;
2016 		}
2017 
2018 		sch_tree_lock(sch);
2019 	}
2020 
2021 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2022 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2023 
2024 	/* it used to be a nasty bug here, we have to check that node
2025 	 * is really leaf before changing cl->leaf !
2026 	 */
2027 	if (!cl->level) {
2028 		u64 quantum = cl->rate.rate_bytes_ps;
2029 
2030 		do_div(quantum, q->rate2quantum);
2031 		cl->quantum = min_t(u64, quantum, INT_MAX);
2032 
2033 		if (!hopt->quantum && cl->quantum < 1000) {
2034 			warn = -1;
2035 			cl->quantum = 1000;
2036 		}
2037 		if (!hopt->quantum && cl->quantum > 200000) {
2038 			warn = 1;
2039 			cl->quantum = 200000;
2040 		}
2041 		if (hopt->quantum)
2042 			cl->quantum = hopt->quantum;
2043 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2044 			cl->prio = TC_HTB_NUMPRIO - 1;
2045 	}
2046 
2047 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2048 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2049 
2050 	sch_tree_unlock(sch);
2051 	qdisc_put(parent_qdisc);
2052 
2053 	if (warn)
2054 		NL_SET_ERR_MSG_FMT_MOD(extack,
2055 				       "quantum of class %X is %s. Consider r2q change.",
2056 				       cl->common.classid, (warn == -1 ? "small" : "big"));
2057 
2058 	qdisc_class_hash_grow(sch, &q->clhash);
2059 
2060 	*arg = (unsigned long)cl;
2061 	return 0;
2062 
2063 err_kill_estimator:
2064 	gen_kill_estimator(&cl->rate_est);
2065 err_block_put:
2066 	tcf_block_put(cl->block);
2067 	kfree(cl);
2068 failure:
2069 	return err;
2070 }
2071 
2072 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2073 				       struct netlink_ext_ack *extack)
2074 {
2075 	struct htb_sched *q = qdisc_priv(sch);
2076 	struct htb_class *cl = (struct htb_class *)arg;
2077 
2078 	return cl ? cl->block : q->block;
2079 }
2080 
2081 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2082 				     u32 classid)
2083 {
2084 	struct htb_class *cl = htb_find(classid, sch);
2085 
2086 	/*if (cl && !cl->level) return 0;
2087 	 * The line above used to be there to prevent attaching filters to
2088 	 * leaves. But at least tc_index filter uses this just to get class
2089 	 * for other reasons so that we have to allow for it.
2090 	 * ----
2091 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2092 	 * another way to "lock" the class - unlike "get" this lock can
2093 	 * be broken by class during destroy IIUC.
2094 	 */
2095 	if (cl)
2096 		qdisc_class_get(&cl->common);
2097 	return (unsigned long)cl;
2098 }
2099 
2100 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2101 {
2102 	struct htb_class *cl = (struct htb_class *)arg;
2103 
2104 	qdisc_class_put(&cl->common);
2105 }
2106 
2107 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2108 {
2109 	struct htb_sched *q = qdisc_priv(sch);
2110 	struct htb_class *cl;
2111 	unsigned int i;
2112 
2113 	if (arg->stop)
2114 		return;
2115 
2116 	for (i = 0; i < q->clhash.hashsize; i++) {
2117 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2118 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
2119 				return;
2120 		}
2121 	}
2122 }
2123 
2124 static const struct Qdisc_class_ops htb_class_ops = {
2125 	.select_queue	=	htb_select_queue,
2126 	.graft		=	htb_graft,
2127 	.leaf		=	htb_leaf,
2128 	.qlen_notify	=	htb_qlen_notify,
2129 	.find		=	htb_search,
2130 	.change		=	htb_change_class,
2131 	.delete		=	htb_delete,
2132 	.walk		=	htb_walk,
2133 	.tcf_block	=	htb_tcf_block,
2134 	.bind_tcf	=	htb_bind_filter,
2135 	.unbind_tcf	=	htb_unbind_filter,
2136 	.dump		=	htb_dump_class,
2137 	.dump_stats	=	htb_dump_class_stats,
2138 };
2139 
2140 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2141 	.cl_ops		=	&htb_class_ops,
2142 	.id		=	"htb",
2143 	.priv_size	=	sizeof(struct htb_sched),
2144 	.enqueue	=	htb_enqueue,
2145 	.dequeue	=	htb_dequeue,
2146 	.peek		=	qdisc_peek_dequeued,
2147 	.init		=	htb_init,
2148 	.attach		=	htb_attach,
2149 	.reset		=	htb_reset,
2150 	.destroy	=	htb_destroy,
2151 	.dump		=	htb_dump,
2152 	.owner		=	THIS_MODULE,
2153 };
2154 MODULE_ALIAS_NET_SCH("htb");
2155 
2156 static int __init htb_module_init(void)
2157 {
2158 	return register_qdisc(&htb_qdisc_ops);
2159 }
2160 static void __exit htb_module_exit(void)
2161 {
2162 	unregister_qdisc(&htb_qdisc_ops);
2163 }
2164 
2165 module_init(htb_module_init)
2166 module_exit(htb_module_exit)
2167 MODULE_LICENSE("GPL");
2168 MODULE_DESCRIPTION("Hierarchical Token Bucket scheduler");
2169