xref: /linux/net/sched/sch_htb.c (revision e0c0ab04f6785abaa71b9b8dc252cb1a2072c225)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 
106 	int			level;		/* our level (see above) */
107 	unsigned int		children;
108 	struct htb_class	*parent;	/* parent class */
109 
110 	struct net_rate_estimator __rcu *rate_est;
111 
112 	/*
113 	 * Written often fields
114 	 */
115 	struct gnet_stats_basic_sync bstats;
116 	struct gnet_stats_basic_sync bstats_bias;
117 	struct tc_htb_xstats	xstats;	/* our special stats */
118 
119 	/* token bucket parameters */
120 	s64			tokens, ctokens;/* current number of tokens */
121 	s64			t_c;		/* checkpoint time */
122 
123 	union {
124 		struct htb_class_leaf {
125 			int		deficit[TC_HTB_MAXDEPTH];
126 			struct Qdisc	*q;
127 			struct netdev_queue *offload_queue;
128 		} leaf;
129 		struct htb_class_inner {
130 			struct htb_prio clprio[TC_HTB_NUMPRIO];
131 		} inner;
132 	};
133 	s64			pq_key;
134 
135 	int			prio_activity;	/* for which prios are we active */
136 	enum htb_cmode		cmode;		/* current mode of the class */
137 	struct rb_node		pq_node;	/* node for event queue */
138 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
139 
140 	unsigned int drops ____cacheline_aligned_in_smp;
141 	unsigned int		overlimits;
142 };
143 
144 struct htb_level {
145 	struct rb_root	wait_pq;
146 	struct htb_prio hprio[TC_HTB_NUMPRIO];
147 };
148 
149 struct htb_sched {
150 	struct Qdisc_class_hash clhash;
151 	int			defcls;		/* class where unclassified flows go to */
152 	int			rate2quantum;	/* quant = rate / rate2quantum */
153 
154 	/* filters for qdisc itself */
155 	struct tcf_proto __rcu	*filter_list;
156 	struct tcf_block	*block;
157 
158 #define HTB_WARN_TOOMANYEVENTS	0x1
159 	unsigned int		warned;	/* only one warning */
160 	int			direct_qlen;
161 	struct work_struct	work;
162 
163 	/* non shaped skbs; let them go directly thru */
164 	struct qdisc_skb_head	direct_queue;
165 	u32			direct_pkts;
166 	u32			overlimits;
167 
168 	struct qdisc_watchdog	watchdog;
169 
170 	s64			now;	/* cached dequeue time */
171 
172 	/* time of nearest event per level (row) */
173 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
174 
175 	int			row_mask[TC_HTB_MAXDEPTH];
176 
177 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
178 
179 	struct Qdisc		**direct_qdiscs;
180 	unsigned int            num_direct_qdiscs;
181 
182 	bool			offload;
183 };
184 
185 /* find class in global hash table using given handle */
186 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
187 {
188 	struct htb_sched *q = qdisc_priv(sch);
189 	struct Qdisc_class_common *clc;
190 
191 	clc = qdisc_class_find(&q->clhash, handle);
192 	if (clc == NULL)
193 		return NULL;
194 	return container_of(clc, struct htb_class, common);
195 }
196 
197 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
198 {
199 	return (unsigned long)htb_find(handle, sch);
200 }
201 
202 #define HTB_DIRECT ((struct htb_class *)-1L)
203 
204 /**
205  * htb_classify - classify a packet into class
206  * @skb: the socket buffer
207  * @sch: the active queue discipline
208  * @qerr: pointer for returned status code
209  *
210  * It returns NULL if the packet should be dropped or -1 if the packet
211  * should be passed directly thru. In all other cases leaf class is returned.
212  * We allow direct class selection by classid in priority. The we examine
213  * filters in qdisc and in inner nodes (if higher filter points to the inner
214  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
215  * internal fifo (direct). These packets then go directly thru. If we still
216  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
217  * then finish and return direct queue.
218  */
219 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
220 				      int *qerr)
221 {
222 	struct htb_sched *q = qdisc_priv(sch);
223 	struct htb_class *cl;
224 	struct tcf_result res;
225 	struct tcf_proto *tcf;
226 	int result;
227 
228 	/* allow to select class by setting skb->priority to valid classid;
229 	 * note that nfmark can be used too by attaching filter fw with no
230 	 * rules in it
231 	 */
232 	if (skb->priority == sch->handle)
233 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
234 	cl = htb_find(skb->priority, sch);
235 	if (cl) {
236 		if (cl->level == 0)
237 			return cl;
238 		/* Start with inner filter chain if a non-leaf class is selected */
239 		tcf = rcu_dereference_bh(cl->filter_list);
240 	} else {
241 		tcf = rcu_dereference_bh(q->filter_list);
242 	}
243 
244 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
245 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
246 #ifdef CONFIG_NET_CLS_ACT
247 		switch (result) {
248 		case TC_ACT_QUEUED:
249 		case TC_ACT_STOLEN:
250 		case TC_ACT_TRAP:
251 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
252 			fallthrough;
253 		case TC_ACT_SHOT:
254 			return NULL;
255 		}
256 #endif
257 		cl = (void *)res.class;
258 		if (!cl) {
259 			if (res.classid == sch->handle)
260 				return HTB_DIRECT;	/* X:0 (direct flow) */
261 			cl = htb_find(res.classid, sch);
262 			if (!cl)
263 				break;	/* filter selected invalid classid */
264 		}
265 		if (!cl->level)
266 			return cl;	/* we hit leaf; return it */
267 
268 		/* we have got inner class; apply inner filter chain */
269 		tcf = rcu_dereference_bh(cl->filter_list);
270 	}
271 	/* classification failed; try to use default class */
272 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
273 	if (!cl || cl->level)
274 		return HTB_DIRECT;	/* bad default .. this is safe bet */
275 	return cl;
276 }
277 
278 /**
279  * htb_add_to_id_tree - adds class to the round robin list
280  * @root: the root of the tree
281  * @cl: the class to add
282  * @prio: the give prio in class
283  *
284  * Routine adds class to the list (actually tree) sorted by classid.
285  * Make sure that class is not already on such list for given prio.
286  */
287 static void htb_add_to_id_tree(struct rb_root *root,
288 			       struct htb_class *cl, int prio)
289 {
290 	struct rb_node **p = &root->rb_node, *parent = NULL;
291 
292 	while (*p) {
293 		struct htb_class *c;
294 		parent = *p;
295 		c = rb_entry(parent, struct htb_class, node[prio]);
296 
297 		if (cl->common.classid > c->common.classid)
298 			p = &parent->rb_right;
299 		else
300 			p = &parent->rb_left;
301 	}
302 	rb_link_node(&cl->node[prio], parent, p);
303 	rb_insert_color(&cl->node[prio], root);
304 }
305 
306 /**
307  * htb_add_to_wait_tree - adds class to the event queue with delay
308  * @q: the priority event queue
309  * @cl: the class to add
310  * @delay: delay in microseconds
311  *
312  * The class is added to priority event queue to indicate that class will
313  * change its mode in cl->pq_key microseconds. Make sure that class is not
314  * already in the queue.
315  */
316 static void htb_add_to_wait_tree(struct htb_sched *q,
317 				 struct htb_class *cl, s64 delay)
318 {
319 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
320 
321 	cl->pq_key = q->now + delay;
322 	if (cl->pq_key == q->now)
323 		cl->pq_key++;
324 
325 	/* update the nearest event cache */
326 	if (q->near_ev_cache[cl->level] > cl->pq_key)
327 		q->near_ev_cache[cl->level] = cl->pq_key;
328 
329 	while (*p) {
330 		struct htb_class *c;
331 		parent = *p;
332 		c = rb_entry(parent, struct htb_class, pq_node);
333 		if (cl->pq_key >= c->pq_key)
334 			p = &parent->rb_right;
335 		else
336 			p = &parent->rb_left;
337 	}
338 	rb_link_node(&cl->pq_node, parent, p);
339 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
340 }
341 
342 /**
343  * htb_next_rb_node - finds next node in binary tree
344  * @n: the current node in binary tree
345  *
346  * When we are past last key we return NULL.
347  * Average complexity is 2 steps per call.
348  */
349 static inline void htb_next_rb_node(struct rb_node **n)
350 {
351 	if (*n)
352 		*n = rb_next(*n);
353 }
354 
355 /**
356  * htb_add_class_to_row - add class to its row
357  * @q: the priority event queue
358  * @cl: the class to add
359  * @mask: the given priorities in class in bitmap
360  *
361  * The class is added to row at priorities marked in mask.
362  * It does nothing if mask == 0.
363  */
364 static inline void htb_add_class_to_row(struct htb_sched *q,
365 					struct htb_class *cl, int mask)
366 {
367 	q->row_mask[cl->level] |= mask;
368 	while (mask) {
369 		int prio = ffz(~mask);
370 		mask &= ~(1 << prio);
371 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
372 	}
373 }
374 
375 /* If this triggers, it is a bug in this code, but it need not be fatal */
376 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
377 {
378 	if (RB_EMPTY_NODE(rb)) {
379 		WARN_ON(1);
380 	} else {
381 		rb_erase(rb, root);
382 		RB_CLEAR_NODE(rb);
383 	}
384 }
385 
386 
387 /**
388  * htb_remove_class_from_row - removes class from its row
389  * @q: the priority event queue
390  * @cl: the class to add
391  * @mask: the given priorities in class in bitmap
392  *
393  * The class is removed from row at priorities marked in mask.
394  * It does nothing if mask == 0.
395  */
396 static inline void htb_remove_class_from_row(struct htb_sched *q,
397 						 struct htb_class *cl, int mask)
398 {
399 	int m = 0;
400 	struct htb_level *hlevel = &q->hlevel[cl->level];
401 
402 	while (mask) {
403 		int prio = ffz(~mask);
404 		struct htb_prio *hprio = &hlevel->hprio[prio];
405 
406 		mask &= ~(1 << prio);
407 		if (hprio->ptr == cl->node + prio)
408 			htb_next_rb_node(&hprio->ptr);
409 
410 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
411 		if (!hprio->row.rb_node)
412 			m |= 1 << prio;
413 	}
414 	q->row_mask[cl->level] &= ~m;
415 }
416 
417 /**
418  * htb_activate_prios - creates active classe's feed chain
419  * @q: the priority event queue
420  * @cl: the class to activate
421  *
422  * The class is connected to ancestors and/or appropriate rows
423  * for priorities it is participating on. cl->cmode must be new
424  * (activated) mode. It does nothing if cl->prio_activity == 0.
425  */
426 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
427 {
428 	struct htb_class *p = cl->parent;
429 	long m, mask = cl->prio_activity;
430 
431 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
432 		m = mask;
433 		while (m) {
434 			unsigned int prio = ffz(~m);
435 
436 			if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
437 				break;
438 			m &= ~(1 << prio);
439 
440 			if (p->inner.clprio[prio].feed.rb_node)
441 				/* parent already has its feed in use so that
442 				 * reset bit in mask as parent is already ok
443 				 */
444 				mask &= ~(1 << prio);
445 
446 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
447 		}
448 		p->prio_activity |= mask;
449 		cl = p;
450 		p = cl->parent;
451 
452 	}
453 	if (cl->cmode == HTB_CAN_SEND && mask)
454 		htb_add_class_to_row(q, cl, mask);
455 }
456 
457 /**
458  * htb_deactivate_prios - remove class from feed chain
459  * @q: the priority event queue
460  * @cl: the class to deactivate
461  *
462  * cl->cmode must represent old mode (before deactivation). It does
463  * nothing if cl->prio_activity == 0. Class is removed from all feed
464  * chains and rows.
465  */
466 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
467 {
468 	struct htb_class *p = cl->parent;
469 	long m, mask = cl->prio_activity;
470 
471 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
472 		m = mask;
473 		mask = 0;
474 		while (m) {
475 			int prio = ffz(~m);
476 			m &= ~(1 << prio);
477 
478 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
479 				/* we are removing child which is pointed to from
480 				 * parent feed - forget the pointer but remember
481 				 * classid
482 				 */
483 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
484 				p->inner.clprio[prio].ptr = NULL;
485 			}
486 
487 			htb_safe_rb_erase(cl->node + prio,
488 					  &p->inner.clprio[prio].feed);
489 
490 			if (!p->inner.clprio[prio].feed.rb_node)
491 				mask |= 1 << prio;
492 		}
493 
494 		p->prio_activity &= ~mask;
495 		cl = p;
496 		p = cl->parent;
497 
498 	}
499 	if (cl->cmode == HTB_CAN_SEND && mask)
500 		htb_remove_class_from_row(q, cl, mask);
501 }
502 
503 static inline s64 htb_lowater(const struct htb_class *cl)
504 {
505 	if (htb_hysteresis)
506 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
507 	else
508 		return 0;
509 }
510 static inline s64 htb_hiwater(const struct htb_class *cl)
511 {
512 	if (htb_hysteresis)
513 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
514 	else
515 		return 0;
516 }
517 
518 
519 /**
520  * htb_class_mode - computes and returns current class mode
521  * @cl: the target class
522  * @diff: diff time in microseconds
523  *
524  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
525  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
526  * from now to time when cl will change its state.
527  * Also it is worth to note that class mode doesn't change simply
528  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
529  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
530  * mode transitions per time unit. The speed gain is about 1/6.
531  */
532 static inline enum htb_cmode
533 htb_class_mode(struct htb_class *cl, s64 *diff)
534 {
535 	s64 toks;
536 
537 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
538 		*diff = -toks;
539 		return HTB_CANT_SEND;
540 	}
541 
542 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
543 		return HTB_CAN_SEND;
544 
545 	*diff = -toks;
546 	return HTB_MAY_BORROW;
547 }
548 
549 /**
550  * htb_change_class_mode - changes classe's mode
551  * @q: the priority event queue
552  * @cl: the target class
553  * @diff: diff time in microseconds
554  *
555  * This should be the only way how to change classe's mode under normal
556  * circumstances. Routine will update feed lists linkage, change mode
557  * and add class to the wait event queue if appropriate. New mode should
558  * be different from old one and cl->pq_key has to be valid if changing
559  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
560  */
561 static void
562 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
563 {
564 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
565 
566 	if (new_mode == cl->cmode)
567 		return;
568 
569 	if (new_mode == HTB_CANT_SEND) {
570 		cl->overlimits++;
571 		q->overlimits++;
572 	}
573 
574 	if (cl->prio_activity) {	/* not necessary: speed optimization */
575 		if (cl->cmode != HTB_CANT_SEND)
576 			htb_deactivate_prios(q, cl);
577 		cl->cmode = new_mode;
578 		if (new_mode != HTB_CANT_SEND)
579 			htb_activate_prios(q, cl);
580 	} else
581 		cl->cmode = new_mode;
582 }
583 
584 /**
585  * htb_activate - inserts leaf cl into appropriate active feeds
586  * @q: the priority event queue
587  * @cl: the target class
588  *
589  * Routine learns (new) priority of leaf and activates feed chain
590  * for the prio. It can be called on already active leaf safely.
591  * It also adds leaf into droplist.
592  */
593 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
594 {
595 	WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
596 
597 	if (!cl->prio_activity) {
598 		cl->prio_activity = 1 << cl->prio;
599 		htb_activate_prios(q, cl);
600 	}
601 }
602 
603 /**
604  * htb_deactivate - remove leaf cl from active feeds
605  * @q: the priority event queue
606  * @cl: the target class
607  *
608  * Make sure that leaf is active. In the other words it can't be called
609  * with non-active leaf. It also removes class from the drop list.
610  */
611 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
612 {
613 	if (!cl->prio_activity)
614 		return;
615 	htb_deactivate_prios(q, cl);
616 	cl->prio_activity = 0;
617 }
618 
619 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
620 		       struct sk_buff **to_free)
621 {
622 	int ret;
623 	unsigned int len = qdisc_pkt_len(skb);
624 	struct htb_sched *q = qdisc_priv(sch);
625 	struct htb_class *cl = htb_classify(skb, sch, &ret);
626 
627 	if (cl == HTB_DIRECT) {
628 		/* enqueue to helper queue */
629 		if (q->direct_queue.qlen < q->direct_qlen) {
630 			__qdisc_enqueue_tail(skb, &q->direct_queue);
631 			q->direct_pkts++;
632 		} else {
633 			return qdisc_drop(skb, sch, to_free);
634 		}
635 #ifdef CONFIG_NET_CLS_ACT
636 	} else if (!cl) {
637 		if (ret & __NET_XMIT_BYPASS)
638 			qdisc_qstats_drop(sch);
639 		__qdisc_drop(skb, to_free);
640 		return ret;
641 #endif
642 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
643 					to_free)) != NET_XMIT_SUCCESS) {
644 		if (net_xmit_drop_count(ret)) {
645 			qdisc_qstats_drop(sch);
646 			cl->drops++;
647 		}
648 		return ret;
649 	} else {
650 		htb_activate(q, cl);
651 	}
652 
653 	sch->qstats.backlog += len;
654 	sch->q.qlen++;
655 	return NET_XMIT_SUCCESS;
656 }
657 
658 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
659 {
660 	s64 toks = diff + cl->tokens;
661 
662 	if (toks > cl->buffer)
663 		toks = cl->buffer;
664 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
665 	if (toks <= -cl->mbuffer)
666 		toks = 1 - cl->mbuffer;
667 
668 	cl->tokens = toks;
669 }
670 
671 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
672 {
673 	s64 toks = diff + cl->ctokens;
674 
675 	if (toks > cl->cbuffer)
676 		toks = cl->cbuffer;
677 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
678 	if (toks <= -cl->mbuffer)
679 		toks = 1 - cl->mbuffer;
680 
681 	cl->ctokens = toks;
682 }
683 
684 /**
685  * htb_charge_class - charges amount "bytes" to leaf and ancestors
686  * @q: the priority event queue
687  * @cl: the class to start iterate
688  * @level: the minimum level to account
689  * @skb: the socket buffer
690  *
691  * Routine assumes that packet "bytes" long was dequeued from leaf cl
692  * borrowing from "level". It accounts bytes to ceil leaky bucket for
693  * leaf and all ancestors and to rate bucket for ancestors at levels
694  * "level" and higher. It also handles possible change of mode resulting
695  * from the update. Note that mode can also increase here (MAY_BORROW to
696  * CAN_SEND) because we can use more precise clock that event queue here.
697  * In such case we remove class from event queue first.
698  */
699 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
700 			     int level, struct sk_buff *skb)
701 {
702 	int bytes = qdisc_pkt_len(skb);
703 	enum htb_cmode old_mode;
704 	s64 diff;
705 
706 	while (cl) {
707 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
708 		if (cl->level >= level) {
709 			if (cl->level == level)
710 				cl->xstats.lends++;
711 			htb_accnt_tokens(cl, bytes, diff);
712 		} else {
713 			cl->xstats.borrows++;
714 			cl->tokens += diff;	/* we moved t_c; update tokens */
715 		}
716 		htb_accnt_ctokens(cl, bytes, diff);
717 		cl->t_c = q->now;
718 
719 		old_mode = cl->cmode;
720 		diff = 0;
721 		htb_change_class_mode(q, cl, &diff);
722 		if (old_mode != cl->cmode) {
723 			if (old_mode != HTB_CAN_SEND)
724 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
725 			if (cl->cmode != HTB_CAN_SEND)
726 				htb_add_to_wait_tree(q, cl, diff);
727 		}
728 
729 		/* update basic stats except for leaves which are already updated */
730 		if (cl->level)
731 			bstats_update(&cl->bstats, skb);
732 
733 		cl = cl->parent;
734 	}
735 }
736 
737 /**
738  * htb_do_events - make mode changes to classes at the level
739  * @q: the priority event queue
740  * @level: which wait_pq in 'q->hlevel'
741  * @start: start jiffies
742  *
743  * Scans event queue for pending events and applies them. Returns time of
744  * next pending event (0 for no event in pq, q->now for too many events).
745  * Note: Applied are events whose have cl->pq_key <= q->now.
746  */
747 static s64 htb_do_events(struct htb_sched *q, const int level,
748 			 unsigned long start)
749 {
750 	/* don't run for longer than 2 jiffies; 2 is used instead of
751 	 * 1 to simplify things when jiffy is going to be incremented
752 	 * too soon
753 	 */
754 	unsigned long stop_at = start + 2;
755 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
756 
757 	while (time_before(jiffies, stop_at)) {
758 		struct htb_class *cl;
759 		s64 diff;
760 		struct rb_node *p = rb_first(wait_pq);
761 
762 		if (!p)
763 			return 0;
764 
765 		cl = rb_entry(p, struct htb_class, pq_node);
766 		if (cl->pq_key > q->now)
767 			return cl->pq_key;
768 
769 		htb_safe_rb_erase(p, wait_pq);
770 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
771 		htb_change_class_mode(q, cl, &diff);
772 		if (cl->cmode != HTB_CAN_SEND)
773 			htb_add_to_wait_tree(q, cl, diff);
774 	}
775 
776 	/* too much load - let's continue after a break for scheduling */
777 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
778 		pr_warn("htb: too many events!\n");
779 		q->warned |= HTB_WARN_TOOMANYEVENTS;
780 	}
781 
782 	return q->now;
783 }
784 
785 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
786  * is no such one exists.
787  */
788 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
789 					      u32 id)
790 {
791 	struct rb_node *r = NULL;
792 	while (n) {
793 		struct htb_class *cl =
794 		    rb_entry(n, struct htb_class, node[prio]);
795 
796 		if (id > cl->common.classid) {
797 			n = n->rb_right;
798 		} else if (id < cl->common.classid) {
799 			r = n;
800 			n = n->rb_left;
801 		} else {
802 			return n;
803 		}
804 	}
805 	return r;
806 }
807 
808 /**
809  * htb_lookup_leaf - returns next leaf class in DRR order
810  * @hprio: the current one
811  * @prio: which prio in class
812  *
813  * Find leaf where current feed pointers points to.
814  */
815 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
816 {
817 	int i;
818 	struct {
819 		struct rb_node *root;
820 		struct rb_node **pptr;
821 		u32 *pid;
822 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
823 
824 	BUG_ON(!hprio->row.rb_node);
825 	sp->root = hprio->row.rb_node;
826 	sp->pptr = &hprio->ptr;
827 	sp->pid = &hprio->last_ptr_id;
828 
829 	for (i = 0; i < 65535; i++) {
830 		if (!*sp->pptr && *sp->pid) {
831 			/* ptr was invalidated but id is valid - try to recover
832 			 * the original or next ptr
833 			 */
834 			*sp->pptr =
835 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
836 		}
837 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
838 				 * can become out of date quickly
839 				 */
840 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
841 			*sp->pptr = sp->root;
842 			while ((*sp->pptr)->rb_left)
843 				*sp->pptr = (*sp->pptr)->rb_left;
844 			if (sp > stk) {
845 				sp--;
846 				if (!*sp->pptr) {
847 					WARN_ON(1);
848 					return NULL;
849 				}
850 				htb_next_rb_node(sp->pptr);
851 			}
852 		} else {
853 			struct htb_class *cl;
854 			struct htb_prio *clp;
855 
856 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
857 			if (!cl->level)
858 				return cl;
859 			clp = &cl->inner.clprio[prio];
860 			(++sp)->root = clp->feed.rb_node;
861 			sp->pptr = &clp->ptr;
862 			sp->pid = &clp->last_ptr_id;
863 		}
864 	}
865 	WARN_ON(1);
866 	return NULL;
867 }
868 
869 /* dequeues packet at given priority and level; call only if
870  * you are sure that there is active class at prio/level
871  */
872 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
873 					const int level)
874 {
875 	struct sk_buff *skb = NULL;
876 	struct htb_class *cl, *start;
877 	struct htb_level *hlevel = &q->hlevel[level];
878 	struct htb_prio *hprio = &hlevel->hprio[prio];
879 
880 	/* look initial class up in the row */
881 	start = cl = htb_lookup_leaf(hprio, prio);
882 
883 	do {
884 next:
885 		if (unlikely(!cl))
886 			return NULL;
887 
888 		/* class can be empty - it is unlikely but can be true if leaf
889 		 * qdisc drops packets in enqueue routine or if someone used
890 		 * graft operation on the leaf since last dequeue;
891 		 * simply deactivate and skip such class
892 		 */
893 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
894 			struct htb_class *next;
895 			htb_deactivate(q, cl);
896 
897 			/* row/level might become empty */
898 			if ((q->row_mask[level] & (1 << prio)) == 0)
899 				return NULL;
900 
901 			next = htb_lookup_leaf(hprio, prio);
902 
903 			if (cl == start)	/* fix start if we just deleted it */
904 				start = next;
905 			cl = next;
906 			goto next;
907 		}
908 
909 		skb = cl->leaf.q->dequeue(cl->leaf.q);
910 		if (likely(skb != NULL))
911 			break;
912 
913 		qdisc_warn_nonwc("htb", cl->leaf.q);
914 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
915 					 &q->hlevel[0].hprio[prio].ptr);
916 		cl = htb_lookup_leaf(hprio, prio);
917 
918 	} while (cl != start);
919 
920 	if (likely(skb != NULL)) {
921 		bstats_update(&cl->bstats, skb);
922 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
923 		if (cl->leaf.deficit[level] < 0) {
924 			cl->leaf.deficit[level] += cl->quantum;
925 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
926 						 &q->hlevel[0].hprio[prio].ptr);
927 		}
928 		/* this used to be after charge_class but this constelation
929 		 * gives us slightly better performance
930 		 */
931 		if (!cl->leaf.q->q.qlen)
932 			htb_deactivate(q, cl);
933 		htb_charge_class(q, cl, level, skb);
934 	}
935 	return skb;
936 }
937 
938 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
939 {
940 	struct sk_buff *skb;
941 	struct htb_sched *q = qdisc_priv(sch);
942 	int level;
943 	s64 next_event;
944 	unsigned long start_at;
945 
946 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
947 	skb = __qdisc_dequeue_head(&q->direct_queue);
948 	if (skb != NULL) {
949 ok:
950 		qdisc_bstats_update(sch, skb);
951 		qdisc_qstats_backlog_dec(sch, skb);
952 		sch->q.qlen--;
953 		return skb;
954 	}
955 
956 	if (!sch->q.qlen)
957 		goto fin;
958 	q->now = ktime_get_ns();
959 	start_at = jiffies;
960 
961 	next_event = q->now + 5LLU * NSEC_PER_SEC;
962 
963 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
964 		/* common case optimization - skip event handler quickly */
965 		int m;
966 		s64 event = q->near_ev_cache[level];
967 
968 		if (q->now >= event) {
969 			event = htb_do_events(q, level, start_at);
970 			if (!event)
971 				event = q->now + NSEC_PER_SEC;
972 			q->near_ev_cache[level] = event;
973 		}
974 
975 		if (next_event > event)
976 			next_event = event;
977 
978 		m = ~q->row_mask[level];
979 		while (m != (int)(-1)) {
980 			int prio = ffz(m);
981 
982 			m |= 1 << prio;
983 			skb = htb_dequeue_tree(q, prio, level);
984 			if (likely(skb != NULL))
985 				goto ok;
986 		}
987 	}
988 	if (likely(next_event > q->now))
989 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
990 	else
991 		schedule_work(&q->work);
992 fin:
993 	return skb;
994 }
995 
996 /* reset all classes */
997 /* always caled under BH & queue lock */
998 static void htb_reset(struct Qdisc *sch)
999 {
1000 	struct htb_sched *q = qdisc_priv(sch);
1001 	struct htb_class *cl;
1002 	unsigned int i;
1003 
1004 	for (i = 0; i < q->clhash.hashsize; i++) {
1005 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1006 			if (cl->level)
1007 				memset(&cl->inner, 0, sizeof(cl->inner));
1008 			else {
1009 				if (cl->leaf.q && !q->offload)
1010 					qdisc_reset(cl->leaf.q);
1011 			}
1012 			cl->prio_activity = 0;
1013 			cl->cmode = HTB_CAN_SEND;
1014 		}
1015 	}
1016 	qdisc_watchdog_cancel(&q->watchdog);
1017 	__qdisc_reset_queue(&q->direct_queue);
1018 	memset(q->hlevel, 0, sizeof(q->hlevel));
1019 	memset(q->row_mask, 0, sizeof(q->row_mask));
1020 }
1021 
1022 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1023 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1024 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1025 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1026 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1027 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1028 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1029 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1030 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1031 };
1032 
1033 static void htb_work_func(struct work_struct *work)
1034 {
1035 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1036 	struct Qdisc *sch = q->watchdog.qdisc;
1037 
1038 	rcu_read_lock();
1039 	__netif_schedule(qdisc_root(sch));
1040 	rcu_read_unlock();
1041 }
1042 
1043 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1044 {
1045 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1046 }
1047 
1048 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1049 		    struct netlink_ext_ack *extack)
1050 {
1051 	struct net_device *dev = qdisc_dev(sch);
1052 	struct tc_htb_qopt_offload offload_opt;
1053 	struct htb_sched *q = qdisc_priv(sch);
1054 	struct nlattr *tb[TCA_HTB_MAX + 1];
1055 	struct tc_htb_glob *gopt;
1056 	unsigned int ntx;
1057 	bool offload;
1058 	int err;
1059 
1060 	qdisc_watchdog_init(&q->watchdog, sch);
1061 	INIT_WORK(&q->work, htb_work_func);
1062 
1063 	if (!opt)
1064 		return -EINVAL;
1065 
1066 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1067 	if (err)
1068 		return err;
1069 
1070 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1071 					  NULL);
1072 	if (err < 0)
1073 		return err;
1074 
1075 	if (!tb[TCA_HTB_INIT])
1076 		return -EINVAL;
1077 
1078 	gopt = nla_data(tb[TCA_HTB_INIT]);
1079 	if (gopt->version != HTB_VER >> 16)
1080 		return -EINVAL;
1081 
1082 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1083 
1084 	if (offload) {
1085 		if (sch->parent != TC_H_ROOT) {
1086 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1087 			return -EOPNOTSUPP;
1088 		}
1089 
1090 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1091 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1092 			return -EOPNOTSUPP;
1093 		}
1094 
1095 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1096 		q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
1097 					   sizeof(*q->direct_qdiscs),
1098 					   GFP_KERNEL);
1099 		if (!q->direct_qdiscs)
1100 			return -ENOMEM;
1101 	}
1102 
1103 	err = qdisc_class_hash_init(&q->clhash);
1104 	if (err < 0)
1105 		return err;
1106 
1107 	if (tb[TCA_HTB_DIRECT_QLEN])
1108 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1109 	else
1110 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1111 
1112 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1113 		q->rate2quantum = 1;
1114 	q->defcls = gopt->defcls;
1115 
1116 	if (!offload)
1117 		return 0;
1118 
1119 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1120 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1121 		struct Qdisc *qdisc;
1122 
1123 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1124 					  TC_H_MAKE(sch->handle, 0), extack);
1125 		if (!qdisc) {
1126 			return -ENOMEM;
1127 		}
1128 
1129 		q->direct_qdiscs[ntx] = qdisc;
1130 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1131 	}
1132 
1133 	sch->flags |= TCQ_F_MQROOT;
1134 
1135 	offload_opt = (struct tc_htb_qopt_offload) {
1136 		.command = TC_HTB_CREATE,
1137 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1138 		.classid = TC_H_MIN(q->defcls),
1139 		.extack = extack,
1140 	};
1141 	err = htb_offload(dev, &offload_opt);
1142 	if (err)
1143 		return err;
1144 
1145 	/* Defer this assignment, so that htb_destroy skips offload-related
1146 	 * parts (especially calling ndo_setup_tc) on errors.
1147 	 */
1148 	q->offload = true;
1149 
1150 	return 0;
1151 }
1152 
1153 static void htb_attach_offload(struct Qdisc *sch)
1154 {
1155 	struct net_device *dev = qdisc_dev(sch);
1156 	struct htb_sched *q = qdisc_priv(sch);
1157 	unsigned int ntx;
1158 
1159 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1160 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1161 
1162 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1163 		qdisc_put(old);
1164 		qdisc_hash_add(qdisc, false);
1165 	}
1166 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1167 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1168 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1169 
1170 		qdisc_put(old);
1171 	}
1172 
1173 	kfree(q->direct_qdiscs);
1174 	q->direct_qdiscs = NULL;
1175 }
1176 
1177 static void htb_attach_software(struct Qdisc *sch)
1178 {
1179 	struct net_device *dev = qdisc_dev(sch);
1180 	unsigned int ntx;
1181 
1182 	/* Resemble qdisc_graft behavior. */
1183 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1184 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1185 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1186 
1187 		qdisc_refcount_inc(sch);
1188 
1189 		qdisc_put(old);
1190 	}
1191 }
1192 
1193 static void htb_attach(struct Qdisc *sch)
1194 {
1195 	struct htb_sched *q = qdisc_priv(sch);
1196 
1197 	if (q->offload)
1198 		htb_attach_offload(sch);
1199 	else
1200 		htb_attach_software(sch);
1201 }
1202 
1203 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1204 {
1205 	struct htb_sched *q = qdisc_priv(sch);
1206 	struct nlattr *nest;
1207 	struct tc_htb_glob gopt;
1208 
1209 	if (q->offload)
1210 		sch->flags |= TCQ_F_OFFLOADED;
1211 	else
1212 		sch->flags &= ~TCQ_F_OFFLOADED;
1213 
1214 	sch->qstats.overlimits = q->overlimits;
1215 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1216 	 * no change can happen on the qdisc parameters.
1217 	 */
1218 
1219 	gopt.direct_pkts = q->direct_pkts;
1220 	gopt.version = HTB_VER;
1221 	gopt.rate2quantum = q->rate2quantum;
1222 	gopt.defcls = q->defcls;
1223 	gopt.debug = 0;
1224 
1225 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1226 	if (nest == NULL)
1227 		goto nla_put_failure;
1228 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1229 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1230 		goto nla_put_failure;
1231 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1232 		goto nla_put_failure;
1233 
1234 	return nla_nest_end(skb, nest);
1235 
1236 nla_put_failure:
1237 	nla_nest_cancel(skb, nest);
1238 	return -1;
1239 }
1240 
1241 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1242 			  struct sk_buff *skb, struct tcmsg *tcm)
1243 {
1244 	struct htb_class *cl = (struct htb_class *)arg;
1245 	struct htb_sched *q = qdisc_priv(sch);
1246 	struct nlattr *nest;
1247 	struct tc_htb_opt opt;
1248 
1249 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1250 	 * no change can happen on the class parameters.
1251 	 */
1252 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1253 	tcm->tcm_handle = cl->common.classid;
1254 	if (!cl->level && cl->leaf.q)
1255 		tcm->tcm_info = cl->leaf.q->handle;
1256 
1257 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1258 	if (nest == NULL)
1259 		goto nla_put_failure;
1260 
1261 	memset(&opt, 0, sizeof(opt));
1262 
1263 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1264 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1265 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1266 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1267 	opt.quantum = cl->quantum;
1268 	opt.prio = cl->prio;
1269 	opt.level = cl->level;
1270 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1271 		goto nla_put_failure;
1272 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1273 		goto nla_put_failure;
1274 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1275 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1276 			      TCA_HTB_PAD))
1277 		goto nla_put_failure;
1278 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1279 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1280 			      TCA_HTB_PAD))
1281 		goto nla_put_failure;
1282 
1283 	return nla_nest_end(skb, nest);
1284 
1285 nla_put_failure:
1286 	nla_nest_cancel(skb, nest);
1287 	return -1;
1288 }
1289 
1290 static void htb_offload_aggregate_stats(struct htb_sched *q,
1291 					struct htb_class *cl)
1292 {
1293 	u64 bytes = 0, packets = 0;
1294 	struct htb_class *c;
1295 	unsigned int i;
1296 
1297 	gnet_stats_basic_sync_init(&cl->bstats);
1298 
1299 	for (i = 0; i < q->clhash.hashsize; i++) {
1300 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1301 			struct htb_class *p = c;
1302 
1303 			while (p && p->level < cl->level)
1304 				p = p->parent;
1305 
1306 			if (p != cl)
1307 				continue;
1308 
1309 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1310 			packets += u64_stats_read(&c->bstats_bias.packets);
1311 			if (c->level == 0) {
1312 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1313 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1314 			}
1315 		}
1316 	}
1317 	_bstats_update(&cl->bstats, bytes, packets);
1318 }
1319 
1320 static int
1321 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1322 {
1323 	struct htb_class *cl = (struct htb_class *)arg;
1324 	struct htb_sched *q = qdisc_priv(sch);
1325 	struct gnet_stats_queue qs = {
1326 		.drops = cl->drops,
1327 		.overlimits = cl->overlimits,
1328 	};
1329 	__u32 qlen = 0;
1330 
1331 	if (!cl->level && cl->leaf.q)
1332 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1333 
1334 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1335 				    INT_MIN, INT_MAX);
1336 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1337 				     INT_MIN, INT_MAX);
1338 
1339 	if (q->offload) {
1340 		if (!cl->level) {
1341 			if (cl->leaf.q)
1342 				cl->bstats = cl->leaf.q->bstats;
1343 			else
1344 				gnet_stats_basic_sync_init(&cl->bstats);
1345 			_bstats_update(&cl->bstats,
1346 				       u64_stats_read(&cl->bstats_bias.bytes),
1347 				       u64_stats_read(&cl->bstats_bias.packets));
1348 		} else {
1349 			htb_offload_aggregate_stats(q, cl);
1350 		}
1351 	}
1352 
1353 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1354 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1355 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1356 		return -1;
1357 
1358 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1359 }
1360 
1361 static struct netdev_queue *
1362 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1363 {
1364 	struct net_device *dev = qdisc_dev(sch);
1365 	struct tc_htb_qopt_offload offload_opt;
1366 	struct htb_sched *q = qdisc_priv(sch);
1367 	int err;
1368 
1369 	if (!q->offload)
1370 		return sch->dev_queue;
1371 
1372 	offload_opt = (struct tc_htb_qopt_offload) {
1373 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1374 		.classid = TC_H_MIN(tcm->tcm_parent),
1375 	};
1376 	err = htb_offload(dev, &offload_opt);
1377 	if (err || offload_opt.qid >= dev->num_tx_queues)
1378 		return NULL;
1379 	return netdev_get_tx_queue(dev, offload_opt.qid);
1380 }
1381 
1382 static struct Qdisc *
1383 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1384 {
1385 	struct net_device *dev = dev_queue->dev;
1386 	struct Qdisc *old_q;
1387 
1388 	if (dev->flags & IFF_UP)
1389 		dev_deactivate(dev);
1390 	old_q = dev_graft_qdisc(dev_queue, new_q);
1391 	if (new_q)
1392 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1393 	if (dev->flags & IFF_UP)
1394 		dev_activate(dev);
1395 
1396 	return old_q;
1397 }
1398 
1399 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1400 {
1401 	struct netdev_queue *queue;
1402 
1403 	queue = cl->leaf.offload_queue;
1404 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1405 		WARN_ON(cl->leaf.q->dev_queue != queue);
1406 
1407 	return queue;
1408 }
1409 
1410 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1411 				   struct htb_class *cl_new, bool destroying)
1412 {
1413 	struct netdev_queue *queue_old, *queue_new;
1414 	struct net_device *dev = qdisc_dev(sch);
1415 
1416 	queue_old = htb_offload_get_queue(cl_old);
1417 	queue_new = htb_offload_get_queue(cl_new);
1418 
1419 	if (!destroying) {
1420 		struct Qdisc *qdisc;
1421 
1422 		if (dev->flags & IFF_UP)
1423 			dev_deactivate(dev);
1424 		qdisc = dev_graft_qdisc(queue_old, NULL);
1425 		WARN_ON(qdisc != cl_old->leaf.q);
1426 	}
1427 
1428 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1429 		cl_old->leaf.q->dev_queue = queue_new;
1430 	cl_old->leaf.offload_queue = queue_new;
1431 
1432 	if (!destroying) {
1433 		struct Qdisc *qdisc;
1434 
1435 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1436 		if (dev->flags & IFF_UP)
1437 			dev_activate(dev);
1438 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1439 	}
1440 }
1441 
1442 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1443 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1444 {
1445 	struct netdev_queue *dev_queue = sch->dev_queue;
1446 	struct htb_class *cl = (struct htb_class *)arg;
1447 	struct htb_sched *q = qdisc_priv(sch);
1448 	struct Qdisc *old_q;
1449 
1450 	if (cl->level)
1451 		return -EINVAL;
1452 
1453 	if (q->offload)
1454 		dev_queue = htb_offload_get_queue(cl);
1455 
1456 	if (!new) {
1457 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1458 					cl->common.classid, extack);
1459 		if (!new)
1460 			return -ENOBUFS;
1461 	}
1462 
1463 	if (q->offload) {
1464 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1465 		qdisc_refcount_inc(new);
1466 		old_q = htb_graft_helper(dev_queue, new);
1467 	}
1468 
1469 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1470 
1471 	if (q->offload) {
1472 		WARN_ON(old_q != *old);
1473 		qdisc_put(old_q);
1474 	}
1475 
1476 	return 0;
1477 }
1478 
1479 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1480 {
1481 	struct htb_class *cl = (struct htb_class *)arg;
1482 	return !cl->level ? cl->leaf.q : NULL;
1483 }
1484 
1485 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1486 {
1487 	struct htb_class *cl = (struct htb_class *)arg;
1488 
1489 	htb_deactivate(qdisc_priv(sch), cl);
1490 }
1491 
1492 static inline int htb_parent_last_child(struct htb_class *cl)
1493 {
1494 	if (!cl->parent)
1495 		/* the root class */
1496 		return 0;
1497 	if (cl->parent->children > 1)
1498 		/* not the last child */
1499 		return 0;
1500 	return 1;
1501 }
1502 
1503 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1504 			       struct Qdisc *new_q)
1505 {
1506 	struct htb_sched *q = qdisc_priv(sch);
1507 	struct htb_class *parent = cl->parent;
1508 
1509 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1510 
1511 	if (parent->cmode != HTB_CAN_SEND)
1512 		htb_safe_rb_erase(&parent->pq_node,
1513 				  &q->hlevel[parent->level].wait_pq);
1514 
1515 	parent->level = 0;
1516 	memset(&parent->inner, 0, sizeof(parent->inner));
1517 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1518 	parent->tokens = parent->buffer;
1519 	parent->ctokens = parent->cbuffer;
1520 	parent->t_c = ktime_get_ns();
1521 	parent->cmode = HTB_CAN_SEND;
1522 	if (q->offload)
1523 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1524 }
1525 
1526 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1527 				       struct netdev_queue *dev_queue,
1528 				       struct Qdisc *new_q)
1529 {
1530 	struct Qdisc *old_q;
1531 
1532 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1533 	if (new_q)
1534 		qdisc_refcount_inc(new_q);
1535 	old_q = htb_graft_helper(dev_queue, new_q);
1536 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1537 }
1538 
1539 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1540 				     bool last_child, bool destroying,
1541 				     struct netlink_ext_ack *extack)
1542 {
1543 	struct tc_htb_qopt_offload offload_opt;
1544 	struct netdev_queue *dev_queue;
1545 	struct Qdisc *q = cl->leaf.q;
1546 	struct Qdisc *old;
1547 	int err;
1548 
1549 	if (cl->level)
1550 		return -EINVAL;
1551 
1552 	WARN_ON(!q);
1553 	dev_queue = htb_offload_get_queue(cl);
1554 	/* When destroying, caller qdisc_graft grafts the new qdisc and invokes
1555 	 * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload
1556 	 * does not need to graft or qdisc_put the qdisc being destroyed.
1557 	 */
1558 	if (!destroying) {
1559 		old = htb_graft_helper(dev_queue, NULL);
1560 		/* Last qdisc grafted should be the same as cl->leaf.q when
1561 		 * calling htb_delete.
1562 		 */
1563 		WARN_ON(old != q);
1564 	}
1565 
1566 	if (cl->parent) {
1567 		_bstats_update(&cl->parent->bstats_bias,
1568 			       u64_stats_read(&q->bstats.bytes),
1569 			       u64_stats_read(&q->bstats.packets));
1570 	}
1571 
1572 	offload_opt = (struct tc_htb_qopt_offload) {
1573 		.command = !last_child ? TC_HTB_LEAF_DEL :
1574 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1575 			   TC_HTB_LEAF_DEL_LAST,
1576 		.classid = cl->common.classid,
1577 		.extack = extack,
1578 	};
1579 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1580 
1581 	if (!destroying) {
1582 		if (!err)
1583 			qdisc_put(old);
1584 		else
1585 			htb_graft_helper(dev_queue, old);
1586 	}
1587 
1588 	if (last_child)
1589 		return err;
1590 
1591 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1592 		u32 classid = TC_H_MAJ(sch->handle) |
1593 			      TC_H_MIN(offload_opt.classid);
1594 		struct htb_class *moved_cl = htb_find(classid, sch);
1595 
1596 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1597 	}
1598 
1599 	return err;
1600 }
1601 
1602 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1603 {
1604 	if (!cl->level) {
1605 		WARN_ON(!cl->leaf.q);
1606 		qdisc_put(cl->leaf.q);
1607 	}
1608 	gen_kill_estimator(&cl->rate_est);
1609 	tcf_block_put(cl->block);
1610 	kfree(cl);
1611 }
1612 
1613 static void htb_destroy(struct Qdisc *sch)
1614 {
1615 	struct net_device *dev = qdisc_dev(sch);
1616 	struct tc_htb_qopt_offload offload_opt;
1617 	struct htb_sched *q = qdisc_priv(sch);
1618 	struct hlist_node *next;
1619 	bool nonempty, changed;
1620 	struct htb_class *cl;
1621 	unsigned int i;
1622 
1623 	cancel_work_sync(&q->work);
1624 	qdisc_watchdog_cancel(&q->watchdog);
1625 	/* This line used to be after htb_destroy_class call below
1626 	 * and surprisingly it worked in 2.4. But it must precede it
1627 	 * because filter need its target class alive to be able to call
1628 	 * unbind_filter on it (without Oops).
1629 	 */
1630 	tcf_block_put(q->block);
1631 
1632 	for (i = 0; i < q->clhash.hashsize; i++) {
1633 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1634 			tcf_block_put(cl->block);
1635 			cl->block = NULL;
1636 		}
1637 	}
1638 
1639 	do {
1640 		nonempty = false;
1641 		changed = false;
1642 		for (i = 0; i < q->clhash.hashsize; i++) {
1643 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1644 						  common.hnode) {
1645 				bool last_child;
1646 
1647 				if (!q->offload) {
1648 					htb_destroy_class(sch, cl);
1649 					continue;
1650 				}
1651 
1652 				nonempty = true;
1653 
1654 				if (cl->level)
1655 					continue;
1656 
1657 				changed = true;
1658 
1659 				last_child = htb_parent_last_child(cl);
1660 				htb_destroy_class_offload(sch, cl, last_child,
1661 							  true, NULL);
1662 				qdisc_class_hash_remove(&q->clhash,
1663 							&cl->common);
1664 				if (cl->parent)
1665 					cl->parent->children--;
1666 				if (last_child)
1667 					htb_parent_to_leaf(sch, cl, NULL);
1668 				htb_destroy_class(sch, cl);
1669 			}
1670 		}
1671 	} while (changed);
1672 	WARN_ON(nonempty);
1673 
1674 	qdisc_class_hash_destroy(&q->clhash);
1675 	__qdisc_reset_queue(&q->direct_queue);
1676 
1677 	if (q->offload) {
1678 		offload_opt = (struct tc_htb_qopt_offload) {
1679 			.command = TC_HTB_DESTROY,
1680 		};
1681 		htb_offload(dev, &offload_opt);
1682 	}
1683 
1684 	if (!q->direct_qdiscs)
1685 		return;
1686 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1687 		qdisc_put(q->direct_qdiscs[i]);
1688 	kfree(q->direct_qdiscs);
1689 }
1690 
1691 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1692 		      struct netlink_ext_ack *extack)
1693 {
1694 	struct htb_sched *q = qdisc_priv(sch);
1695 	struct htb_class *cl = (struct htb_class *)arg;
1696 	struct Qdisc *new_q = NULL;
1697 	int last_child = 0;
1698 	int err;
1699 
1700 	/* TODO: why don't allow to delete subtree ? references ? does
1701 	 * tc subsys guarantee us that in htb_destroy it holds no class
1702 	 * refs so that we can remove children safely there ?
1703 	 */
1704 	if (cl->children || qdisc_class_in_use(&cl->common)) {
1705 		NL_SET_ERR_MSG(extack, "HTB class in use");
1706 		return -EBUSY;
1707 	}
1708 
1709 	if (!cl->level && htb_parent_last_child(cl))
1710 		last_child = 1;
1711 
1712 	if (q->offload) {
1713 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1714 						extack);
1715 		if (err)
1716 			return err;
1717 	}
1718 
1719 	if (last_child) {
1720 		struct netdev_queue *dev_queue = sch->dev_queue;
1721 
1722 		if (q->offload)
1723 			dev_queue = htb_offload_get_queue(cl);
1724 
1725 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1726 					  cl->parent->common.classid,
1727 					  NULL);
1728 		if (q->offload)
1729 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1730 	}
1731 
1732 	sch_tree_lock(sch);
1733 
1734 	if (!cl->level)
1735 		qdisc_purge_queue(cl->leaf.q);
1736 
1737 	/* delete from hash and active; remainder in destroy_class */
1738 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1739 	if (cl->parent)
1740 		cl->parent->children--;
1741 
1742 	htb_deactivate(q, cl);
1743 
1744 	if (cl->cmode != HTB_CAN_SEND)
1745 		htb_safe_rb_erase(&cl->pq_node,
1746 				  &q->hlevel[cl->level].wait_pq);
1747 
1748 	if (last_child)
1749 		htb_parent_to_leaf(sch, cl, new_q);
1750 
1751 	sch_tree_unlock(sch);
1752 
1753 	htb_destroy_class(sch, cl);
1754 	return 0;
1755 }
1756 
1757 static int htb_change_class(struct Qdisc *sch, u32 classid,
1758 			    u32 parentid, struct nlattr **tca,
1759 			    unsigned long *arg, struct netlink_ext_ack *extack)
1760 {
1761 	int err = -EINVAL;
1762 	struct htb_sched *q = qdisc_priv(sch);
1763 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1764 	struct tc_htb_qopt_offload offload_opt;
1765 	struct nlattr *opt = tca[TCA_OPTIONS];
1766 	struct nlattr *tb[TCA_HTB_MAX + 1];
1767 	struct Qdisc *parent_qdisc = NULL;
1768 	struct netdev_queue *dev_queue;
1769 	struct tc_htb_opt *hopt;
1770 	u64 rate64, ceil64;
1771 	int warn = 0;
1772 
1773 	/* extract all subattrs from opt attr */
1774 	if (!opt)
1775 		goto failure;
1776 
1777 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1778 					  extack);
1779 	if (err < 0)
1780 		goto failure;
1781 
1782 	err = -EINVAL;
1783 	if (tb[TCA_HTB_PARMS] == NULL)
1784 		goto failure;
1785 
1786 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1787 
1788 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1789 	if (!hopt->rate.rate || !hopt->ceil.rate)
1790 		goto failure;
1791 
1792 	if (q->offload) {
1793 		/* Options not supported by the offload. */
1794 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1795 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1796 			goto failure;
1797 		}
1798 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1799 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1800 			goto failure;
1801 		}
1802 	}
1803 
1804 	/* Keeping backward compatible with rate_table based iproute2 tc */
1805 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1806 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1807 					      NULL));
1808 
1809 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1810 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1811 					      NULL));
1812 
1813 	rate64 = nla_get_u64_default(tb[TCA_HTB_RATE64], 0);
1814 	ceil64 = nla_get_u64_default(tb[TCA_HTB_CEIL64], 0);
1815 
1816 	if (!cl) {		/* new class */
1817 		struct net_device *dev = qdisc_dev(sch);
1818 		struct Qdisc *new_q, *old_q;
1819 		int prio;
1820 		struct {
1821 			struct nlattr		nla;
1822 			struct gnet_estimator	opt;
1823 		} est = {
1824 			.nla = {
1825 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1826 				.nla_type	= TCA_RATE,
1827 			},
1828 			.opt = {
1829 				/* 4s interval, 16s averaging constant */
1830 				.interval	= 2,
1831 				.ewma_log	= 2,
1832 			},
1833 		};
1834 
1835 		/* check for valid classid */
1836 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1837 		    htb_find(classid, sch))
1838 			goto failure;
1839 
1840 		/* check maximal depth */
1841 		if (parent && parent->parent && parent->parent->level < 2) {
1842 			NL_SET_ERR_MSG_MOD(extack, "tree is too deep");
1843 			goto failure;
1844 		}
1845 		err = -ENOBUFS;
1846 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1847 		if (!cl)
1848 			goto failure;
1849 
1850 		gnet_stats_basic_sync_init(&cl->bstats);
1851 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1852 
1853 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1854 		if (err) {
1855 			kfree(cl);
1856 			goto failure;
1857 		}
1858 		if (htb_rate_est || tca[TCA_RATE]) {
1859 			err = gen_new_estimator(&cl->bstats, NULL,
1860 						&cl->rate_est,
1861 						NULL,
1862 						true,
1863 						tca[TCA_RATE] ? : &est.nla);
1864 			if (err)
1865 				goto err_block_put;
1866 		}
1867 
1868 		cl->children = 0;
1869 		RB_CLEAR_NODE(&cl->pq_node);
1870 
1871 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1872 			RB_CLEAR_NODE(&cl->node[prio]);
1873 
1874 		cl->common.classid = classid;
1875 
1876 		/* Make sure nothing interrupts us in between of two
1877 		 * ndo_setup_tc calls.
1878 		 */
1879 		ASSERT_RTNL();
1880 
1881 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1882 		 * so that can't be used inside of sch_tree_lock
1883 		 * -- thanks to Karlis Peisenieks
1884 		 */
1885 		if (!q->offload) {
1886 			dev_queue = sch->dev_queue;
1887 		} else if (!(parent && !parent->level)) {
1888 			/* Assign a dev_queue to this classid. */
1889 			offload_opt = (struct tc_htb_qopt_offload) {
1890 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1891 				.classid = cl->common.classid,
1892 				.parent_classid = parent ?
1893 					TC_H_MIN(parent->common.classid) :
1894 					TC_HTB_CLASSID_ROOT,
1895 				.rate = max_t(u64, hopt->rate.rate, rate64),
1896 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1897 				.prio = hopt->prio,
1898 				.quantum = hopt->quantum,
1899 				.extack = extack,
1900 			};
1901 			err = htb_offload(dev, &offload_opt);
1902 			if (err) {
1903 				NL_SET_ERR_MSG_WEAK(extack,
1904 						    "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE");
1905 				goto err_kill_estimator;
1906 			}
1907 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1908 		} else { /* First child. */
1909 			dev_queue = htb_offload_get_queue(parent);
1910 			old_q = htb_graft_helper(dev_queue, NULL);
1911 			WARN_ON(old_q != parent->leaf.q);
1912 			offload_opt = (struct tc_htb_qopt_offload) {
1913 				.command = TC_HTB_LEAF_TO_INNER,
1914 				.classid = cl->common.classid,
1915 				.parent_classid =
1916 					TC_H_MIN(parent->common.classid),
1917 				.rate = max_t(u64, hopt->rate.rate, rate64),
1918 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1919 				.prio = hopt->prio,
1920 				.quantum = hopt->quantum,
1921 				.extack = extack,
1922 			};
1923 			err = htb_offload(dev, &offload_opt);
1924 			if (err) {
1925 				NL_SET_ERR_MSG_WEAK(extack,
1926 						    "Failed to offload TC_HTB_LEAF_TO_INNER");
1927 				htb_graft_helper(dev_queue, old_q);
1928 				goto err_kill_estimator;
1929 			}
1930 			_bstats_update(&parent->bstats_bias,
1931 				       u64_stats_read(&old_q->bstats.bytes),
1932 				       u64_stats_read(&old_q->bstats.packets));
1933 			qdisc_put(old_q);
1934 		}
1935 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1936 					  classid, NULL);
1937 		if (q->offload) {
1938 			/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1939 			if (new_q)
1940 				qdisc_refcount_inc(new_q);
1941 			old_q = htb_graft_helper(dev_queue, new_q);
1942 			/* No qdisc_put needed. */
1943 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1944 		}
1945 		sch_tree_lock(sch);
1946 		if (parent && !parent->level) {
1947 			/* turn parent into inner node */
1948 			qdisc_purge_queue(parent->leaf.q);
1949 			parent_qdisc = parent->leaf.q;
1950 			htb_deactivate(q, parent);
1951 
1952 			/* remove from evt list because of level change */
1953 			if (parent->cmode != HTB_CAN_SEND) {
1954 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1955 				parent->cmode = HTB_CAN_SEND;
1956 			}
1957 			parent->level = (parent->parent ? parent->parent->level
1958 					 : TC_HTB_MAXDEPTH) - 1;
1959 			memset(&parent->inner, 0, sizeof(parent->inner));
1960 		}
1961 
1962 		/* leaf (we) needs elementary qdisc */
1963 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1964 		if (q->offload)
1965 			cl->leaf.offload_queue = dev_queue;
1966 
1967 		cl->parent = parent;
1968 
1969 		/* set class to be in HTB_CAN_SEND state */
1970 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1971 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1972 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1973 		cl->t_c = ktime_get_ns();
1974 		cl->cmode = HTB_CAN_SEND;
1975 
1976 		/* attach to the hash list and parent's family */
1977 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1978 		if (parent)
1979 			parent->children++;
1980 		if (cl->leaf.q != &noop_qdisc)
1981 			qdisc_hash_add(cl->leaf.q, true);
1982 	} else {
1983 		if (tca[TCA_RATE]) {
1984 			err = gen_replace_estimator(&cl->bstats, NULL,
1985 						    &cl->rate_est,
1986 						    NULL,
1987 						    true,
1988 						    tca[TCA_RATE]);
1989 			if (err)
1990 				return err;
1991 		}
1992 
1993 		if (q->offload) {
1994 			struct net_device *dev = qdisc_dev(sch);
1995 
1996 			offload_opt = (struct tc_htb_qopt_offload) {
1997 				.command = TC_HTB_NODE_MODIFY,
1998 				.classid = cl->common.classid,
1999 				.rate = max_t(u64, hopt->rate.rate, rate64),
2000 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2001 				.prio = hopt->prio,
2002 				.quantum = hopt->quantum,
2003 				.extack = extack,
2004 			};
2005 			err = htb_offload(dev, &offload_opt);
2006 			if (err)
2007 				/* Estimator was replaced, and rollback may fail
2008 				 * as well, so we don't try to recover it, and
2009 				 * the estimator won't work property with the
2010 				 * offload anyway, because bstats are updated
2011 				 * only when the stats are queried.
2012 				 */
2013 				return err;
2014 		}
2015 
2016 		sch_tree_lock(sch);
2017 	}
2018 
2019 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2020 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2021 
2022 	/* it used to be a nasty bug here, we have to check that node
2023 	 * is really leaf before changing cl->leaf !
2024 	 */
2025 	if (!cl->level) {
2026 		u64 quantum = cl->rate.rate_bytes_ps;
2027 
2028 		do_div(quantum, q->rate2quantum);
2029 		cl->quantum = min_t(u64, quantum, INT_MAX);
2030 
2031 		if (!hopt->quantum && cl->quantum < 1000) {
2032 			warn = -1;
2033 			cl->quantum = 1000;
2034 		}
2035 		if (!hopt->quantum && cl->quantum > 200000) {
2036 			warn = 1;
2037 			cl->quantum = 200000;
2038 		}
2039 		if (hopt->quantum)
2040 			cl->quantum = hopt->quantum;
2041 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2042 			cl->prio = TC_HTB_NUMPRIO - 1;
2043 	}
2044 
2045 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2046 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2047 
2048 	sch_tree_unlock(sch);
2049 	qdisc_put(parent_qdisc);
2050 
2051 	if (warn)
2052 		NL_SET_ERR_MSG_FMT_MOD(extack,
2053 				       "quantum of class %X is %s. Consider r2q change.",
2054 				       cl->common.classid, (warn == -1 ? "small" : "big"));
2055 
2056 	qdisc_class_hash_grow(sch, &q->clhash);
2057 
2058 	*arg = (unsigned long)cl;
2059 	return 0;
2060 
2061 err_kill_estimator:
2062 	gen_kill_estimator(&cl->rate_est);
2063 err_block_put:
2064 	tcf_block_put(cl->block);
2065 	kfree(cl);
2066 failure:
2067 	return err;
2068 }
2069 
2070 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2071 				       struct netlink_ext_ack *extack)
2072 {
2073 	struct htb_sched *q = qdisc_priv(sch);
2074 	struct htb_class *cl = (struct htb_class *)arg;
2075 
2076 	return cl ? cl->block : q->block;
2077 }
2078 
2079 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2080 				     u32 classid)
2081 {
2082 	struct htb_class *cl = htb_find(classid, sch);
2083 
2084 	/*if (cl && !cl->level) return 0;
2085 	 * The line above used to be there to prevent attaching filters to
2086 	 * leaves. But at least tc_index filter uses this just to get class
2087 	 * for other reasons so that we have to allow for it.
2088 	 * ----
2089 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2090 	 * another way to "lock" the class - unlike "get" this lock can
2091 	 * be broken by class during destroy IIUC.
2092 	 */
2093 	if (cl)
2094 		qdisc_class_get(&cl->common);
2095 	return (unsigned long)cl;
2096 }
2097 
2098 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2099 {
2100 	struct htb_class *cl = (struct htb_class *)arg;
2101 
2102 	qdisc_class_put(&cl->common);
2103 }
2104 
2105 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2106 {
2107 	struct htb_sched *q = qdisc_priv(sch);
2108 	struct htb_class *cl;
2109 	unsigned int i;
2110 
2111 	if (arg->stop)
2112 		return;
2113 
2114 	for (i = 0; i < q->clhash.hashsize; i++) {
2115 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2116 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
2117 				return;
2118 		}
2119 	}
2120 }
2121 
2122 static const struct Qdisc_class_ops htb_class_ops = {
2123 	.select_queue	=	htb_select_queue,
2124 	.graft		=	htb_graft,
2125 	.leaf		=	htb_leaf,
2126 	.qlen_notify	=	htb_qlen_notify,
2127 	.find		=	htb_search,
2128 	.change		=	htb_change_class,
2129 	.delete		=	htb_delete,
2130 	.walk		=	htb_walk,
2131 	.tcf_block	=	htb_tcf_block,
2132 	.bind_tcf	=	htb_bind_filter,
2133 	.unbind_tcf	=	htb_unbind_filter,
2134 	.dump		=	htb_dump_class,
2135 	.dump_stats	=	htb_dump_class_stats,
2136 };
2137 
2138 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2139 	.cl_ops		=	&htb_class_ops,
2140 	.id		=	"htb",
2141 	.priv_size	=	sizeof(struct htb_sched),
2142 	.enqueue	=	htb_enqueue,
2143 	.dequeue	=	htb_dequeue,
2144 	.peek		=	qdisc_peek_dequeued,
2145 	.init		=	htb_init,
2146 	.attach		=	htb_attach,
2147 	.reset		=	htb_reset,
2148 	.destroy	=	htb_destroy,
2149 	.dump		=	htb_dump,
2150 	.owner		=	THIS_MODULE,
2151 };
2152 MODULE_ALIAS_NET_SCH("htb");
2153 
2154 static int __init htb_module_init(void)
2155 {
2156 	return register_qdisc(&htb_qdisc_ops);
2157 }
2158 static void __exit htb_module_exit(void)
2159 {
2160 	unregister_qdisc(&htb_qdisc_ops);
2161 }
2162 
2163 module_init(htb_module_init)
2164 module_exit(htb_module_exit)
2165 MODULE_LICENSE("GPL");
2166 MODULE_DESCRIPTION("Hierarchical Token Bucket scheduler");
2167