xref: /linux/net/sched/sch_fq.c (revision 3d0fe49454652117522f60bfbefb978ba0e5300b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
4  *
5  *  Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
6  *
7  *  Meant to be mostly used for locally generated traffic :
8  *  Fast classification depends on skb->sk being set before reaching us.
9  *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
10  *  All packets belonging to a socket are considered as a 'flow'.
11  *
12  *  Flows are dynamically allocated and stored in a hash table of RB trees
13  *  They are also part of one Round Robin 'queues' (new or old flows)
14  *
15  *  Burst avoidance (aka pacing) capability :
16  *
17  *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
18  *  bunch of packets, and this packet scheduler adds delay between
19  *  packets to respect rate limitation.
20  *
21  *  enqueue() :
22  *   - lookup one RB tree (out of 1024 or more) to find the flow.
23  *     If non existent flow, create it, add it to the tree.
24  *     Add skb to the per flow list of skb (fifo).
25  *   - Use a special fifo for high prio packets
26  *
27  *  dequeue() : serves flows in Round Robin
28  *  Note : When a flow becomes empty, we do not immediately remove it from
29  *  rb trees, for performance reasons (its expected to send additional packets,
30  *  or SLAB cache will reuse socket for another flow)
31  */
32 
33 #include <linux/module.h>
34 #include <linux/types.h>
35 #include <linux/kernel.h>
36 #include <linux/jiffies.h>
37 #include <linux/string.h>
38 #include <linux/in.h>
39 #include <linux/errno.h>
40 #include <linux/init.h>
41 #include <linux/skbuff.h>
42 #include <linux/slab.h>
43 #include <linux/rbtree.h>
44 #include <linux/hash.h>
45 #include <linux/prefetch.h>
46 #include <linux/vmalloc.h>
47 #include <net/netlink.h>
48 #include <net/pkt_sched.h>
49 #include <net/sock.h>
50 #include <net/tcp_states.h>
51 #include <net/tcp.h>
52 
53 struct fq_skb_cb {
54 	u64	time_to_send;
55 	u8	band;
56 };
57 
58 static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
59 {
60 	qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
61 	return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
62 }
63 
64 /*
65  * Per flow structure, dynamically allocated.
66  * If packets have monotically increasing time_to_send, they are placed in O(1)
67  * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
68  */
69 struct fq_flow {
70 /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
71 	struct rb_root	t_root;
72 	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
73 	union {
74 		struct sk_buff *tail;	/* last skb in the list */
75 		unsigned long  age;	/* (jiffies | 1UL) when flow was emptied, for gc */
76 	};
77 	union {
78 		struct rb_node	fq_node;	/* anchor in fq_root[] trees */
79 		/* Following field is only used for q->internal,
80 		 * because q->internal is not hashed in fq_root[]
81 		 */
82 		u64		stat_fastpath_packets;
83 	};
84 	struct sock	*sk;
85 	u32		socket_hash;	/* sk_hash */
86 	int		qlen;		/* number of packets in flow queue */
87 
88 /* Second cache line */
89 	int		credit;
90 	int		band;
91 	struct fq_flow *next;		/* next pointer in RR lists */
92 
93 	struct rb_node  rate_node;	/* anchor in q->delayed tree */
94 	u64		time_next_packet;
95 };
96 
97 struct fq_flow_head {
98 	struct fq_flow *first;
99 	struct fq_flow *last;
100 };
101 
102 struct fq_perband_flows {
103 	struct fq_flow_head new_flows;
104 	struct fq_flow_head old_flows;
105 	int		    credit;
106 	int		    quantum; /* based on band nr : 576KB, 192KB, 64KB */
107 };
108 
109 struct fq_sched_data {
110 /* Read mostly cache line */
111 
112 	u32		quantum;
113 	u32		initial_quantum;
114 	u32		flow_refill_delay;
115 	u32		flow_plimit;	/* max packets per flow */
116 	unsigned long	flow_max_rate;	/* optional max rate per flow */
117 	u64		ce_threshold;
118 	u64		horizon;	/* horizon in ns */
119 	u32		orphan_mask;	/* mask for orphaned skb */
120 	u32		low_rate_threshold;
121 	struct rb_root	*fq_root;
122 	u8		rate_enable;
123 	u8		fq_trees_log;
124 	u8		horizon_drop;
125 	u8		prio2band[(TC_PRIO_MAX + 1) >> 2];
126 	u32		timer_slack; /* hrtimer slack in ns */
127 
128 /* Read/Write fields. */
129 
130 	unsigned int band_nr; /* band being serviced in fq_dequeue() */
131 
132 	struct fq_perband_flows band_flows[FQ_BANDS];
133 
134 	struct fq_flow	internal;	/* fastpath queue. */
135 	struct rb_root	delayed;	/* for rate limited flows */
136 	u64		time_next_delayed_flow;
137 	unsigned long	unthrottle_latency_ns;
138 
139 	u32		band_pkt_count[FQ_BANDS];
140 	u32		flows;
141 	u32		inactive_flows; /* Flows with no packet to send. */
142 	u32		throttled_flows;
143 
144 	u64		stat_throttled;
145 	struct qdisc_watchdog watchdog;
146 	u64		stat_gc_flows;
147 
148 /* Seldom used fields. */
149 
150 	u64		stat_band_drops[FQ_BANDS];
151 	u64		stat_ce_mark;
152 	u64		stat_horizon_drops;
153 	u64		stat_horizon_caps;
154 	u64		stat_flows_plimit;
155 	u64		stat_pkts_too_long;
156 	u64		stat_allocation_errors;
157 };
158 
159 /* return the i-th 2-bit value ("crumb") */
160 static u8 fq_prio2band(const u8 *prio2band, unsigned int prio)
161 {
162 	return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3;
163 }
164 
165 /*
166  * f->tail and f->age share the same location.
167  * We can use the low order bit to differentiate if this location points
168  * to a sk_buff or contains a jiffies value, if we force this value to be odd.
169  * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
170  */
171 static void fq_flow_set_detached(struct fq_flow *f)
172 {
173 	f->age = jiffies | 1UL;
174 }
175 
176 static bool fq_flow_is_detached(const struct fq_flow *f)
177 {
178 	return !!(f->age & 1UL);
179 }
180 
181 /* special value to mark a throttled flow (not on old/new list) */
182 static struct fq_flow throttled;
183 
184 static bool fq_flow_is_throttled(const struct fq_flow *f)
185 {
186 	return f->next == &throttled;
187 }
188 
189 enum new_flow {
190 	NEW_FLOW,
191 	OLD_FLOW
192 };
193 
194 static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow,
195 			     enum new_flow list_sel)
196 {
197 	struct fq_perband_flows *pband = &q->band_flows[flow->band];
198 	struct fq_flow_head *head = (list_sel == NEW_FLOW) ?
199 					&pband->new_flows :
200 					&pband->old_flows;
201 
202 	if (head->first)
203 		head->last->next = flow;
204 	else
205 		head->first = flow;
206 	head->last = flow;
207 	flow->next = NULL;
208 }
209 
210 static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
211 {
212 	rb_erase(&f->rate_node, &q->delayed);
213 	q->throttled_flows--;
214 	fq_flow_add_tail(q, f, OLD_FLOW);
215 }
216 
217 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
218 {
219 	struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
220 
221 	while (*p) {
222 		struct fq_flow *aux;
223 
224 		parent = *p;
225 		aux = rb_entry(parent, struct fq_flow, rate_node);
226 		if (f->time_next_packet >= aux->time_next_packet)
227 			p = &parent->rb_right;
228 		else
229 			p = &parent->rb_left;
230 	}
231 	rb_link_node(&f->rate_node, parent, p);
232 	rb_insert_color(&f->rate_node, &q->delayed);
233 	q->throttled_flows++;
234 	q->stat_throttled++;
235 
236 	f->next = &throttled;
237 	if (q->time_next_delayed_flow > f->time_next_packet)
238 		q->time_next_delayed_flow = f->time_next_packet;
239 }
240 
241 
242 static struct kmem_cache *fq_flow_cachep __read_mostly;
243 
244 
245 /* limit number of collected flows per round */
246 #define FQ_GC_MAX 8
247 #define FQ_GC_AGE (3*HZ)
248 
249 static bool fq_gc_candidate(const struct fq_flow *f)
250 {
251 	return fq_flow_is_detached(f) &&
252 	       time_after(jiffies, f->age + FQ_GC_AGE);
253 }
254 
255 static void fq_gc(struct fq_sched_data *q,
256 		  struct rb_root *root,
257 		  struct sock *sk)
258 {
259 	struct rb_node **p, *parent;
260 	void *tofree[FQ_GC_MAX];
261 	struct fq_flow *f;
262 	int i, fcnt = 0;
263 
264 	p = &root->rb_node;
265 	parent = NULL;
266 	while (*p) {
267 		parent = *p;
268 
269 		f = rb_entry(parent, struct fq_flow, fq_node);
270 		if (f->sk == sk)
271 			break;
272 
273 		if (fq_gc_candidate(f)) {
274 			tofree[fcnt++] = f;
275 			if (fcnt == FQ_GC_MAX)
276 				break;
277 		}
278 
279 		if (f->sk > sk)
280 			p = &parent->rb_right;
281 		else
282 			p = &parent->rb_left;
283 	}
284 
285 	if (!fcnt)
286 		return;
287 
288 	for (i = fcnt; i > 0; ) {
289 		f = tofree[--i];
290 		rb_erase(&f->fq_node, root);
291 	}
292 	q->flows -= fcnt;
293 	q->inactive_flows -= fcnt;
294 	q->stat_gc_flows += fcnt;
295 
296 	kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
297 }
298 
299 /* Fast path can be used if :
300  * 1) Packet tstamp is in the past.
301  * 2) FQ qlen == 0   OR
302  *   (no flow is currently eligible for transmit,
303  *    AND fast path queue has less than 8 packets)
304  * 3) No SO_MAX_PACING_RATE on the socket (if any).
305  * 4) No @maxrate attribute on this qdisc,
306  *
307  * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
308  */
309 static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb,
310 			      u64 now)
311 {
312 	const struct fq_sched_data *q = qdisc_priv(sch);
313 	const struct sock *sk;
314 
315 	if (fq_skb_cb(skb)->time_to_send > now)
316 		return false;
317 
318 	if (sch->q.qlen != 0) {
319 		/* Even if some packets are stored in this qdisc,
320 		 * we can still enable fast path if all of them are
321 		 * scheduled in the future (ie no flows are eligible)
322 		 * or in the fast path queue.
323 		 */
324 		if (q->flows != q->inactive_flows + q->throttled_flows)
325 			return false;
326 
327 		/* Do not allow fast path queue to explode, we want Fair Queue mode
328 		 * under pressure.
329 		 */
330 		if (q->internal.qlen >= 8)
331 			return false;
332 	}
333 
334 	sk = skb->sk;
335 	if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
336 	    sk->sk_max_pacing_rate != ~0UL)
337 		return false;
338 
339 	if (q->flow_max_rate != ~0UL)
340 		return false;
341 
342 	return true;
343 }
344 
345 static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
346 				   u64 now)
347 {
348 	struct fq_sched_data *q = qdisc_priv(sch);
349 	struct rb_node **p, *parent;
350 	struct sock *sk = skb->sk;
351 	struct rb_root *root;
352 	struct fq_flow *f;
353 
354 	/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
355 	 * or a listener (SYNCOOKIE mode)
356 	 * 1) request sockets are not full blown,
357 	 *    they do not contain sk_pacing_rate
358 	 * 2) They are not part of a 'flow' yet
359 	 * 3) We do not want to rate limit them (eg SYNFLOOD attack),
360 	 *    especially if the listener set SO_MAX_PACING_RATE
361 	 * 4) We pretend they are orphaned
362 	 */
363 	if (!sk || sk_listener(sk)) {
364 		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
365 
366 		/* By forcing low order bit to 1, we make sure to not
367 		 * collide with a local flow (socket pointers are word aligned)
368 		 */
369 		sk = (struct sock *)((hash << 1) | 1UL);
370 		skb_orphan(skb);
371 	} else if (sk->sk_state == TCP_CLOSE) {
372 		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
373 		/*
374 		 * Sockets in TCP_CLOSE are non connected.
375 		 * Typical use case is UDP sockets, they can send packets
376 		 * with sendto() to many different destinations.
377 		 * We probably could use a generic bit advertising
378 		 * non connected sockets, instead of sk_state == TCP_CLOSE,
379 		 * if we care enough.
380 		 */
381 		sk = (struct sock *)((hash << 1) | 1UL);
382 	}
383 
384 	if (fq_fastpath_check(sch, skb, now)) {
385 		q->internal.stat_fastpath_packets++;
386 		if (skb->sk == sk && q->rate_enable &&
387 		    READ_ONCE(sk->sk_pacing_status) != SK_PACING_FQ)
388 			smp_store_release(&sk->sk_pacing_status,
389 					  SK_PACING_FQ);
390 		return &q->internal;
391 	}
392 
393 	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
394 
395 	fq_gc(q, root, sk);
396 
397 	p = &root->rb_node;
398 	parent = NULL;
399 	while (*p) {
400 		parent = *p;
401 
402 		f = rb_entry(parent, struct fq_flow, fq_node);
403 		if (f->sk == sk) {
404 			/* socket might have been reallocated, so check
405 			 * if its sk_hash is the same.
406 			 * It not, we need to refill credit with
407 			 * initial quantum
408 			 */
409 			if (unlikely(skb->sk == sk &&
410 				     f->socket_hash != sk->sk_hash)) {
411 				f->credit = q->initial_quantum;
412 				f->socket_hash = sk->sk_hash;
413 				if (q->rate_enable)
414 					smp_store_release(&sk->sk_pacing_status,
415 							  SK_PACING_FQ);
416 				if (fq_flow_is_throttled(f))
417 					fq_flow_unset_throttled(q, f);
418 				f->time_next_packet = 0ULL;
419 			}
420 			return f;
421 		}
422 		if (f->sk > sk)
423 			p = &parent->rb_right;
424 		else
425 			p = &parent->rb_left;
426 	}
427 
428 	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
429 	if (unlikely(!f)) {
430 		q->stat_allocation_errors++;
431 		return &q->internal;
432 	}
433 	/* f->t_root is already zeroed after kmem_cache_zalloc() */
434 
435 	fq_flow_set_detached(f);
436 	f->sk = sk;
437 	if (skb->sk == sk) {
438 		f->socket_hash = sk->sk_hash;
439 		if (q->rate_enable)
440 			smp_store_release(&sk->sk_pacing_status,
441 					  SK_PACING_FQ);
442 	}
443 	f->credit = q->initial_quantum;
444 
445 	rb_link_node(&f->fq_node, parent, p);
446 	rb_insert_color(&f->fq_node, root);
447 
448 	q->flows++;
449 	q->inactive_flows++;
450 	return f;
451 }
452 
453 static struct sk_buff *fq_peek(struct fq_flow *flow)
454 {
455 	struct sk_buff *skb = skb_rb_first(&flow->t_root);
456 	struct sk_buff *head = flow->head;
457 
458 	if (!skb)
459 		return head;
460 
461 	if (!head)
462 		return skb;
463 
464 	if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
465 		return skb;
466 	return head;
467 }
468 
469 static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
470 			  struct sk_buff *skb)
471 {
472 	if (skb == flow->head) {
473 		flow->head = skb->next;
474 	} else {
475 		rb_erase(&skb->rbnode, &flow->t_root);
476 		skb->dev = qdisc_dev(sch);
477 	}
478 }
479 
480 /* Remove one skb from flow queue.
481  * This skb must be the return value of prior fq_peek().
482  */
483 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
484 			   struct sk_buff *skb)
485 {
486 	fq_erase_head(sch, flow, skb);
487 	skb_mark_not_on_list(skb);
488 	qdisc_qstats_backlog_dec(sch, skb);
489 	sch->q.qlen--;
490 }
491 
492 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
493 {
494 	struct rb_node **p, *parent;
495 	struct sk_buff *head, *aux;
496 
497 	head = flow->head;
498 	if (!head ||
499 	    fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
500 		if (!head)
501 			flow->head = skb;
502 		else
503 			flow->tail->next = skb;
504 		flow->tail = skb;
505 		skb->next = NULL;
506 		return;
507 	}
508 
509 	p = &flow->t_root.rb_node;
510 	parent = NULL;
511 
512 	while (*p) {
513 		parent = *p;
514 		aux = rb_to_skb(parent);
515 		if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
516 			p = &parent->rb_right;
517 		else
518 			p = &parent->rb_left;
519 	}
520 	rb_link_node(&skb->rbnode, parent, p);
521 	rb_insert_color(&skb->rbnode, &flow->t_root);
522 }
523 
524 static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
525 				     const struct fq_sched_data *q, u64 now)
526 {
527 	return unlikely((s64)skb->tstamp > (s64)(now + q->horizon));
528 }
529 
530 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
531 		      struct sk_buff **to_free)
532 {
533 	struct fq_sched_data *q = qdisc_priv(sch);
534 	struct fq_flow *f;
535 	u64 now;
536 	u8 band;
537 
538 	band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
539 	if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
540 		q->stat_band_drops[band]++;
541 		return qdisc_drop(skb, sch, to_free);
542 	}
543 
544 	now = ktime_get_ns();
545 	if (!skb->tstamp) {
546 		fq_skb_cb(skb)->time_to_send = now;
547 	} else {
548 		/* Check if packet timestamp is too far in the future. */
549 		if (fq_packet_beyond_horizon(skb, q, now)) {
550 			if (q->horizon_drop) {
551 					q->stat_horizon_drops++;
552 					return qdisc_drop(skb, sch, to_free);
553 			}
554 			q->stat_horizon_caps++;
555 			skb->tstamp = now + q->horizon;
556 		}
557 		fq_skb_cb(skb)->time_to_send = skb->tstamp;
558 	}
559 
560 	f = fq_classify(sch, skb, now);
561 
562 	if (f != &q->internal) {
563 		if (unlikely(f->qlen >= q->flow_plimit)) {
564 			q->stat_flows_plimit++;
565 			return qdisc_drop(skb, sch, to_free);
566 		}
567 
568 		if (fq_flow_is_detached(f)) {
569 			fq_flow_add_tail(q, f, NEW_FLOW);
570 			if (time_after(jiffies, f->age + q->flow_refill_delay))
571 				f->credit = max_t(u32, f->credit, q->quantum);
572 		}
573 
574 		f->band = band;
575 		q->band_pkt_count[band]++;
576 		fq_skb_cb(skb)->band = band;
577 		if (f->qlen == 0)
578 			q->inactive_flows--;
579 	}
580 
581 	f->qlen++;
582 	/* Note: this overwrites f->age */
583 	flow_queue_add(f, skb);
584 
585 	qdisc_qstats_backlog_inc(sch, skb);
586 	sch->q.qlen++;
587 
588 	return NET_XMIT_SUCCESS;
589 }
590 
591 static void fq_check_throttled(struct fq_sched_data *q, u64 now)
592 {
593 	unsigned long sample;
594 	struct rb_node *p;
595 
596 	if (q->time_next_delayed_flow > now)
597 		return;
598 
599 	/* Update unthrottle latency EWMA.
600 	 * This is cheap and can help diagnosing timer/latency problems.
601 	 */
602 	sample = (unsigned long)(now - q->time_next_delayed_flow);
603 	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
604 	q->unthrottle_latency_ns += sample >> 3;
605 
606 	q->time_next_delayed_flow = ~0ULL;
607 	while ((p = rb_first(&q->delayed)) != NULL) {
608 		struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
609 
610 		if (f->time_next_packet > now) {
611 			q->time_next_delayed_flow = f->time_next_packet;
612 			break;
613 		}
614 		fq_flow_unset_throttled(q, f);
615 	}
616 }
617 
618 static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband)
619 {
620 	if (pband->credit <= 0)
621 		return NULL;
622 
623 	if (pband->new_flows.first)
624 		return &pband->new_flows;
625 
626 	return pband->old_flows.first ? &pband->old_flows : NULL;
627 }
628 
629 static struct sk_buff *fq_dequeue(struct Qdisc *sch)
630 {
631 	struct fq_sched_data *q = qdisc_priv(sch);
632 	struct fq_perband_flows *pband;
633 	struct fq_flow_head *head;
634 	struct sk_buff *skb;
635 	struct fq_flow *f;
636 	unsigned long rate;
637 	int retry;
638 	u32 plen;
639 	u64 now;
640 
641 	if (!sch->q.qlen)
642 		return NULL;
643 
644 	skb = fq_peek(&q->internal);
645 	if (unlikely(skb)) {
646 		q->internal.qlen--;
647 		fq_dequeue_skb(sch, &q->internal, skb);
648 		goto out;
649 	}
650 
651 	now = ktime_get_ns();
652 	fq_check_throttled(q, now);
653 	retry = 0;
654 	pband = &q->band_flows[q->band_nr];
655 begin:
656 	head = fq_pband_head_select(pband);
657 	if (!head) {
658 		while (++retry <= FQ_BANDS) {
659 			if (++q->band_nr == FQ_BANDS)
660 				q->band_nr = 0;
661 			pband = &q->band_flows[q->band_nr];
662 			pband->credit = min(pband->credit + pband->quantum,
663 					    pband->quantum);
664 			goto begin;
665 		}
666 		if (q->time_next_delayed_flow != ~0ULL)
667 			qdisc_watchdog_schedule_range_ns(&q->watchdog,
668 							q->time_next_delayed_flow,
669 							q->timer_slack);
670 		return NULL;
671 	}
672 	f = head->first;
673 	retry = 0;
674 	if (f->credit <= 0) {
675 		f->credit += q->quantum;
676 		head->first = f->next;
677 		fq_flow_add_tail(q, f, OLD_FLOW);
678 		goto begin;
679 	}
680 
681 	skb = fq_peek(f);
682 	if (skb) {
683 		u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
684 					     f->time_next_packet);
685 
686 		if (now < time_next_packet) {
687 			head->first = f->next;
688 			f->time_next_packet = time_next_packet;
689 			fq_flow_set_throttled(q, f);
690 			goto begin;
691 		}
692 		prefetch(&skb->end);
693 		if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
694 			INET_ECN_set_ce(skb);
695 			q->stat_ce_mark++;
696 		}
697 		if (--f->qlen == 0)
698 			q->inactive_flows++;
699 		q->band_pkt_count[fq_skb_cb(skb)->band]--;
700 		fq_dequeue_skb(sch, f, skb);
701 	} else {
702 		head->first = f->next;
703 		/* force a pass through old_flows to prevent starvation */
704 		if (head == &pband->new_flows) {
705 			fq_flow_add_tail(q, f, OLD_FLOW);
706 		} else {
707 			fq_flow_set_detached(f);
708 		}
709 		goto begin;
710 	}
711 	plen = qdisc_pkt_len(skb);
712 	f->credit -= plen;
713 	pband->credit -= plen;
714 
715 	if (!q->rate_enable)
716 		goto out;
717 
718 	rate = q->flow_max_rate;
719 
720 	/* If EDT time was provided for this skb, we need to
721 	 * update f->time_next_packet only if this qdisc enforces
722 	 * a flow max rate.
723 	 */
724 	if (!skb->tstamp) {
725 		if (skb->sk)
726 			rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
727 
728 		if (rate <= q->low_rate_threshold) {
729 			f->credit = 0;
730 		} else {
731 			plen = max(plen, q->quantum);
732 			if (f->credit > 0)
733 				goto out;
734 		}
735 	}
736 	if (rate != ~0UL) {
737 		u64 len = (u64)plen * NSEC_PER_SEC;
738 
739 		if (likely(rate))
740 			len = div64_ul(len, rate);
741 		/* Since socket rate can change later,
742 		 * clamp the delay to 1 second.
743 		 * Really, providers of too big packets should be fixed !
744 		 */
745 		if (unlikely(len > NSEC_PER_SEC)) {
746 			len = NSEC_PER_SEC;
747 			q->stat_pkts_too_long++;
748 		}
749 		/* Account for schedule/timers drifts.
750 		 * f->time_next_packet was set when prior packet was sent,
751 		 * and current time (@now) can be too late by tens of us.
752 		 */
753 		if (f->time_next_packet)
754 			len -= min(len/2, now - f->time_next_packet);
755 		f->time_next_packet = now + len;
756 	}
757 out:
758 	qdisc_bstats_update(sch, skb);
759 	return skb;
760 }
761 
762 static void fq_flow_purge(struct fq_flow *flow)
763 {
764 	struct rb_node *p = rb_first(&flow->t_root);
765 
766 	while (p) {
767 		struct sk_buff *skb = rb_to_skb(p);
768 
769 		p = rb_next(p);
770 		rb_erase(&skb->rbnode, &flow->t_root);
771 		rtnl_kfree_skbs(skb, skb);
772 	}
773 	rtnl_kfree_skbs(flow->head, flow->tail);
774 	flow->head = NULL;
775 	flow->qlen = 0;
776 }
777 
778 static void fq_reset(struct Qdisc *sch)
779 {
780 	struct fq_sched_data *q = qdisc_priv(sch);
781 	struct rb_root *root;
782 	struct rb_node *p;
783 	struct fq_flow *f;
784 	unsigned int idx;
785 
786 	sch->q.qlen = 0;
787 	sch->qstats.backlog = 0;
788 
789 	fq_flow_purge(&q->internal);
790 
791 	if (!q->fq_root)
792 		return;
793 
794 	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
795 		root = &q->fq_root[idx];
796 		while ((p = rb_first(root)) != NULL) {
797 			f = rb_entry(p, struct fq_flow, fq_node);
798 			rb_erase(p, root);
799 
800 			fq_flow_purge(f);
801 
802 			kmem_cache_free(fq_flow_cachep, f);
803 		}
804 	}
805 	for (idx = 0; idx < FQ_BANDS; idx++) {
806 		q->band_flows[idx].new_flows.first = NULL;
807 		q->band_flows[idx].old_flows.first = NULL;
808 	}
809 	q->delayed		= RB_ROOT;
810 	q->flows		= 0;
811 	q->inactive_flows	= 0;
812 	q->throttled_flows	= 0;
813 }
814 
815 static void fq_rehash(struct fq_sched_data *q,
816 		      struct rb_root *old_array, u32 old_log,
817 		      struct rb_root *new_array, u32 new_log)
818 {
819 	struct rb_node *op, **np, *parent;
820 	struct rb_root *oroot, *nroot;
821 	struct fq_flow *of, *nf;
822 	int fcnt = 0;
823 	u32 idx;
824 
825 	for (idx = 0; idx < (1U << old_log); idx++) {
826 		oroot = &old_array[idx];
827 		while ((op = rb_first(oroot)) != NULL) {
828 			rb_erase(op, oroot);
829 			of = rb_entry(op, struct fq_flow, fq_node);
830 			if (fq_gc_candidate(of)) {
831 				fcnt++;
832 				kmem_cache_free(fq_flow_cachep, of);
833 				continue;
834 			}
835 			nroot = &new_array[hash_ptr(of->sk, new_log)];
836 
837 			np = &nroot->rb_node;
838 			parent = NULL;
839 			while (*np) {
840 				parent = *np;
841 
842 				nf = rb_entry(parent, struct fq_flow, fq_node);
843 				BUG_ON(nf->sk == of->sk);
844 
845 				if (nf->sk > of->sk)
846 					np = &parent->rb_right;
847 				else
848 					np = &parent->rb_left;
849 			}
850 
851 			rb_link_node(&of->fq_node, parent, np);
852 			rb_insert_color(&of->fq_node, nroot);
853 		}
854 	}
855 	q->flows -= fcnt;
856 	q->inactive_flows -= fcnt;
857 	q->stat_gc_flows += fcnt;
858 }
859 
860 static void fq_free(void *addr)
861 {
862 	kvfree(addr);
863 }
864 
865 static int fq_resize(struct Qdisc *sch, u32 log)
866 {
867 	struct fq_sched_data *q = qdisc_priv(sch);
868 	struct rb_root *array;
869 	void *old_fq_root;
870 	u32 idx;
871 
872 	if (q->fq_root && log == q->fq_trees_log)
873 		return 0;
874 
875 	/* If XPS was setup, we can allocate memory on right NUMA node */
876 	array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
877 			      netdev_queue_numa_node_read(sch->dev_queue));
878 	if (!array)
879 		return -ENOMEM;
880 
881 	for (idx = 0; idx < (1U << log); idx++)
882 		array[idx] = RB_ROOT;
883 
884 	sch_tree_lock(sch);
885 
886 	old_fq_root = q->fq_root;
887 	if (old_fq_root)
888 		fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
889 
890 	q->fq_root = array;
891 	q->fq_trees_log = log;
892 
893 	sch_tree_unlock(sch);
894 
895 	fq_free(old_fq_root);
896 
897 	return 0;
898 }
899 
900 static const struct netlink_range_validation iq_range = {
901 	.max = INT_MAX,
902 };
903 
904 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
905 	[TCA_FQ_UNSPEC]			= { .strict_start_type = TCA_FQ_TIMER_SLACK },
906 
907 	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 },
908 	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 },
909 	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 },
910 	[TCA_FQ_INITIAL_QUANTUM]	= NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range),
911 	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 },
912 	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
913 	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
914 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
915 	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
916 	[TCA_FQ_ORPHAN_MASK]		= { .type = NLA_U32 },
917 	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
918 	[TCA_FQ_CE_THRESHOLD]		= { .type = NLA_U32 },
919 	[TCA_FQ_TIMER_SLACK]		= { .type = NLA_U32 },
920 	[TCA_FQ_HORIZON]		= { .type = NLA_U32 },
921 	[TCA_FQ_HORIZON_DROP]		= { .type = NLA_U8 },
922 	[TCA_FQ_PRIOMAP]		= NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)),
923 	[TCA_FQ_WEIGHTS]		= NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)),
924 };
925 
926 /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
927 static void fq_prio2band_compress_crumb(const u8 *in, u8 *out)
928 {
929 	const int num_elems = TC_PRIO_MAX + 1;
930 	int i;
931 
932 	memset(out, 0, num_elems / 4);
933 	for (i = 0; i < num_elems; i++)
934 		out[i / 4] |= in[i] << (2 * (i & 0x3));
935 }
936 
937 static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
938 {
939 	const int num_elems = TC_PRIO_MAX + 1;
940 	int i;
941 
942 	for (i = 0; i < num_elems; i++)
943 		out[i] = fq_prio2band(in, i);
944 }
945 
946 static int fq_load_weights(struct fq_sched_data *q,
947 			   const struct nlattr *attr,
948 			   struct netlink_ext_ack *extack)
949 {
950 	s32 *weights = nla_data(attr);
951 	int i;
952 
953 	for (i = 0; i < FQ_BANDS; i++) {
954 		if (weights[i] < FQ_MIN_WEIGHT) {
955 			NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d",
956 					       weights[i], FQ_MIN_WEIGHT);
957 			return -EINVAL;
958 		}
959 	}
960 	for (i = 0; i < FQ_BANDS; i++)
961 		q->band_flows[i].quantum = weights[i];
962 	return 0;
963 }
964 
965 static int fq_load_priomap(struct fq_sched_data *q,
966 			   const struct nlattr *attr,
967 			   struct netlink_ext_ack *extack)
968 {
969 	const struct tc_prio_qopt *map = nla_data(attr);
970 	int i;
971 
972 	if (map->bands != FQ_BANDS) {
973 		NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
974 		return -EINVAL;
975 	}
976 	for (i = 0; i < TC_PRIO_MAX + 1; i++) {
977 		if (map->priomap[i] >= FQ_BANDS) {
978 			NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d",
979 					       i, map->priomap[i]);
980 			return -EINVAL;
981 		}
982 	}
983 	fq_prio2band_compress_crumb(map->priomap, q->prio2band);
984 	return 0;
985 }
986 
987 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
988 		     struct netlink_ext_ack *extack)
989 {
990 	struct fq_sched_data *q = qdisc_priv(sch);
991 	struct nlattr *tb[TCA_FQ_MAX + 1];
992 	int err, drop_count = 0;
993 	unsigned drop_len = 0;
994 	u32 fq_log;
995 
996 	err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
997 					  NULL);
998 	if (err < 0)
999 		return err;
1000 
1001 	sch_tree_lock(sch);
1002 
1003 	fq_log = q->fq_trees_log;
1004 
1005 	if (tb[TCA_FQ_BUCKETS_LOG]) {
1006 		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
1007 
1008 		if (nval >= 1 && nval <= ilog2(256*1024))
1009 			fq_log = nval;
1010 		else
1011 			err = -EINVAL;
1012 	}
1013 	if (tb[TCA_FQ_PLIMIT])
1014 		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
1015 
1016 	if (tb[TCA_FQ_FLOW_PLIMIT])
1017 		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
1018 
1019 	if (tb[TCA_FQ_QUANTUM]) {
1020 		u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
1021 
1022 		if (quantum > 0 && quantum <= (1 << 20)) {
1023 			q->quantum = quantum;
1024 		} else {
1025 			NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
1026 			err = -EINVAL;
1027 		}
1028 	}
1029 
1030 	if (tb[TCA_FQ_INITIAL_QUANTUM])
1031 		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
1032 
1033 	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
1034 		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
1035 				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
1036 
1037 	if (tb[TCA_FQ_FLOW_MAX_RATE]) {
1038 		u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
1039 
1040 		q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
1041 	}
1042 	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
1043 		q->low_rate_threshold =
1044 			nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
1045 
1046 	if (tb[TCA_FQ_RATE_ENABLE]) {
1047 		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
1048 
1049 		if (enable <= 1)
1050 			q->rate_enable = enable;
1051 		else
1052 			err = -EINVAL;
1053 	}
1054 
1055 	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
1056 		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
1057 
1058 		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
1059 	}
1060 
1061 	if (!err && tb[TCA_FQ_PRIOMAP])
1062 		err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
1063 
1064 	if (!err && tb[TCA_FQ_WEIGHTS])
1065 		err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack);
1066 
1067 	if (tb[TCA_FQ_ORPHAN_MASK])
1068 		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
1069 
1070 	if (tb[TCA_FQ_CE_THRESHOLD])
1071 		q->ce_threshold = (u64)NSEC_PER_USEC *
1072 				  nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
1073 
1074 	if (tb[TCA_FQ_TIMER_SLACK])
1075 		q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
1076 
1077 	if (tb[TCA_FQ_HORIZON])
1078 		q->horizon = (u64)NSEC_PER_USEC *
1079 				  nla_get_u32(tb[TCA_FQ_HORIZON]);
1080 
1081 	if (tb[TCA_FQ_HORIZON_DROP])
1082 		q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
1083 
1084 	if (!err) {
1085 
1086 		sch_tree_unlock(sch);
1087 		err = fq_resize(sch, fq_log);
1088 		sch_tree_lock(sch);
1089 	}
1090 	while (sch->q.qlen > sch->limit) {
1091 		struct sk_buff *skb = fq_dequeue(sch);
1092 
1093 		if (!skb)
1094 			break;
1095 		drop_len += qdisc_pkt_len(skb);
1096 		rtnl_kfree_skbs(skb, skb);
1097 		drop_count++;
1098 	}
1099 	qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
1100 
1101 	sch_tree_unlock(sch);
1102 	return err;
1103 }
1104 
1105 static void fq_destroy(struct Qdisc *sch)
1106 {
1107 	struct fq_sched_data *q = qdisc_priv(sch);
1108 
1109 	fq_reset(sch);
1110 	fq_free(q->fq_root);
1111 	qdisc_watchdog_cancel(&q->watchdog);
1112 }
1113 
1114 static int fq_init(struct Qdisc *sch, struct nlattr *opt,
1115 		   struct netlink_ext_ack *extack)
1116 {
1117 	struct fq_sched_data *q = qdisc_priv(sch);
1118 	int i, err;
1119 
1120 	sch->limit		= 10000;
1121 	q->flow_plimit		= 100;
1122 	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
1123 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
1124 	q->flow_refill_delay	= msecs_to_jiffies(40);
1125 	q->flow_max_rate	= ~0UL;
1126 	q->time_next_delayed_flow = ~0ULL;
1127 	q->rate_enable		= 1;
1128 	for (i = 0; i < FQ_BANDS; i++) {
1129 		q->band_flows[i].new_flows.first = NULL;
1130 		q->band_flows[i].old_flows.first = NULL;
1131 	}
1132 	q->band_flows[0].quantum = 9 << 16;
1133 	q->band_flows[1].quantum = 3 << 16;
1134 	q->band_flows[2].quantum = 1 << 16;
1135 	q->delayed		= RB_ROOT;
1136 	q->fq_root		= NULL;
1137 	q->fq_trees_log		= ilog2(1024);
1138 	q->orphan_mask		= 1024 - 1;
1139 	q->low_rate_threshold	= 550000 / 8;
1140 
1141 	q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
1142 
1143 	q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
1144 	q->horizon_drop = 1; /* by default, drop packets beyond horizon */
1145 
1146 	/* Default ce_threshold of 4294 seconds */
1147 	q->ce_threshold		= (u64)NSEC_PER_USEC * ~0U;
1148 
1149 	fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band);
1150 	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
1151 
1152 	if (opt)
1153 		err = fq_change(sch, opt, extack);
1154 	else
1155 		err = fq_resize(sch, q->fq_trees_log);
1156 
1157 	return err;
1158 }
1159 
1160 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
1161 {
1162 	struct fq_sched_data *q = qdisc_priv(sch);
1163 	u64 ce_threshold = q->ce_threshold;
1164 	struct tc_prio_qopt prio = {
1165 		.bands = FQ_BANDS,
1166 	};
1167 	u64 horizon = q->horizon;
1168 	struct nlattr *opts;
1169 	s32 weights[3];
1170 
1171 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
1172 	if (opts == NULL)
1173 		goto nla_put_failure;
1174 
1175 	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
1176 
1177 	do_div(ce_threshold, NSEC_PER_USEC);
1178 	do_div(horizon, NSEC_PER_USEC);
1179 
1180 	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
1181 	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
1182 	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
1183 	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
1184 	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
1185 	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
1186 			min_t(unsigned long, q->flow_max_rate, ~0U)) ||
1187 	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
1188 			jiffies_to_usecs(q->flow_refill_delay)) ||
1189 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
1190 	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
1191 			q->low_rate_threshold) ||
1192 	    nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
1193 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
1194 	    nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
1195 	    nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
1196 	    nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
1197 		goto nla_put_failure;
1198 
1199 	fq_prio2band_decompress_crumb(q->prio2band, prio.priomap);
1200 	if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
1201 		goto nla_put_failure;
1202 
1203 	weights[0] = q->band_flows[0].quantum;
1204 	weights[1] = q->band_flows[1].quantum;
1205 	weights[2] = q->band_flows[2].quantum;
1206 	if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights))
1207 		goto nla_put_failure;
1208 
1209 	return nla_nest_end(skb, opts);
1210 
1211 nla_put_failure:
1212 	return -1;
1213 }
1214 
1215 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1216 {
1217 	struct fq_sched_data *q = qdisc_priv(sch);
1218 	struct tc_fq_qd_stats st;
1219 	int i;
1220 
1221 	st.pad = 0;
1222 
1223 	sch_tree_lock(sch);
1224 
1225 	st.gc_flows		  = q->stat_gc_flows;
1226 	st.highprio_packets	  = 0;
1227 	st.fastpath_packets	  = q->internal.stat_fastpath_packets;
1228 	st.tcp_retrans		  = 0;
1229 	st.throttled		  = q->stat_throttled;
1230 	st.flows_plimit		  = q->stat_flows_plimit;
1231 	st.pkts_too_long	  = q->stat_pkts_too_long;
1232 	st.allocation_errors	  = q->stat_allocation_errors;
1233 	st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
1234 				    ktime_get_ns();
1235 	st.flows		  = q->flows;
1236 	st.inactive_flows	  = q->inactive_flows;
1237 	st.throttled_flows	  = q->throttled_flows;
1238 	st.unthrottle_latency_ns  = min_t(unsigned long,
1239 					  q->unthrottle_latency_ns, ~0U);
1240 	st.ce_mark		  = q->stat_ce_mark;
1241 	st.horizon_drops	  = q->stat_horizon_drops;
1242 	st.horizon_caps		  = q->stat_horizon_caps;
1243 	for (i = 0; i < FQ_BANDS; i++) {
1244 		st.band_drops[i]  = q->stat_band_drops[i];
1245 		st.band_pkt_count[i] = q->band_pkt_count[i];
1246 	}
1247 	sch_tree_unlock(sch);
1248 
1249 	return gnet_stats_copy_app(d, &st, sizeof(st));
1250 }
1251 
1252 static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
1253 	.id		=	"fq",
1254 	.priv_size	=	sizeof(struct fq_sched_data),
1255 
1256 	.enqueue	=	fq_enqueue,
1257 	.dequeue	=	fq_dequeue,
1258 	.peek		=	qdisc_peek_dequeued,
1259 	.init		=	fq_init,
1260 	.reset		=	fq_reset,
1261 	.destroy	=	fq_destroy,
1262 	.change		=	fq_change,
1263 	.dump		=	fq_dump,
1264 	.dump_stats	=	fq_dump_stats,
1265 	.owner		=	THIS_MODULE,
1266 };
1267 
1268 static int __init fq_module_init(void)
1269 {
1270 	int ret;
1271 
1272 	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
1273 					   sizeof(struct fq_flow),
1274 					   0, SLAB_HWCACHE_ALIGN, NULL);
1275 	if (!fq_flow_cachep)
1276 		return -ENOMEM;
1277 
1278 	ret = register_qdisc(&fq_qdisc_ops);
1279 	if (ret)
1280 		kmem_cache_destroy(fq_flow_cachep);
1281 	return ret;
1282 }
1283 
1284 static void __exit fq_module_exit(void)
1285 {
1286 	unregister_qdisc(&fq_qdisc_ops);
1287 	kmem_cache_destroy(fq_flow_cachep);
1288 }
1289 
1290 module_init(fq_module_init)
1291 module_exit(fq_module_exit)
1292 MODULE_AUTHOR("Eric Dumazet");
1293 MODULE_LICENSE("GPL");
1294 MODULE_DESCRIPTION("Fair Queue Packet Scheduler");
1295