xref: /freebsd/sys/netpfil/ipfw/dn_sched_qfq.c (revision 09a53ad8f1318c5daae6cfb19d97f4f6459f0013)
1 /*
2  * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
3  * All rights reserved
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28  * $FreeBSD$
29  */
30 
31 #ifdef _KERNEL
32 #include <sys/malloc.h>
33 #include <sys/socket.h>
34 #include <sys/socketvar.h>
35 #include <sys/kernel.h>
36 #include <sys/mbuf.h>
37 #include <sys/module.h>
38 #include <net/if.h>	/* IFNAMSIZ */
39 #include <netinet/in.h>
40 #include <netinet/ip_var.h>		/* ipfw_rule_ref */
41 #include <netinet/ip_fw.h>	/* flow_id */
42 #include <netinet/ip_dummynet.h>
43 #include <netpfil/ipfw/dn_heap.h>
44 #include <netpfil/ipfw/ip_dn_private.h>
45 #ifdef NEW_AQM
46 #include <netpfil/ipfw/dn_aqm.h>
47 #endif
48 #include <netpfil/ipfw/dn_sched.h>
49 #else
50 #include <dn_test.h>
51 #endif
52 
53 #ifdef QFQ_DEBUG
54 #define _P64	unsigned long long	/* cast for printing uint64_t */
55 struct qfq_sched;
56 static void dump_sched(struct qfq_sched *q, const char *msg);
57 #define	NO(x)	x
58 #else
59 #define NO(x)
60 #endif
61 #define DN_SCHED_QFQ	4 // XXX Where?
62 typedef	unsigned long	bitmap;
63 
64 /*
65  * bitmaps ops are critical. Some linux versions have __fls
66  * and the bitmap ops. Some machines have ffs
67  * NOTE: fls() returns 1 for the least significant bit,
68  *       __fls() returns 0 for the same case.
69  * We use the base-0 version __fls() to match the description in
70  * the ToN QFQ paper
71  */
72 #if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
73 int fls(unsigned int n)
74 {
75 	int i = 0;
76 	for (i = 0; n > 0; n >>= 1, i++)
77 		;
78 	return i;
79 }
80 #endif
81 
82 #if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
83 static inline unsigned long __fls(unsigned long word)
84 {
85 	return fls(word) - 1;
86 }
87 #endif
88 
89 #if !defined(_KERNEL) || !defined(__linux__)
90 #ifdef QFQ_DEBUG
91 static int test_bit(int ix, bitmap *p)
92 {
93 	if (ix < 0 || ix > 31)
94 		D("bad index %d", ix);
95 	return *p & (1<<ix);
96 }
97 static void __set_bit(int ix, bitmap *p)
98 {
99 	if (ix < 0 || ix > 31)
100 		D("bad index %d", ix);
101 	*p |= (1<<ix);
102 }
103 static void __clear_bit(int ix, bitmap *p)
104 {
105 	if (ix < 0 || ix > 31)
106 		D("bad index %d", ix);
107 	*p &= ~(1<<ix);
108 }
109 #else /* !QFQ_DEBUG */
110 /* XXX do we have fast version, or leave it to the compiler ? */
111 #define test_bit(ix, pData)	((*pData) & (1<<(ix)))
112 #define __set_bit(ix, pData)	(*pData) |= (1<<(ix))
113 #define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
114 #endif /* !QFQ_DEBUG */
115 #endif /* !__linux__ */
116 
117 #ifdef __MIPSEL__
118 #define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
119 #endif
120 
121 /*-------------------------------------------*/
122 /*
123 
124 Virtual time computations.
125 
126 S, F and V are all computed in fixed point arithmetic with
127 FRAC_BITS decimal bits.
128 
129    QFQ_MAX_INDEX is the maximum index allowed for a group. We need
130   	one bit per index.
131    QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
132    The layout of the bits is as below:
133 
134                    [ MTU_SHIFT ][      FRAC_BITS    ]
135                    [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
136   				 ^.__grp->index = 0
137   				 *.__grp->slot_shift
138 
139    where MIN_SLOT_SHIFT is derived by difference from the others.
140 
141 The max group index corresponds to Lmax/w_min, where
142 Lmax=1<<MTU_SHIFT, w_min = 1 .
143 From this, and knowing how many groups (MAX_INDEX) we want,
144 we can derive the shift corresponding to each group.
145 
146 Because we often need to compute
147 	F = S + len/w_i  and V = V + len/wsum
148 instead of storing w_i store the value
149 	inv_w = (1<<FRAC_BITS)/w_i
150 so we can do F = S + len * inv_w * wsum.
151 We use W_TOT in the formulas so we can easily move between
152 static and adaptive weight sum.
153 
154 The per-scheduler-instance data contain all the data structures
155 for the scheduler: bitmaps and bucket lists.
156 
157  */
158 /*
159  * Maximum number of consecutive slots occupied by backlogged classes
160  * inside a group. This is approx lmax/lmin + 5.
161  * XXX check because it poses constraints on MAX_INDEX
162  */
163 #define QFQ_MAX_SLOTS	32
164 /*
165  * Shifts used for class<->group mapping. Class weights are
166  * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
167  * group with the smallest index that can support the L_i / r_i
168  * configured for the class.
169  *
170  * grp->index is the index of the group; and grp->slot_shift
171  * is the shift for the corresponding (scaled) sigma_i.
172  *
173  * When computing the group index, we do (len<<FP_SHIFT)/weight,
174  * then compute an FLS (which is like a log2()), and if the result
175  * is below the MAX_INDEX region we use 0 (which is the same as
176  * using a larger len).
177  */
178 #define QFQ_MAX_INDEX		19
179 #define QFQ_MAX_WSHIFT		16	/* log2(max_weight) */
180 
181 #define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT)
182 #define QFQ_MAX_WSUM		(2*QFQ_MAX_WEIGHT)
183 
184 #define FRAC_BITS		30	/* fixed point arithmetic */
185 #define ONE_FP			(1UL << FRAC_BITS)
186 
187 #define QFQ_MTU_SHIFT		11	/* log2(max_len) */
188 #define QFQ_MIN_SLOT_SHIFT	(FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
189 
190 /*
191  * Possible group states, also indexes for the bitmaps array in
192  * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
193  */
194 enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
195 
196 struct qfq_group;
197 /*
198  * additional queue info. Some of this info should come from
199  * the flowset, we copy them here for faster processing.
200  * This is an overlay of the struct dn_queue
201  */
202 struct qfq_class {
203 	struct dn_queue _q;
204 	uint64_t S, F;		/* flow timestamps (exact) */
205 	struct qfq_class *next; /* Link for the slot list. */
206 
207 	/* group we belong to. In principle we would need the index,
208 	 * which is log_2(lmax/weight), but we never reference it
209 	 * directly, only the group.
210 	 */
211 	struct qfq_group *grp;
212 
213 	/* these are copied from the flowset. */
214 	uint32_t	inv_w;	/* ONE_FP/weight */
215 	uint32_t 	lmax;	/* Max packet size for this flow. */
216 };
217 
218 /* Group descriptor, see the paper for details.
219  * Basically this contains the bucket lists
220  */
221 struct qfq_group {
222 	uint64_t S, F;			/* group timestamps (approx). */
223 	unsigned int slot_shift;	/* Slot shift. */
224 	unsigned int index;		/* Group index. */
225 	unsigned int front;		/* Index of the front slot. */
226 	bitmap full_slots;		/* non-empty slots */
227 
228 	/* Array of lists of active classes. */
229 	struct qfq_class *slots[QFQ_MAX_SLOTS];
230 };
231 
232 /* scheduler instance descriptor. */
233 struct qfq_sched {
234 	uint64_t	V;		/* Precise virtual time. */
235 	uint32_t	wsum;		/* weight sum */
236 	uint32_t	iwsum;		/* inverse weight sum */
237 	NO(uint32_t	i_wsum;)	/* ONE_FP/w_sum */
238 	NO(uint32_t	queued;)	/* debugging */
239 	NO(uint32_t	loops;)		/* debugging */
240 	bitmap bitmaps[QFQ_MAX_STATE];	/* Group bitmaps. */
241 	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
242 };
243 
244 /*---- support functions ----------------------------*/
245 
246 /* Generic comparison function, handling wraparound. */
247 static inline int qfq_gt(uint64_t a, uint64_t b)
248 {
249 	return (int64_t)(a - b) > 0;
250 }
251 
252 /* Round a precise timestamp to its slotted value. */
253 static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
254 {
255 	return ts & ~((1ULL << shift) - 1);
256 }
257 
258 /* return the pointer to the group with lowest index in the bitmap */
259 static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
260 					unsigned long bitmap)
261 {
262 	int index = ffs(bitmap) - 1; // zero-based
263 	return &q->groups[index];
264 }
265 
266 /*
267  * Calculate a flow index, given its weight and maximum packet length.
268  * index = log_2(maxlen/weight) but we need to apply the scaling.
269  * This is used only once at flow creation.
270  */
271 static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
272 {
273 	uint64_t slot_size = (uint64_t)maxlen *inv_w;
274 	unsigned long size_map;
275 	int index = 0;
276 
277 	size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
278 	if (!size_map)
279 		goto out;
280 
281 	index = __fls(size_map) + 1;	// basically a log_2()
282 	index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
283 
284 	if (index < 0)
285 		index = 0;
286 
287 out:
288 	ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
289 	return index;
290 }
291 /*---- end support functions ----*/
292 
293 /*-------- API calls --------------------------------*/
294 /*
295  * Validate and copy parameters from flowset.
296  */
297 static int
298 qfq_new_queue(struct dn_queue *_q)
299 {
300 	struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
301 	struct qfq_class *cl = (struct qfq_class *)_q;
302 	int i;
303 	uint32_t w;	/* approximated weight */
304 
305 	/* import parameters from the flowset. They should be correct
306 	 * already.
307 	 */
308 	w = _q->fs->fs.par[0];
309 	cl->lmax = _q->fs->fs.par[1];
310 	if (!w || w > QFQ_MAX_WEIGHT) {
311 		w = 1;
312 		D("rounding weight to 1");
313 	}
314 	cl->inv_w = ONE_FP/w;
315 	w = ONE_FP/cl->inv_w;
316 	if (q->wsum + w > QFQ_MAX_WSUM)
317 		return EINVAL;
318 
319 	i = qfq_calc_index(cl->inv_w, cl->lmax);
320 	cl->grp = &q->groups[i];
321 	q->wsum += w;
322 	q->iwsum = ONE_FP / q->wsum; /* XXX note theory */
323 	// XXX cl->S = q->V; ?
324 	return 0;
325 }
326 
327 /* remove an empty queue */
328 static int
329 qfq_free_queue(struct dn_queue *_q)
330 {
331 	struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
332 	struct qfq_class *cl = (struct qfq_class *)_q;
333 	if (cl->inv_w) {
334 		q->wsum -= ONE_FP/cl->inv_w;
335 		if (q->wsum != 0)
336 			q->iwsum = ONE_FP / q->wsum;
337 		cl->inv_w = 0; /* reset weight to avoid run twice */
338 	}
339 	return 0;
340 }
341 
342 /* Calculate a mask to mimic what would be ffs_from(). */
343 static inline unsigned long
344 mask_from(unsigned long bitmap, int from)
345 {
346 	return bitmap & ~((1UL << from) - 1);
347 }
348 
349 /*
350  * The state computation relies on ER=0, IR=1, EB=2, IB=3
351  * First compute eligibility comparing grp->S, q->V,
352  * then check if someone is blocking us and possibly add EB
353  */
354 static inline unsigned int
355 qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
356 {
357 	/* if S > V we are not eligible */
358 	unsigned int state = qfq_gt(grp->S, q->V);
359 	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
360 	struct qfq_group *next;
361 
362 	if (mask) {
363 		next = qfq_ffs(q, mask);
364 		if (qfq_gt(grp->F, next->F))
365 			state |= EB;
366 	}
367 
368 	return state;
369 }
370 
371 /*
372  * In principle
373  *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
374  *	q->bitmaps[src] &= ~mask;
375  * but we should make sure that src != dst
376  */
377 static inline void
378 qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
379 {
380 	q->bitmaps[dst] |= q->bitmaps[src] & mask;
381 	q->bitmaps[src] &= ~mask;
382 }
383 
384 static inline void
385 qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
386 {
387 	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
388 	struct qfq_group *next;
389 
390 	if (mask) {
391 		next = qfq_ffs(q, mask);
392 		if (!qfq_gt(next->F, old_finish))
393 			return;
394 	}
395 
396 	mask = (1UL << index) - 1;
397 	qfq_move_groups(q, mask, EB, ER);
398 	qfq_move_groups(q, mask, IB, IR);
399 }
400 
401 /*
402  * perhaps
403  *
404 	old_V ^= q->V;
405 	old_V >>= QFQ_MIN_SLOT_SHIFT;
406 	if (old_V) {
407 		...
408 	}
409  *
410  */
411 static inline void
412 qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
413 {
414 	unsigned long mask, vslot, old_vslot;
415 
416 	vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
417 	old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
418 
419 	if (vslot != old_vslot) {
420 		/* must be 2ULL, see ToN QFQ article fig.5, we use base-0 fls */
421 		mask = (2ULL << (__fls(vslot ^ old_vslot))) - 1;
422 		qfq_move_groups(q, mask, IR, ER);
423 		qfq_move_groups(q, mask, IB, EB);
424 	}
425 }
426 
427 /*
428  * XXX we should make sure that slot becomes less than 32.
429  * This is guaranteed by the input values.
430  * roundedS is always cl->S rounded on grp->slot_shift bits.
431  */
432 static inline void
433 qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
434 {
435 	uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
436 	unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
437 
438 	cl->next = grp->slots[i];
439 	grp->slots[i] = cl;
440 	__set_bit(slot, &grp->full_slots);
441 }
442 
443 /*
444  * remove the entry from the slot
445  */
446 static inline void
447 qfq_front_slot_remove(struct qfq_group *grp)
448 {
449 	struct qfq_class **h = &grp->slots[grp->front];
450 
451 	*h = (*h)->next;
452 	if (!*h)
453 		__clear_bit(0, &grp->full_slots);
454 }
455 
456 /*
457  * Returns the first full queue in a group. As a side effect,
458  * adjust the bucket list so the first non-empty bucket is at
459  * position 0 in full_slots.
460  */
461 static inline struct qfq_class *
462 qfq_slot_scan(struct qfq_group *grp)
463 {
464 	int i;
465 
466 	ND("grp %d full %x", grp->index, grp->full_slots);
467 	if (!grp->full_slots)
468 		return NULL;
469 
470 	i = ffs(grp->full_slots) - 1; // zero-based
471 	if (i > 0) {
472 		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
473 		grp->full_slots >>= i;
474 	}
475 
476 	return grp->slots[grp->front];
477 }
478 
479 /*
480  * adjust the bucket list. When the start time of a group decreases,
481  * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
482  * move the objects. The mask of occupied slots must be shifted
483  * because we use ffs() to find the first non-empty slot.
484  * This covers decreases in the group's start time, but what about
485  * increases of the start time ?
486  * Here too we should make sure that i is less than 32
487  */
488 static inline void
489 qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
490 {
491 	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
492 
493 	(void)q;
494 	grp->full_slots <<= i;
495 	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
496 }
497 
498 
499 static inline void
500 qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
501 {
502 	bitmap ineligible;
503 
504 	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
505 	if (ineligible) {
506 		if (!q->bitmaps[ER]) {
507 			struct qfq_group *grp;
508 			grp = qfq_ffs(q, ineligible);
509 			if (qfq_gt(grp->S, q->V))
510 				q->V = grp->S;
511 		}
512 		qfq_make_eligible(q, old_V);
513 	}
514 }
515 
516 /*
517  * Updates the class, returns true if also the group needs to be updated.
518  */
519 static inline int
520 qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
521 	    struct qfq_class *cl)
522 {
523 
524 	(void)q;
525 	cl->S = cl->F;
526 	if (cl->_q.mq.head == NULL)  {
527 		qfq_front_slot_remove(grp);
528 	} else {
529 		unsigned int len;
530 		uint64_t roundedS;
531 
532 		len = cl->_q.mq.head->m_pkthdr.len;
533 		cl->F = cl->S + (uint64_t)len * cl->inv_w;
534 		roundedS = qfq_round_down(cl->S, grp->slot_shift);
535 		if (roundedS == grp->S)
536 			return 0;
537 
538 		qfq_front_slot_remove(grp);
539 		qfq_slot_insert(grp, cl, roundedS);
540 	}
541 	return 1;
542 }
543 
544 static struct mbuf *
545 qfq_dequeue(struct dn_sch_inst *si)
546 {
547 	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
548 	struct qfq_group *grp;
549 	struct qfq_class *cl;
550 	struct mbuf *m;
551 	uint64_t old_V;
552 
553 	NO(q->loops++;)
554 	if (!q->bitmaps[ER]) {
555 		NO(if (q->queued)
556 			dump_sched(q, "start dequeue");)
557 		return NULL;
558 	}
559 
560 	grp = qfq_ffs(q, q->bitmaps[ER]);
561 
562 	cl = grp->slots[grp->front];
563 	/* extract from the first bucket in the bucket list */
564 	m = dn_dequeue(&cl->_q);
565 
566 	if (!m) {
567 		D("BUG/* non-workconserving leaf */");
568 		return NULL;
569 	}
570 	NO(q->queued--;)
571 	old_V = q->V;
572 	q->V += (uint64_t)m->m_pkthdr.len * q->iwsum;
573 	ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
574 
575 	if (qfq_update_class(q, grp, cl)) {
576 		uint64_t old_F = grp->F;
577 		cl = qfq_slot_scan(grp);
578 		if (!cl) { /* group gone, remove from ER */
579 			__clear_bit(grp->index, &q->bitmaps[ER]);
580 			// grp->S = grp->F + 1; // XXX debugging only
581 		} else {
582 			uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
583 			unsigned int s;
584 
585 			if (grp->S == roundedS)
586 				goto skip_unblock;
587 			grp->S = roundedS;
588 			grp->F = roundedS + (2ULL << grp->slot_shift);
589 			/* remove from ER and put in the new set */
590 			__clear_bit(grp->index, &q->bitmaps[ER]);
591 			s = qfq_calc_state(q, grp);
592 			__set_bit(grp->index, &q->bitmaps[s]);
593 		}
594 		/* we need to unblock even if the group has gone away */
595 		qfq_unblock_groups(q, grp->index, old_F);
596 	}
597 
598 skip_unblock:
599 	qfq_update_eligible(q, old_V);
600 	NO(if (!q->bitmaps[ER] && q->queued)
601 		dump_sched(q, "end dequeue");)
602 
603 	return m;
604 }
605 
606 /*
607  * Assign a reasonable start time for a new flow k in group i.
608  * Admissible values for \hat(F) are multiples of \sigma_i
609  * no greater than V+\sigma_i . Larger values mean that
610  * we had a wraparound so we consider the timestamp to be stale.
611  *
612  * If F is not stale and F >= V then we set S = F.
613  * Otherwise we should assign S = V, but this may violate
614  * the ordering in ER. So, if we have groups in ER, set S to
615  * the F_j of the first group j which would be blocking us.
616  * We are guaranteed not to move S backward because
617  * otherwise our group i would still be blocked.
618  */
619 static inline void
620 qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
621 {
622 	unsigned long mask;
623 	uint64_t limit, roundedF;
624 	int slot_shift = cl->grp->slot_shift;
625 
626 	roundedF = qfq_round_down(cl->F, slot_shift);
627 	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
628 
629 	if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
630 		/* timestamp was stale */
631 		mask = mask_from(q->bitmaps[ER], cl->grp->index);
632 		if (mask) {
633 			struct qfq_group *next = qfq_ffs(q, mask);
634 			if (qfq_gt(roundedF, next->F)) {
635 				/* from pv 71261956973ba9e0637848a5adb4a5819b4bae83 */
636 				if (qfq_gt(limit, next->F))
637 					cl->S = next->F;
638 				else /* preserve timestamp correctness */
639 					cl->S = limit;
640 				return;
641 			}
642 		}
643 		cl->S = q->V;
644 	} else { /* timestamp is not stale */
645 		cl->S = cl->F;
646 	}
647 }
648 
649 static int
650 qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
651 {
652 	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
653 	struct qfq_group *grp;
654 	struct qfq_class *cl = (struct qfq_class *)_q;
655 	uint64_t roundedS;
656 	int s;
657 
658 	NO(q->loops++;)
659 	DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
660 		_q, cl->inv_w, cl->grp->index);
661 	/* XXX verify that the packet obeys the parameters */
662 	if (m != _q->mq.head) {
663 		if (dn_enqueue(_q, m, 0)) /* packet was dropped */
664 			return 1;
665 		NO(q->queued++;)
666 		if (m != _q->mq.head)
667 			return 0;
668 	}
669 	/* If reach this point, queue q was idle */
670 	grp = cl->grp;
671 	qfq_update_start(q, cl); /* adjust start time */
672 	/* compute new finish time and rounded start. */
673 	cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
674 	roundedS = qfq_round_down(cl->S, grp->slot_shift);
675 
676 	/*
677 	 * insert cl in the correct bucket.
678 	 * If cl->S >= grp->S we don't need to adjust the
679 	 * bucket list and simply go to the insertion phase.
680 	 * Otherwise grp->S is decreasing, we must make room
681 	 * in the bucket list, and also recompute the group state.
682 	 * Finally, if there were no flows in this group and nobody
683 	 * was in ER make sure to adjust V.
684 	 */
685 	if (grp->full_slots) {
686 		if (!qfq_gt(grp->S, cl->S))
687 			goto skip_update;
688 		/* create a slot for this cl->S */
689 		qfq_slot_rotate(q, grp, roundedS);
690 		/* group was surely ineligible, remove */
691 		__clear_bit(grp->index, &q->bitmaps[IR]);
692 		__clear_bit(grp->index, &q->bitmaps[IB]);
693 	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
694 		q->V = roundedS;
695 
696 	grp->S = roundedS;
697 	grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
698 	s = qfq_calc_state(q, grp);
699 	__set_bit(grp->index, &q->bitmaps[s]);
700 	ND("new state %d 0x%x", s, q->bitmaps[s]);
701 	ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
702 skip_update:
703 	qfq_slot_insert(grp, cl, roundedS);
704 
705 	return 0;
706 }
707 
708 
709 #if 0
710 static inline void
711 qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
712 	struct qfq_class *cl, struct qfq_class **pprev)
713 {
714 	unsigned int i, offset;
715 	uint64_t roundedS;
716 
717 	roundedS = qfq_round_down(cl->S, grp->slot_shift);
718 	offset = (roundedS - grp->S) >> grp->slot_shift;
719 	i = (grp->front + offset) % QFQ_MAX_SLOTS;
720 
721 #ifdef notyet
722 	if (!pprev) {
723 		pprev = &grp->slots[i];
724 		while (*pprev && *pprev != cl)
725 			pprev = &(*pprev)->next;
726 	}
727 #endif
728 
729 	*pprev = cl->next;
730 	if (!grp->slots[i])
731 		__clear_bit(offset, &grp->full_slots);
732 }
733 
734 /*
735  * called to forcibly destroy a queue.
736  * If the queue is not in the front bucket, or if it has
737  * other queues in the front bucket, we can simply remove
738  * the queue with no other side effects.
739  * Otherwise we must propagate the event up.
740  * XXX description to be completed.
741  */
742 static void
743 qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
744 				 struct qfq_class **pprev)
745 {
746 	struct qfq_group *grp = &q->groups[cl->index];
747 	unsigned long mask;
748 	uint64_t roundedS;
749 	int s;
750 
751 	cl->F = cl->S;	// not needed if the class goes away.
752 	qfq_slot_remove(q, grp, cl, pprev);
753 
754 	if (!grp->full_slots) {
755 		/* nothing left in the group, remove from all sets.
756 		 * Do ER last because if we were blocking other groups
757 		 * we must unblock them.
758 		 */
759 		__clear_bit(grp->index, &q->bitmaps[IR]);
760 		__clear_bit(grp->index, &q->bitmaps[EB]);
761 		__clear_bit(grp->index, &q->bitmaps[IB]);
762 
763 		if (test_bit(grp->index, &q->bitmaps[ER]) &&
764 		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
765 			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
766 			if (mask)
767 				mask = ~((1UL << __fls(mask)) - 1);
768 			else
769 				mask = ~0UL;
770 			qfq_move_groups(q, mask, EB, ER);
771 			qfq_move_groups(q, mask, IB, IR);
772 		}
773 		__clear_bit(grp->index, &q->bitmaps[ER]);
774 	} else if (!grp->slots[grp->front]) {
775 		cl = qfq_slot_scan(grp);
776 		roundedS = qfq_round_down(cl->S, grp->slot_shift);
777 		if (grp->S != roundedS) {
778 			__clear_bit(grp->index, &q->bitmaps[ER]);
779 			__clear_bit(grp->index, &q->bitmaps[IR]);
780 			__clear_bit(grp->index, &q->bitmaps[EB]);
781 			__clear_bit(grp->index, &q->bitmaps[IB]);
782 			grp->S = roundedS;
783 			grp->F = roundedS + (2ULL << grp->slot_shift);
784 			s = qfq_calc_state(q, grp);
785 			__set_bit(grp->index, &q->bitmaps[s]);
786 		}
787 	}
788 	qfq_update_eligible(q, q->V);
789 }
790 #endif
791 
792 static int
793 qfq_new_fsk(struct dn_fsk *f)
794 {
795 	ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
796 	ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
797 	ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
798 	return 0;
799 }
800 
801 /*
802  * initialize a new scheduler instance
803  */
804 static int
805 qfq_new_sched(struct dn_sch_inst *si)
806 {
807 	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
808 	struct qfq_group *grp;
809 	int i;
810 
811 	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
812 		grp = &q->groups[i];
813 		grp->index = i;
814 		grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
815 					(QFQ_MAX_INDEX - i);
816 	}
817 	return 0;
818 }
819 
820 /*
821  * QFQ scheduler descriptor
822  */
823 static struct dn_alg qfq_desc = {
824 	_SI( .type = ) DN_SCHED_QFQ,
825 	_SI( .name = ) "QFQ",
826 	_SI( .flags = ) DN_MULTIQUEUE,
827 
828 	_SI( .schk_datalen = ) 0,
829 	_SI( .si_datalen = ) sizeof(struct qfq_sched),
830 	_SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
831 
832 	_SI( .enqueue = ) qfq_enqueue,
833 	_SI( .dequeue = ) qfq_dequeue,
834 
835 	_SI( .config = )  NULL,
836 	_SI( .destroy = )  NULL,
837 	_SI( .new_sched = ) qfq_new_sched,
838 	_SI( .free_sched = )  NULL,
839 	_SI( .new_fsk = ) qfq_new_fsk,
840 	_SI( .free_fsk = )  NULL,
841 	_SI( .new_queue = ) qfq_new_queue,
842 	_SI( .free_queue = ) qfq_free_queue,
843 #ifdef NEW_AQM
844 	_SI( .getconfig = )  NULL,
845 #endif
846 };
847 
848 DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
849 
850 #ifdef QFQ_DEBUG
851 static void
852 dump_groups(struct qfq_sched *q, uint32_t mask)
853 {
854 	int i, j;
855 
856 	for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
857 		struct qfq_group *g = &q->groups[i];
858 
859 		if (0 == (mask & (1<<i)))
860 			continue;
861 		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
862 			if (g->slots[j])
863 				D("    bucket %d %p", j, g->slots[j]);
864 		}
865 		D("full_slots 0x%llx", (_P64)g->full_slots);
866 		D("        %2d S 0x%20llx F 0x%llx %c", i,
867 			(_P64)g->S, (_P64)g->F,
868 			mask & (1<<i) ? '1' : '0');
869 	}
870 }
871 
872 static void
873 dump_sched(struct qfq_sched *q, const char *msg)
874 {
875 	D("--- in %s: ---", msg);
876 	D("loops %d queued %d V 0x%llx", q->loops, q->queued, (_P64)q->V);
877 	D("    ER 0x%08x", (unsigned)q->bitmaps[ER]);
878 	D("    EB 0x%08x", (unsigned)q->bitmaps[EB]);
879 	D("    IR 0x%08x", (unsigned)q->bitmaps[IR]);
880 	D("    IB 0x%08x", (unsigned)q->bitmaps[IB]);
881 	dump_groups(q, 0xffffffff);
882 };
883 #endif /* QFQ_DEBUG */
884