1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
5 * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
6 * All rights reserved
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 /*
31 */
32
33 #ifdef _KERNEL
34 #include <sys/malloc.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/kernel.h>
38 #include <sys/lock.h>
39 #include <sys/mbuf.h>
40 #include <sys/module.h>
41 #include <sys/rwlock.h>
42 #include <net/if.h> /* IFNAMSIZ */
43 #include <netinet/in.h>
44 #include <netinet/ip_var.h> /* ipfw_rule_ref */
45 #include <netinet/ip_fw.h> /* flow_id */
46 #include <netinet/ip_dummynet.h>
47 #include <netpfil/ipfw/ip_fw_private.h>
48 #include <netpfil/ipfw/dn_heap.h>
49 #include <netpfil/ipfw/ip_dn_private.h>
50 #ifdef NEW_AQM
51 #include <netpfil/ipfw/dn_aqm.h>
52 #endif
53 #include <netpfil/ipfw/dn_sched.h>
54 #else
55 #include <dn_test.h>
56 #endif
57
58 #ifndef MAX64
59 #define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
60 #endif
61
62 /*
63 * timestamps are computed on 64 bit using fixed point arithmetic.
64 * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
65 * and sum of weights, respectively. FRAC_BITS is the number of
66 * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
67 * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
68 * using an unsigned 32-bit division, and to avoid wraparounds we need
69 * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
70 * As an example
71 * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
72 */
73 #ifndef FRAC_BITS
74 #define FRAC_BITS 28 /* shift for fixed point arithmetic */
75 #define ONE_FP (1UL << FRAC_BITS)
76 #endif
77
78 /*
79 * Private information for the scheduler instance:
80 * sch_heap (key is Finish time) returns the next queue to serve
81 * ne_heap (key is Start time) stores not-eligible queues
82 * idle_heap (key=start/finish time) stores idle flows. It must
83 * support extract-from-middle.
84 * A flow is only in 1 of the three heaps.
85 * XXX todo: use a more efficient data structure, e.g. a tree sorted
86 * by F with min_subtree(S) in each node
87 */
88 struct wf2qp_si {
89 struct dn_heap sch_heap; /* top extract - key Finish time */
90 struct dn_heap ne_heap; /* top extract - key Start time */
91 struct dn_heap idle_heap; /* random extract - key Start=Finish time */
92 uint64_t V; /* virtual time */
93 uint32_t inv_wsum; /* inverse of sum of weights */
94 uint32_t wsum; /* sum of weights */
95 };
96
97 struct wf2qp_queue {
98 struct dn_queue _q;
99 uint64_t S, F; /* start time, finish time */
100 uint32_t inv_w; /* ONE_FP / weight */
101 int32_t heap_pos; /* position (index) of struct in heap */
102 };
103
104 /*
105 * This file implements a WF2Q+ scheduler as it has been in dummynet
106 * since 2000.
107 * The scheduler supports per-flow queues and has O(log N) complexity.
108 *
109 * WF2Q+ needs to drain entries from the idle heap so that we
110 * can keep the sum of weights up to date. We can do it whenever
111 * we get a chance, or periodically, or following some other
112 * strategy. The function idle_check() drains at most N elements
113 * from the idle heap.
114 */
115 static void
idle_check(struct wf2qp_si * si,int n,int force)116 idle_check(struct wf2qp_si *si, int n, int force)
117 {
118 struct dn_heap *h = &si->idle_heap;
119 while (n-- > 0 && h->elements > 0 &&
120 (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
121 struct dn_queue *q = HEAP_TOP(h)->object;
122 struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
123
124 heap_extract(h, NULL);
125 /* XXX to let the flowset delete the queue we should
126 * mark it as 'unused' by the scheduler.
127 */
128 alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
129 si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */
130 if (si->wsum > 0)
131 si->inv_wsum = ONE_FP/si->wsum;
132 }
133 }
134
135 static int
wf2qp_enqueue(struct dn_sch_inst * _si,struct dn_queue * q,struct mbuf * m)136 wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
137 {
138 struct dn_fsk *fs = q->fs;
139 struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
140 struct wf2qp_queue *alg_fq;
141 uint64_t len = m->m_pkthdr.len;
142
143 if (m != q->mq.head) {
144 if (dn_enqueue(q, m, 0)) /* packet was dropped */
145 return 1;
146 if (m != q->mq.head) /* queue was already busy */
147 return 0;
148 }
149
150 /* If reach this point, queue q was idle */
151 alg_fq = (struct wf2qp_queue *)q;
152
153 if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
154 /* F<S means timestamps are invalid ->brand new queue. */
155 alg_fq->S = si->V; /* init start time */
156 si->wsum += fs->fs.par[0]; /* add weight of new queue. */
157 si->inv_wsum = ONE_FP/si->wsum;
158 } else { /* if it was idle then it was in the idle heap */
159 if (! heap_extract(&si->idle_heap, q))
160 return 1;
161 alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */
162 }
163 alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
164
165 /* if nothing is backlogged, make sure this flow is eligible */
166 if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
167 si->V = MAX64(alg_fq->S, si->V);
168
169 /*
170 * Look at eligibility. A flow is not eligibile if S>V (when
171 * this happens, it means that there is some other flow already
172 * scheduled for the same pipe, so the sch_heap cannot be
173 * empty). If the flow is not eligible we just store it in the
174 * ne_heap. Otherwise, we store in the sch_heap.
175 * Note that for all flows in sch_heap (SCH), S_i <= V,
176 * and for all flows in ne_heap (NEH), S_i > V.
177 * So when we need to compute max(V, min(S_i)) forall i in
178 * SCH+NEH, we only need to look into NEH.
179 */
180 if (DN_KEY_LT(si->V, alg_fq->S)) {
181 /* S>V means flow Not eligible. */
182 if (si->sch_heap.elements == 0)
183 D("++ ouch! not eligible but empty scheduler!");
184 heap_insert(&si->ne_heap, alg_fq->S, q);
185 } else {
186 heap_insert(&si->sch_heap, alg_fq->F, q);
187 }
188 return 0;
189 }
190
191 /* XXX invariant: sch > 0 || V >= min(S in neh) */
192 static struct mbuf *
wf2qp_dequeue(struct dn_sch_inst * _si)193 wf2qp_dequeue(struct dn_sch_inst *_si)
194 {
195 /* Access scheduler instance private data */
196 struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
197 struct mbuf *m;
198 struct dn_queue *q;
199 struct dn_heap *sch = &si->sch_heap;
200 struct dn_heap *neh = &si->ne_heap;
201 struct wf2qp_queue *alg_fq;
202
203 if (sch->elements == 0 && neh->elements == 0) {
204 /* we have nothing to do. We could kill the idle heap
205 * altogether and reset V
206 */
207 idle_check(si, 0x7fffffff, 1);
208 si->V = 0;
209 si->wsum = 0; /* should be set already */
210 return NULL; /* quick return if nothing to do */
211 }
212 idle_check(si, 1, 0); /* drain something from the idle heap */
213
214 /* make sure at least one element is eligible, bumping V
215 * and moving entries that have become eligible.
216 * We need to repeat the first part twice, before and
217 * after extracting the candidate, or enqueue() will
218 * find the data structure in a wrong state.
219 */
220 m = NULL;
221 for(;;) {
222 /*
223 * Compute V = max(V, min(S_i)). Remember that all elements
224 * in sch have by definition S_i <= V so if sch is not empty,
225 * V is surely the max and we must not update it. Conversely,
226 * if sch is empty we only need to look at neh.
227 * We don't need to move the queues, as it will be done at the
228 * next enqueue
229 */
230 if (sch->elements == 0 && neh->elements > 0) {
231 si->V = MAX64(si->V, HEAP_TOP(neh)->key);
232 }
233 while (neh->elements > 0 &&
234 DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
235 q = HEAP_TOP(neh)->object;
236 alg_fq = (struct wf2qp_queue *)q;
237 heap_extract(neh, NULL);
238 heap_insert(sch, alg_fq->F, q);
239 }
240 if (m) /* pkt found in previous iteration */
241 break;
242 /* ok we have at least one eligible pkt */
243 q = HEAP_TOP(sch)->object;
244 alg_fq = (struct wf2qp_queue *)q;
245 m = dn_dequeue(q);
246 if (m == NULL)
247 return NULL;
248 heap_extract(sch, NULL); /* Remove queue from heap. */
249 si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
250 alg_fq->S = alg_fq->F; /* Update start time. */
251 if (q->mq.head == 0) { /* not backlogged any more. */
252 heap_insert(&si->idle_heap, alg_fq->F, q);
253 } else { /* Still backlogged. */
254 /* Update F, store in neh or sch */
255 uint64_t len = q->mq.head->m_pkthdr.len;
256 alg_fq->F += len * alg_fq->inv_w;
257 if (DN_KEY_LEQ(alg_fq->S, si->V)) {
258 heap_insert(sch, alg_fq->F, q);
259 } else {
260 heap_insert(neh, alg_fq->S, q);
261 }
262 }
263 }
264 return m;
265 }
266
267 static int
wf2qp_new_sched(struct dn_sch_inst * _si)268 wf2qp_new_sched(struct dn_sch_inst *_si)
269 {
270 struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
271 int ofs = offsetof(struct wf2qp_queue, heap_pos);
272
273 /* all heaps support extract from middle */
274 if (heap_init(&si->idle_heap, 16, ofs) ||
275 heap_init(&si->sch_heap, 16, ofs) ||
276 heap_init(&si->ne_heap, 16, ofs)) {
277 heap_free(&si->ne_heap);
278 heap_free(&si->sch_heap);
279 heap_free(&si->idle_heap);
280 return ENOMEM;
281 }
282 return 0;
283 }
284
285 static int
wf2qp_free_sched(struct dn_sch_inst * _si)286 wf2qp_free_sched(struct dn_sch_inst *_si)
287 {
288 struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
289
290 heap_free(&si->sch_heap);
291 heap_free(&si->ne_heap);
292 heap_free(&si->idle_heap);
293
294 return 0;
295 }
296
297 static int
wf2qp_new_fsk(struct dn_fsk * fs)298 wf2qp_new_fsk(struct dn_fsk *fs)
299 {
300 ipdn_bound_var(&fs->fs.par[0], 1,
301 1, 100, "WF2Q+ weight");
302 return 0;
303 }
304
305 static int
wf2qp_new_queue(struct dn_queue * _q)306 wf2qp_new_queue(struct dn_queue *_q)
307 {
308 struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
309
310 _q->ni.oid.subtype = DN_SCHED_WF2QP;
311 q->F = 0; /* not strictly necessary */
312 q->S = q->F + 1; /* mark timestamp as invalid. */
313 q->inv_w = ONE_FP / _q->fs->fs.par[0];
314 if (_q->mq.head != NULL) {
315 wf2qp_enqueue(_q->_si, _q, _q->mq.head);
316 }
317 return 0;
318 }
319
320 /*
321 * Called when the infrastructure removes a queue (e.g. flowset
322 * is reconfigured). Nothing to do if we did not 'own' the queue,
323 * otherwise remove it from the right heap and adjust the sum
324 * of weights.
325 */
326 static int
wf2qp_free_queue(struct dn_queue * q)327 wf2qp_free_queue(struct dn_queue *q)
328 {
329 struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
330 struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
331
332 if (alg_fq->S >= alg_fq->F + 1)
333 return 0; /* nothing to do, not in any heap */
334 si->wsum -= q->fs->fs.par[0];
335 if (si->wsum > 0)
336 si->inv_wsum = ONE_FP/si->wsum;
337
338 /* extract from the heap. XXX TODO we may need to adjust V
339 * to make sure the invariants hold.
340 */
341 heap_extract(&si->idle_heap, q);
342 heap_extract(&si->ne_heap, q);
343 heap_extract(&si->sch_heap, q);
344
345 return 0;
346 }
347
348 /*
349 * WF2Q+ scheduler descriptor
350 * contains the type of the scheduler, the name, the size of the
351 * structures and function pointers.
352 */
353 static struct dn_alg wf2qp_desc = {
354 _SI( .type = ) DN_SCHED_WF2QP,
355 _SI( .name = ) "WF2Q+",
356 _SI( .flags = ) DN_MULTIQUEUE,
357
358 /* we need extra space in the si and the queue */
359 _SI( .schk_datalen = ) 0,
360 _SI( .si_datalen = ) sizeof(struct wf2qp_si),
361 _SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
362 sizeof(struct dn_queue),
363
364 _SI( .enqueue = ) wf2qp_enqueue,
365 _SI( .dequeue = ) wf2qp_dequeue,
366
367 _SI( .config = ) NULL,
368 _SI( .destroy = ) NULL,
369 _SI( .new_sched = ) wf2qp_new_sched,
370 _SI( .free_sched = ) wf2qp_free_sched,
371
372 _SI( .new_fsk = ) wf2qp_new_fsk,
373 _SI( .free_fsk = ) NULL,
374
375 _SI( .new_queue = ) wf2qp_new_queue,
376 _SI( .free_queue = ) wf2qp_free_queue,
377 #ifdef NEW_AQM
378 _SI( .getconfig = ) NULL,
379 #endif
380
381 };
382
383 DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
384