1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa 5 * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa 6 * All rights reserved 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * $FreeBSD$ 32 */ 33 34 #ifdef _KERNEL 35 #include <sys/malloc.h> 36 #include <sys/socket.h> 37 #include <sys/socketvar.h> 38 #include <sys/kernel.h> 39 #include <sys/lock.h> 40 #include <sys/mbuf.h> 41 #include <sys/module.h> 42 #include <sys/rwlock.h> 43 #include <net/if.h> /* IFNAMSIZ */ 44 #include <netinet/in.h> 45 #include <netinet/ip_var.h> /* ipfw_rule_ref */ 46 #include <netinet/ip_fw.h> /* flow_id */ 47 #include <netinet/ip_dummynet.h> 48 #include <netpfil/ipfw/ip_fw_private.h> 49 #include <netpfil/ipfw/dn_heap.h> 50 #include <netpfil/ipfw/ip_dn_private.h> 51 #ifdef NEW_AQM 52 #include <netpfil/ipfw/dn_aqm.h> 53 #endif 54 #include <netpfil/ipfw/dn_sched.h> 55 #else 56 #include <dn_test.h> 57 #endif 58 59 #ifndef MAX64 60 #define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) 61 #endif 62 63 /* 64 * timestamps are computed on 64 bit using fixed point arithmetic. 65 * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len 66 * and sum of weights, respectively. FRAC_BITS is the number of 67 * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large 68 * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w 69 * using an unsigned 32-bit division, and to avoid wraparounds we need 70 * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 71 * As an example 72 * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 73 */ 74 #ifndef FRAC_BITS 75 #define FRAC_BITS 28 /* shift for fixed point arithmetic */ 76 #define ONE_FP (1UL << FRAC_BITS) 77 #endif 78 79 /* 80 * Private information for the scheduler instance: 81 * sch_heap (key is Finish time) returns the next queue to serve 82 * ne_heap (key is Start time) stores not-eligible queues 83 * idle_heap (key=start/finish time) stores idle flows. It must 84 * support extract-from-middle. 85 * A flow is only in 1 of the three heaps. 86 * XXX todo: use a more efficient data structure, e.g. a tree sorted 87 * by F with min_subtree(S) in each node 88 */ 89 struct wf2qp_si { 90 struct dn_heap sch_heap; /* top extract - key Finish time */ 91 struct dn_heap ne_heap; /* top extract - key Start time */ 92 struct dn_heap idle_heap; /* random extract - key Start=Finish time */ 93 uint64_t V; /* virtual time */ 94 uint32_t inv_wsum; /* inverse of sum of weights */ 95 uint32_t wsum; /* sum of weights */ 96 }; 97 98 struct wf2qp_queue { 99 struct dn_queue _q; 100 uint64_t S, F; /* start time, finish time */ 101 uint32_t inv_w; /* ONE_FP / weight */ 102 int32_t heap_pos; /* position (index) of struct in heap */ 103 }; 104 105 /* 106 * This file implements a WF2Q+ scheduler as it has been in dummynet 107 * since 2000. 108 * The scheduler supports per-flow queues and has O(log N) complexity. 109 * 110 * WF2Q+ needs to drain entries from the idle heap so that we 111 * can keep the sum of weights up to date. We can do it whenever 112 * we get a chance, or periodically, or following some other 113 * strategy. The function idle_check() drains at most N elements 114 * from the idle heap. 115 */ 116 static void 117 idle_check(struct wf2qp_si *si, int n, int force) 118 { 119 struct dn_heap *h = &si->idle_heap; 120 while (n-- > 0 && h->elements > 0 && 121 (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { 122 struct dn_queue *q = HEAP_TOP(h)->object; 123 struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; 124 125 heap_extract(h, NULL); 126 /* XXX to let the flowset delete the queue we should 127 * mark it as 'unused' by the scheduler. 128 */ 129 alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ 130 si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ 131 if (si->wsum > 0) 132 si->inv_wsum = ONE_FP/si->wsum; 133 } 134 } 135 136 static int 137 wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) 138 { 139 struct dn_fsk *fs = q->fs; 140 struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); 141 struct wf2qp_queue *alg_fq; 142 uint64_t len = m->m_pkthdr.len; 143 144 if (m != q->mq.head) { 145 if (dn_enqueue(q, m, 0)) /* packet was dropped */ 146 return 1; 147 if (m != q->mq.head) /* queue was already busy */ 148 return 0; 149 } 150 151 /* If reach this point, queue q was idle */ 152 alg_fq = (struct wf2qp_queue *)q; 153 154 if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { 155 /* F<S means timestamps are invalid ->brand new queue. */ 156 alg_fq->S = si->V; /* init start time */ 157 si->wsum += fs->fs.par[0]; /* add weight of new queue. */ 158 si->inv_wsum = ONE_FP/si->wsum; 159 } else { /* if it was idle then it was in the idle heap */ 160 heap_extract(&si->idle_heap, q); 161 alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ 162 } 163 alg_fq->F = alg_fq->S + len * alg_fq->inv_w; 164 165 /* if nothing is backlogged, make sure this flow is eligible */ 166 if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) 167 si->V = MAX64(alg_fq->S, si->V); 168 169 /* 170 * Look at eligibility. A flow is not eligibile if S>V (when 171 * this happens, it means that there is some other flow already 172 * scheduled for the same pipe, so the sch_heap cannot be 173 * empty). If the flow is not eligible we just store it in the 174 * ne_heap. Otherwise, we store in the sch_heap. 175 * Note that for all flows in sch_heap (SCH), S_i <= V, 176 * and for all flows in ne_heap (NEH), S_i > V. 177 * So when we need to compute max(V, min(S_i)) forall i in 178 * SCH+NEH, we only need to look into NEH. 179 */ 180 if (DN_KEY_LT(si->V, alg_fq->S)) { 181 /* S>V means flow Not eligible. */ 182 if (si->sch_heap.elements == 0) 183 D("++ ouch! not eligible but empty scheduler!"); 184 heap_insert(&si->ne_heap, alg_fq->S, q); 185 } else { 186 heap_insert(&si->sch_heap, alg_fq->F, q); 187 } 188 return 0; 189 } 190 191 /* XXX invariant: sch > 0 || V >= min(S in neh) */ 192 static struct mbuf * 193 wf2qp_dequeue(struct dn_sch_inst *_si) 194 { 195 /* Access scheduler instance private data */ 196 struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); 197 struct mbuf *m; 198 struct dn_queue *q; 199 struct dn_heap *sch = &si->sch_heap; 200 struct dn_heap *neh = &si->ne_heap; 201 struct wf2qp_queue *alg_fq; 202 203 if (sch->elements == 0 && neh->elements == 0) { 204 /* we have nothing to do. We could kill the idle heap 205 * altogether and reset V 206 */ 207 idle_check(si, 0x7fffffff, 1); 208 si->V = 0; 209 si->wsum = 0; /* should be set already */ 210 return NULL; /* quick return if nothing to do */ 211 } 212 idle_check(si, 1, 0); /* drain something from the idle heap */ 213 214 /* make sure at least one element is eligible, bumping V 215 * and moving entries that have become eligible. 216 * We need to repeat the first part twice, before and 217 * after extracting the candidate, or enqueue() will 218 * find the data structure in a wrong state. 219 */ 220 m = NULL; 221 for(;;) { 222 /* 223 * Compute V = max(V, min(S_i)). Remember that all elements 224 * in sch have by definition S_i <= V so if sch is not empty, 225 * V is surely the max and we must not update it. Conversely, 226 * if sch is empty we only need to look at neh. 227 * We don't need to move the queues, as it will be done at the 228 * next enqueue 229 */ 230 if (sch->elements == 0 && neh->elements > 0) { 231 si->V = MAX64(si->V, HEAP_TOP(neh)->key); 232 } 233 while (neh->elements > 0 && 234 DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { 235 q = HEAP_TOP(neh)->object; 236 alg_fq = (struct wf2qp_queue *)q; 237 heap_extract(neh, NULL); 238 heap_insert(sch, alg_fq->F, q); 239 } 240 if (m) /* pkt found in previous iteration */ 241 break; 242 /* ok we have at least one eligible pkt */ 243 q = HEAP_TOP(sch)->object; 244 alg_fq = (struct wf2qp_queue *)q; 245 m = dn_dequeue(q); 246 heap_extract(sch, NULL); /* Remove queue from heap. */ 247 si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; 248 alg_fq->S = alg_fq->F; /* Update start time. */ 249 if (q->mq.head == 0) { /* not backlogged any more. */ 250 heap_insert(&si->idle_heap, alg_fq->F, q); 251 } else { /* Still backlogged. */ 252 /* Update F, store in neh or sch */ 253 uint64_t len = q->mq.head->m_pkthdr.len; 254 alg_fq->F += len * alg_fq->inv_w; 255 if (DN_KEY_LEQ(alg_fq->S, si->V)) { 256 heap_insert(sch, alg_fq->F, q); 257 } else { 258 heap_insert(neh, alg_fq->S, q); 259 } 260 } 261 } 262 return m; 263 } 264 265 static int 266 wf2qp_new_sched(struct dn_sch_inst *_si) 267 { 268 struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); 269 int ofs = offsetof(struct wf2qp_queue, heap_pos); 270 271 /* all heaps support extract from middle */ 272 if (heap_init(&si->idle_heap, 16, ofs) || 273 heap_init(&si->sch_heap, 16, ofs) || 274 heap_init(&si->ne_heap, 16, ofs)) { 275 heap_free(&si->ne_heap); 276 heap_free(&si->sch_heap); 277 heap_free(&si->idle_heap); 278 return ENOMEM; 279 } 280 return 0; 281 } 282 283 static int 284 wf2qp_free_sched(struct dn_sch_inst *_si) 285 { 286 struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); 287 288 heap_free(&si->sch_heap); 289 heap_free(&si->ne_heap); 290 heap_free(&si->idle_heap); 291 292 return 0; 293 } 294 295 static int 296 wf2qp_new_fsk(struct dn_fsk *fs) 297 { 298 ipdn_bound_var(&fs->fs.par[0], 1, 299 1, 100, "WF2Q+ weight"); 300 return 0; 301 } 302 303 static int 304 wf2qp_new_queue(struct dn_queue *_q) 305 { 306 struct wf2qp_queue *q = (struct wf2qp_queue *)_q; 307 308 _q->ni.oid.subtype = DN_SCHED_WF2QP; 309 q->F = 0; /* not strictly necessary */ 310 q->S = q->F + 1; /* mark timestamp as invalid. */ 311 q->inv_w = ONE_FP / _q->fs->fs.par[0]; 312 if (_q->mq.head != NULL) { 313 wf2qp_enqueue(_q->_si, _q, _q->mq.head); 314 } 315 return 0; 316 } 317 318 /* 319 * Called when the infrastructure removes a queue (e.g. flowset 320 * is reconfigured). Nothing to do if we did not 'own' the queue, 321 * otherwise remove it from the right heap and adjust the sum 322 * of weights. 323 */ 324 static int 325 wf2qp_free_queue(struct dn_queue *q) 326 { 327 struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; 328 struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); 329 330 if (alg_fq->S >= alg_fq->F + 1) 331 return 0; /* nothing to do, not in any heap */ 332 si->wsum -= q->fs->fs.par[0]; 333 if (si->wsum > 0) 334 si->inv_wsum = ONE_FP/si->wsum; 335 336 /* extract from the heap. XXX TODO we may need to adjust V 337 * to make sure the invariants hold. 338 */ 339 if (q->mq.head == NULL) { 340 heap_extract(&si->idle_heap, q); 341 } else if (DN_KEY_LT(si->V, alg_fq->S)) { 342 heap_extract(&si->ne_heap, q); 343 } else { 344 heap_extract(&si->sch_heap, q); 345 } 346 return 0; 347 } 348 349 /* 350 * WF2Q+ scheduler descriptor 351 * contains the type of the scheduler, the name, the size of the 352 * structures and function pointers. 353 */ 354 static struct dn_alg wf2qp_desc = { 355 _SI( .type = ) DN_SCHED_WF2QP, 356 _SI( .name = ) "WF2Q+", 357 _SI( .flags = ) DN_MULTIQUEUE, 358 359 /* we need extra space in the si and the queue */ 360 _SI( .schk_datalen = ) 0, 361 _SI( .si_datalen = ) sizeof(struct wf2qp_si), 362 _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - 363 sizeof(struct dn_queue), 364 365 _SI( .enqueue = ) wf2qp_enqueue, 366 _SI( .dequeue = ) wf2qp_dequeue, 367 368 _SI( .config = ) NULL, 369 _SI( .destroy = ) NULL, 370 _SI( .new_sched = ) wf2qp_new_sched, 371 _SI( .free_sched = ) wf2qp_free_sched, 372 373 _SI( .new_fsk = ) wf2qp_new_fsk, 374 _SI( .free_fsk = ) NULL, 375 376 _SI( .new_queue = ) wf2qp_new_queue, 377 _SI( .free_queue = ) wf2qp_free_queue, 378 #ifdef NEW_AQM 379 _SI( .getconfig = ) NULL, 380 #endif 381 382 }; 383 384 DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); 385