1 /* Copyright (C) 2013 Cisco Systems, Inc, 2013. 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of the GNU General Public License 5 * as published by the Free Software Foundation; either version 2 6 * of the License. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * Author: Vijay Subramanian <vijaynsu@cisco.com> 14 * Author: Mythili Prabhu <mysuryan@cisco.com> 15 * 16 * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> 17 * University of Oslo, Norway. 18 */ 19 20 #include <linux/module.h> 21 #include <linux/slab.h> 22 #include <linux/types.h> 23 #include <linux/kernel.h> 24 #include <linux/errno.h> 25 #include <linux/skbuff.h> 26 #include <net/pkt_sched.h> 27 #include <net/inet_ecn.h> 28 29 #define QUEUE_THRESHOLD 10000 30 #define DQCOUNT_INVALID -1 31 #define MAX_PROB 0xffffffff 32 #define PIE_SCALE 8 33 34 /* parameters used */ 35 struct pie_params { 36 psched_time_t target; /* user specified target delay in pschedtime */ 37 u32 tupdate; /* timer frequency (in jiffies) */ 38 u32 limit; /* number of packets that can be enqueued */ 39 u32 alpha; /* alpha and beta are between -4 and 4 */ 40 u32 beta; /* and are used for shift relative to 1 */ 41 bool ecn; /* true if ecn is enabled */ 42 bool bytemode; /* to scale drop early prob based on pkt size */ 43 }; 44 45 /* variables used */ 46 struct pie_vars { 47 u32 prob; /* probability but scaled by u32 limit. */ 48 psched_time_t burst_time; 49 psched_time_t qdelay; 50 psched_time_t qdelay_old; 51 u64 dq_count; /* measured in bytes */ 52 psched_time_t dq_tstamp; /* drain rate */ 53 u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ 54 u32 qlen_old; /* in bytes */ 55 }; 56 57 /* statistics gathering */ 58 struct pie_stats { 59 u32 packets_in; /* total number of packets enqueued */ 60 u32 dropped; /* packets dropped due to pie_action */ 61 u32 overlimit; /* dropped due to lack of space in queue */ 62 u32 maxq; /* maximum queue size */ 63 u32 ecn_mark; /* packets marked with ECN */ 64 }; 65 66 /* private data for the Qdisc */ 67 struct pie_sched_data { 68 struct pie_params params; 69 struct pie_vars vars; 70 struct pie_stats stats; 71 struct timer_list adapt_timer; 72 }; 73 74 static void pie_params_init(struct pie_params *params) 75 { 76 params->alpha = 2; 77 params->beta = 20; 78 params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */ 79 params->limit = 1000; /* default of 1000 packets */ 80 params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */ 81 params->ecn = false; 82 params->bytemode = false; 83 } 84 85 static void pie_vars_init(struct pie_vars *vars) 86 { 87 vars->dq_count = DQCOUNT_INVALID; 88 vars->avg_dq_rate = 0; 89 /* default of 100 ms in pschedtime */ 90 vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); 91 } 92 93 static bool drop_early(struct Qdisc *sch, u32 packet_size) 94 { 95 struct pie_sched_data *q = qdisc_priv(sch); 96 u32 rnd; 97 u32 local_prob = q->vars.prob; 98 u32 mtu = psched_mtu(qdisc_dev(sch)); 99 100 /* If there is still burst allowance left skip random early drop */ 101 if (q->vars.burst_time > 0) 102 return false; 103 104 /* If current delay is less than half of target, and 105 * if drop prob is low already, disable early_drop 106 */ 107 if ((q->vars.qdelay < q->params.target / 2) 108 && (q->vars.prob < MAX_PROB / 5)) 109 return false; 110 111 /* If we have fewer than 2 mtu-sized packets, disable drop_early, 112 * similar to min_th in RED 113 */ 114 if (sch->qstats.backlog < 2 * mtu) 115 return false; 116 117 /* If bytemode is turned on, use packet size to compute new 118 * probablity. Smaller packets will have lower drop prob in this case 119 */ 120 if (q->params.bytemode && packet_size <= mtu) 121 local_prob = (local_prob / mtu) * packet_size; 122 else 123 local_prob = q->vars.prob; 124 125 rnd = prandom_u32(); 126 if (rnd < local_prob) 127 return true; 128 129 return false; 130 } 131 132 static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) 133 { 134 struct pie_sched_data *q = qdisc_priv(sch); 135 bool enqueue = false; 136 137 if (unlikely(qdisc_qlen(sch) >= sch->limit)) { 138 q->stats.overlimit++; 139 goto out; 140 } 141 142 if (!drop_early(sch, skb->len)) { 143 enqueue = true; 144 } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && 145 INET_ECN_set_ce(skb)) { 146 /* If packet is ecn capable, mark it if drop probability 147 * is lower than 10%, else drop it. 148 */ 149 q->stats.ecn_mark++; 150 enqueue = true; 151 } 152 153 /* we can enqueue the packet */ 154 if (enqueue) { 155 q->stats.packets_in++; 156 if (qdisc_qlen(sch) > q->stats.maxq) 157 q->stats.maxq = qdisc_qlen(sch); 158 159 return qdisc_enqueue_tail(skb, sch); 160 } 161 162 out: 163 q->stats.dropped++; 164 return qdisc_drop(skb, sch); 165 } 166 167 static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { 168 [TCA_PIE_TARGET] = {.type = NLA_U32}, 169 [TCA_PIE_LIMIT] = {.type = NLA_U32}, 170 [TCA_PIE_TUPDATE] = {.type = NLA_U32}, 171 [TCA_PIE_ALPHA] = {.type = NLA_U32}, 172 [TCA_PIE_BETA] = {.type = NLA_U32}, 173 [TCA_PIE_ECN] = {.type = NLA_U32}, 174 [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, 175 }; 176 177 static int pie_change(struct Qdisc *sch, struct nlattr *opt) 178 { 179 struct pie_sched_data *q = qdisc_priv(sch); 180 struct nlattr *tb[TCA_PIE_MAX + 1]; 181 unsigned int qlen; 182 int err; 183 184 if (!opt) 185 return -EINVAL; 186 187 err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy); 188 if (err < 0) 189 return err; 190 191 sch_tree_lock(sch); 192 193 /* convert from microseconds to pschedtime */ 194 if (tb[TCA_PIE_TARGET]) { 195 /* target is in us */ 196 u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); 197 198 /* convert to pschedtime */ 199 q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); 200 } 201 202 /* tupdate is in jiffies */ 203 if (tb[TCA_PIE_TUPDATE]) 204 q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); 205 206 if (tb[TCA_PIE_LIMIT]) { 207 u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); 208 209 q->params.limit = limit; 210 sch->limit = limit; 211 } 212 213 if (tb[TCA_PIE_ALPHA]) 214 q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); 215 216 if (tb[TCA_PIE_BETA]) 217 q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); 218 219 if (tb[TCA_PIE_ECN]) 220 q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); 221 222 if (tb[TCA_PIE_BYTEMODE]) 223 q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); 224 225 /* Drop excess packets if new limit is lower */ 226 qlen = sch->q.qlen; 227 while (sch->q.qlen > sch->limit) { 228 struct sk_buff *skb = __skb_dequeue(&sch->q); 229 230 sch->qstats.backlog -= qdisc_pkt_len(skb); 231 qdisc_drop(skb, sch); 232 } 233 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); 234 235 sch_tree_unlock(sch); 236 return 0; 237 } 238 239 static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) 240 { 241 242 struct pie_sched_data *q = qdisc_priv(sch); 243 int qlen = sch->qstats.backlog; /* current queue size in bytes */ 244 245 /* If current queue is about 10 packets or more and dq_count is unset 246 * we have enough packets to calculate the drain rate. Save 247 * current time as dq_tstamp and start measurement cycle. 248 */ 249 if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { 250 q->vars.dq_tstamp = psched_get_time(); 251 q->vars.dq_count = 0; 252 } 253 254 /* Calculate the average drain rate from this value. If queue length 255 * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset 256 * the dq_count to -1 as we don't have enough packets to calculate the 257 * drain rate anymore The following if block is entered only when we 258 * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) 259 * and we calculate the drain rate for the threshold here. dq_count is 260 * in bytes, time difference in psched_time, hence rate is in 261 * bytes/psched_time. 262 */ 263 if (q->vars.dq_count != DQCOUNT_INVALID) { 264 q->vars.dq_count += skb->len; 265 266 if (q->vars.dq_count >= QUEUE_THRESHOLD) { 267 psched_time_t now = psched_get_time(); 268 u32 dtime = now - q->vars.dq_tstamp; 269 u32 count = q->vars.dq_count << PIE_SCALE; 270 271 if (dtime == 0) 272 return; 273 274 count = count / dtime; 275 276 if (q->vars.avg_dq_rate == 0) 277 q->vars.avg_dq_rate = count; 278 else 279 q->vars.avg_dq_rate = 280 (q->vars.avg_dq_rate - 281 (q->vars.avg_dq_rate >> 3)) + (count >> 3); 282 283 /* If the queue has receded below the threshold, we hold 284 * on to the last drain rate calculated, else we reset 285 * dq_count to 0 to re-enter the if block when the next 286 * packet is dequeued 287 */ 288 if (qlen < QUEUE_THRESHOLD) 289 q->vars.dq_count = DQCOUNT_INVALID; 290 else { 291 q->vars.dq_count = 0; 292 q->vars.dq_tstamp = psched_get_time(); 293 } 294 295 if (q->vars.burst_time > 0) { 296 if (q->vars.burst_time > dtime) 297 q->vars.burst_time -= dtime; 298 else 299 q->vars.burst_time = 0; 300 } 301 } 302 } 303 } 304 305 static void calculate_probability(struct Qdisc *sch) 306 { 307 struct pie_sched_data *q = qdisc_priv(sch); 308 u32 qlen = sch->qstats.backlog; /* queue size in bytes */ 309 psched_time_t qdelay = 0; /* in pschedtime */ 310 psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ 311 s32 delta = 0; /* determines the change in probability */ 312 u32 oldprob; 313 u32 alpha, beta; 314 bool update_prob = true; 315 316 q->vars.qdelay_old = q->vars.qdelay; 317 318 if (q->vars.avg_dq_rate > 0) 319 qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; 320 else 321 qdelay = 0; 322 323 /* If qdelay is zero and qlen is not, it means qlen is very small, less 324 * than dequeue_rate, so we do not update probabilty in this round 325 */ 326 if (qdelay == 0 && qlen != 0) 327 update_prob = false; 328 329 /* Add ranges for alpha and beta, more aggressive for high dropping 330 * mode and gentle steps for light dropping mode 331 * In light dropping mode, take gentle steps; in medium dropping mode, 332 * take medium steps; in high dropping mode, take big steps. 333 */ 334 if (q->vars.prob < MAX_PROB / 100) { 335 alpha = 336 (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; 337 beta = 338 (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; 339 } else if (q->vars.prob < MAX_PROB / 10) { 340 alpha = 341 (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; 342 beta = 343 (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; 344 } else { 345 alpha = 346 (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; 347 beta = 348 (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; 349 } 350 351 /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ 352 delta += alpha * ((qdelay - q->params.target)); 353 delta += beta * ((qdelay - qdelay_old)); 354 355 oldprob = q->vars.prob; 356 357 /* to ensure we increase probability in steps of no more than 2% */ 358 if (delta > (s32) (MAX_PROB / (100 / 2)) && 359 q->vars.prob >= MAX_PROB / 10) 360 delta = (MAX_PROB / 100) * 2; 361 362 /* Non-linear drop: 363 * Tune drop probability to increase quickly for high delays(>= 250ms) 364 * 250ms is derived through experiments and provides error protection 365 */ 366 367 if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) 368 delta += MAX_PROB / (100 / 2); 369 370 q->vars.prob += delta; 371 372 if (delta > 0) { 373 /* prevent overflow */ 374 if (q->vars.prob < oldprob) { 375 q->vars.prob = MAX_PROB; 376 /* Prevent normalization error. If probability is at 377 * maximum value already, we normalize it here, and 378 * skip the check to do a non-linear drop in the next 379 * section. 380 */ 381 update_prob = false; 382 } 383 } else { 384 /* prevent underflow */ 385 if (q->vars.prob > oldprob) 386 q->vars.prob = 0; 387 } 388 389 /* Non-linear drop in probability: Reduce drop probability quickly if 390 * delay is 0 for 2 consecutive Tupdate periods. 391 */ 392 393 if ((qdelay == 0) && (qdelay_old == 0) && update_prob) 394 q->vars.prob = (q->vars.prob * 98) / 100; 395 396 q->vars.qdelay = qdelay; 397 q->vars.qlen_old = qlen; 398 399 /* We restart the measurement cycle if the following conditions are met 400 * 1. If the delay has been low for 2 consecutive Tupdate periods 401 * 2. Calculated drop probability is zero 402 * 3. We have atleast one estimate for the avg_dq_rate ie., 403 * is a non-zero value 404 */ 405 if ((q->vars.qdelay < q->params.target / 2) && 406 (q->vars.qdelay_old < q->params.target / 2) && 407 (q->vars.prob == 0) && 408 (q->vars.avg_dq_rate > 0)) 409 pie_vars_init(&q->vars); 410 } 411 412 static void pie_timer(unsigned long arg) 413 { 414 struct Qdisc *sch = (struct Qdisc *)arg; 415 struct pie_sched_data *q = qdisc_priv(sch); 416 spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); 417 418 spin_lock(root_lock); 419 calculate_probability(sch); 420 421 /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ 422 if (q->params.tupdate) 423 mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); 424 spin_unlock(root_lock); 425 426 } 427 428 static int pie_init(struct Qdisc *sch, struct nlattr *opt) 429 { 430 struct pie_sched_data *q = qdisc_priv(sch); 431 432 pie_params_init(&q->params); 433 pie_vars_init(&q->vars); 434 sch->limit = q->params.limit; 435 436 setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); 437 mod_timer(&q->adapt_timer, jiffies + HZ / 2); 438 439 if (opt) { 440 int err = pie_change(sch, opt); 441 442 if (err) 443 return err; 444 } 445 446 return 0; 447 } 448 449 static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) 450 { 451 struct pie_sched_data *q = qdisc_priv(sch); 452 struct nlattr *opts; 453 454 opts = nla_nest_start(skb, TCA_OPTIONS); 455 if (opts == NULL) 456 goto nla_put_failure; 457 458 /* convert target from pschedtime to us */ 459 if (nla_put_u32(skb, TCA_PIE_TARGET, 460 ((u32) PSCHED_TICKS2NS(q->params.target)) / 461 NSEC_PER_USEC) || 462 nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || 463 nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || 464 nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || 465 nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || 466 nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || 467 nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) 468 goto nla_put_failure; 469 470 return nla_nest_end(skb, opts); 471 472 nla_put_failure: 473 nla_nest_cancel(skb, opts); 474 return -1; 475 476 } 477 478 static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 479 { 480 struct pie_sched_data *q = qdisc_priv(sch); 481 struct tc_pie_xstats st = { 482 .prob = q->vars.prob, 483 .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / 484 NSEC_PER_USEC, 485 /* unscale and return dq_rate in bytes per sec */ 486 .avg_dq_rate = q->vars.avg_dq_rate * 487 (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, 488 .packets_in = q->stats.packets_in, 489 .overlimit = q->stats.overlimit, 490 .maxq = q->stats.maxq, 491 .dropped = q->stats.dropped, 492 .ecn_mark = q->stats.ecn_mark, 493 }; 494 495 return gnet_stats_copy_app(d, &st, sizeof(st)); 496 } 497 498 static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) 499 { 500 struct sk_buff *skb; 501 skb = __qdisc_dequeue_head(sch, &sch->q); 502 503 if (!skb) 504 return NULL; 505 506 pie_process_dequeue(sch, skb); 507 return skb; 508 } 509 510 static void pie_reset(struct Qdisc *sch) 511 { 512 struct pie_sched_data *q = qdisc_priv(sch); 513 qdisc_reset_queue(sch); 514 pie_vars_init(&q->vars); 515 } 516 517 static void pie_destroy(struct Qdisc *sch) 518 { 519 struct pie_sched_data *q = qdisc_priv(sch); 520 q->params.tupdate = 0; 521 del_timer_sync(&q->adapt_timer); 522 } 523 524 static struct Qdisc_ops pie_qdisc_ops __read_mostly = { 525 .id = "pie", 526 .priv_size = sizeof(struct pie_sched_data), 527 .enqueue = pie_qdisc_enqueue, 528 .dequeue = pie_qdisc_dequeue, 529 .peek = qdisc_peek_dequeued, 530 .init = pie_init, 531 .destroy = pie_destroy, 532 .reset = pie_reset, 533 .change = pie_change, 534 .dump = pie_dump, 535 .dump_stats = pie_dump_stats, 536 .owner = THIS_MODULE, 537 }; 538 539 static int __init pie_module_init(void) 540 { 541 return register_qdisc(&pie_qdisc_ops); 542 } 543 544 static void __exit pie_module_exit(void) 545 { 546 unregister_qdisc(&pie_qdisc_ops); 547 } 548 549 module_init(pie_module_init); 550 module_exit(pie_module_exit); 551 552 MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); 553 MODULE_AUTHOR("Vijay Subramanian"); 554 MODULE_AUTHOR("Mythili Prabhu"); 555 MODULE_LICENSE("GPL"); 556