1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * ip_vs_est.c: simple rate estimator for IPVS 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * 7 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> 8 * Network name space (netns) aware. 9 * Global data moved to netns i.e struct netns_ipvs 10 * Affected data: est_list and est_lock. 11 * estimation_timer() runs with timer per netns. 12 * get_stats()) do the per cpu summing. 13 */ 14 15 #define KMSG_COMPONENT "IPVS" 16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 17 18 #include <linux/kernel.h> 19 #include <linux/jiffies.h> 20 #include <linux/types.h> 21 #include <linux/interrupt.h> 22 #include <linux/sysctl.h> 23 #include <linux/list.h> 24 #include <linux/rcupdate_wait.h> 25 26 #include <net/ip_vs.h> 27 28 /* 29 This code is to estimate rate in a shorter interval (such as 8 30 seconds) for virtual services and real servers. For measure rate in a 31 long interval, it is easy to implement a user level daemon which 32 periodically reads those statistical counters and measure rate. 33 34 We measure rate during the last 8 seconds every 2 seconds: 35 36 avgrate = avgrate*(1-W) + rate*W 37 38 where W = 2^(-2) 39 40 NOTES. 41 42 * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. 43 44 * Netlink users can see 64-bit values but sockopt users are restricted 45 to 32-bit values for conns, packets, bps, cps and pps. 46 47 * A lot of code is taken from net/core/gen_estimator.c 48 49 KEY POINTS: 50 - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled 51 - kthreads read the cpustats to update the estimators (svcs, dests, total) 52 - the states of estimators can be read (get stats) or modified (zero stats) 53 from processes 54 55 KTHREADS: 56 - estimators are added initially to est_temp_list and later kthread 0 57 distributes them to one or many kthreads for estimation 58 - kthread contexts are created and attached to array 59 - the kthread tasks are started when first service is added, before that 60 the total stats are not estimated 61 - when configuration (cpulist/nice) is changed, the tasks are restarted 62 by work (est_reload_work) 63 - kthread tasks are stopped while the cpulist is empty 64 - the kthread context holds lists with estimators (chains) which are 65 processed every 2 seconds 66 - as estimators can be added dynamically and in bursts, we try to spread 67 them to multiple chains which are estimated at different time 68 - on start, kthread 0 enters calculation phase to determine the chain limits 69 and the limit of estimators per kthread 70 - est_add_ktid: ktid where to add new ests, can point to empty slot where 71 we should add kt data 72 */ 73 74 static struct lock_class_key __ipvs_est_key; 75 76 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); 77 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); 78 79 static void ip_vs_chain_estimation(struct hlist_head *chain) 80 { 81 struct ip_vs_estimator *e; 82 struct ip_vs_cpu_stats *c; 83 struct ip_vs_stats *s; 84 u64 rate; 85 86 hlist_for_each_entry_rcu(e, chain, list) { 87 u64 conns, inpkts, outpkts, inbytes, outbytes; 88 u64 kconns = 0, kinpkts = 0, koutpkts = 0; 89 u64 kinbytes = 0, koutbytes = 0; 90 unsigned int start; 91 int i; 92 93 if (kthread_should_stop()) 94 break; 95 96 s = container_of(e, struct ip_vs_stats, est); 97 for_each_possible_cpu(i) { 98 c = per_cpu_ptr(s->cpustats, i); 99 do { 100 start = u64_stats_fetch_begin(&c->syncp); 101 conns = u64_stats_read(&c->cnt.conns); 102 inpkts = u64_stats_read(&c->cnt.inpkts); 103 outpkts = u64_stats_read(&c->cnt.outpkts); 104 inbytes = u64_stats_read(&c->cnt.inbytes); 105 outbytes = u64_stats_read(&c->cnt.outbytes); 106 } while (u64_stats_fetch_retry(&c->syncp, start)); 107 kconns += conns; 108 kinpkts += inpkts; 109 koutpkts += outpkts; 110 kinbytes += inbytes; 111 koutbytes += outbytes; 112 } 113 114 spin_lock(&s->lock); 115 116 s->kstats.conns = kconns; 117 s->kstats.inpkts = kinpkts; 118 s->kstats.outpkts = koutpkts; 119 s->kstats.inbytes = kinbytes; 120 s->kstats.outbytes = koutbytes; 121 122 /* scaled by 2^10, but divided 2 seconds */ 123 rate = (s->kstats.conns - e->last_conns) << 9; 124 e->last_conns = s->kstats.conns; 125 e->cps += ((s64)rate - (s64)e->cps) >> 2; 126 127 rate = (s->kstats.inpkts - e->last_inpkts) << 9; 128 e->last_inpkts = s->kstats.inpkts; 129 e->inpps += ((s64)rate - (s64)e->inpps) >> 2; 130 131 rate = (s->kstats.outpkts - e->last_outpkts) << 9; 132 e->last_outpkts = s->kstats.outpkts; 133 e->outpps += ((s64)rate - (s64)e->outpps) >> 2; 134 135 /* scaled by 2^5, but divided 2 seconds */ 136 rate = (s->kstats.inbytes - e->last_inbytes) << 4; 137 e->last_inbytes = s->kstats.inbytes; 138 e->inbps += ((s64)rate - (s64)e->inbps) >> 2; 139 140 rate = (s->kstats.outbytes - e->last_outbytes) << 4; 141 e->last_outbytes = s->kstats.outbytes; 142 e->outbps += ((s64)rate - (s64)e->outbps) >> 2; 143 spin_unlock(&s->lock); 144 } 145 } 146 147 static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) 148 { 149 struct ip_vs_est_tick_data *td; 150 int cid; 151 152 rcu_read_lock(); 153 td = rcu_dereference(kd->ticks[row]); 154 if (!td) 155 goto out; 156 for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { 157 if (kthread_should_stop()) 158 break; 159 ip_vs_chain_estimation(&td->chains[cid]); 160 cond_resched_rcu(); 161 td = rcu_dereference(kd->ticks[row]); 162 if (!td) 163 break; 164 } 165 166 out: 167 rcu_read_unlock(); 168 } 169 170 static int ip_vs_estimation_kthread(void *data) 171 { 172 struct ip_vs_est_kt_data *kd = data; 173 struct netns_ipvs *ipvs = kd->ipvs; 174 int row = kd->est_row; 175 unsigned long now; 176 int id = kd->id; 177 long gap; 178 179 if (id > 0) { 180 if (!ipvs->est_chain_max) 181 return 0; 182 } else { 183 if (!ipvs->est_chain_max) { 184 ipvs->est_calc_phase = 1; 185 /* commit est_calc_phase before reading est_genid */ 186 smp_mb(); 187 } 188 189 /* kthread 0 will handle the calc phase */ 190 if (ipvs->est_calc_phase) 191 ip_vs_est_calc_phase(ipvs); 192 } 193 194 while (1) { 195 if (!id && !hlist_empty(&ipvs->est_temp_list)) 196 ip_vs_est_drain_temp_list(ipvs); 197 set_current_state(TASK_IDLE); 198 if (kthread_should_stop()) 199 break; 200 201 /* before estimation, check if we should sleep */ 202 now = jiffies; 203 gap = kd->est_timer - now; 204 if (gap > 0) { 205 if (gap > IPVS_EST_TICK) { 206 kd->est_timer = now - IPVS_EST_TICK; 207 gap = IPVS_EST_TICK; 208 } 209 schedule_timeout(gap); 210 } else { 211 __set_current_state(TASK_RUNNING); 212 if (gap < -8 * IPVS_EST_TICK) 213 kd->est_timer = now; 214 } 215 216 if (kd->tick_len[row]) 217 ip_vs_tick_estimation(kd, row); 218 219 row++; 220 if (row >= IPVS_EST_NTICKS) 221 row = 0; 222 WRITE_ONCE(kd->est_row, row); 223 kd->est_timer += IPVS_EST_TICK; 224 } 225 __set_current_state(TASK_RUNNING); 226 227 return 0; 228 } 229 230 /* Schedule stop/start for kthread tasks */ 231 void ip_vs_est_reload_start(struct netns_ipvs *ipvs) 232 { 233 /* Ignore reloads before first service is added */ 234 if (!READ_ONCE(ipvs->enable)) 235 return; 236 ip_vs_est_stopped_recalc(ipvs); 237 /* Bump the kthread configuration genid */ 238 atomic_inc(&ipvs->est_genid); 239 queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); 240 } 241 242 /* Start kthread task with current configuration */ 243 int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, 244 struct ip_vs_est_kt_data *kd) 245 { 246 unsigned long now; 247 int ret = 0; 248 long gap; 249 250 lockdep_assert_held(&ipvs->est_mutex); 251 252 if (kd->task) 253 goto out; 254 now = jiffies; 255 gap = kd->est_timer - now; 256 /* Sync est_timer if task is starting later */ 257 if (abs(gap) > 4 * IPVS_EST_TICK) 258 kd->est_timer = now; 259 kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", 260 ipvs->gen, kd->id); 261 if (IS_ERR(kd->task)) { 262 ret = PTR_ERR(kd->task); 263 kd->task = NULL; 264 goto out; 265 } 266 267 set_user_nice(kd->task, sysctl_est_nice(ipvs)); 268 if (sysctl_est_preferred_cpulist(ipvs)) 269 kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs)); 270 271 pr_info("starting estimator thread %d...\n", kd->id); 272 wake_up_process(kd->task); 273 274 out: 275 return ret; 276 } 277 278 void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) 279 { 280 if (kd->task) { 281 pr_info("stopping estimator thread %d...\n", kd->id); 282 kthread_stop(kd->task); 283 kd->task = NULL; 284 } 285 } 286 287 /* Apply parameters to kthread */ 288 static void ip_vs_est_set_params(struct netns_ipvs *ipvs, 289 struct ip_vs_est_kt_data *kd) 290 { 291 kd->chain_max = ipvs->est_chain_max; 292 /* We are using single chain on RCU preemption */ 293 if (IPVS_EST_TICK_CHAINS == 1) 294 kd->chain_max *= IPVS_EST_CHAIN_FACTOR; 295 kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; 296 kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; 297 } 298 299 /* Create and start estimation kthread in a free or new array slot */ 300 static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) 301 { 302 struct ip_vs_est_kt_data *kd = NULL; 303 int id = ipvs->est_kt_count; 304 int ret = -ENOMEM; 305 void *arr = NULL; 306 int i; 307 308 if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && 309 READ_ONCE(ipvs->enable) && ipvs->est_max_threads) 310 return -EINVAL; 311 312 mutex_lock(&ipvs->est_mutex); 313 314 for (i = 0; i < id; i++) { 315 if (!ipvs->est_kt_arr[i]) 316 break; 317 } 318 if (i >= id) { 319 arr = krealloc_array(ipvs->est_kt_arr, id + 1, 320 sizeof(struct ip_vs_est_kt_data *), 321 GFP_KERNEL); 322 if (!arr) 323 goto out; 324 ipvs->est_kt_arr = arr; 325 } else { 326 id = i; 327 } 328 329 kd = kzalloc(sizeof(*kd), GFP_KERNEL); 330 if (!kd) 331 goto out; 332 kd->ipvs = ipvs; 333 bitmap_fill(kd->avail, IPVS_EST_NTICKS); 334 kd->est_timer = jiffies; 335 kd->id = id; 336 ip_vs_est_set_params(ipvs, kd); 337 338 /* Pre-allocate stats used in calc phase */ 339 if (!id && !kd->calc_stats) { 340 kd->calc_stats = ip_vs_stats_alloc(); 341 if (!kd->calc_stats) 342 goto out; 343 } 344 345 /* Start kthread tasks only when services are present */ 346 if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { 347 ret = ip_vs_est_kthread_start(ipvs, kd); 348 if (ret < 0) 349 goto out; 350 } 351 352 if (arr) 353 ipvs->est_kt_count++; 354 ipvs->est_kt_arr[id] = kd; 355 kd = NULL; 356 /* Use most recent kthread for new ests */ 357 ipvs->est_add_ktid = id; 358 ret = 0; 359 360 out: 361 mutex_unlock(&ipvs->est_mutex); 362 if (kd) { 363 ip_vs_stats_free(kd->calc_stats); 364 kfree(kd); 365 } 366 367 return ret; 368 } 369 370 /* Select ktid where to add new ests: available, unused or new slot */ 371 static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) 372 { 373 int ktid, best = ipvs->est_kt_count; 374 struct ip_vs_est_kt_data *kd; 375 376 for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { 377 kd = ipvs->est_kt_arr[ktid]; 378 if (kd) { 379 if (kd->est_count < kd->est_max_count) { 380 best = ktid; 381 break; 382 } 383 } else if (ktid < best) { 384 best = ktid; 385 } 386 } 387 ipvs->est_add_ktid = best; 388 } 389 390 /* Add estimator to current kthread (est_add_ktid) */ 391 static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, 392 struct ip_vs_estimator *est) 393 { 394 struct ip_vs_est_kt_data *kd = NULL; 395 struct ip_vs_est_tick_data *td; 396 int ktid, row, crow, cid, ret; 397 int delay = est->ktrow; 398 399 BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, 400 "Too many chains for ktcid"); 401 402 if (ipvs->est_add_ktid < ipvs->est_kt_count) { 403 kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; 404 if (kd) 405 goto add_est; 406 } 407 408 ret = ip_vs_est_add_kthread(ipvs); 409 if (ret < 0) 410 goto out; 411 kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; 412 413 add_est: 414 ktid = kd->id; 415 /* For small number of estimators prefer to use few ticks, 416 * otherwise try to add into the last estimated row. 417 * est_row and add_row point after the row we should use 418 */ 419 if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) 420 crow = READ_ONCE(kd->est_row); 421 else 422 crow = kd->add_row; 423 crow += delay; 424 if (crow >= IPVS_EST_NTICKS) 425 crow -= IPVS_EST_NTICKS; 426 /* Assume initial delay ? */ 427 if (delay >= IPVS_EST_NTICKS - 1) { 428 /* Preserve initial delay or decrease it if no space in tick */ 429 row = crow; 430 if (crow < IPVS_EST_NTICKS - 1) { 431 crow++; 432 row = find_last_bit(kd->avail, crow); 433 } 434 if (row >= crow) 435 row = find_last_bit(kd->avail, IPVS_EST_NTICKS); 436 } else { 437 /* Preserve delay or increase it if no space in tick */ 438 row = IPVS_EST_NTICKS; 439 if (crow > 0) 440 row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); 441 if (row >= IPVS_EST_NTICKS) 442 row = find_first_bit(kd->avail, IPVS_EST_NTICKS); 443 } 444 445 td = rcu_dereference_protected(kd->ticks[row], 1); 446 if (!td) { 447 td = kzalloc(sizeof(*td), GFP_KERNEL); 448 if (!td) { 449 ret = -ENOMEM; 450 goto out; 451 } 452 rcu_assign_pointer(kd->ticks[row], td); 453 } 454 455 cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); 456 457 kd->est_count++; 458 kd->tick_len[row]++; 459 if (!td->chain_len[cid]) 460 __set_bit(cid, td->present); 461 td->chain_len[cid]++; 462 est->ktid = ktid; 463 est->ktrow = row; 464 est->ktcid = cid; 465 hlist_add_head_rcu(&est->list, &td->chains[cid]); 466 467 if (td->chain_len[cid] >= kd->chain_max) { 468 __set_bit(cid, td->full); 469 if (kd->tick_len[row] >= kd->tick_max) 470 __clear_bit(row, kd->avail); 471 } 472 473 /* Update est_add_ktid to point to first available/empty kt slot */ 474 if (kd->est_count == kd->est_max_count) 475 ip_vs_est_update_ktid(ipvs); 476 477 ret = 0; 478 479 out: 480 return ret; 481 } 482 483 /* Start estimation for stats */ 484 int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) 485 { 486 struct ip_vs_estimator *est = &stats->est; 487 int ret; 488 489 if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) 490 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 491 492 est->ktid = -1; 493 est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ 494 495 /* We prefer this code to be short, kthread 0 will requeue the 496 * estimator to available chain. If tasks are disabled, we 497 * will not allocate much memory, just for kt 0. 498 */ 499 ret = 0; 500 if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) 501 ret = ip_vs_est_add_kthread(ipvs); 502 if (ret >= 0) 503 hlist_add_head(&est->list, &ipvs->est_temp_list); 504 else 505 INIT_HLIST_NODE(&est->list); 506 return ret; 507 } 508 509 static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) 510 { 511 if (kd) { 512 if (kd->task) { 513 pr_info("stop unused estimator thread %d...\n", kd->id); 514 kthread_stop(kd->task); 515 } 516 ip_vs_stats_free(kd->calc_stats); 517 kfree(kd); 518 } 519 } 520 521 /* Unlink estimator from chain */ 522 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) 523 { 524 struct ip_vs_estimator *est = &stats->est; 525 struct ip_vs_est_tick_data *td; 526 struct ip_vs_est_kt_data *kd; 527 int ktid = est->ktid; 528 int row = est->ktrow; 529 int cid = est->ktcid; 530 531 /* Failed to add to chain ? */ 532 if (hlist_unhashed(&est->list)) 533 return; 534 535 /* On return, estimator can be freed, dequeue it now */ 536 537 /* In est_temp_list ? */ 538 if (ktid < 0) { 539 hlist_del(&est->list); 540 goto end_kt0; 541 } 542 543 hlist_del_rcu(&est->list); 544 kd = ipvs->est_kt_arr[ktid]; 545 td = rcu_dereference_protected(kd->ticks[row], 1); 546 __clear_bit(cid, td->full); 547 td->chain_len[cid]--; 548 if (!td->chain_len[cid]) 549 __clear_bit(cid, td->present); 550 kd->tick_len[row]--; 551 __set_bit(row, kd->avail); 552 if (!kd->tick_len[row]) { 553 RCU_INIT_POINTER(kd->ticks[row], NULL); 554 kfree_rcu(td, rcu_head); 555 } 556 kd->est_count--; 557 if (kd->est_count) { 558 /* This kt slot can become available just now, prefer it */ 559 if (ktid < ipvs->est_add_ktid) 560 ipvs->est_add_ktid = ktid; 561 return; 562 } 563 564 if (ktid > 0) { 565 mutex_lock(&ipvs->est_mutex); 566 ip_vs_est_kthread_destroy(kd); 567 ipvs->est_kt_arr[ktid] = NULL; 568 if (ktid == ipvs->est_kt_count - 1) { 569 ipvs->est_kt_count--; 570 while (ipvs->est_kt_count > 1 && 571 !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) 572 ipvs->est_kt_count--; 573 } 574 mutex_unlock(&ipvs->est_mutex); 575 576 /* This slot is now empty, prefer another available kt slot */ 577 if (ktid == ipvs->est_add_ktid) 578 ip_vs_est_update_ktid(ipvs); 579 } 580 581 end_kt0: 582 /* kt 0 is freed after all other kthreads and chains are empty */ 583 if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { 584 kd = ipvs->est_kt_arr[0]; 585 if (!kd || !kd->est_count) { 586 mutex_lock(&ipvs->est_mutex); 587 if (kd) { 588 ip_vs_est_kthread_destroy(kd); 589 ipvs->est_kt_arr[0] = NULL; 590 } 591 ipvs->est_kt_count--; 592 mutex_unlock(&ipvs->est_mutex); 593 ipvs->est_add_ktid = 0; 594 } 595 } 596 } 597 598 /* Register all ests from est_temp_list to kthreads */ 599 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) 600 { 601 struct ip_vs_estimator *est; 602 603 while (1) { 604 int max = 16; 605 606 mutex_lock(&__ip_vs_mutex); 607 608 while (max-- > 0) { 609 est = hlist_entry_safe(ipvs->est_temp_list.first, 610 struct ip_vs_estimator, list); 611 if (est) { 612 if (kthread_should_stop()) 613 goto unlock; 614 hlist_del_init(&est->list); 615 if (ip_vs_enqueue_estimator(ipvs, est) >= 0) 616 continue; 617 est->ktid = -1; 618 hlist_add_head(&est->list, 619 &ipvs->est_temp_list); 620 /* Abort, some entries will not be estimated 621 * until next attempt 622 */ 623 } 624 goto unlock; 625 } 626 mutex_unlock(&__ip_vs_mutex); 627 cond_resched(); 628 } 629 630 unlock: 631 mutex_unlock(&__ip_vs_mutex); 632 } 633 634 /* Calculate limits for all kthreads */ 635 static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) 636 { 637 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); 638 struct ip_vs_est_kt_data *kd; 639 struct hlist_head chain; 640 struct ip_vs_stats *s; 641 int cache_factor = 4; 642 int i, loops, ntest; 643 s32 min_est = 0; 644 ktime_t t1, t2; 645 int max = 8; 646 int ret = 1; 647 s64 diff; 648 u64 val; 649 650 INIT_HLIST_HEAD(&chain); 651 mutex_lock(&__ip_vs_mutex); 652 kd = ipvs->est_kt_arr[0]; 653 mutex_unlock(&__ip_vs_mutex); 654 s = kd ? kd->calc_stats : NULL; 655 if (!s) 656 goto out; 657 hlist_add_head(&s->est.list, &chain); 658 659 loops = 1; 660 /* Get best result from many tests */ 661 for (ntest = 0; ntest < 12; ntest++) { 662 if (!(ntest & 3)) { 663 /* Wait for cpufreq frequency transition */ 664 wait_event_idle_timeout(wq, kthread_should_stop(), 665 HZ / 50); 666 if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) 667 goto stop; 668 } 669 670 local_bh_disable(); 671 rcu_read_lock(); 672 673 /* Put stats in cache */ 674 ip_vs_chain_estimation(&chain); 675 676 t1 = ktime_get(); 677 for (i = loops * cache_factor; i > 0; i--) 678 ip_vs_chain_estimation(&chain); 679 t2 = ktime_get(); 680 681 rcu_read_unlock(); 682 local_bh_enable(); 683 684 if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) 685 goto stop; 686 cond_resched(); 687 688 diff = ktime_to_ns(ktime_sub(t2, t1)); 689 if (diff <= 1 * NSEC_PER_USEC) { 690 /* Do more loops on low time resolution */ 691 loops *= 2; 692 continue; 693 } 694 if (diff >= NSEC_PER_SEC) 695 continue; 696 val = diff; 697 do_div(val, loops); 698 if (!min_est || val < min_est) { 699 min_est = val; 700 /* goal: 95usec per chain */ 701 val = 95 * NSEC_PER_USEC; 702 if (val >= min_est) { 703 do_div(val, min_est); 704 max = (int)val; 705 } else { 706 max = 1; 707 } 708 } 709 } 710 711 out: 712 if (s) 713 hlist_del_init(&s->est.list); 714 *chain_max = max; 715 return ret; 716 717 stop: 718 ret = 0; 719 goto out; 720 } 721 722 /* Calculate the parameters and apply them in context of kt #0 723 * ECP: est_calc_phase 724 * ECM: est_chain_max 725 * ECP ECM Insert Chain enable Description 726 * --------------------------------------------------------------------------- 727 * 0 0 est_temp_list 0 create kt #0 context 728 * 0 0 est_temp_list 0->1 service added, start kthread #0 task 729 * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase 730 * 1 0 est_temp_list 1 kt #0: determine est_chain_max, 731 * stop tasks, move ests to est_temp_list 732 * and free kd for kthreads 1..last 733 * 1->0 0->N kt chains 1 ests can go to kthreads 734 * 0 N kt chains 1 drain est_temp_list, create new kthread 735 * contexts, start tasks, estimate 736 */ 737 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) 738 { 739 int genid = atomic_read(&ipvs->est_genid); 740 struct ip_vs_est_tick_data *td; 741 struct ip_vs_est_kt_data *kd; 742 struct ip_vs_estimator *est; 743 struct ip_vs_stats *stats; 744 int id, row, cid, delay; 745 bool last, last_td; 746 int chain_max; 747 int step; 748 749 if (!ip_vs_est_calc_limits(ipvs, &chain_max)) 750 return; 751 752 mutex_lock(&__ip_vs_mutex); 753 754 /* Stop all other tasks, so that we can immediately move the 755 * estimators to est_temp_list without RCU grace period 756 */ 757 mutex_lock(&ipvs->est_mutex); 758 for (id = 1; id < ipvs->est_kt_count; id++) { 759 /* netns clean up started, abort */ 760 if (!READ_ONCE(ipvs->enable)) 761 goto unlock2; 762 kd = ipvs->est_kt_arr[id]; 763 if (!kd) 764 continue; 765 ip_vs_est_kthread_stop(kd); 766 } 767 mutex_unlock(&ipvs->est_mutex); 768 769 /* Move all estimators to est_temp_list but carefully, 770 * all estimators and kthread data can be released while 771 * we reschedule. Even for kthread 0. 772 */ 773 step = 0; 774 775 /* Order entries in est_temp_list in ascending delay, so now 776 * walk delay(desc), id(desc), cid(asc) 777 */ 778 delay = IPVS_EST_NTICKS; 779 780 next_delay: 781 delay--; 782 if (delay < 0) 783 goto end_dequeue; 784 785 last_kt: 786 /* Destroy contexts backwards */ 787 id = ipvs->est_kt_count; 788 789 next_kt: 790 if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) 791 goto unlock; 792 id--; 793 if (id < 0) 794 goto next_delay; 795 kd = ipvs->est_kt_arr[id]; 796 if (!kd) 797 goto next_kt; 798 /* kt 0 can exist with empty chains */ 799 if (!id && kd->est_count <= 1) 800 goto next_delay; 801 802 row = kd->est_row + delay; 803 if (row >= IPVS_EST_NTICKS) 804 row -= IPVS_EST_NTICKS; 805 td = rcu_dereference_protected(kd->ticks[row], 1); 806 if (!td) 807 goto next_kt; 808 809 cid = 0; 810 811 walk_chain: 812 if (kthread_should_stop()) 813 goto unlock; 814 step++; 815 if (!(step & 63)) { 816 /* Give chance estimators to be added (to est_temp_list) 817 * and deleted (releasing kthread contexts) 818 */ 819 mutex_unlock(&__ip_vs_mutex); 820 cond_resched(); 821 mutex_lock(&__ip_vs_mutex); 822 823 /* Current kt released ? */ 824 if (id >= ipvs->est_kt_count) 825 goto last_kt; 826 if (kd != ipvs->est_kt_arr[id]) 827 goto next_kt; 828 /* Current td released ? */ 829 if (td != rcu_dereference_protected(kd->ticks[row], 1)) 830 goto next_kt; 831 /* No fatal changes on the current kd and td */ 832 } 833 est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, 834 list); 835 if (!est) { 836 cid++; 837 if (cid >= IPVS_EST_TICK_CHAINS) 838 goto next_kt; 839 goto walk_chain; 840 } 841 /* We can cheat and increase est_count to protect kt 0 context 842 * from release but we prefer to keep the last estimator 843 */ 844 last = kd->est_count <= 1; 845 /* Do not free kt #0 data */ 846 if (!id && last) 847 goto next_delay; 848 last_td = kd->tick_len[row] <= 1; 849 stats = container_of(est, struct ip_vs_stats, est); 850 ip_vs_stop_estimator(ipvs, stats); 851 /* Tasks are stopped, move without RCU grace period */ 852 est->ktid = -1; 853 est->ktrow = row - kd->est_row; 854 if (est->ktrow < 0) 855 est->ktrow += IPVS_EST_NTICKS; 856 hlist_add_head(&est->list, &ipvs->est_temp_list); 857 /* kd freed ? */ 858 if (last) 859 goto next_kt; 860 /* td freed ? */ 861 if (last_td) 862 goto next_kt; 863 goto walk_chain; 864 865 end_dequeue: 866 /* All estimators removed while calculating ? */ 867 if (!ipvs->est_kt_count) 868 goto unlock; 869 kd = ipvs->est_kt_arr[0]; 870 if (!kd) 871 goto unlock; 872 kd->add_row = kd->est_row; 873 ipvs->est_chain_max = chain_max; 874 ip_vs_est_set_params(ipvs, kd); 875 876 pr_info("using max %d ests per chain, %d per kthread\n", 877 kd->chain_max, kd->est_max_count); 878 879 /* Try to keep tot_stats in kt0, enqueue it early */ 880 if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && 881 ipvs->tot_stats->s.est.ktid == -1) { 882 hlist_del(&ipvs->tot_stats->s.est.list); 883 hlist_add_head(&ipvs->tot_stats->s.est.list, 884 &ipvs->est_temp_list); 885 } 886 887 mutex_lock(&ipvs->est_mutex); 888 889 /* We completed the calc phase, new calc phase not requested */ 890 if (genid == atomic_read(&ipvs->est_genid)) 891 ipvs->est_calc_phase = 0; 892 893 unlock2: 894 mutex_unlock(&ipvs->est_mutex); 895 896 unlock: 897 mutex_unlock(&__ip_vs_mutex); 898 } 899 900 void ip_vs_zero_estimator(struct ip_vs_stats *stats) 901 { 902 struct ip_vs_estimator *est = &stats->est; 903 struct ip_vs_kstats *k = &stats->kstats; 904 905 /* reset counters, caller must hold the stats->lock lock */ 906 est->last_inbytes = k->inbytes; 907 est->last_outbytes = k->outbytes; 908 est->last_conns = k->conns; 909 est->last_inpkts = k->inpkts; 910 est->last_outpkts = k->outpkts; 911 est->cps = 0; 912 est->inpps = 0; 913 est->outpps = 0; 914 est->inbps = 0; 915 est->outbps = 0; 916 } 917 918 /* Get decoded rates */ 919 void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) 920 { 921 struct ip_vs_estimator *e = &stats->est; 922 923 dst->cps = (e->cps + 0x1FF) >> 10; 924 dst->inpps = (e->inpps + 0x1FF) >> 10; 925 dst->outpps = (e->outpps + 0x1FF) >> 10; 926 dst->inbps = (e->inbps + 0xF) >> 5; 927 dst->outbps = (e->outbps + 0xF) >> 5; 928 } 929 930 int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) 931 { 932 INIT_HLIST_HEAD(&ipvs->est_temp_list); 933 ipvs->est_kt_arr = NULL; 934 ipvs->est_max_threads = 0; 935 ipvs->est_calc_phase = 0; 936 ipvs->est_chain_max = 0; 937 ipvs->est_kt_count = 0; 938 ipvs->est_add_ktid = 0; 939 atomic_set(&ipvs->est_genid, 0); 940 atomic_set(&ipvs->est_genid_done, 0); 941 __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); 942 return 0; 943 } 944 945 void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) 946 { 947 int i; 948 949 for (i = 0; i < ipvs->est_kt_count; i++) 950 ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); 951 kfree(ipvs->est_kt_arr); 952 mutex_destroy(&ipvs->est_mutex); 953 } 954