1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * ip_vs_est.c: simple rate estimator for IPVS
4 *
5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
8 * Network name space (netns) aware.
9 * Global data moved to netns i.e struct netns_ipvs
10 * Affected data: est_list and est_lock.
11 * estimation_timer() runs with timer per netns.
12 * get_stats()) do the per cpu summing.
13 */
14
15 #define KMSG_COMPONENT "IPVS"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18 #include <linux/kernel.h>
19 #include <linux/jiffies.h>
20 #include <linux/types.h>
21 #include <linux/interrupt.h>
22 #include <linux/sysctl.h>
23 #include <linux/list.h>
24 #include <linux/rcupdate_wait.h>
25
26 #include <net/ip_vs.h>
27
28 /*
29 This code is to estimate rate in a shorter interval (such as 8
30 seconds) for virtual services and real servers. For measure rate in a
31 long interval, it is easy to implement a user level daemon which
32 periodically reads those statistical counters and measure rate.
33
34 We measure rate during the last 8 seconds every 2 seconds:
35
36 avgrate = avgrate*(1-W) + rate*W
37
38 where W = 2^(-2)
39
40 NOTES.
41
42 * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
43
44 * Netlink users can see 64-bit values but sockopt users are restricted
45 to 32-bit values for conns, packets, bps, cps and pps.
46
47 * A lot of code is taken from net/core/gen_estimator.c
48
49 KEY POINTS:
50 - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
51 - kthreads read the cpustats to update the estimators (svcs, dests, total)
52 - the states of estimators can be read (get stats) or modified (zero stats)
53 from processes
54
55 KTHREADS:
56 - estimators are added initially to est_temp_list and later kthread 0
57 distributes them to one or many kthreads for estimation
58 - kthread contexts are created and attached to array
59 - the kthread tasks are started when first service is added, before that
60 the total stats are not estimated
61 - when configuration (cpulist/nice) is changed, the tasks are restarted
62 by work (est_reload_work)
63 - kthread tasks are stopped while the cpulist is empty
64 - the kthread context holds lists with estimators (chains) which are
65 processed every 2 seconds
66 - as estimators can be added dynamically and in bursts, we try to spread
67 them to multiple chains which are estimated at different time
68 - on start, kthread 0 enters calculation phase to determine the chain limits
69 and the limit of estimators per kthread
70 - est_add_ktid: ktid where to add new ests, can point to empty slot where
71 we should add kt data
72 */
73
74 static struct lock_class_key __ipvs_est_key;
75
76 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
77 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
78
ip_vs_chain_estimation(struct hlist_head * chain)79 static void ip_vs_chain_estimation(struct hlist_head *chain)
80 {
81 struct ip_vs_estimator *e;
82 struct ip_vs_cpu_stats *c;
83 struct ip_vs_stats *s;
84 u64 rate;
85
86 hlist_for_each_entry_rcu(e, chain, list) {
87 u64 conns, inpkts, outpkts, inbytes, outbytes;
88 u64 kconns = 0, kinpkts = 0, koutpkts = 0;
89 u64 kinbytes = 0, koutbytes = 0;
90 unsigned int start;
91 int i;
92
93 if (kthread_should_stop())
94 break;
95
96 s = container_of(e, struct ip_vs_stats, est);
97 for_each_possible_cpu(i) {
98 c = per_cpu_ptr(s->cpustats, i);
99 do {
100 start = u64_stats_fetch_begin(&c->syncp);
101 conns = u64_stats_read(&c->cnt.conns);
102 inpkts = u64_stats_read(&c->cnt.inpkts);
103 outpkts = u64_stats_read(&c->cnt.outpkts);
104 inbytes = u64_stats_read(&c->cnt.inbytes);
105 outbytes = u64_stats_read(&c->cnt.outbytes);
106 } while (u64_stats_fetch_retry(&c->syncp, start));
107 kconns += conns;
108 kinpkts += inpkts;
109 koutpkts += outpkts;
110 kinbytes += inbytes;
111 koutbytes += outbytes;
112 }
113
114 spin_lock(&s->lock);
115
116 s->kstats.conns = kconns;
117 s->kstats.inpkts = kinpkts;
118 s->kstats.outpkts = koutpkts;
119 s->kstats.inbytes = kinbytes;
120 s->kstats.outbytes = koutbytes;
121
122 /* scaled by 2^10, but divided 2 seconds */
123 rate = (s->kstats.conns - e->last_conns) << 9;
124 e->last_conns = s->kstats.conns;
125 e->cps += ((s64)rate - (s64)e->cps) >> 2;
126
127 rate = (s->kstats.inpkts - e->last_inpkts) << 9;
128 e->last_inpkts = s->kstats.inpkts;
129 e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
130
131 rate = (s->kstats.outpkts - e->last_outpkts) << 9;
132 e->last_outpkts = s->kstats.outpkts;
133 e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
134
135 /* scaled by 2^5, but divided 2 seconds */
136 rate = (s->kstats.inbytes - e->last_inbytes) << 4;
137 e->last_inbytes = s->kstats.inbytes;
138 e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
139
140 rate = (s->kstats.outbytes - e->last_outbytes) << 4;
141 e->last_outbytes = s->kstats.outbytes;
142 e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
143 spin_unlock(&s->lock);
144 }
145 }
146
ip_vs_tick_estimation(struct ip_vs_est_kt_data * kd,int row)147 static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
148 {
149 struct ip_vs_est_tick_data *td;
150 int cid;
151
152 rcu_read_lock();
153 td = rcu_dereference(kd->ticks[row]);
154 if (!td)
155 goto out;
156 for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
157 if (kthread_should_stop())
158 break;
159 ip_vs_chain_estimation(&td->chains[cid]);
160 cond_resched_rcu();
161 td = rcu_dereference(kd->ticks[row]);
162 if (!td)
163 break;
164 }
165
166 out:
167 rcu_read_unlock();
168 }
169
ip_vs_estimation_kthread(void * data)170 static int ip_vs_estimation_kthread(void *data)
171 {
172 struct ip_vs_est_kt_data *kd = data;
173 struct netns_ipvs *ipvs = kd->ipvs;
174 int row = kd->est_row;
175 unsigned long now;
176 int id = kd->id;
177 long gap;
178
179 if (id > 0) {
180 if (!ipvs->est_chain_max)
181 return 0;
182 } else {
183 if (!ipvs->est_chain_max) {
184 ipvs->est_calc_phase = 1;
185 /* commit est_calc_phase before reading est_genid */
186 smp_mb();
187 }
188
189 /* kthread 0 will handle the calc phase */
190 if (ipvs->est_calc_phase)
191 ip_vs_est_calc_phase(ipvs);
192 }
193
194 while (1) {
195 if (!id && !hlist_empty(&ipvs->est_temp_list))
196 ip_vs_est_drain_temp_list(ipvs);
197 set_current_state(TASK_IDLE);
198 if (kthread_should_stop())
199 break;
200
201 /* before estimation, check if we should sleep */
202 now = jiffies;
203 gap = kd->est_timer - now;
204 if (gap > 0) {
205 if (gap > IPVS_EST_TICK) {
206 kd->est_timer = now - IPVS_EST_TICK;
207 gap = IPVS_EST_TICK;
208 }
209 schedule_timeout(gap);
210 } else {
211 __set_current_state(TASK_RUNNING);
212 if (gap < -8 * IPVS_EST_TICK)
213 kd->est_timer = now;
214 }
215
216 if (kd->tick_len[row])
217 ip_vs_tick_estimation(kd, row);
218
219 row++;
220 if (row >= IPVS_EST_NTICKS)
221 row = 0;
222 WRITE_ONCE(kd->est_row, row);
223 kd->est_timer += IPVS_EST_TICK;
224 }
225 __set_current_state(TASK_RUNNING);
226
227 return 0;
228 }
229
230 /* Schedule stop/start for kthread tasks */
ip_vs_est_reload_start(struct netns_ipvs * ipvs)231 void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
232 {
233 /* Ignore reloads before first service is added */
234 if (!ipvs->enable)
235 return;
236 ip_vs_est_stopped_recalc(ipvs);
237 /* Bump the kthread configuration genid */
238 atomic_inc(&ipvs->est_genid);
239 queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
240 }
241
242 /* Start kthread task with current configuration */
ip_vs_est_kthread_start(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)243 int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
244 struct ip_vs_est_kt_data *kd)
245 {
246 unsigned long now;
247 int ret = 0;
248 long gap;
249
250 lockdep_assert_held(&ipvs->est_mutex);
251
252 if (kd->task)
253 goto out;
254 now = jiffies;
255 gap = kd->est_timer - now;
256 /* Sync est_timer if task is starting later */
257 if (abs(gap) > 4 * IPVS_EST_TICK)
258 kd->est_timer = now;
259 kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
260 ipvs->gen, kd->id);
261 if (IS_ERR(kd->task)) {
262 ret = PTR_ERR(kd->task);
263 kd->task = NULL;
264 goto out;
265 }
266
267 set_user_nice(kd->task, sysctl_est_nice(ipvs));
268 set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
269
270 pr_info("starting estimator thread %d...\n", kd->id);
271 wake_up_process(kd->task);
272
273 out:
274 return ret;
275 }
276
ip_vs_est_kthread_stop(struct ip_vs_est_kt_data * kd)277 void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
278 {
279 if (kd->task) {
280 pr_info("stopping estimator thread %d...\n", kd->id);
281 kthread_stop(kd->task);
282 kd->task = NULL;
283 }
284 }
285
286 /* Apply parameters to kthread */
ip_vs_est_set_params(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)287 static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
288 struct ip_vs_est_kt_data *kd)
289 {
290 kd->chain_max = ipvs->est_chain_max;
291 /* We are using single chain on RCU preemption */
292 if (IPVS_EST_TICK_CHAINS == 1)
293 kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
294 kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
295 kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
296 }
297
298 /* Create and start estimation kthread in a free or new array slot */
ip_vs_est_add_kthread(struct netns_ipvs * ipvs)299 static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
300 {
301 struct ip_vs_est_kt_data *kd = NULL;
302 int id = ipvs->est_kt_count;
303 int ret = -ENOMEM;
304 void *arr = NULL;
305 int i;
306
307 if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
308 ipvs->enable && ipvs->est_max_threads)
309 return -EINVAL;
310
311 mutex_lock(&ipvs->est_mutex);
312
313 for (i = 0; i < id; i++) {
314 if (!ipvs->est_kt_arr[i])
315 break;
316 }
317 if (i >= id) {
318 arr = krealloc_array(ipvs->est_kt_arr, id + 1,
319 sizeof(struct ip_vs_est_kt_data *),
320 GFP_KERNEL);
321 if (!arr)
322 goto out;
323 ipvs->est_kt_arr = arr;
324 } else {
325 id = i;
326 }
327
328 kd = kzalloc(sizeof(*kd), GFP_KERNEL);
329 if (!kd)
330 goto out;
331 kd->ipvs = ipvs;
332 bitmap_fill(kd->avail, IPVS_EST_NTICKS);
333 kd->est_timer = jiffies;
334 kd->id = id;
335 ip_vs_est_set_params(ipvs, kd);
336
337 /* Pre-allocate stats used in calc phase */
338 if (!id && !kd->calc_stats) {
339 kd->calc_stats = ip_vs_stats_alloc();
340 if (!kd->calc_stats)
341 goto out;
342 }
343
344 /* Start kthread tasks only when services are present */
345 if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
346 ret = ip_vs_est_kthread_start(ipvs, kd);
347 if (ret < 0)
348 goto out;
349 }
350
351 if (arr)
352 ipvs->est_kt_count++;
353 ipvs->est_kt_arr[id] = kd;
354 kd = NULL;
355 /* Use most recent kthread for new ests */
356 ipvs->est_add_ktid = id;
357 ret = 0;
358
359 out:
360 mutex_unlock(&ipvs->est_mutex);
361 if (kd) {
362 ip_vs_stats_free(kd->calc_stats);
363 kfree(kd);
364 }
365
366 return ret;
367 }
368
369 /* Select ktid where to add new ests: available, unused or new slot */
ip_vs_est_update_ktid(struct netns_ipvs * ipvs)370 static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
371 {
372 int ktid, best = ipvs->est_kt_count;
373 struct ip_vs_est_kt_data *kd;
374
375 for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
376 kd = ipvs->est_kt_arr[ktid];
377 if (kd) {
378 if (kd->est_count < kd->est_max_count) {
379 best = ktid;
380 break;
381 }
382 } else if (ktid < best) {
383 best = ktid;
384 }
385 }
386 ipvs->est_add_ktid = best;
387 }
388
389 /* Add estimator to current kthread (est_add_ktid) */
ip_vs_enqueue_estimator(struct netns_ipvs * ipvs,struct ip_vs_estimator * est)390 static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
391 struct ip_vs_estimator *est)
392 {
393 struct ip_vs_est_kt_data *kd = NULL;
394 struct ip_vs_est_tick_data *td;
395 int ktid, row, crow, cid, ret;
396 int delay = est->ktrow;
397
398 BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
399 "Too many chains for ktcid");
400
401 if (ipvs->est_add_ktid < ipvs->est_kt_count) {
402 kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
403 if (kd)
404 goto add_est;
405 }
406
407 ret = ip_vs_est_add_kthread(ipvs);
408 if (ret < 0)
409 goto out;
410 kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
411
412 add_est:
413 ktid = kd->id;
414 /* For small number of estimators prefer to use few ticks,
415 * otherwise try to add into the last estimated row.
416 * est_row and add_row point after the row we should use
417 */
418 if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
419 crow = READ_ONCE(kd->est_row);
420 else
421 crow = kd->add_row;
422 crow += delay;
423 if (crow >= IPVS_EST_NTICKS)
424 crow -= IPVS_EST_NTICKS;
425 /* Assume initial delay ? */
426 if (delay >= IPVS_EST_NTICKS - 1) {
427 /* Preserve initial delay or decrease it if no space in tick */
428 row = crow;
429 if (crow < IPVS_EST_NTICKS - 1) {
430 crow++;
431 row = find_last_bit(kd->avail, crow);
432 }
433 if (row >= crow)
434 row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
435 } else {
436 /* Preserve delay or increase it if no space in tick */
437 row = IPVS_EST_NTICKS;
438 if (crow > 0)
439 row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
440 if (row >= IPVS_EST_NTICKS)
441 row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
442 }
443
444 td = rcu_dereference_protected(kd->ticks[row], 1);
445 if (!td) {
446 td = kzalloc(sizeof(*td), GFP_KERNEL);
447 if (!td) {
448 ret = -ENOMEM;
449 goto out;
450 }
451 rcu_assign_pointer(kd->ticks[row], td);
452 }
453
454 cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
455
456 kd->est_count++;
457 kd->tick_len[row]++;
458 if (!td->chain_len[cid])
459 __set_bit(cid, td->present);
460 td->chain_len[cid]++;
461 est->ktid = ktid;
462 est->ktrow = row;
463 est->ktcid = cid;
464 hlist_add_head_rcu(&est->list, &td->chains[cid]);
465
466 if (td->chain_len[cid] >= kd->chain_max) {
467 __set_bit(cid, td->full);
468 if (kd->tick_len[row] >= kd->tick_max)
469 __clear_bit(row, kd->avail);
470 }
471
472 /* Update est_add_ktid to point to first available/empty kt slot */
473 if (kd->est_count == kd->est_max_count)
474 ip_vs_est_update_ktid(ipvs);
475
476 ret = 0;
477
478 out:
479 return ret;
480 }
481
482 /* Start estimation for stats */
ip_vs_start_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)483 int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
484 {
485 struct ip_vs_estimator *est = &stats->est;
486 int ret;
487
488 if (!ipvs->est_max_threads && ipvs->enable)
489 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
490
491 est->ktid = -1;
492 est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
493
494 /* We prefer this code to be short, kthread 0 will requeue the
495 * estimator to available chain. If tasks are disabled, we
496 * will not allocate much memory, just for kt 0.
497 */
498 ret = 0;
499 if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
500 ret = ip_vs_est_add_kthread(ipvs);
501 if (ret >= 0)
502 hlist_add_head(&est->list, &ipvs->est_temp_list);
503 else
504 INIT_HLIST_NODE(&est->list);
505 return ret;
506 }
507
ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data * kd)508 static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
509 {
510 if (kd) {
511 if (kd->task) {
512 pr_info("stop unused estimator thread %d...\n", kd->id);
513 kthread_stop(kd->task);
514 }
515 ip_vs_stats_free(kd->calc_stats);
516 kfree(kd);
517 }
518 }
519
520 /* Unlink estimator from chain */
ip_vs_stop_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)521 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
522 {
523 struct ip_vs_estimator *est = &stats->est;
524 struct ip_vs_est_tick_data *td;
525 struct ip_vs_est_kt_data *kd;
526 int ktid = est->ktid;
527 int row = est->ktrow;
528 int cid = est->ktcid;
529
530 /* Failed to add to chain ? */
531 if (hlist_unhashed(&est->list))
532 return;
533
534 /* On return, estimator can be freed, dequeue it now */
535
536 /* In est_temp_list ? */
537 if (ktid < 0) {
538 hlist_del(&est->list);
539 goto end_kt0;
540 }
541
542 hlist_del_rcu(&est->list);
543 kd = ipvs->est_kt_arr[ktid];
544 td = rcu_dereference_protected(kd->ticks[row], 1);
545 __clear_bit(cid, td->full);
546 td->chain_len[cid]--;
547 if (!td->chain_len[cid])
548 __clear_bit(cid, td->present);
549 kd->tick_len[row]--;
550 __set_bit(row, kd->avail);
551 if (!kd->tick_len[row]) {
552 RCU_INIT_POINTER(kd->ticks[row], NULL);
553 kfree_rcu(td, rcu_head);
554 }
555 kd->est_count--;
556 if (kd->est_count) {
557 /* This kt slot can become available just now, prefer it */
558 if (ktid < ipvs->est_add_ktid)
559 ipvs->est_add_ktid = ktid;
560 return;
561 }
562
563 if (ktid > 0) {
564 mutex_lock(&ipvs->est_mutex);
565 ip_vs_est_kthread_destroy(kd);
566 ipvs->est_kt_arr[ktid] = NULL;
567 if (ktid == ipvs->est_kt_count - 1) {
568 ipvs->est_kt_count--;
569 while (ipvs->est_kt_count > 1 &&
570 !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
571 ipvs->est_kt_count--;
572 }
573 mutex_unlock(&ipvs->est_mutex);
574
575 /* This slot is now empty, prefer another available kt slot */
576 if (ktid == ipvs->est_add_ktid)
577 ip_vs_est_update_ktid(ipvs);
578 }
579
580 end_kt0:
581 /* kt 0 is freed after all other kthreads and chains are empty */
582 if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
583 kd = ipvs->est_kt_arr[0];
584 if (!kd || !kd->est_count) {
585 mutex_lock(&ipvs->est_mutex);
586 if (kd) {
587 ip_vs_est_kthread_destroy(kd);
588 ipvs->est_kt_arr[0] = NULL;
589 }
590 ipvs->est_kt_count--;
591 mutex_unlock(&ipvs->est_mutex);
592 ipvs->est_add_ktid = 0;
593 }
594 }
595 }
596
597 /* Register all ests from est_temp_list to kthreads */
ip_vs_est_drain_temp_list(struct netns_ipvs * ipvs)598 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
599 {
600 struct ip_vs_estimator *est;
601
602 while (1) {
603 int max = 16;
604
605 mutex_lock(&__ip_vs_mutex);
606
607 while (max-- > 0) {
608 est = hlist_entry_safe(ipvs->est_temp_list.first,
609 struct ip_vs_estimator, list);
610 if (est) {
611 if (kthread_should_stop())
612 goto unlock;
613 hlist_del_init(&est->list);
614 if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
615 continue;
616 est->ktid = -1;
617 hlist_add_head(&est->list,
618 &ipvs->est_temp_list);
619 /* Abort, some entries will not be estimated
620 * until next attempt
621 */
622 }
623 goto unlock;
624 }
625 mutex_unlock(&__ip_vs_mutex);
626 cond_resched();
627 }
628
629 unlock:
630 mutex_unlock(&__ip_vs_mutex);
631 }
632
633 /* Calculate limits for all kthreads */
ip_vs_est_calc_limits(struct netns_ipvs * ipvs,int * chain_max)634 static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
635 {
636 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
637 struct ip_vs_est_kt_data *kd;
638 struct hlist_head chain;
639 struct ip_vs_stats *s;
640 int cache_factor = 4;
641 int i, loops, ntest;
642 s32 min_est = 0;
643 ktime_t t1, t2;
644 int max = 8;
645 int ret = 1;
646 s64 diff;
647 u64 val;
648
649 INIT_HLIST_HEAD(&chain);
650 mutex_lock(&__ip_vs_mutex);
651 kd = ipvs->est_kt_arr[0];
652 mutex_unlock(&__ip_vs_mutex);
653 s = kd ? kd->calc_stats : NULL;
654 if (!s)
655 goto out;
656 hlist_add_head(&s->est.list, &chain);
657
658 loops = 1;
659 /* Get best result from many tests */
660 for (ntest = 0; ntest < 12; ntest++) {
661 if (!(ntest & 3)) {
662 /* Wait for cpufreq frequency transition */
663 wait_event_idle_timeout(wq, kthread_should_stop(),
664 HZ / 50);
665 if (!ipvs->enable || kthread_should_stop())
666 goto stop;
667 }
668
669 local_bh_disable();
670 rcu_read_lock();
671
672 /* Put stats in cache */
673 ip_vs_chain_estimation(&chain);
674
675 t1 = ktime_get();
676 for (i = loops * cache_factor; i > 0; i--)
677 ip_vs_chain_estimation(&chain);
678 t2 = ktime_get();
679
680 rcu_read_unlock();
681 local_bh_enable();
682
683 if (!ipvs->enable || kthread_should_stop())
684 goto stop;
685 cond_resched();
686
687 diff = ktime_to_ns(ktime_sub(t2, t1));
688 if (diff <= 1 * NSEC_PER_USEC) {
689 /* Do more loops on low time resolution */
690 loops *= 2;
691 continue;
692 }
693 if (diff >= NSEC_PER_SEC)
694 continue;
695 val = diff;
696 do_div(val, loops);
697 if (!min_est || val < min_est) {
698 min_est = val;
699 /* goal: 95usec per chain */
700 val = 95 * NSEC_PER_USEC;
701 if (val >= min_est) {
702 do_div(val, min_est);
703 max = (int)val;
704 } else {
705 max = 1;
706 }
707 }
708 }
709
710 out:
711 if (s)
712 hlist_del_init(&s->est.list);
713 *chain_max = max;
714 return ret;
715
716 stop:
717 ret = 0;
718 goto out;
719 }
720
721 /* Calculate the parameters and apply them in context of kt #0
722 * ECP: est_calc_phase
723 * ECM: est_chain_max
724 * ECP ECM Insert Chain enable Description
725 * ---------------------------------------------------------------------------
726 * 0 0 est_temp_list 0 create kt #0 context
727 * 0 0 est_temp_list 0->1 service added, start kthread #0 task
728 * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase
729 * 1 0 est_temp_list 1 kt #0: determine est_chain_max,
730 * stop tasks, move ests to est_temp_list
731 * and free kd for kthreads 1..last
732 * 1->0 0->N kt chains 1 ests can go to kthreads
733 * 0 N kt chains 1 drain est_temp_list, create new kthread
734 * contexts, start tasks, estimate
735 */
ip_vs_est_calc_phase(struct netns_ipvs * ipvs)736 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
737 {
738 int genid = atomic_read(&ipvs->est_genid);
739 struct ip_vs_est_tick_data *td;
740 struct ip_vs_est_kt_data *kd;
741 struct ip_vs_estimator *est;
742 struct ip_vs_stats *stats;
743 int id, row, cid, delay;
744 bool last, last_td;
745 int chain_max;
746 int step;
747
748 if (!ip_vs_est_calc_limits(ipvs, &chain_max))
749 return;
750
751 mutex_lock(&__ip_vs_mutex);
752
753 /* Stop all other tasks, so that we can immediately move the
754 * estimators to est_temp_list without RCU grace period
755 */
756 mutex_lock(&ipvs->est_mutex);
757 for (id = 1; id < ipvs->est_kt_count; id++) {
758 /* netns clean up started, abort */
759 if (!ipvs->enable)
760 goto unlock2;
761 kd = ipvs->est_kt_arr[id];
762 if (!kd)
763 continue;
764 ip_vs_est_kthread_stop(kd);
765 }
766 mutex_unlock(&ipvs->est_mutex);
767
768 /* Move all estimators to est_temp_list but carefully,
769 * all estimators and kthread data can be released while
770 * we reschedule. Even for kthread 0.
771 */
772 step = 0;
773
774 /* Order entries in est_temp_list in ascending delay, so now
775 * walk delay(desc), id(desc), cid(asc)
776 */
777 delay = IPVS_EST_NTICKS;
778
779 next_delay:
780 delay--;
781 if (delay < 0)
782 goto end_dequeue;
783
784 last_kt:
785 /* Destroy contexts backwards */
786 id = ipvs->est_kt_count;
787
788 next_kt:
789 if (!ipvs->enable || kthread_should_stop())
790 goto unlock;
791 id--;
792 if (id < 0)
793 goto next_delay;
794 kd = ipvs->est_kt_arr[id];
795 if (!kd)
796 goto next_kt;
797 /* kt 0 can exist with empty chains */
798 if (!id && kd->est_count <= 1)
799 goto next_delay;
800
801 row = kd->est_row + delay;
802 if (row >= IPVS_EST_NTICKS)
803 row -= IPVS_EST_NTICKS;
804 td = rcu_dereference_protected(kd->ticks[row], 1);
805 if (!td)
806 goto next_kt;
807
808 cid = 0;
809
810 walk_chain:
811 if (kthread_should_stop())
812 goto unlock;
813 step++;
814 if (!(step & 63)) {
815 /* Give chance estimators to be added (to est_temp_list)
816 * and deleted (releasing kthread contexts)
817 */
818 mutex_unlock(&__ip_vs_mutex);
819 cond_resched();
820 mutex_lock(&__ip_vs_mutex);
821
822 /* Current kt released ? */
823 if (id >= ipvs->est_kt_count)
824 goto last_kt;
825 if (kd != ipvs->est_kt_arr[id])
826 goto next_kt;
827 /* Current td released ? */
828 if (td != rcu_dereference_protected(kd->ticks[row], 1))
829 goto next_kt;
830 /* No fatal changes on the current kd and td */
831 }
832 est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
833 list);
834 if (!est) {
835 cid++;
836 if (cid >= IPVS_EST_TICK_CHAINS)
837 goto next_kt;
838 goto walk_chain;
839 }
840 /* We can cheat and increase est_count to protect kt 0 context
841 * from release but we prefer to keep the last estimator
842 */
843 last = kd->est_count <= 1;
844 /* Do not free kt #0 data */
845 if (!id && last)
846 goto next_delay;
847 last_td = kd->tick_len[row] <= 1;
848 stats = container_of(est, struct ip_vs_stats, est);
849 ip_vs_stop_estimator(ipvs, stats);
850 /* Tasks are stopped, move without RCU grace period */
851 est->ktid = -1;
852 est->ktrow = row - kd->est_row;
853 if (est->ktrow < 0)
854 est->ktrow += IPVS_EST_NTICKS;
855 hlist_add_head(&est->list, &ipvs->est_temp_list);
856 /* kd freed ? */
857 if (last)
858 goto next_kt;
859 /* td freed ? */
860 if (last_td)
861 goto next_kt;
862 goto walk_chain;
863
864 end_dequeue:
865 /* All estimators removed while calculating ? */
866 if (!ipvs->est_kt_count)
867 goto unlock;
868 kd = ipvs->est_kt_arr[0];
869 if (!kd)
870 goto unlock;
871 kd->add_row = kd->est_row;
872 ipvs->est_chain_max = chain_max;
873 ip_vs_est_set_params(ipvs, kd);
874
875 pr_info("using max %d ests per chain, %d per kthread\n",
876 kd->chain_max, kd->est_max_count);
877
878 /* Try to keep tot_stats in kt0, enqueue it early */
879 if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
880 ipvs->tot_stats->s.est.ktid == -1) {
881 hlist_del(&ipvs->tot_stats->s.est.list);
882 hlist_add_head(&ipvs->tot_stats->s.est.list,
883 &ipvs->est_temp_list);
884 }
885
886 mutex_lock(&ipvs->est_mutex);
887
888 /* We completed the calc phase, new calc phase not requested */
889 if (genid == atomic_read(&ipvs->est_genid))
890 ipvs->est_calc_phase = 0;
891
892 unlock2:
893 mutex_unlock(&ipvs->est_mutex);
894
895 unlock:
896 mutex_unlock(&__ip_vs_mutex);
897 }
898
ip_vs_zero_estimator(struct ip_vs_stats * stats)899 void ip_vs_zero_estimator(struct ip_vs_stats *stats)
900 {
901 struct ip_vs_estimator *est = &stats->est;
902 struct ip_vs_kstats *k = &stats->kstats;
903
904 /* reset counters, caller must hold the stats->lock lock */
905 est->last_inbytes = k->inbytes;
906 est->last_outbytes = k->outbytes;
907 est->last_conns = k->conns;
908 est->last_inpkts = k->inpkts;
909 est->last_outpkts = k->outpkts;
910 est->cps = 0;
911 est->inpps = 0;
912 est->outpps = 0;
913 est->inbps = 0;
914 est->outbps = 0;
915 }
916
917 /* Get decoded rates */
ip_vs_read_estimator(struct ip_vs_kstats * dst,struct ip_vs_stats * stats)918 void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
919 {
920 struct ip_vs_estimator *e = &stats->est;
921
922 dst->cps = (e->cps + 0x1FF) >> 10;
923 dst->inpps = (e->inpps + 0x1FF) >> 10;
924 dst->outpps = (e->outpps + 0x1FF) >> 10;
925 dst->inbps = (e->inbps + 0xF) >> 5;
926 dst->outbps = (e->outbps + 0xF) >> 5;
927 }
928
ip_vs_estimator_net_init(struct netns_ipvs * ipvs)929 int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
930 {
931 INIT_HLIST_HEAD(&ipvs->est_temp_list);
932 ipvs->est_kt_arr = NULL;
933 ipvs->est_max_threads = 0;
934 ipvs->est_calc_phase = 0;
935 ipvs->est_chain_max = 0;
936 ipvs->est_kt_count = 0;
937 ipvs->est_add_ktid = 0;
938 atomic_set(&ipvs->est_genid, 0);
939 atomic_set(&ipvs->est_genid_done, 0);
940 __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
941 return 0;
942 }
943
ip_vs_estimator_net_cleanup(struct netns_ipvs * ipvs)944 void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
945 {
946 int i;
947
948 for (i = 0; i < ipvs->est_kt_count; i++)
949 ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
950 kfree(ipvs->est_kt_arr);
951 mutex_destroy(&ipvs->est_mutex);
952 }
953