1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * ip_vs_est.c: simple rate estimator for IPVS
4 *
5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
8 * Network name space (netns) aware.
9 * Global data moved to netns i.e struct netns_ipvs
10 * Affected data: est_list and est_lock.
11 * estimation_timer() runs with timer per netns.
12 * get_stats()) do the per cpu summing.
13 */
14
15 #define pr_fmt(fmt) "IPVS: " fmt
16
17 #include <linux/kernel.h>
18 #include <linux/jiffies.h>
19 #include <linux/types.h>
20 #include <linux/interrupt.h>
21 #include <linux/sysctl.h>
22 #include <linux/list.h>
23 #include <linux/rcupdate_wait.h>
24
25 #include <net/ip_vs.h>
26
27 /*
28 This code is to estimate rate in a shorter interval (such as 8
29 seconds) for virtual services and real servers. For measure rate in a
30 long interval, it is easy to implement a user level daemon which
31 periodically reads those statistical counters and measure rate.
32
33 We measure rate during the last 8 seconds every 2 seconds:
34
35 avgrate = avgrate*(1-W) + rate*W
36
37 where W = 2^(-2)
38
39 NOTES.
40
41 * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
42
43 * Netlink users can see 64-bit values but sockopt users are restricted
44 to 32-bit values for conns, packets, bps, cps and pps.
45
46 * A lot of code is taken from net/core/gen_estimator.c
47
48 KEY POINTS:
49 - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
50 - kthreads read the cpustats to update the estimators (svcs, dests, total)
51 - the states of estimators can be read (get stats) or modified (zero stats)
52 from processes
53
54 KTHREADS:
55 - estimators are added initially to est_temp_list and later kthread 0
56 distributes them to one or many kthreads for estimation
57 - kthread contexts are created and attached to array
58 - the kthread tasks are started when first service is added, before that
59 the total stats are not estimated
60 - when configuration (cpulist/nice) is changed, the tasks are restarted
61 by work (est_reload_work)
62 - kthread tasks are stopped while the cpulist is empty
63 - the kthread context holds lists with estimators (chains) which are
64 processed every 2 seconds
65 - as estimators can be added dynamically and in bursts, we try to spread
66 them to multiple chains which are estimated at different time
67 - on start, kthread 0 enters calculation phase to determine the chain limits
68 and the limit of estimators per kthread
69 - est_add_ktid: ktid where to add new ests, can point to empty slot where
70 we should add kt data
71 - data protected by service_mutex: est_temp_list, est_add_ktid,
72 est_kt_count(R/W), est_kt_arr(R/W), est_genid_done, kd->needed(R/W)
73 - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist,
74 est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation,
75 est_kt_count(R), est_kt_arr(R), kd->needed(R), kd->task (id > 0)
76 */
77
78 static struct lock_class_key __ipvs_est_key;
79
80 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
81 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
82
ip_vs_chain_estimation(struct hlist_head * chain)83 static void ip_vs_chain_estimation(struct hlist_head *chain)
84 {
85 struct ip_vs_estimator *e;
86 struct ip_vs_cpu_stats *c;
87 struct ip_vs_stats *s;
88 u64 rate;
89
90 hlist_for_each_entry_rcu(e, chain, list) {
91 u64 conns, inpkts, outpkts, inbytes, outbytes;
92 u64 kconns = 0, kinpkts = 0, koutpkts = 0;
93 u64 kinbytes = 0, koutbytes = 0;
94 unsigned int start;
95 int i;
96
97 if (kthread_should_stop())
98 break;
99
100 s = container_of(e, struct ip_vs_stats, est);
101 for_each_possible_cpu(i) {
102 c = per_cpu_ptr(s->cpustats, i);
103 do {
104 start = u64_stats_fetch_begin(&c->syncp);
105 conns = u64_stats_read(&c->cnt.conns);
106 inpkts = u64_stats_read(&c->cnt.inpkts);
107 outpkts = u64_stats_read(&c->cnt.outpkts);
108 inbytes = u64_stats_read(&c->cnt.inbytes);
109 outbytes = u64_stats_read(&c->cnt.outbytes);
110 } while (u64_stats_fetch_retry(&c->syncp, start));
111 kconns += conns;
112 kinpkts += inpkts;
113 koutpkts += outpkts;
114 kinbytes += inbytes;
115 koutbytes += outbytes;
116 }
117
118 spin_lock(&s->lock);
119
120 s->kstats.conns = kconns;
121 s->kstats.inpkts = kinpkts;
122 s->kstats.outpkts = koutpkts;
123 s->kstats.inbytes = kinbytes;
124 s->kstats.outbytes = koutbytes;
125
126 /* scaled by 2^10, but divided 2 seconds */
127 rate = (s->kstats.conns - e->last_conns) << 9;
128 e->last_conns = s->kstats.conns;
129 e->cps += ((s64)rate - (s64)e->cps) >> 2;
130
131 rate = (s->kstats.inpkts - e->last_inpkts) << 9;
132 e->last_inpkts = s->kstats.inpkts;
133 e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
134
135 rate = (s->kstats.outpkts - e->last_outpkts) << 9;
136 e->last_outpkts = s->kstats.outpkts;
137 e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
138
139 /* scaled by 2^5, but divided 2 seconds */
140 rate = (s->kstats.inbytes - e->last_inbytes) << 4;
141 e->last_inbytes = s->kstats.inbytes;
142 e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
143
144 rate = (s->kstats.outbytes - e->last_outbytes) << 4;
145 e->last_outbytes = s->kstats.outbytes;
146 e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
147 spin_unlock(&s->lock);
148 }
149 }
150
ip_vs_tick_estimation(struct ip_vs_est_kt_data * kd,int row)151 static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
152 {
153 struct ip_vs_est_tick_data *td;
154 int cid;
155
156 rcu_read_lock();
157 td = rcu_dereference(kd->ticks[row]);
158 if (!td)
159 goto out;
160 for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
161 if (kthread_should_stop())
162 break;
163 ip_vs_chain_estimation(&td->chains[cid]);
164 cond_resched_rcu();
165 td = rcu_dereference(kd->ticks[row]);
166 if (!td)
167 break;
168 }
169
170 out:
171 rcu_read_unlock();
172 }
173
ip_vs_estimation_kthread(void * data)174 static int ip_vs_estimation_kthread(void *data)
175 {
176 struct ip_vs_est_kt_data *kd = data;
177 struct netns_ipvs *ipvs = kd->ipvs;
178 int row = kd->est_row;
179 unsigned long now;
180 int id = kd->id;
181 long gap;
182
183 if (id > 0) {
184 if (!ipvs->est_chain_max)
185 return 0;
186 } else {
187 if (!ipvs->est_chain_max) {
188 ipvs->est_calc_phase = 1;
189 /* commit est_calc_phase before reading est_genid */
190 smp_mb();
191 }
192
193 /* kthread 0 will handle the calc phase */
194 if (ipvs->est_calc_phase)
195 ip_vs_est_calc_phase(ipvs);
196 }
197
198 while (1) {
199 if (!id && !hlist_empty(&ipvs->est_temp_list))
200 ip_vs_est_drain_temp_list(ipvs);
201 set_current_state(TASK_IDLE);
202 if (kthread_should_stop())
203 break;
204
205 /* before estimation, check if we should sleep */
206 now = jiffies;
207 gap = kd->est_timer - now;
208 if (gap > 0) {
209 if (gap > IPVS_EST_TICK) {
210 kd->est_timer = now - IPVS_EST_TICK;
211 gap = IPVS_EST_TICK;
212 }
213 schedule_timeout(gap);
214 } else {
215 __set_current_state(TASK_RUNNING);
216 if (gap < -8 * IPVS_EST_TICK)
217 kd->est_timer = now;
218 }
219
220 if (kd->tick_len[row])
221 ip_vs_tick_estimation(kd, row);
222
223 row++;
224 if (row >= IPVS_EST_NTICKS)
225 row = 0;
226 WRITE_ONCE(kd->est_row, row);
227 kd->est_timer += IPVS_EST_TICK;
228 }
229 __set_current_state(TASK_RUNNING);
230
231 return 0;
232 }
233
234 /* Schedule stop/start for kthread tasks */
ip_vs_est_reload_start(struct netns_ipvs * ipvs,bool restart)235 void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart)
236 {
237 lockdep_assert_held(&ipvs->est_mutex);
238
239 /* Ignore reloads before first service is added */
240 if (!READ_ONCE(ipvs->enable))
241 return;
242 ip_vs_est_stopped_recalc(ipvs);
243 /* Bump the kthread configuration genid if stopping is requested */
244 if (restart)
245 atomic_inc(&ipvs->est_genid);
246 queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
247 }
248
249 /* Start kthread task with current configuration */
ip_vs_est_kthread_start(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)250 int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
251 struct ip_vs_est_kt_data *kd)
252 {
253 unsigned long now;
254 int ret = 0;
255 long gap;
256
257 lockdep_assert_held(&ipvs->est_mutex);
258
259 if (kd->task)
260 goto out;
261 now = jiffies;
262 gap = kd->est_timer - now;
263 /* Sync est_timer if task is starting later */
264 if (abs(gap) > 4 * IPVS_EST_TICK)
265 kd->est_timer = now;
266 kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
267 ipvs->gen, kd->id);
268 if (IS_ERR(kd->task)) {
269 ret = PTR_ERR(kd->task);
270 kd->task = NULL;
271 goto out;
272 }
273
274 set_user_nice(kd->task, sysctl_est_nice(ipvs));
275 if (sysctl_est_preferred_cpulist(ipvs))
276 kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
277
278 pr_info("starting estimator thread %d...\n", kd->id);
279 wake_up_process(kd->task);
280
281 out:
282 return ret;
283 }
284
ip_vs_est_kthread_stop(struct ip_vs_est_kt_data * kd)285 void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
286 {
287 if (kd->task) {
288 pr_info("stopping estimator thread %d...\n", kd->id);
289 kthread_stop(kd->task);
290 kd->task = NULL;
291 }
292 }
293
294 /* Apply parameters to kthread */
ip_vs_est_set_params(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)295 static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
296 struct ip_vs_est_kt_data *kd)
297 {
298 kd->chain_max = ipvs->est_chain_max;
299 /* We are using single chain on RCU preemption */
300 if (IPVS_EST_TICK_CHAINS == 1)
301 kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
302 kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
303 kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
304 }
305
306 /* Create and start estimation kthread in a free or new array slot */
ip_vs_est_add_kthread(struct netns_ipvs * ipvs)307 static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
308 {
309 struct ip_vs_est_kt_data *kd = NULL;
310 int id = ipvs->est_kt_count;
311 int ret = -ENOMEM;
312 void *arr = NULL;
313 int i;
314
315 mutex_lock(&ipvs->est_mutex);
316
317 /* Allow kt 0 data to be created before the services are added
318 * and limit the kthreads when services are present.
319 */
320 if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
321 READ_ONCE(ipvs->enable) && ipvs->est_max_threads) {
322 ret = -EINVAL;
323 goto out;
324 }
325
326 for (i = 0; i < id; i++) {
327 if (!ipvs->est_kt_arr[i])
328 break;
329 }
330 if (i >= id) {
331 arr = krealloc_array(ipvs->est_kt_arr, id + 1,
332 sizeof(struct ip_vs_est_kt_data *),
333 GFP_KERNEL);
334 if (!arr)
335 goto out;
336 ipvs->est_kt_arr = arr;
337 } else {
338 id = i;
339 }
340
341 kd = kzalloc_obj(*kd);
342 if (!kd)
343 goto out;
344 kd->ipvs = ipvs;
345 bitmap_fill(kd->avail, IPVS_EST_NTICKS);
346 kd->est_timer = jiffies;
347 kd->id = id;
348 ip_vs_est_set_params(ipvs, kd);
349 kd->needed = 1;
350
351 /* Pre-allocate stats used in calc phase */
352 if (!id && !kd->calc_stats) {
353 kd->calc_stats = ip_vs_stats_alloc();
354 if (!kd->calc_stats)
355 goto out;
356 }
357
358 /* Request kthread to be started */
359 ip_vs_est_reload_start(ipvs, false);
360
361 if (arr)
362 ipvs->est_kt_count++;
363 ipvs->est_kt_arr[id] = kd;
364 kd = NULL;
365 /* Use most recent kthread for new ests */
366 ipvs->est_add_ktid = id;
367 ret = 0;
368
369 out:
370 mutex_unlock(&ipvs->est_mutex);
371 if (kd) {
372 ip_vs_stats_free(kd->calc_stats);
373 kfree(kd);
374 }
375
376 return ret;
377 }
378
379 /* Select ktid where to add new ests: available, unused or new slot */
ip_vs_est_update_ktid(struct netns_ipvs * ipvs)380 static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
381 {
382 int ktid, best = ipvs->est_kt_count;
383 struct ip_vs_est_kt_data *kd;
384
385 for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
386 kd = ipvs->est_kt_arr[ktid];
387 if (kd) {
388 if (kd->est_count < kd->est_max_count) {
389 best = ktid;
390 break;
391 }
392 } else if (ktid < best) {
393 best = ktid;
394 }
395 }
396 ipvs->est_add_ktid = best;
397 }
398
399 /* Add estimator to current kthread (est_add_ktid) */
ip_vs_enqueue_estimator(struct netns_ipvs * ipvs,struct ip_vs_estimator * est)400 static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
401 struct ip_vs_estimator *est)
402 {
403 struct ip_vs_est_kt_data *kd = NULL;
404 struct ip_vs_est_tick_data *td;
405 int ktid, row, crow, cid, ret;
406 int delay = est->ktrow;
407
408 BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
409 "Too many chains for ktcid");
410
411 if (ipvs->est_add_ktid < ipvs->est_kt_count) {
412 kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
413 if (kd)
414 goto add_est;
415 }
416
417 ret = ip_vs_est_add_kthread(ipvs);
418 if (ret < 0)
419 goto out;
420 kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
421
422 add_est:
423 ktid = kd->id;
424 /* For small number of estimators prefer to use few ticks,
425 * otherwise try to add into the last estimated row.
426 * est_row and add_row point after the row we should use
427 */
428 if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
429 crow = READ_ONCE(kd->est_row);
430 else
431 crow = kd->add_row;
432 crow += delay;
433 if (crow >= IPVS_EST_NTICKS)
434 crow -= IPVS_EST_NTICKS;
435 /* Assume initial delay ? */
436 if (delay >= IPVS_EST_NTICKS - 1) {
437 /* Preserve initial delay or decrease it if no space in tick */
438 row = crow;
439 if (crow < IPVS_EST_NTICKS - 1) {
440 crow++;
441 row = find_last_bit(kd->avail, crow);
442 }
443 if (row >= crow)
444 row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
445 } else {
446 /* Preserve delay or increase it if no space in tick */
447 row = IPVS_EST_NTICKS;
448 if (crow > 0)
449 row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
450 if (row >= IPVS_EST_NTICKS)
451 row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
452 }
453
454 td = rcu_dereference_protected(kd->ticks[row], 1);
455 if (!td) {
456 td = kzalloc_obj(*td);
457 if (!td) {
458 ret = -ENOMEM;
459 goto out;
460 }
461 rcu_assign_pointer(kd->ticks[row], td);
462 }
463
464 cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
465
466 kd->est_count++;
467 kd->tick_len[row]++;
468 if (!td->chain_len[cid])
469 __set_bit(cid, td->present);
470 td->chain_len[cid]++;
471 est->ktid = ktid;
472 est->ktrow = row;
473 est->ktcid = cid;
474 hlist_add_head_rcu(&est->list, &td->chains[cid]);
475
476 if (td->chain_len[cid] >= kd->chain_max) {
477 __set_bit(cid, td->full);
478 if (kd->tick_len[row] >= kd->tick_max)
479 __clear_bit(row, kd->avail);
480 }
481
482 /* Update est_add_ktid to point to first available/empty kt slot */
483 if (kd->est_count == kd->est_max_count)
484 ip_vs_est_update_ktid(ipvs);
485
486 ret = 0;
487
488 out:
489 return ret;
490 }
491
492 /* Start estimation for stats */
ip_vs_start_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)493 int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
494 {
495 struct ip_vs_est_kt_data *kd = ipvs->est_kt_count > 0 ?
496 ipvs->est_kt_arr[0] : NULL;
497 struct ip_vs_estimator *est = &stats->est;
498 int ret;
499
500 est->ktid = -1;
501 est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
502
503 /* We prefer this code to be short, kthread 0 will requeue the
504 * estimator to available chain. If tasks are disabled, we
505 * will not allocate much memory, just for kt 0.
506 */
507 ret = 0;
508 if (!kd) {
509 ret = ip_vs_est_add_kthread(ipvs);
510 } else if (!kd->needed) {
511 mutex_lock(&ipvs->est_mutex);
512 /* We have job for the kt 0 task */
513 kd->needed = 1;
514 ip_vs_est_reload_start(ipvs, true);
515 mutex_unlock(&ipvs->est_mutex);
516 }
517 if (ret >= 0)
518 hlist_add_head(&est->list, &ipvs->est_temp_list);
519 else
520 INIT_HLIST_NODE(&est->list);
521 return ret;
522 }
523
ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data * kd)524 static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
525 {
526 if (kd) {
527 if (kd->task) {
528 pr_info("stop unused estimator thread %d...\n", kd->id);
529 kthread_stop(kd->task);
530 }
531 ip_vs_stats_free(kd->calc_stats);
532 kfree(kd);
533 }
534 }
535
536 /* Unlink estimator from chain */
ip_vs_stop_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)537 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
538 {
539 struct ip_vs_estimator *est = &stats->est;
540 struct ip_vs_est_tick_data *td;
541 struct ip_vs_est_kt_data *kd;
542 int ktid = est->ktid;
543 int row = est->ktrow;
544 int cid = est->ktcid;
545
546 /* Failed to add to chain ? */
547 if (hlist_unhashed(&est->list))
548 return;
549
550 /* On return, estimator can be freed, dequeue it now */
551
552 /* In est_temp_list ? */
553 if (ktid < 0) {
554 hlist_del(&est->list);
555 goto end_kt0;
556 }
557
558 hlist_del_rcu(&est->list);
559 kd = ipvs->est_kt_arr[ktid];
560 td = rcu_dereference_protected(kd->ticks[row], 1);
561 __clear_bit(cid, td->full);
562 td->chain_len[cid]--;
563 if (!td->chain_len[cid])
564 __clear_bit(cid, td->present);
565 kd->tick_len[row]--;
566 __set_bit(row, kd->avail);
567 if (!kd->tick_len[row]) {
568 RCU_INIT_POINTER(kd->ticks[row], NULL);
569 kfree_rcu(td, rcu_head);
570 }
571 kd->est_count--;
572 if (kd->est_count) {
573 /* This kt slot can become available just now, prefer it */
574 if (ktid < ipvs->est_add_ktid)
575 ipvs->est_add_ktid = ktid;
576 return;
577 }
578
579 if (ktid > 0) {
580 mutex_lock(&ipvs->est_mutex);
581 ip_vs_est_kthread_destroy(kd);
582 ipvs->est_kt_arr[ktid] = NULL;
583 if (ktid == ipvs->est_kt_count - 1) {
584 ipvs->est_kt_count--;
585 while (ipvs->est_kt_count > 1 &&
586 !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
587 ipvs->est_kt_count--;
588 }
589 mutex_unlock(&ipvs->est_mutex);
590
591 /* This slot is now empty, prefer another available kt slot */
592 if (ktid == ipvs->est_add_ktid)
593 ip_vs_est_update_ktid(ipvs);
594 }
595
596 end_kt0:
597 /* kt 0 task is stopped after all other kt slots and chains are empty */
598 if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
599 kd = ipvs->est_kt_arr[0];
600 if (kd && !kd->est_count) {
601 mutex_lock(&ipvs->est_mutex);
602 /* Keep the kt0 data but request kthread_stop */
603 kd->needed = 0;
604 ip_vs_est_reload_start(ipvs, true);
605 mutex_unlock(&ipvs->est_mutex);
606 ipvs->est_add_ktid = 0;
607 }
608 }
609 }
610
611 /* Register all ests from est_temp_list to kthreads */
ip_vs_est_drain_temp_list(struct netns_ipvs * ipvs)612 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
613 {
614 struct ip_vs_estimator *est;
615
616 while (1) {
617 int max = 16;
618
619 mutex_lock(&ipvs->service_mutex);
620
621 while (max-- > 0) {
622 est = hlist_entry_safe(ipvs->est_temp_list.first,
623 struct ip_vs_estimator, list);
624 if (est) {
625 if (kthread_should_stop())
626 goto unlock;
627 hlist_del_init(&est->list);
628 if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
629 continue;
630 est->ktid = -1;
631 hlist_add_head(&est->list,
632 &ipvs->est_temp_list);
633 /* Abort, some entries will not be estimated
634 * until next attempt
635 */
636 }
637 goto unlock;
638 }
639 mutex_unlock(&ipvs->service_mutex);
640 cond_resched();
641 }
642
643 unlock:
644 mutex_unlock(&ipvs->service_mutex);
645 }
646
647 /* Calculate limits for all kthreads */
ip_vs_est_calc_limits(struct netns_ipvs * ipvs,int * chain_max)648 static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
649 {
650 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
651 struct ip_vs_est_kt_data *kd;
652 struct hlist_head chain;
653 struct ip_vs_stats *s;
654 int cache_factor = 4;
655 int i, loops, ntest;
656 s32 min_est = 0;
657 ktime_t t1, t2;
658 int max = 8;
659 int ret = 1;
660 s64 diff;
661 u64 val;
662
663 INIT_HLIST_HEAD(&chain);
664 mutex_lock(&ipvs->est_mutex);
665 kd = ipvs->est_kt_arr[0];
666 mutex_unlock(&ipvs->est_mutex);
667 s = kd ? kd->calc_stats : NULL;
668 if (!s)
669 goto out;
670 hlist_add_head(&s->est.list, &chain);
671
672 loops = 1;
673 /* Get best result from many tests */
674 for (ntest = 0; ntest < 12; ntest++) {
675 if (!(ntest & 3)) {
676 /* Wait for cpufreq frequency transition */
677 wait_event_idle_timeout(wq, kthread_should_stop(),
678 HZ / 50);
679 if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
680 goto stop;
681 }
682
683 local_bh_disable();
684 rcu_read_lock();
685
686 /* Put stats in cache */
687 ip_vs_chain_estimation(&chain);
688
689 t1 = ktime_get();
690 for (i = loops * cache_factor; i > 0; i--)
691 ip_vs_chain_estimation(&chain);
692 t2 = ktime_get();
693
694 rcu_read_unlock();
695 local_bh_enable();
696
697 if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
698 goto stop;
699 cond_resched();
700
701 diff = ktime_to_ns(ktime_sub(t2, t1));
702 if (diff <= 1 * NSEC_PER_USEC) {
703 /* Do more loops on low time resolution */
704 loops *= 2;
705 continue;
706 }
707 if (diff >= NSEC_PER_SEC)
708 continue;
709 val = diff;
710 do_div(val, loops);
711 if (!min_est || val < min_est) {
712 min_est = val;
713 /* goal: 95usec per chain */
714 val = 95 * NSEC_PER_USEC;
715 if (val >= min_est) {
716 do_div(val, min_est);
717 max = (int)val;
718 } else {
719 max = 1;
720 }
721 }
722 }
723
724 out:
725 if (s)
726 hlist_del_init(&s->est.list);
727 *chain_max = max;
728 return ret;
729
730 stop:
731 ret = 0;
732 goto out;
733 }
734
735 /* Calculate the parameters and apply them in context of kt #0
736 * ECP: est_calc_phase
737 * ECM: est_chain_max
738 * ECP ECM Insert Chain enable Description
739 * ---------------------------------------------------------------------------
740 * 0 0 est_temp_list 0 create kt #0 context
741 * 0 0 est_temp_list 0->1 service added, start kthread #0 task
742 * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase
743 * 1 0 est_temp_list 1 kt #0: determine est_chain_max,
744 * stop tasks, move ests to est_temp_list
745 * and free kd for kthreads 1..last
746 * 1->0 0->N kt chains 1 ests can go to kthreads
747 * 0 N kt chains 1 drain est_temp_list, create new kthread
748 * contexts, start tasks, estimate
749 */
ip_vs_est_calc_phase(struct netns_ipvs * ipvs)750 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
751 {
752 int genid = atomic_read(&ipvs->est_genid);
753 struct ip_vs_est_tick_data *td;
754 struct ip_vs_est_kt_data *kd;
755 struct ip_vs_estimator *est;
756 struct ip_vs_stats *stats;
757 int id, row, cid, delay;
758 bool last, last_td;
759 int chain_max;
760 int step;
761
762 if (!ip_vs_est_calc_limits(ipvs, &chain_max))
763 return;
764
765 /* Stop all other tasks, so that we can immediately move the
766 * estimators to est_temp_list without RCU grace period
767 */
768 mutex_lock(&ipvs->est_mutex);
769 for (id = 1; id < ipvs->est_kt_count; id++) {
770 /* netns clean up started, abort */
771 if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) {
772 mutex_unlock(&ipvs->est_mutex);
773 return;
774 }
775 kd = ipvs->est_kt_arr[id];
776 if (!kd)
777 continue;
778 ip_vs_est_kthread_stop(kd);
779 }
780 mutex_unlock(&ipvs->est_mutex);
781
782 mutex_lock(&ipvs->service_mutex);
783
784 /* Move all estimators to est_temp_list but carefully,
785 * all estimators and kthread data can be released while
786 * we reschedule.
787 */
788 step = 0;
789
790 /* Order entries in est_temp_list in ascending delay, so now
791 * walk delay(desc), id(desc), cid(asc)
792 */
793 delay = IPVS_EST_NTICKS;
794
795 next_delay:
796 delay--;
797 if (delay < 0)
798 goto end_dequeue;
799
800 last_kt:
801 /* Destroy contexts backwards */
802 id = ipvs->est_kt_count;
803
804 next_kt:
805 if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
806 goto unlock;
807 id--;
808 if (id < 0)
809 goto next_delay;
810 kd = ipvs->est_kt_arr[id];
811 if (!kd)
812 goto next_kt;
813 /* kt 0 can exist with empty chains */
814 if (!id && kd->est_count <= 1)
815 goto next_delay;
816
817 row = kd->est_row + delay;
818 if (row >= IPVS_EST_NTICKS)
819 row -= IPVS_EST_NTICKS;
820 td = rcu_dereference_protected(kd->ticks[row], 1);
821 if (!td)
822 goto next_kt;
823
824 cid = 0;
825
826 walk_chain:
827 if (kthread_should_stop())
828 goto unlock;
829 step++;
830 if (!(step & 63)) {
831 /* Give chance estimators to be added (to est_temp_list)
832 * and deleted (releasing kthread contexts)
833 */
834 mutex_unlock(&ipvs->service_mutex);
835 cond_resched();
836 mutex_lock(&ipvs->service_mutex);
837
838 /* Current kt released ? */
839 if (id >= ipvs->est_kt_count)
840 goto last_kt;
841 if (kd != ipvs->est_kt_arr[id])
842 goto next_kt;
843 /* Current td released ? */
844 if (td != rcu_dereference_protected(kd->ticks[row], 1))
845 goto next_kt;
846 /* No fatal changes on the current kd and td */
847 }
848 est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
849 list);
850 if (!est) {
851 cid++;
852 if (cid >= IPVS_EST_TICK_CHAINS)
853 goto next_kt;
854 goto walk_chain;
855 }
856 /* We can cheat and increase est_count to protect kt 0 context
857 * from release but we prefer to keep the last estimator
858 */
859 last = kd->est_count <= 1;
860 /* Do not free kt #0 data */
861 if (!id && last)
862 goto next_delay;
863 last_td = kd->tick_len[row] <= 1;
864 stats = container_of(est, struct ip_vs_stats, est);
865 ip_vs_stop_estimator(ipvs, stats);
866 /* Tasks are stopped, move without RCU grace period */
867 est->ktid = -1;
868 est->ktrow = delay;
869 hlist_add_head(&est->list, &ipvs->est_temp_list);
870 /* kd freed ? */
871 if (last)
872 goto next_kt;
873 /* td freed ? */
874 if (last_td)
875 goto next_kt;
876 goto walk_chain;
877
878 end_dequeue:
879 /* All estimators removed while calculating ? */
880 if (!ipvs->est_kt_count)
881 goto unlock;
882 kd = ipvs->est_kt_arr[0];
883 if (!kd)
884 goto unlock;
885 kd->add_row = kd->est_row;
886 ipvs->est_chain_max = chain_max;
887 ip_vs_est_set_params(ipvs, kd);
888
889 pr_info("using max %d ests per chain, %d per kthread\n",
890 kd->chain_max, kd->est_max_count);
891
892 /* Try to keep tot_stats in kt0, enqueue it early */
893 if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
894 ipvs->tot_stats->s.est.ktid == -1) {
895 hlist_del(&ipvs->tot_stats->s.est.list);
896 hlist_add_head(&ipvs->tot_stats->s.est.list,
897 &ipvs->est_temp_list);
898 }
899
900 mutex_lock(&ipvs->est_mutex);
901
902 /* We completed the calc phase, new calc phase not requested */
903 if (genid == atomic_read(&ipvs->est_genid))
904 ipvs->est_calc_phase = 0;
905
906 mutex_unlock(&ipvs->est_mutex);
907
908 unlock:
909 mutex_unlock(&ipvs->service_mutex);
910 }
911
ip_vs_zero_estimator(struct ip_vs_stats * stats)912 void ip_vs_zero_estimator(struct ip_vs_stats *stats)
913 {
914 struct ip_vs_estimator *est = &stats->est;
915 struct ip_vs_kstats *k = &stats->kstats;
916
917 /* reset counters, caller must hold the stats->lock lock */
918 est->last_inbytes = k->inbytes;
919 est->last_outbytes = k->outbytes;
920 est->last_conns = k->conns;
921 est->last_inpkts = k->inpkts;
922 est->last_outpkts = k->outpkts;
923 est->cps = 0;
924 est->inpps = 0;
925 est->outpps = 0;
926 est->inbps = 0;
927 est->outbps = 0;
928 }
929
930 /* Get decoded rates */
ip_vs_read_estimator(struct ip_vs_kstats * dst,struct ip_vs_stats * stats)931 void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
932 {
933 struct ip_vs_estimator *e = &stats->est;
934
935 dst->cps = (e->cps + 0x1FF) >> 10;
936 dst->inpps = (e->inpps + 0x1FF) >> 10;
937 dst->outpps = (e->outpps + 0x1FF) >> 10;
938 dst->inbps = (e->inbps + 0xF) >> 5;
939 dst->outbps = (e->outbps + 0xF) >> 5;
940 }
941
ip_vs_estimator_net_init(struct netns_ipvs * ipvs)942 int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
943 {
944 INIT_HLIST_HEAD(&ipvs->est_temp_list);
945 ipvs->est_kt_arr = NULL;
946 ipvs->est_max_threads = 0;
947 ipvs->est_calc_phase = 0;
948 ipvs->est_chain_max = 0;
949 ipvs->est_kt_count = 0;
950 ipvs->est_add_ktid = 0;
951 atomic_set(&ipvs->est_genid, 0);
952 atomic_set(&ipvs->est_genid_done, 0);
953 __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
954 return 0;
955 }
956
ip_vs_estimator_net_cleanup(struct netns_ipvs * ipvs)957 void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
958 {
959 int i;
960
961 for (i = 0; i < ipvs->est_kt_count; i++)
962 ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
963 kfree(ipvs->est_kt_arr);
964 mutex_destroy(&ipvs->est_mutex);
965 }
966