xref: /linux/net/netfilter/ipvs/ip_vs_est.c (revision 63467137ecc0ff6f804d53903ad87a2f0397a18b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * ip_vs_est.c: simple rate estimator for IPVS
4  *
5  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
6  *
7  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
8  *              Network name space (netns) aware.
9  *              Global data moved to netns i.e struct netns_ipvs
10  *              Affected data: est_list and est_lock.
11  *              estimation_timer() runs with timer per netns.
12  *              get_stats()) do the per cpu summing.
13  */
14 
15 #define KMSG_COMPONENT "IPVS"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17 
18 #include <linux/kernel.h>
19 #include <linux/jiffies.h>
20 #include <linux/types.h>
21 #include <linux/interrupt.h>
22 #include <linux/sysctl.h>
23 #include <linux/list.h>
24 #include <linux/rcupdate_wait.h>
25 
26 #include <net/ip_vs.h>
27 
28 /*
29   This code is to estimate rate in a shorter interval (such as 8
30   seconds) for virtual services and real servers. For measure rate in a
31   long interval, it is easy to implement a user level daemon which
32   periodically reads those statistical counters and measure rate.
33 
34   We measure rate during the last 8 seconds every 2 seconds:
35 
36     avgrate = avgrate*(1-W) + rate*W
37 
38     where W = 2^(-2)
39 
40   NOTES.
41 
42   * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
43 
44   * Netlink users can see 64-bit values but sockopt users are restricted
45     to 32-bit values for conns, packets, bps, cps and pps.
46 
47   * A lot of code is taken from net/core/gen_estimator.c
48 
49   KEY POINTS:
50   - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
51   - kthreads read the cpustats to update the estimators (svcs, dests, total)
52   - the states of estimators can be read (get stats) or modified (zero stats)
53     from processes
54 
55   KTHREADS:
56   - estimators are added initially to est_temp_list and later kthread 0
57     distributes them to one or many kthreads for estimation
58   - kthread contexts are created and attached to array
59   - the kthread tasks are started when first service is added, before that
60     the total stats are not estimated
61   - when configuration (cpulist/nice) is changed, the tasks are restarted
62     by work (est_reload_work)
63   - kthread tasks are stopped while the cpulist is empty
64   - the kthread context holds lists with estimators (chains) which are
65     processed every 2 seconds
66   - as estimators can be added dynamically and in bursts, we try to spread
67     them to multiple chains which are estimated at different time
68   - on start, kthread 0 enters calculation phase to determine the chain limits
69     and the limit of estimators per kthread
70   - est_add_ktid: ktid where to add new ests, can point to empty slot where
71     we should add kt data
72  */
73 
74 static struct lock_class_key __ipvs_est_key;
75 
76 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
77 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
78 
ip_vs_chain_estimation(struct hlist_head * chain)79 static void ip_vs_chain_estimation(struct hlist_head *chain)
80 {
81 	struct ip_vs_estimator *e;
82 	struct ip_vs_cpu_stats *c;
83 	struct ip_vs_stats *s;
84 	u64 rate;
85 
86 	hlist_for_each_entry_rcu(e, chain, list) {
87 		u64 conns, inpkts, outpkts, inbytes, outbytes;
88 		u64 kconns = 0, kinpkts = 0, koutpkts = 0;
89 		u64 kinbytes = 0, koutbytes = 0;
90 		unsigned int start;
91 		int i;
92 
93 		if (kthread_should_stop())
94 			break;
95 
96 		s = container_of(e, struct ip_vs_stats, est);
97 		for_each_possible_cpu(i) {
98 			c = per_cpu_ptr(s->cpustats, i);
99 			do {
100 				start = u64_stats_fetch_begin(&c->syncp);
101 				conns = u64_stats_read(&c->cnt.conns);
102 				inpkts = u64_stats_read(&c->cnt.inpkts);
103 				outpkts = u64_stats_read(&c->cnt.outpkts);
104 				inbytes = u64_stats_read(&c->cnt.inbytes);
105 				outbytes = u64_stats_read(&c->cnt.outbytes);
106 			} while (u64_stats_fetch_retry(&c->syncp, start));
107 			kconns += conns;
108 			kinpkts += inpkts;
109 			koutpkts += outpkts;
110 			kinbytes += inbytes;
111 			koutbytes += outbytes;
112 		}
113 
114 		spin_lock(&s->lock);
115 
116 		s->kstats.conns = kconns;
117 		s->kstats.inpkts = kinpkts;
118 		s->kstats.outpkts = koutpkts;
119 		s->kstats.inbytes = kinbytes;
120 		s->kstats.outbytes = koutbytes;
121 
122 		/* scaled by 2^10, but divided 2 seconds */
123 		rate = (s->kstats.conns - e->last_conns) << 9;
124 		e->last_conns = s->kstats.conns;
125 		e->cps += ((s64)rate - (s64)e->cps) >> 2;
126 
127 		rate = (s->kstats.inpkts - e->last_inpkts) << 9;
128 		e->last_inpkts = s->kstats.inpkts;
129 		e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
130 
131 		rate = (s->kstats.outpkts - e->last_outpkts) << 9;
132 		e->last_outpkts = s->kstats.outpkts;
133 		e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
134 
135 		/* scaled by 2^5, but divided 2 seconds */
136 		rate = (s->kstats.inbytes - e->last_inbytes) << 4;
137 		e->last_inbytes = s->kstats.inbytes;
138 		e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
139 
140 		rate = (s->kstats.outbytes - e->last_outbytes) << 4;
141 		e->last_outbytes = s->kstats.outbytes;
142 		e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
143 		spin_unlock(&s->lock);
144 	}
145 }
146 
ip_vs_tick_estimation(struct ip_vs_est_kt_data * kd,int row)147 static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
148 {
149 	struct ip_vs_est_tick_data *td;
150 	int cid;
151 
152 	rcu_read_lock();
153 	td = rcu_dereference(kd->ticks[row]);
154 	if (!td)
155 		goto out;
156 	for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
157 		if (kthread_should_stop())
158 			break;
159 		ip_vs_chain_estimation(&td->chains[cid]);
160 		cond_resched_rcu();
161 		td = rcu_dereference(kd->ticks[row]);
162 		if (!td)
163 			break;
164 	}
165 
166 out:
167 	rcu_read_unlock();
168 }
169 
ip_vs_estimation_kthread(void * data)170 static int ip_vs_estimation_kthread(void *data)
171 {
172 	struct ip_vs_est_kt_data *kd = data;
173 	struct netns_ipvs *ipvs = kd->ipvs;
174 	int row = kd->est_row;
175 	unsigned long now;
176 	int id = kd->id;
177 	long gap;
178 
179 	if (id > 0) {
180 		if (!ipvs->est_chain_max)
181 			return 0;
182 	} else {
183 		if (!ipvs->est_chain_max) {
184 			ipvs->est_calc_phase = 1;
185 			/* commit est_calc_phase before reading est_genid */
186 			smp_mb();
187 		}
188 
189 		/* kthread 0 will handle the calc phase */
190 		if (ipvs->est_calc_phase)
191 			ip_vs_est_calc_phase(ipvs);
192 	}
193 
194 	while (1) {
195 		if (!id && !hlist_empty(&ipvs->est_temp_list))
196 			ip_vs_est_drain_temp_list(ipvs);
197 		set_current_state(TASK_IDLE);
198 		if (kthread_should_stop())
199 			break;
200 
201 		/* before estimation, check if we should sleep */
202 		now = jiffies;
203 		gap = kd->est_timer - now;
204 		if (gap > 0) {
205 			if (gap > IPVS_EST_TICK) {
206 				kd->est_timer = now - IPVS_EST_TICK;
207 				gap = IPVS_EST_TICK;
208 			}
209 			schedule_timeout(gap);
210 		} else {
211 			__set_current_state(TASK_RUNNING);
212 			if (gap < -8 * IPVS_EST_TICK)
213 				kd->est_timer = now;
214 		}
215 
216 		if (kd->tick_len[row])
217 			ip_vs_tick_estimation(kd, row);
218 
219 		row++;
220 		if (row >= IPVS_EST_NTICKS)
221 			row = 0;
222 		WRITE_ONCE(kd->est_row, row);
223 		kd->est_timer += IPVS_EST_TICK;
224 	}
225 	__set_current_state(TASK_RUNNING);
226 
227 	return 0;
228 }
229 
230 /* Schedule stop/start for kthread tasks */
ip_vs_est_reload_start(struct netns_ipvs * ipvs)231 void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
232 {
233 	/* Ignore reloads before first service is added */
234 	if (!ipvs->enable)
235 		return;
236 	ip_vs_est_stopped_recalc(ipvs);
237 	/* Bump the kthread configuration genid */
238 	atomic_inc(&ipvs->est_genid);
239 	queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
240 }
241 
242 /* Start kthread task with current configuration */
ip_vs_est_kthread_start(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)243 int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
244 			    struct ip_vs_est_kt_data *kd)
245 {
246 	unsigned long now;
247 	int ret = 0;
248 	long gap;
249 
250 	lockdep_assert_held(&ipvs->est_mutex);
251 
252 	if (kd->task)
253 		goto out;
254 	now = jiffies;
255 	gap = kd->est_timer - now;
256 	/* Sync est_timer if task is starting later */
257 	if (abs(gap) > 4 * IPVS_EST_TICK)
258 		kd->est_timer = now;
259 	kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
260 				  ipvs->gen, kd->id);
261 	if (IS_ERR(kd->task)) {
262 		ret = PTR_ERR(kd->task);
263 		kd->task = NULL;
264 		goto out;
265 	}
266 
267 	set_user_nice(kd->task, sysctl_est_nice(ipvs));
268 	if (sysctl_est_preferred_cpulist(ipvs))
269 		kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
270 
271 	pr_info("starting estimator thread %d...\n", kd->id);
272 	wake_up_process(kd->task);
273 
274 out:
275 	return ret;
276 }
277 
ip_vs_est_kthread_stop(struct ip_vs_est_kt_data * kd)278 void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
279 {
280 	if (kd->task) {
281 		pr_info("stopping estimator thread %d...\n", kd->id);
282 		kthread_stop(kd->task);
283 		kd->task = NULL;
284 	}
285 }
286 
287 /* Apply parameters to kthread */
ip_vs_est_set_params(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)288 static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
289 				 struct ip_vs_est_kt_data *kd)
290 {
291 	kd->chain_max = ipvs->est_chain_max;
292 	/* We are using single chain on RCU preemption */
293 	if (IPVS_EST_TICK_CHAINS == 1)
294 		kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
295 	kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
296 	kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
297 }
298 
299 /* Create and start estimation kthread in a free or new array slot */
ip_vs_est_add_kthread(struct netns_ipvs * ipvs)300 static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
301 {
302 	struct ip_vs_est_kt_data *kd = NULL;
303 	int id = ipvs->est_kt_count;
304 	int ret = -ENOMEM;
305 	void *arr = NULL;
306 	int i;
307 
308 	if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
309 	    ipvs->enable && ipvs->est_max_threads)
310 		return -EINVAL;
311 
312 	mutex_lock(&ipvs->est_mutex);
313 
314 	for (i = 0; i < id; i++) {
315 		if (!ipvs->est_kt_arr[i])
316 			break;
317 	}
318 	if (i >= id) {
319 		arr = krealloc_array(ipvs->est_kt_arr, id + 1,
320 				     sizeof(struct ip_vs_est_kt_data *),
321 				     GFP_KERNEL);
322 		if (!arr)
323 			goto out;
324 		ipvs->est_kt_arr = arr;
325 	} else {
326 		id = i;
327 	}
328 
329 	kd = kzalloc(sizeof(*kd), GFP_KERNEL);
330 	if (!kd)
331 		goto out;
332 	kd->ipvs = ipvs;
333 	bitmap_fill(kd->avail, IPVS_EST_NTICKS);
334 	kd->est_timer = jiffies;
335 	kd->id = id;
336 	ip_vs_est_set_params(ipvs, kd);
337 
338 	/* Pre-allocate stats used in calc phase */
339 	if (!id && !kd->calc_stats) {
340 		kd->calc_stats = ip_vs_stats_alloc();
341 		if (!kd->calc_stats)
342 			goto out;
343 	}
344 
345 	/* Start kthread tasks only when services are present */
346 	if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
347 		ret = ip_vs_est_kthread_start(ipvs, kd);
348 		if (ret < 0)
349 			goto out;
350 	}
351 
352 	if (arr)
353 		ipvs->est_kt_count++;
354 	ipvs->est_kt_arr[id] = kd;
355 	kd = NULL;
356 	/* Use most recent kthread for new ests */
357 	ipvs->est_add_ktid = id;
358 	ret = 0;
359 
360 out:
361 	mutex_unlock(&ipvs->est_mutex);
362 	if (kd) {
363 		ip_vs_stats_free(kd->calc_stats);
364 		kfree(kd);
365 	}
366 
367 	return ret;
368 }
369 
370 /* Select ktid where to add new ests: available, unused or new slot */
ip_vs_est_update_ktid(struct netns_ipvs * ipvs)371 static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
372 {
373 	int ktid, best = ipvs->est_kt_count;
374 	struct ip_vs_est_kt_data *kd;
375 
376 	for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
377 		kd = ipvs->est_kt_arr[ktid];
378 		if (kd) {
379 			if (kd->est_count < kd->est_max_count) {
380 				best = ktid;
381 				break;
382 			}
383 		} else if (ktid < best) {
384 			best = ktid;
385 		}
386 	}
387 	ipvs->est_add_ktid = best;
388 }
389 
390 /* Add estimator to current kthread (est_add_ktid) */
ip_vs_enqueue_estimator(struct netns_ipvs * ipvs,struct ip_vs_estimator * est)391 static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
392 				   struct ip_vs_estimator *est)
393 {
394 	struct ip_vs_est_kt_data *kd = NULL;
395 	struct ip_vs_est_tick_data *td;
396 	int ktid, row, crow, cid, ret;
397 	int delay = est->ktrow;
398 
399 	BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
400 			 "Too many chains for ktcid");
401 
402 	if (ipvs->est_add_ktid < ipvs->est_kt_count) {
403 		kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
404 		if (kd)
405 			goto add_est;
406 	}
407 
408 	ret = ip_vs_est_add_kthread(ipvs);
409 	if (ret < 0)
410 		goto out;
411 	kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
412 
413 add_est:
414 	ktid = kd->id;
415 	/* For small number of estimators prefer to use few ticks,
416 	 * otherwise try to add into the last estimated row.
417 	 * est_row and add_row point after the row we should use
418 	 */
419 	if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
420 		crow = READ_ONCE(kd->est_row);
421 	else
422 		crow = kd->add_row;
423 	crow += delay;
424 	if (crow >= IPVS_EST_NTICKS)
425 		crow -= IPVS_EST_NTICKS;
426 	/* Assume initial delay ? */
427 	if (delay >= IPVS_EST_NTICKS - 1) {
428 		/* Preserve initial delay or decrease it if no space in tick */
429 		row = crow;
430 		if (crow < IPVS_EST_NTICKS - 1) {
431 			crow++;
432 			row = find_last_bit(kd->avail, crow);
433 		}
434 		if (row >= crow)
435 			row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
436 	} else {
437 		/* Preserve delay or increase it if no space in tick */
438 		row = IPVS_EST_NTICKS;
439 		if (crow > 0)
440 			row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
441 		if (row >= IPVS_EST_NTICKS)
442 			row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
443 	}
444 
445 	td = rcu_dereference_protected(kd->ticks[row], 1);
446 	if (!td) {
447 		td = kzalloc(sizeof(*td), GFP_KERNEL);
448 		if (!td) {
449 			ret = -ENOMEM;
450 			goto out;
451 		}
452 		rcu_assign_pointer(kd->ticks[row], td);
453 	}
454 
455 	cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
456 
457 	kd->est_count++;
458 	kd->tick_len[row]++;
459 	if (!td->chain_len[cid])
460 		__set_bit(cid, td->present);
461 	td->chain_len[cid]++;
462 	est->ktid = ktid;
463 	est->ktrow = row;
464 	est->ktcid = cid;
465 	hlist_add_head_rcu(&est->list, &td->chains[cid]);
466 
467 	if (td->chain_len[cid] >= kd->chain_max) {
468 		__set_bit(cid, td->full);
469 		if (kd->tick_len[row] >= kd->tick_max)
470 			__clear_bit(row, kd->avail);
471 	}
472 
473 	/* Update est_add_ktid to point to first available/empty kt slot */
474 	if (kd->est_count == kd->est_max_count)
475 		ip_vs_est_update_ktid(ipvs);
476 
477 	ret = 0;
478 
479 out:
480 	return ret;
481 }
482 
483 /* Start estimation for stats */
ip_vs_start_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)484 int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
485 {
486 	struct ip_vs_estimator *est = &stats->est;
487 	int ret;
488 
489 	if (!ipvs->est_max_threads && ipvs->enable)
490 		ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
491 
492 	est->ktid = -1;
493 	est->ktrow = IPVS_EST_NTICKS - 1;	/* Initial delay */
494 
495 	/* We prefer this code to be short, kthread 0 will requeue the
496 	 * estimator to available chain. If tasks are disabled, we
497 	 * will not allocate much memory, just for kt 0.
498 	 */
499 	ret = 0;
500 	if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
501 		ret = ip_vs_est_add_kthread(ipvs);
502 	if (ret >= 0)
503 		hlist_add_head(&est->list, &ipvs->est_temp_list);
504 	else
505 		INIT_HLIST_NODE(&est->list);
506 	return ret;
507 }
508 
ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data * kd)509 static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
510 {
511 	if (kd) {
512 		if (kd->task) {
513 			pr_info("stop unused estimator thread %d...\n", kd->id);
514 			kthread_stop(kd->task);
515 		}
516 		ip_vs_stats_free(kd->calc_stats);
517 		kfree(kd);
518 	}
519 }
520 
521 /* Unlink estimator from chain */
ip_vs_stop_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)522 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
523 {
524 	struct ip_vs_estimator *est = &stats->est;
525 	struct ip_vs_est_tick_data *td;
526 	struct ip_vs_est_kt_data *kd;
527 	int ktid = est->ktid;
528 	int row = est->ktrow;
529 	int cid = est->ktcid;
530 
531 	/* Failed to add to chain ? */
532 	if (hlist_unhashed(&est->list))
533 		return;
534 
535 	/* On return, estimator can be freed, dequeue it now */
536 
537 	/* In est_temp_list ? */
538 	if (ktid < 0) {
539 		hlist_del(&est->list);
540 		goto end_kt0;
541 	}
542 
543 	hlist_del_rcu(&est->list);
544 	kd = ipvs->est_kt_arr[ktid];
545 	td = rcu_dereference_protected(kd->ticks[row], 1);
546 	__clear_bit(cid, td->full);
547 	td->chain_len[cid]--;
548 	if (!td->chain_len[cid])
549 		__clear_bit(cid, td->present);
550 	kd->tick_len[row]--;
551 	__set_bit(row, kd->avail);
552 	if (!kd->tick_len[row]) {
553 		RCU_INIT_POINTER(kd->ticks[row], NULL);
554 		kfree_rcu(td, rcu_head);
555 	}
556 	kd->est_count--;
557 	if (kd->est_count) {
558 		/* This kt slot can become available just now, prefer it */
559 		if (ktid < ipvs->est_add_ktid)
560 			ipvs->est_add_ktid = ktid;
561 		return;
562 	}
563 
564 	if (ktid > 0) {
565 		mutex_lock(&ipvs->est_mutex);
566 		ip_vs_est_kthread_destroy(kd);
567 		ipvs->est_kt_arr[ktid] = NULL;
568 		if (ktid == ipvs->est_kt_count - 1) {
569 			ipvs->est_kt_count--;
570 			while (ipvs->est_kt_count > 1 &&
571 			       !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
572 				ipvs->est_kt_count--;
573 		}
574 		mutex_unlock(&ipvs->est_mutex);
575 
576 		/* This slot is now empty, prefer another available kt slot */
577 		if (ktid == ipvs->est_add_ktid)
578 			ip_vs_est_update_ktid(ipvs);
579 	}
580 
581 end_kt0:
582 	/* kt 0 is freed after all other kthreads and chains are empty */
583 	if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
584 		kd = ipvs->est_kt_arr[0];
585 		if (!kd || !kd->est_count) {
586 			mutex_lock(&ipvs->est_mutex);
587 			if (kd) {
588 				ip_vs_est_kthread_destroy(kd);
589 				ipvs->est_kt_arr[0] = NULL;
590 			}
591 			ipvs->est_kt_count--;
592 			mutex_unlock(&ipvs->est_mutex);
593 			ipvs->est_add_ktid = 0;
594 		}
595 	}
596 }
597 
598 /* Register all ests from est_temp_list to kthreads */
ip_vs_est_drain_temp_list(struct netns_ipvs * ipvs)599 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
600 {
601 	struct ip_vs_estimator *est;
602 
603 	while (1) {
604 		int max = 16;
605 
606 		mutex_lock(&__ip_vs_mutex);
607 
608 		while (max-- > 0) {
609 			est = hlist_entry_safe(ipvs->est_temp_list.first,
610 					       struct ip_vs_estimator, list);
611 			if (est) {
612 				if (kthread_should_stop())
613 					goto unlock;
614 				hlist_del_init(&est->list);
615 				if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
616 					continue;
617 				est->ktid = -1;
618 				hlist_add_head(&est->list,
619 					       &ipvs->est_temp_list);
620 				/* Abort, some entries will not be estimated
621 				 * until next attempt
622 				 */
623 			}
624 			goto unlock;
625 		}
626 		mutex_unlock(&__ip_vs_mutex);
627 		cond_resched();
628 	}
629 
630 unlock:
631 	mutex_unlock(&__ip_vs_mutex);
632 }
633 
634 /* Calculate limits for all kthreads */
ip_vs_est_calc_limits(struct netns_ipvs * ipvs,int * chain_max)635 static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
636 {
637 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
638 	struct ip_vs_est_kt_data *kd;
639 	struct hlist_head chain;
640 	struct ip_vs_stats *s;
641 	int cache_factor = 4;
642 	int i, loops, ntest;
643 	s32 min_est = 0;
644 	ktime_t t1, t2;
645 	int max = 8;
646 	int ret = 1;
647 	s64 diff;
648 	u64 val;
649 
650 	INIT_HLIST_HEAD(&chain);
651 	mutex_lock(&__ip_vs_mutex);
652 	kd = ipvs->est_kt_arr[0];
653 	mutex_unlock(&__ip_vs_mutex);
654 	s = kd ? kd->calc_stats : NULL;
655 	if (!s)
656 		goto out;
657 	hlist_add_head(&s->est.list, &chain);
658 
659 	loops = 1;
660 	/* Get best result from many tests */
661 	for (ntest = 0; ntest < 12; ntest++) {
662 		if (!(ntest & 3)) {
663 			/* Wait for cpufreq frequency transition */
664 			wait_event_idle_timeout(wq, kthread_should_stop(),
665 						HZ / 50);
666 			if (!ipvs->enable || kthread_should_stop())
667 				goto stop;
668 		}
669 
670 		local_bh_disable();
671 		rcu_read_lock();
672 
673 		/* Put stats in cache */
674 		ip_vs_chain_estimation(&chain);
675 
676 		t1 = ktime_get();
677 		for (i = loops * cache_factor; i > 0; i--)
678 			ip_vs_chain_estimation(&chain);
679 		t2 = ktime_get();
680 
681 		rcu_read_unlock();
682 		local_bh_enable();
683 
684 		if (!ipvs->enable || kthread_should_stop())
685 			goto stop;
686 		cond_resched();
687 
688 		diff = ktime_to_ns(ktime_sub(t2, t1));
689 		if (diff <= 1 * NSEC_PER_USEC) {
690 			/* Do more loops on low time resolution */
691 			loops *= 2;
692 			continue;
693 		}
694 		if (diff >= NSEC_PER_SEC)
695 			continue;
696 		val = diff;
697 		do_div(val, loops);
698 		if (!min_est || val < min_est) {
699 			min_est = val;
700 			/* goal: 95usec per chain */
701 			val = 95 * NSEC_PER_USEC;
702 			if (val >= min_est) {
703 				do_div(val, min_est);
704 				max = (int)val;
705 			} else {
706 				max = 1;
707 			}
708 		}
709 	}
710 
711 out:
712 	if (s)
713 		hlist_del_init(&s->est.list);
714 	*chain_max = max;
715 	return ret;
716 
717 stop:
718 	ret = 0;
719 	goto out;
720 }
721 
722 /* Calculate the parameters and apply them in context of kt #0
723  * ECP: est_calc_phase
724  * ECM: est_chain_max
725  * ECP	ECM	Insert Chain	enable	Description
726  * ---------------------------------------------------------------------------
727  * 0	0	est_temp_list	0	create kt #0 context
728  * 0	0	est_temp_list	0->1	service added, start kthread #0 task
729  * 0->1	0	est_temp_list	1	kt task #0 started, enters calc phase
730  * 1	0	est_temp_list	1	kt #0: determine est_chain_max,
731  *					stop tasks, move ests to est_temp_list
732  *					and free kd for kthreads 1..last
733  * 1->0	0->N	kt chains	1	ests can go to kthreads
734  * 0	N	kt chains	1	drain est_temp_list, create new kthread
735  *					contexts, start tasks, estimate
736  */
ip_vs_est_calc_phase(struct netns_ipvs * ipvs)737 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
738 {
739 	int genid = atomic_read(&ipvs->est_genid);
740 	struct ip_vs_est_tick_data *td;
741 	struct ip_vs_est_kt_data *kd;
742 	struct ip_vs_estimator *est;
743 	struct ip_vs_stats *stats;
744 	int id, row, cid, delay;
745 	bool last, last_td;
746 	int chain_max;
747 	int step;
748 
749 	if (!ip_vs_est_calc_limits(ipvs, &chain_max))
750 		return;
751 
752 	mutex_lock(&__ip_vs_mutex);
753 
754 	/* Stop all other tasks, so that we can immediately move the
755 	 * estimators to est_temp_list without RCU grace period
756 	 */
757 	mutex_lock(&ipvs->est_mutex);
758 	for (id = 1; id < ipvs->est_kt_count; id++) {
759 		/* netns clean up started, abort */
760 		if (!ipvs->enable)
761 			goto unlock2;
762 		kd = ipvs->est_kt_arr[id];
763 		if (!kd)
764 			continue;
765 		ip_vs_est_kthread_stop(kd);
766 	}
767 	mutex_unlock(&ipvs->est_mutex);
768 
769 	/* Move all estimators to est_temp_list but carefully,
770 	 * all estimators and kthread data can be released while
771 	 * we reschedule. Even for kthread 0.
772 	 */
773 	step = 0;
774 
775 	/* Order entries in est_temp_list in ascending delay, so now
776 	 * walk delay(desc), id(desc), cid(asc)
777 	 */
778 	delay = IPVS_EST_NTICKS;
779 
780 next_delay:
781 	delay--;
782 	if (delay < 0)
783 		goto end_dequeue;
784 
785 last_kt:
786 	/* Destroy contexts backwards */
787 	id = ipvs->est_kt_count;
788 
789 next_kt:
790 	if (!ipvs->enable || kthread_should_stop())
791 		goto unlock;
792 	id--;
793 	if (id < 0)
794 		goto next_delay;
795 	kd = ipvs->est_kt_arr[id];
796 	if (!kd)
797 		goto next_kt;
798 	/* kt 0 can exist with empty chains */
799 	if (!id && kd->est_count <= 1)
800 		goto next_delay;
801 
802 	row = kd->est_row + delay;
803 	if (row >= IPVS_EST_NTICKS)
804 		row -= IPVS_EST_NTICKS;
805 	td = rcu_dereference_protected(kd->ticks[row], 1);
806 	if (!td)
807 		goto next_kt;
808 
809 	cid = 0;
810 
811 walk_chain:
812 	if (kthread_should_stop())
813 		goto unlock;
814 	step++;
815 	if (!(step & 63)) {
816 		/* Give chance estimators to be added (to est_temp_list)
817 		 * and deleted (releasing kthread contexts)
818 		 */
819 		mutex_unlock(&__ip_vs_mutex);
820 		cond_resched();
821 		mutex_lock(&__ip_vs_mutex);
822 
823 		/* Current kt released ? */
824 		if (id >= ipvs->est_kt_count)
825 			goto last_kt;
826 		if (kd != ipvs->est_kt_arr[id])
827 			goto next_kt;
828 		/* Current td released ? */
829 		if (td != rcu_dereference_protected(kd->ticks[row], 1))
830 			goto next_kt;
831 		/* No fatal changes on the current kd and td */
832 	}
833 	est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
834 			       list);
835 	if (!est) {
836 		cid++;
837 		if (cid >= IPVS_EST_TICK_CHAINS)
838 			goto next_kt;
839 		goto walk_chain;
840 	}
841 	/* We can cheat and increase est_count to protect kt 0 context
842 	 * from release but we prefer to keep the last estimator
843 	 */
844 	last = kd->est_count <= 1;
845 	/* Do not free kt #0 data */
846 	if (!id && last)
847 		goto next_delay;
848 	last_td = kd->tick_len[row] <= 1;
849 	stats = container_of(est, struct ip_vs_stats, est);
850 	ip_vs_stop_estimator(ipvs, stats);
851 	/* Tasks are stopped, move without RCU grace period */
852 	est->ktid = -1;
853 	est->ktrow = row - kd->est_row;
854 	if (est->ktrow < 0)
855 		est->ktrow += IPVS_EST_NTICKS;
856 	hlist_add_head(&est->list, &ipvs->est_temp_list);
857 	/* kd freed ? */
858 	if (last)
859 		goto next_kt;
860 	/* td freed ? */
861 	if (last_td)
862 		goto next_kt;
863 	goto walk_chain;
864 
865 end_dequeue:
866 	/* All estimators removed while calculating ? */
867 	if (!ipvs->est_kt_count)
868 		goto unlock;
869 	kd = ipvs->est_kt_arr[0];
870 	if (!kd)
871 		goto unlock;
872 	kd->add_row = kd->est_row;
873 	ipvs->est_chain_max = chain_max;
874 	ip_vs_est_set_params(ipvs, kd);
875 
876 	pr_info("using max %d ests per chain, %d per kthread\n",
877 		kd->chain_max, kd->est_max_count);
878 
879 	/* Try to keep tot_stats in kt0, enqueue it early */
880 	if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
881 	    ipvs->tot_stats->s.est.ktid == -1) {
882 		hlist_del(&ipvs->tot_stats->s.est.list);
883 		hlist_add_head(&ipvs->tot_stats->s.est.list,
884 			       &ipvs->est_temp_list);
885 	}
886 
887 	mutex_lock(&ipvs->est_mutex);
888 
889 	/* We completed the calc phase, new calc phase not requested */
890 	if (genid == atomic_read(&ipvs->est_genid))
891 		ipvs->est_calc_phase = 0;
892 
893 unlock2:
894 	mutex_unlock(&ipvs->est_mutex);
895 
896 unlock:
897 	mutex_unlock(&__ip_vs_mutex);
898 }
899 
ip_vs_zero_estimator(struct ip_vs_stats * stats)900 void ip_vs_zero_estimator(struct ip_vs_stats *stats)
901 {
902 	struct ip_vs_estimator *est = &stats->est;
903 	struct ip_vs_kstats *k = &stats->kstats;
904 
905 	/* reset counters, caller must hold the stats->lock lock */
906 	est->last_inbytes = k->inbytes;
907 	est->last_outbytes = k->outbytes;
908 	est->last_conns = k->conns;
909 	est->last_inpkts = k->inpkts;
910 	est->last_outpkts = k->outpkts;
911 	est->cps = 0;
912 	est->inpps = 0;
913 	est->outpps = 0;
914 	est->inbps = 0;
915 	est->outbps = 0;
916 }
917 
918 /* Get decoded rates */
ip_vs_read_estimator(struct ip_vs_kstats * dst,struct ip_vs_stats * stats)919 void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
920 {
921 	struct ip_vs_estimator *e = &stats->est;
922 
923 	dst->cps = (e->cps + 0x1FF) >> 10;
924 	dst->inpps = (e->inpps + 0x1FF) >> 10;
925 	dst->outpps = (e->outpps + 0x1FF) >> 10;
926 	dst->inbps = (e->inbps + 0xF) >> 5;
927 	dst->outbps = (e->outbps + 0xF) >> 5;
928 }
929 
ip_vs_estimator_net_init(struct netns_ipvs * ipvs)930 int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
931 {
932 	INIT_HLIST_HEAD(&ipvs->est_temp_list);
933 	ipvs->est_kt_arr = NULL;
934 	ipvs->est_max_threads = 0;
935 	ipvs->est_calc_phase = 0;
936 	ipvs->est_chain_max = 0;
937 	ipvs->est_kt_count = 0;
938 	ipvs->est_add_ktid = 0;
939 	atomic_set(&ipvs->est_genid, 0);
940 	atomic_set(&ipvs->est_genid_done, 0);
941 	__mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
942 	return 0;
943 }
944 
ip_vs_estimator_net_cleanup(struct netns_ipvs * ipvs)945 void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
946 {
947 	int i;
948 
949 	for (i = 0; i < ipvs->est_kt_count; i++)
950 		ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
951 	kfree(ipvs->est_kt_arr);
952 	mutex_destroy(&ipvs->est_mutex);
953 }
954