xref: /linux/net/netfilter/ipvs/ip_vs_est.c (revision fcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * ip_vs_est.c: simple rate estimator for IPVS
4  *
5  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
6  *
7  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
8  *              Network name space (netns) aware.
9  *              Global data moved to netns i.e struct netns_ipvs
10  *              Affected data: est_list and est_lock.
11  *              estimation_timer() runs with timer per netns.
12  *              get_stats()) do the per cpu summing.
13  */
14 
15 #define pr_fmt(fmt) "IPVS: " fmt
16 
17 #include <linux/kernel.h>
18 #include <linux/jiffies.h>
19 #include <linux/types.h>
20 #include <linux/interrupt.h>
21 #include <linux/sysctl.h>
22 #include <linux/list.h>
23 #include <linux/rcupdate_wait.h>
24 
25 #include <net/ip_vs.h>
26 
27 /*
28   This code is to estimate rate in a shorter interval (such as 8
29   seconds) for virtual services and real servers. For measure rate in a
30   long interval, it is easy to implement a user level daemon which
31   periodically reads those statistical counters and measure rate.
32 
33   We measure rate during the last 8 seconds every 2 seconds:
34 
35     avgrate = avgrate*(1-W) + rate*W
36 
37     where W = 2^(-2)
38 
39   NOTES.
40 
41   * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
42 
43   * Netlink users can see 64-bit values but sockopt users are restricted
44     to 32-bit values for conns, packets, bps, cps and pps.
45 
46   * A lot of code is taken from net/core/gen_estimator.c
47 
48   KEY POINTS:
49   - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
50   - kthreads read the cpustats to update the estimators (svcs, dests, total)
51   - the states of estimators can be read (get stats) or modified (zero stats)
52     from processes
53 
54   KTHREADS:
55   - estimators are added initially to est_temp_list and later kthread 0
56     distributes them to one or many kthreads for estimation
57   - kthread contexts are created and attached to array
58   - the kthread tasks are started when first service is added, before that
59     the total stats are not estimated
60   - when configuration (cpulist/nice) is changed, the tasks are restarted
61     by work (est_reload_work)
62   - kthread tasks are stopped while the cpulist is empty
63   - the kthread context holds lists with estimators (chains) which are
64     processed every 2 seconds
65   - as estimators can be added dynamically and in bursts, we try to spread
66     them to multiple chains which are estimated at different time
67   - on start, kthread 0 enters calculation phase to determine the chain limits
68     and the limit of estimators per kthread
69   - est_add_ktid: ktid where to add new ests, can point to empty slot where
70     we should add kt data
71   - data protected by service_mutex: est_temp_list, est_add_ktid,
72     est_kt_count(R/W), est_kt_arr(R/W), est_genid_done, kd->needed(R/W)
73   - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist,
74     est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation,
75     est_kt_count(R), est_kt_arr(R), kd->needed(R), kd->task (id > 0)
76  */
77 
78 static struct lock_class_key __ipvs_est_key;
79 
80 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
81 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
82 
ip_vs_chain_estimation(struct hlist_head * chain)83 static void ip_vs_chain_estimation(struct hlist_head *chain)
84 {
85 	struct ip_vs_estimator *e;
86 	struct ip_vs_cpu_stats *c;
87 	struct ip_vs_stats *s;
88 	u64 rate;
89 
90 	hlist_for_each_entry_rcu(e, chain, list) {
91 		u64 conns, inpkts, outpkts, inbytes, outbytes;
92 		u64 kconns = 0, kinpkts = 0, koutpkts = 0;
93 		u64 kinbytes = 0, koutbytes = 0;
94 		unsigned int start;
95 		int i;
96 
97 		if (kthread_should_stop())
98 			break;
99 
100 		s = container_of(e, struct ip_vs_stats, est);
101 		for_each_possible_cpu(i) {
102 			c = per_cpu_ptr(s->cpustats, i);
103 			do {
104 				start = u64_stats_fetch_begin(&c->syncp);
105 				conns = u64_stats_read(&c->cnt.conns);
106 				inpkts = u64_stats_read(&c->cnt.inpkts);
107 				outpkts = u64_stats_read(&c->cnt.outpkts);
108 				inbytes = u64_stats_read(&c->cnt.inbytes);
109 				outbytes = u64_stats_read(&c->cnt.outbytes);
110 			} while (u64_stats_fetch_retry(&c->syncp, start));
111 			kconns += conns;
112 			kinpkts += inpkts;
113 			koutpkts += outpkts;
114 			kinbytes += inbytes;
115 			koutbytes += outbytes;
116 		}
117 
118 		spin_lock(&s->lock);
119 
120 		s->kstats.conns = kconns;
121 		s->kstats.inpkts = kinpkts;
122 		s->kstats.outpkts = koutpkts;
123 		s->kstats.inbytes = kinbytes;
124 		s->kstats.outbytes = koutbytes;
125 
126 		/* scaled by 2^10, but divided 2 seconds */
127 		rate = (s->kstats.conns - e->last_conns) << 9;
128 		e->last_conns = s->kstats.conns;
129 		e->cps += ((s64)rate - (s64)e->cps) >> 2;
130 
131 		rate = (s->kstats.inpkts - e->last_inpkts) << 9;
132 		e->last_inpkts = s->kstats.inpkts;
133 		e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
134 
135 		rate = (s->kstats.outpkts - e->last_outpkts) << 9;
136 		e->last_outpkts = s->kstats.outpkts;
137 		e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
138 
139 		/* scaled by 2^5, but divided 2 seconds */
140 		rate = (s->kstats.inbytes - e->last_inbytes) << 4;
141 		e->last_inbytes = s->kstats.inbytes;
142 		e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
143 
144 		rate = (s->kstats.outbytes - e->last_outbytes) << 4;
145 		e->last_outbytes = s->kstats.outbytes;
146 		e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
147 		spin_unlock(&s->lock);
148 	}
149 }
150 
ip_vs_tick_estimation(struct ip_vs_est_kt_data * kd,int row)151 static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
152 {
153 	struct ip_vs_est_tick_data *td;
154 	int cid;
155 
156 	rcu_read_lock();
157 	td = rcu_dereference(kd->ticks[row]);
158 	if (!td)
159 		goto out;
160 	for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
161 		if (kthread_should_stop())
162 			break;
163 		ip_vs_chain_estimation(&td->chains[cid]);
164 		cond_resched_rcu();
165 		td = rcu_dereference(kd->ticks[row]);
166 		if (!td)
167 			break;
168 	}
169 
170 out:
171 	rcu_read_unlock();
172 }
173 
ip_vs_estimation_kthread(void * data)174 static int ip_vs_estimation_kthread(void *data)
175 {
176 	struct ip_vs_est_kt_data *kd = data;
177 	struct netns_ipvs *ipvs = kd->ipvs;
178 	int row = kd->est_row;
179 	unsigned long now;
180 	int id = kd->id;
181 	long gap;
182 
183 	if (id > 0) {
184 		if (!ipvs->est_chain_max)
185 			return 0;
186 	} else {
187 		if (!ipvs->est_chain_max) {
188 			ipvs->est_calc_phase = 1;
189 			/* commit est_calc_phase before reading est_genid */
190 			smp_mb();
191 		}
192 
193 		/* kthread 0 will handle the calc phase */
194 		if (ipvs->est_calc_phase)
195 			ip_vs_est_calc_phase(ipvs);
196 	}
197 
198 	while (1) {
199 		if (!id && !hlist_empty(&ipvs->est_temp_list))
200 			ip_vs_est_drain_temp_list(ipvs);
201 		set_current_state(TASK_IDLE);
202 		if (kthread_should_stop())
203 			break;
204 
205 		/* before estimation, check if we should sleep */
206 		now = jiffies;
207 		gap = kd->est_timer - now;
208 		if (gap > 0) {
209 			if (gap > IPVS_EST_TICK) {
210 				kd->est_timer = now - IPVS_EST_TICK;
211 				gap = IPVS_EST_TICK;
212 			}
213 			schedule_timeout(gap);
214 		} else {
215 			__set_current_state(TASK_RUNNING);
216 			if (gap < -8 * IPVS_EST_TICK)
217 				kd->est_timer = now;
218 		}
219 
220 		if (kd->tick_len[row])
221 			ip_vs_tick_estimation(kd, row);
222 
223 		row++;
224 		if (row >= IPVS_EST_NTICKS)
225 			row = 0;
226 		WRITE_ONCE(kd->est_row, row);
227 		kd->est_timer += IPVS_EST_TICK;
228 	}
229 	__set_current_state(TASK_RUNNING);
230 
231 	return 0;
232 }
233 
234 /* Schedule stop/start for kthread tasks */
ip_vs_est_reload_start(struct netns_ipvs * ipvs,bool restart)235 void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart)
236 {
237 	lockdep_assert_held(&ipvs->est_mutex);
238 
239 	/* Ignore reloads before first service is added */
240 	if (!READ_ONCE(ipvs->enable))
241 		return;
242 	ip_vs_est_stopped_recalc(ipvs);
243 	/* Bump the kthread configuration genid if stopping is requested */
244 	if (restart)
245 		atomic_inc(&ipvs->est_genid);
246 	queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
247 }
248 
249 /* Start kthread task with current configuration */
ip_vs_est_kthread_start(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)250 int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
251 			    struct ip_vs_est_kt_data *kd)
252 {
253 	unsigned long now;
254 	int ret = 0;
255 	long gap;
256 
257 	lockdep_assert_held(&ipvs->est_mutex);
258 
259 	if (kd->task)
260 		goto out;
261 	now = jiffies;
262 	gap = kd->est_timer - now;
263 	/* Sync est_timer if task is starting later */
264 	if (abs(gap) > 4 * IPVS_EST_TICK)
265 		kd->est_timer = now;
266 	kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
267 				  ipvs->gen, kd->id);
268 	if (IS_ERR(kd->task)) {
269 		ret = PTR_ERR(kd->task);
270 		kd->task = NULL;
271 		goto out;
272 	}
273 
274 	set_user_nice(kd->task, sysctl_est_nice(ipvs));
275 	if (sysctl_est_preferred_cpulist(ipvs))
276 		kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
277 
278 	pr_info("starting estimator thread %d...\n", kd->id);
279 	wake_up_process(kd->task);
280 
281 out:
282 	return ret;
283 }
284 
ip_vs_est_kthread_stop(struct ip_vs_est_kt_data * kd)285 void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
286 {
287 	if (kd->task) {
288 		pr_info("stopping estimator thread %d...\n", kd->id);
289 		kthread_stop(kd->task);
290 		kd->task = NULL;
291 	}
292 }
293 
294 /* Apply parameters to kthread */
ip_vs_est_set_params(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)295 static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
296 				 struct ip_vs_est_kt_data *kd)
297 {
298 	kd->chain_max = ipvs->est_chain_max;
299 	/* We are using single chain on RCU preemption */
300 	if (IPVS_EST_TICK_CHAINS == 1)
301 		kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
302 	kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
303 	kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
304 }
305 
306 /* Create and start estimation kthread in a free or new array slot */
ip_vs_est_add_kthread(struct netns_ipvs * ipvs)307 static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
308 {
309 	struct ip_vs_est_kt_data *kd = NULL;
310 	int id = ipvs->est_kt_count;
311 	int ret = -ENOMEM;
312 	void *arr = NULL;
313 	int i;
314 
315 	mutex_lock(&ipvs->est_mutex);
316 
317 	/* Allow kt 0 data to be created before the services are added
318 	 * and limit the kthreads when services are present.
319 	 */
320 	if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
321 	    READ_ONCE(ipvs->enable) && ipvs->est_max_threads) {
322 		ret = -EINVAL;
323 		goto out;
324 	}
325 
326 	for (i = 0; i < id; i++) {
327 		if (!ipvs->est_kt_arr[i])
328 			break;
329 	}
330 	if (i >= id) {
331 		arr = krealloc_array(ipvs->est_kt_arr, id + 1,
332 				     sizeof(struct ip_vs_est_kt_data *),
333 				     GFP_KERNEL);
334 		if (!arr)
335 			goto out;
336 		ipvs->est_kt_arr = arr;
337 	} else {
338 		id = i;
339 	}
340 
341 	kd = kzalloc_obj(*kd);
342 	if (!kd)
343 		goto out;
344 	kd->ipvs = ipvs;
345 	bitmap_fill(kd->avail, IPVS_EST_NTICKS);
346 	kd->est_timer = jiffies;
347 	kd->id = id;
348 	ip_vs_est_set_params(ipvs, kd);
349 	kd->needed = 1;
350 
351 	/* Pre-allocate stats used in calc phase */
352 	if (!id && !kd->calc_stats) {
353 		kd->calc_stats = ip_vs_stats_alloc();
354 		if (!kd->calc_stats)
355 			goto out;
356 	}
357 
358 	/* Request kthread to be started */
359 	ip_vs_est_reload_start(ipvs, false);
360 
361 	if (arr)
362 		ipvs->est_kt_count++;
363 	ipvs->est_kt_arr[id] = kd;
364 	kd = NULL;
365 	/* Use most recent kthread for new ests */
366 	ipvs->est_add_ktid = id;
367 	ret = 0;
368 
369 out:
370 	mutex_unlock(&ipvs->est_mutex);
371 	if (kd) {
372 		ip_vs_stats_free(kd->calc_stats);
373 		kfree(kd);
374 	}
375 
376 	return ret;
377 }
378 
379 /* Select ktid where to add new ests: available, unused or new slot */
ip_vs_est_update_ktid(struct netns_ipvs * ipvs)380 static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
381 {
382 	int ktid, best = ipvs->est_kt_count;
383 	struct ip_vs_est_kt_data *kd;
384 
385 	for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
386 		kd = ipvs->est_kt_arr[ktid];
387 		if (kd) {
388 			if (kd->est_count < kd->est_max_count) {
389 				best = ktid;
390 				break;
391 			}
392 		} else if (ktid < best) {
393 			best = ktid;
394 		}
395 	}
396 	ipvs->est_add_ktid = best;
397 }
398 
399 /* Add estimator to current kthread (est_add_ktid) */
ip_vs_enqueue_estimator(struct netns_ipvs * ipvs,struct ip_vs_estimator * est)400 static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
401 				   struct ip_vs_estimator *est)
402 {
403 	struct ip_vs_est_kt_data *kd = NULL;
404 	struct ip_vs_est_tick_data *td;
405 	int ktid, row, crow, cid, ret;
406 	int delay = est->ktrow;
407 
408 	BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
409 			 "Too many chains for ktcid");
410 
411 	if (ipvs->est_add_ktid < ipvs->est_kt_count) {
412 		kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
413 		if (kd)
414 			goto add_est;
415 	}
416 
417 	ret = ip_vs_est_add_kthread(ipvs);
418 	if (ret < 0)
419 		goto out;
420 	kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
421 
422 add_est:
423 	ktid = kd->id;
424 	/* For small number of estimators prefer to use few ticks,
425 	 * otherwise try to add into the last estimated row.
426 	 * est_row and add_row point after the row we should use
427 	 */
428 	if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
429 		crow = READ_ONCE(kd->est_row);
430 	else
431 		crow = kd->add_row;
432 	crow += delay;
433 	if (crow >= IPVS_EST_NTICKS)
434 		crow -= IPVS_EST_NTICKS;
435 	/* Assume initial delay ? */
436 	if (delay >= IPVS_EST_NTICKS - 1) {
437 		/* Preserve initial delay or decrease it if no space in tick */
438 		row = crow;
439 		if (crow < IPVS_EST_NTICKS - 1) {
440 			crow++;
441 			row = find_last_bit(kd->avail, crow);
442 		}
443 		if (row >= crow)
444 			row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
445 	} else {
446 		/* Preserve delay or increase it if no space in tick */
447 		row = IPVS_EST_NTICKS;
448 		if (crow > 0)
449 			row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
450 		if (row >= IPVS_EST_NTICKS)
451 			row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
452 	}
453 
454 	td = rcu_dereference_protected(kd->ticks[row], 1);
455 	if (!td) {
456 		td = kzalloc_obj(*td);
457 		if (!td) {
458 			ret = -ENOMEM;
459 			goto out;
460 		}
461 		rcu_assign_pointer(kd->ticks[row], td);
462 	}
463 
464 	cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
465 
466 	kd->est_count++;
467 	kd->tick_len[row]++;
468 	if (!td->chain_len[cid])
469 		__set_bit(cid, td->present);
470 	td->chain_len[cid]++;
471 	est->ktid = ktid;
472 	est->ktrow = row;
473 	est->ktcid = cid;
474 	hlist_add_head_rcu(&est->list, &td->chains[cid]);
475 
476 	if (td->chain_len[cid] >= kd->chain_max) {
477 		__set_bit(cid, td->full);
478 		if (kd->tick_len[row] >= kd->tick_max)
479 			__clear_bit(row, kd->avail);
480 	}
481 
482 	/* Update est_add_ktid to point to first available/empty kt slot */
483 	if (kd->est_count == kd->est_max_count)
484 		ip_vs_est_update_ktid(ipvs);
485 
486 	ret = 0;
487 
488 out:
489 	return ret;
490 }
491 
492 /* Start estimation for stats */
ip_vs_start_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)493 int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
494 {
495 	struct ip_vs_est_kt_data *kd = ipvs->est_kt_count > 0 ?
496 				       ipvs->est_kt_arr[0] : NULL;
497 	struct ip_vs_estimator *est = &stats->est;
498 	int ret;
499 
500 	est->ktid = -1;
501 	est->ktrow = IPVS_EST_NTICKS - 1;	/* Initial delay */
502 
503 	/* We prefer this code to be short, kthread 0 will requeue the
504 	 * estimator to available chain. If tasks are disabled, we
505 	 * will not allocate much memory, just for kt 0.
506 	 */
507 	ret = 0;
508 	if (!kd) {
509 		ret = ip_vs_est_add_kthread(ipvs);
510 	} else if (!kd->needed) {
511 		mutex_lock(&ipvs->est_mutex);
512 		/* We have job for the kt 0 task */
513 		kd->needed = 1;
514 		ip_vs_est_reload_start(ipvs, true);
515 		mutex_unlock(&ipvs->est_mutex);
516 	}
517 	if (ret >= 0)
518 		hlist_add_head(&est->list, &ipvs->est_temp_list);
519 	else
520 		INIT_HLIST_NODE(&est->list);
521 	return ret;
522 }
523 
ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data * kd)524 static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
525 {
526 	if (kd) {
527 		if (kd->task) {
528 			pr_info("stop unused estimator thread %d...\n", kd->id);
529 			kthread_stop(kd->task);
530 		}
531 		ip_vs_stats_free(kd->calc_stats);
532 		kfree(kd);
533 	}
534 }
535 
536 /* Unlink estimator from chain */
ip_vs_stop_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)537 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
538 {
539 	struct ip_vs_estimator *est = &stats->est;
540 	struct ip_vs_est_tick_data *td;
541 	struct ip_vs_est_kt_data *kd;
542 	int ktid = est->ktid;
543 	int row = est->ktrow;
544 	int cid = est->ktcid;
545 
546 	/* Failed to add to chain ? */
547 	if (hlist_unhashed(&est->list))
548 		return;
549 
550 	/* On return, estimator can be freed, dequeue it now */
551 
552 	/* In est_temp_list ? */
553 	if (ktid < 0) {
554 		hlist_del(&est->list);
555 		goto end_kt0;
556 	}
557 
558 	hlist_del_rcu(&est->list);
559 	kd = ipvs->est_kt_arr[ktid];
560 	td = rcu_dereference_protected(kd->ticks[row], 1);
561 	__clear_bit(cid, td->full);
562 	td->chain_len[cid]--;
563 	if (!td->chain_len[cid])
564 		__clear_bit(cid, td->present);
565 	kd->tick_len[row]--;
566 	__set_bit(row, kd->avail);
567 	if (!kd->tick_len[row]) {
568 		RCU_INIT_POINTER(kd->ticks[row], NULL);
569 		kfree_rcu(td, rcu_head);
570 	}
571 	kd->est_count--;
572 	if (kd->est_count) {
573 		/* This kt slot can become available just now, prefer it */
574 		if (ktid < ipvs->est_add_ktid)
575 			ipvs->est_add_ktid = ktid;
576 		return;
577 	}
578 
579 	if (ktid > 0) {
580 		mutex_lock(&ipvs->est_mutex);
581 		ip_vs_est_kthread_destroy(kd);
582 		ipvs->est_kt_arr[ktid] = NULL;
583 		if (ktid == ipvs->est_kt_count - 1) {
584 			ipvs->est_kt_count--;
585 			while (ipvs->est_kt_count > 1 &&
586 			       !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
587 				ipvs->est_kt_count--;
588 		}
589 		mutex_unlock(&ipvs->est_mutex);
590 
591 		/* This slot is now empty, prefer another available kt slot */
592 		if (ktid == ipvs->est_add_ktid)
593 			ip_vs_est_update_ktid(ipvs);
594 	}
595 
596 end_kt0:
597 	/* kt 0 task is stopped after all other kt slots and chains are empty */
598 	if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
599 		kd = ipvs->est_kt_arr[0];
600 		if (kd && !kd->est_count) {
601 			mutex_lock(&ipvs->est_mutex);
602 			/* Keep the kt0 data but request kthread_stop */
603 			kd->needed = 0;
604 			ip_vs_est_reload_start(ipvs, true);
605 			mutex_unlock(&ipvs->est_mutex);
606 			ipvs->est_add_ktid = 0;
607 		}
608 	}
609 }
610 
611 /* Register all ests from est_temp_list to kthreads */
ip_vs_est_drain_temp_list(struct netns_ipvs * ipvs)612 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
613 {
614 	struct ip_vs_estimator *est;
615 
616 	while (1) {
617 		int max = 16;
618 
619 		mutex_lock(&ipvs->service_mutex);
620 
621 		while (max-- > 0) {
622 			est = hlist_entry_safe(ipvs->est_temp_list.first,
623 					       struct ip_vs_estimator, list);
624 			if (est) {
625 				if (kthread_should_stop())
626 					goto unlock;
627 				hlist_del_init(&est->list);
628 				if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
629 					continue;
630 				est->ktid = -1;
631 				hlist_add_head(&est->list,
632 					       &ipvs->est_temp_list);
633 				/* Abort, some entries will not be estimated
634 				 * until next attempt
635 				 */
636 			}
637 			goto unlock;
638 		}
639 		mutex_unlock(&ipvs->service_mutex);
640 		cond_resched();
641 	}
642 
643 unlock:
644 	mutex_unlock(&ipvs->service_mutex);
645 }
646 
647 /* Calculate limits for all kthreads */
ip_vs_est_calc_limits(struct netns_ipvs * ipvs,int * chain_max)648 static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
649 {
650 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
651 	struct ip_vs_est_kt_data *kd;
652 	struct hlist_head chain;
653 	struct ip_vs_stats *s;
654 	int cache_factor = 4;
655 	int i, loops, ntest;
656 	s32 min_est = 0;
657 	ktime_t t1, t2;
658 	int max = 8;
659 	int ret = 1;
660 	s64 diff;
661 	u64 val;
662 
663 	INIT_HLIST_HEAD(&chain);
664 	mutex_lock(&ipvs->est_mutex);
665 	kd = ipvs->est_kt_arr[0];
666 	mutex_unlock(&ipvs->est_mutex);
667 	s = kd ? kd->calc_stats : NULL;
668 	if (!s)
669 		goto out;
670 	hlist_add_head(&s->est.list, &chain);
671 
672 	loops = 1;
673 	/* Get best result from many tests */
674 	for (ntest = 0; ntest < 12; ntest++) {
675 		if (!(ntest & 3)) {
676 			/* Wait for cpufreq frequency transition */
677 			wait_event_idle_timeout(wq, kthread_should_stop(),
678 						HZ / 50);
679 			if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
680 				goto stop;
681 		}
682 
683 		local_bh_disable();
684 		rcu_read_lock();
685 
686 		/* Put stats in cache */
687 		ip_vs_chain_estimation(&chain);
688 
689 		t1 = ktime_get();
690 		for (i = loops * cache_factor; i > 0; i--)
691 			ip_vs_chain_estimation(&chain);
692 		t2 = ktime_get();
693 
694 		rcu_read_unlock();
695 		local_bh_enable();
696 
697 		if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
698 			goto stop;
699 		cond_resched();
700 
701 		diff = ktime_to_ns(ktime_sub(t2, t1));
702 		if (diff <= 1 * NSEC_PER_USEC) {
703 			/* Do more loops on low time resolution */
704 			loops *= 2;
705 			continue;
706 		}
707 		if (diff >= NSEC_PER_SEC)
708 			continue;
709 		val = diff;
710 		do_div(val, loops);
711 		if (!min_est || val < min_est) {
712 			min_est = val;
713 			/* goal: 95usec per chain */
714 			val = 95 * NSEC_PER_USEC;
715 			if (val >= min_est) {
716 				do_div(val, min_est);
717 				max = (int)val;
718 			} else {
719 				max = 1;
720 			}
721 		}
722 	}
723 
724 out:
725 	if (s)
726 		hlist_del_init(&s->est.list);
727 	*chain_max = max;
728 	return ret;
729 
730 stop:
731 	ret = 0;
732 	goto out;
733 }
734 
735 /* Calculate the parameters and apply them in context of kt #0
736  * ECP: est_calc_phase
737  * ECM: est_chain_max
738  * ECP	ECM	Insert Chain	enable	Description
739  * ---------------------------------------------------------------------------
740  * 0	0	est_temp_list	0	create kt #0 context
741  * 0	0	est_temp_list	0->1	service added, start kthread #0 task
742  * 0->1	0	est_temp_list	1	kt task #0 started, enters calc phase
743  * 1	0	est_temp_list	1	kt #0: determine est_chain_max,
744  *					stop tasks, move ests to est_temp_list
745  *					and free kd for kthreads 1..last
746  * 1->0	0->N	kt chains	1	ests can go to kthreads
747  * 0	N	kt chains	1	drain est_temp_list, create new kthread
748  *					contexts, start tasks, estimate
749  */
ip_vs_est_calc_phase(struct netns_ipvs * ipvs)750 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
751 {
752 	int genid = atomic_read(&ipvs->est_genid);
753 	struct ip_vs_est_tick_data *td;
754 	struct ip_vs_est_kt_data *kd;
755 	struct ip_vs_estimator *est;
756 	struct ip_vs_stats *stats;
757 	int id, row, cid, delay;
758 	bool last, last_td;
759 	int chain_max;
760 	int step;
761 
762 	if (!ip_vs_est_calc_limits(ipvs, &chain_max))
763 		return;
764 
765 	/* Stop all other tasks, so that we can immediately move the
766 	 * estimators to est_temp_list without RCU grace period
767 	 */
768 	mutex_lock(&ipvs->est_mutex);
769 	for (id = 1; id < ipvs->est_kt_count; id++) {
770 		/* netns clean up started, abort */
771 		if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) {
772 			mutex_unlock(&ipvs->est_mutex);
773 			return;
774 		}
775 		kd = ipvs->est_kt_arr[id];
776 		if (!kd)
777 			continue;
778 		ip_vs_est_kthread_stop(kd);
779 	}
780 	mutex_unlock(&ipvs->est_mutex);
781 
782 	mutex_lock(&ipvs->service_mutex);
783 
784 	/* Move all estimators to est_temp_list but carefully,
785 	 * all estimators and kthread data can be released while
786 	 * we reschedule.
787 	 */
788 	step = 0;
789 
790 	/* Order entries in est_temp_list in ascending delay, so now
791 	 * walk delay(desc), id(desc), cid(asc)
792 	 */
793 	delay = IPVS_EST_NTICKS;
794 
795 next_delay:
796 	delay--;
797 	if (delay < 0)
798 		goto end_dequeue;
799 
800 last_kt:
801 	/* Destroy contexts backwards */
802 	id = ipvs->est_kt_count;
803 
804 next_kt:
805 	if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
806 		goto unlock;
807 	id--;
808 	if (id < 0)
809 		goto next_delay;
810 	kd = ipvs->est_kt_arr[id];
811 	if (!kd)
812 		goto next_kt;
813 	/* kt 0 can exist with empty chains */
814 	if (!id && kd->est_count <= 1)
815 		goto next_delay;
816 
817 	row = kd->est_row + delay;
818 	if (row >= IPVS_EST_NTICKS)
819 		row -= IPVS_EST_NTICKS;
820 	td = rcu_dereference_protected(kd->ticks[row], 1);
821 	if (!td)
822 		goto next_kt;
823 
824 	cid = 0;
825 
826 walk_chain:
827 	if (kthread_should_stop())
828 		goto unlock;
829 	step++;
830 	if (!(step & 63)) {
831 		/* Give chance estimators to be added (to est_temp_list)
832 		 * and deleted (releasing kthread contexts)
833 		 */
834 		mutex_unlock(&ipvs->service_mutex);
835 		cond_resched();
836 		mutex_lock(&ipvs->service_mutex);
837 
838 		/* Current kt released ? */
839 		if (id >= ipvs->est_kt_count)
840 			goto last_kt;
841 		if (kd != ipvs->est_kt_arr[id])
842 			goto next_kt;
843 		/* Current td released ? */
844 		if (td != rcu_dereference_protected(kd->ticks[row], 1))
845 			goto next_kt;
846 		/* No fatal changes on the current kd and td */
847 	}
848 	est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
849 			       list);
850 	if (!est) {
851 		cid++;
852 		if (cid >= IPVS_EST_TICK_CHAINS)
853 			goto next_kt;
854 		goto walk_chain;
855 	}
856 	/* We can cheat and increase est_count to protect kt 0 context
857 	 * from release but we prefer to keep the last estimator
858 	 */
859 	last = kd->est_count <= 1;
860 	/* Do not free kt #0 data */
861 	if (!id && last)
862 		goto next_delay;
863 	last_td = kd->tick_len[row] <= 1;
864 	stats = container_of(est, struct ip_vs_stats, est);
865 	ip_vs_stop_estimator(ipvs, stats);
866 	/* Tasks are stopped, move without RCU grace period */
867 	est->ktid = -1;
868 	est->ktrow = delay;
869 	hlist_add_head(&est->list, &ipvs->est_temp_list);
870 	/* kd freed ? */
871 	if (last)
872 		goto next_kt;
873 	/* td freed ? */
874 	if (last_td)
875 		goto next_kt;
876 	goto walk_chain;
877 
878 end_dequeue:
879 	/* All estimators removed while calculating ? */
880 	if (!ipvs->est_kt_count)
881 		goto unlock;
882 	kd = ipvs->est_kt_arr[0];
883 	if (!kd)
884 		goto unlock;
885 	kd->add_row = kd->est_row;
886 	ipvs->est_chain_max = chain_max;
887 	ip_vs_est_set_params(ipvs, kd);
888 
889 	pr_info("using max %d ests per chain, %d per kthread\n",
890 		kd->chain_max, kd->est_max_count);
891 
892 	/* Try to keep tot_stats in kt0, enqueue it early */
893 	if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
894 	    ipvs->tot_stats->s.est.ktid == -1) {
895 		hlist_del(&ipvs->tot_stats->s.est.list);
896 		hlist_add_head(&ipvs->tot_stats->s.est.list,
897 			       &ipvs->est_temp_list);
898 	}
899 
900 	mutex_lock(&ipvs->est_mutex);
901 
902 	/* We completed the calc phase, new calc phase not requested */
903 	if (genid == atomic_read(&ipvs->est_genid))
904 		ipvs->est_calc_phase = 0;
905 
906 	mutex_unlock(&ipvs->est_mutex);
907 
908 unlock:
909 	mutex_unlock(&ipvs->service_mutex);
910 }
911 
ip_vs_zero_estimator(struct ip_vs_stats * stats)912 void ip_vs_zero_estimator(struct ip_vs_stats *stats)
913 {
914 	struct ip_vs_estimator *est = &stats->est;
915 	struct ip_vs_kstats *k = &stats->kstats;
916 
917 	/* reset counters, caller must hold the stats->lock lock */
918 	est->last_inbytes = k->inbytes;
919 	est->last_outbytes = k->outbytes;
920 	est->last_conns = k->conns;
921 	est->last_inpkts = k->inpkts;
922 	est->last_outpkts = k->outpkts;
923 	est->cps = 0;
924 	est->inpps = 0;
925 	est->outpps = 0;
926 	est->inbps = 0;
927 	est->outbps = 0;
928 }
929 
930 /* Get decoded rates */
ip_vs_read_estimator(struct ip_vs_kstats * dst,struct ip_vs_stats * stats)931 void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
932 {
933 	struct ip_vs_estimator *e = &stats->est;
934 
935 	dst->cps = (e->cps + 0x1FF) >> 10;
936 	dst->inpps = (e->inpps + 0x1FF) >> 10;
937 	dst->outpps = (e->outpps + 0x1FF) >> 10;
938 	dst->inbps = (e->inbps + 0xF) >> 5;
939 	dst->outbps = (e->outbps + 0xF) >> 5;
940 }
941 
ip_vs_estimator_net_init(struct netns_ipvs * ipvs)942 int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
943 {
944 	INIT_HLIST_HEAD(&ipvs->est_temp_list);
945 	ipvs->est_kt_arr = NULL;
946 	ipvs->est_max_threads = 0;
947 	ipvs->est_calc_phase = 0;
948 	ipvs->est_chain_max = 0;
949 	ipvs->est_kt_count = 0;
950 	ipvs->est_add_ktid = 0;
951 	atomic_set(&ipvs->est_genid, 0);
952 	atomic_set(&ipvs->est_genid_done, 0);
953 	__mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
954 	return 0;
955 }
956 
ip_vs_estimator_net_cleanup(struct netns_ipvs * ipvs)957 void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
958 {
959 	int i;
960 
961 	for (i = 0; i < ipvs->est_kt_count; i++)
962 		ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
963 	kfree(ipvs->est_kt_arr);
964 	mutex_destroy(&ipvs->est_mutex);
965 }
966