xref: /linux/net/netfilter/ipvs/ip_vs_mh.c (revision 8f7aa3d3c7323f4ca2768a9e74ebbe359c4f8f88)
1 // SPDX-License-Identifier: GPL-2.0
2 /* IPVS:	Maglev Hashing scheduling module
3  *
4  * Authors:	Inju Song <inju.song@navercorp.com>
5  *
6  */
7 
8 /* The mh algorithm is to assign a preference list of all the lookup
9  * table positions to each destination and populate the table with
10  * the most-preferred position of destinations. Then it is to select
11  * destination with the hash key of source IP address through looking
12  * up a the lookup table.
13  *
14  * The algorithm is detailed in:
15  * [3.4 Consistent Hasing]
16 https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
17  *
18  */
19 
20 #define pr_fmt(fmt) "IPVS: " fmt
21 
22 #include <linux/ip.h>
23 #include <linux/slab.h>
24 #include <linux/module.h>
25 #include <linux/kernel.h>
26 #include <linux/skbuff.h>
27 
28 #include <net/ip_vs.h>
29 
30 #include <linux/siphash.h>
31 #include <linux/bitops.h>
32 #include <linux/gcd.h>
33 
34 #define IP_VS_SVC_F_SCHED_MH_FALLBACK	IP_VS_SVC_F_SCHED1 /* MH fallback */
35 #define IP_VS_SVC_F_SCHED_MH_PORT	IP_VS_SVC_F_SCHED2 /* MH use port */
36 
37 struct ip_vs_mh_lookup {
38 	struct ip_vs_dest __rcu	*dest;	/* real server (cache) */
39 };
40 
41 struct ip_vs_mh_dest_setup {
42 	unsigned int	offset; /* starting offset */
43 	unsigned int	skip;	/* skip */
44 	unsigned int	perm;	/* next_offset */
45 	int		turns;	/* weight / gcd() and rshift */
46 };
47 
48 /* Available prime numbers for MH table */
49 static int primes[] = {251, 509, 1021, 2039, 4093,
50 		       8191, 16381, 32749, 65521, 131071};
51 
52 /* For IPVS MH entry hash table */
53 #ifndef CONFIG_IP_VS_MH_TAB_INDEX
54 #define CONFIG_IP_VS_MH_TAB_INDEX	12
55 #endif
56 #define IP_VS_MH_TAB_BITS		(CONFIG_IP_VS_MH_TAB_INDEX / 2)
57 #define IP_VS_MH_TAB_INDEX		(CONFIG_IP_VS_MH_TAB_INDEX - 8)
58 #define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
59 
60 struct ip_vs_mh_state {
61 	struct rcu_head			rcu_head;
62 	struct ip_vs_mh_lookup		*lookup;
63 	struct ip_vs_mh_dest_setup	*dest_setup;
64 	hsiphash_key_t			hash1, hash2;
65 	int				gcd;
66 	int				rshift;
67 };
68 
69 static inline void generate_hash_secret(hsiphash_key_t *hash1,
70 					hsiphash_key_t *hash2)
71 {
72 	hash1->key[0] = 2654435761UL;
73 	hash1->key[1] = 2654435761UL;
74 
75 	hash2->key[0] = 2654446892UL;
76 	hash2->key[1] = 2654446892UL;
77 }
78 
79 /* Helper function to determine if server is unavailable */
80 static inline bool is_unavailable(struct ip_vs_dest *dest)
81 {
82 	return atomic_read(&dest->weight) <= 0 ||
83 	       dest->flags & IP_VS_DEST_F_OVERLOAD;
84 }
85 
86 /* Returns hash value for IPVS MH entry */
87 static inline unsigned int
88 ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
89 		 __be16 port, hsiphash_key_t *key, unsigned int offset)
90 {
91 	unsigned int v;
92 	__be32 addr_fold = addr->ip;
93 
94 #ifdef CONFIG_IP_VS_IPV6
95 	if (af == AF_INET6)
96 		addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
97 			    addr->ip6[2] ^ addr->ip6[3];
98 #endif
99 	v = (offset + ntohs(port) + ntohl(addr_fold));
100 	return hsiphash(&v, sizeof(v), key);
101 }
102 
103 /* Reset all the hash buckets of the specified table. */
104 static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
105 {
106 	int i;
107 	struct ip_vs_mh_lookup *l;
108 	struct ip_vs_dest *dest;
109 
110 	l = &s->lookup[0];
111 	for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
112 		dest = rcu_dereference_protected(l->dest, 1);
113 		if (dest) {
114 			ip_vs_dest_put(dest);
115 			RCU_INIT_POINTER(l->dest, NULL);
116 		}
117 		l++;
118 	}
119 }
120 
121 static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
122 			      struct ip_vs_service *svc)
123 {
124 	struct list_head *p;
125 	struct ip_vs_mh_dest_setup *ds;
126 	struct ip_vs_dest *dest;
127 	int lw;
128 
129 	/* If gcd is smaller then 1, number of dests or
130 	 * all last_weight of dests are zero. So, skip
131 	 * permutation for the dests.
132 	 */
133 	if (s->gcd < 1)
134 		return 0;
135 
136 	/* Set dest_setup for the dests permutation */
137 	p = &svc->destinations;
138 	ds = &s->dest_setup[0];
139 	while ((p = p->next) != &svc->destinations) {
140 		dest = list_entry(p, struct ip_vs_dest, n_list);
141 
142 		ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
143 					      dest->port, &s->hash1, 0) %
144 					      IP_VS_MH_TAB_SIZE;
145 		ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
146 					    dest->port, &s->hash2, 0) %
147 					    (IP_VS_MH_TAB_SIZE - 1) + 1;
148 		ds->perm = ds->offset;
149 
150 		lw = atomic_read(&dest->last_weight);
151 		ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
152 		ds++;
153 	}
154 
155 	return 0;
156 }
157 
158 static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
159 			     struct ip_vs_service *svc)
160 {
161 	int n, c, dt_count;
162 	unsigned long *table;
163 	struct list_head *p;
164 	struct ip_vs_mh_dest_setup *ds;
165 	struct ip_vs_dest *dest, *new_dest;
166 
167 	/* If gcd is smaller then 1, number of dests or
168 	 * all last_weight of dests are zero. So, skip
169 	 * the population for the dests and reset lookup table.
170 	 */
171 	if (s->gcd < 1) {
172 		ip_vs_mh_reset(s);
173 		return 0;
174 	}
175 
176 	table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL);
177 	if (!table)
178 		return -ENOMEM;
179 
180 	p = &svc->destinations;
181 	n = 0;
182 	dt_count = 0;
183 	while (n < IP_VS_MH_TAB_SIZE) {
184 		if (p == &svc->destinations)
185 			p = p->next;
186 
187 		ds = &s->dest_setup[0];
188 		while (p != &svc->destinations) {
189 			/* Ignore added server with zero weight */
190 			if (ds->turns < 1) {
191 				p = p->next;
192 				ds++;
193 				continue;
194 			}
195 
196 			c = ds->perm;
197 			while (test_bit(c, table)) {
198 				/* Add skip, mod IP_VS_MH_TAB_SIZE */
199 				ds->perm += ds->skip;
200 				if (ds->perm >= IP_VS_MH_TAB_SIZE)
201 					ds->perm -= IP_VS_MH_TAB_SIZE;
202 				c = ds->perm;
203 			}
204 
205 			__set_bit(c, table);
206 
207 			dest = rcu_dereference_protected(s->lookup[c].dest, 1);
208 			new_dest = list_entry(p, struct ip_vs_dest, n_list);
209 			if (dest != new_dest) {
210 				if (dest)
211 					ip_vs_dest_put(dest);
212 				ip_vs_dest_hold(new_dest);
213 				RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
214 			}
215 
216 			if (++n == IP_VS_MH_TAB_SIZE)
217 				goto out;
218 
219 			if (++dt_count >= ds->turns) {
220 				dt_count = 0;
221 				p = p->next;
222 				ds++;
223 			}
224 		}
225 	}
226 
227 out:
228 	bitmap_free(table);
229 	return 0;
230 }
231 
232 /* Get ip_vs_dest associated with supplied parameters. */
233 static inline struct ip_vs_dest *
234 ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
235 	     const union nf_inet_addr *addr, __be16 port)
236 {
237 	unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
238 					     % IP_VS_MH_TAB_SIZE;
239 	struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
240 
241 	return (!dest || is_unavailable(dest)) ? NULL : dest;
242 }
243 
244 /* As ip_vs_mh_get, but with fallback if selected server is unavailable */
245 static inline struct ip_vs_dest *
246 ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
247 		      const union nf_inet_addr *addr, __be16 port)
248 {
249 	unsigned int offset, roffset;
250 	unsigned int hash, ihash;
251 	struct ip_vs_dest *dest;
252 
253 	/* First try the dest it's supposed to go to */
254 	ihash = ip_vs_mh_hashkey(svc->af, addr, port,
255 				 &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
256 	dest = rcu_dereference(s->lookup[ihash].dest);
257 	if (!dest)
258 		return NULL;
259 	if (!is_unavailable(dest))
260 		return dest;
261 
262 	IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
263 		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
264 
265 	/* If the original dest is unavailable, loop around the table
266 	 * starting from ihash to find a new dest
267 	 */
268 	for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
269 		roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
270 		hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
271 					roffset) % IP_VS_MH_TAB_SIZE;
272 		dest = rcu_dereference(s->lookup[hash].dest);
273 		if (!dest)
274 			break;
275 		if (!is_unavailable(dest))
276 			return dest;
277 		IP_VS_DBG_BUF(6,
278 			      "MH: selected unavailable server %s:%u (offset %u), reselecting",
279 			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
280 			      ntohs(dest->port), roffset);
281 	}
282 
283 	return NULL;
284 }
285 
286 /* Assign all the hash buckets of the specified table with the service. */
287 static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
288 			     struct ip_vs_service *svc)
289 {
290 	int ret;
291 
292 	if (svc->num_dests > IP_VS_MH_TAB_SIZE)
293 		return -EINVAL;
294 
295 	if (svc->num_dests >= 1) {
296 		s->dest_setup = kcalloc(svc->num_dests,
297 					sizeof(struct ip_vs_mh_dest_setup),
298 					GFP_KERNEL);
299 		if (!s->dest_setup)
300 			return -ENOMEM;
301 	}
302 
303 	ip_vs_mh_permutate(s, svc);
304 
305 	ret = ip_vs_mh_populate(s, svc);
306 	if (ret < 0)
307 		goto out;
308 
309 	IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
310 		      IP_VS_DBG_ADDR(svc->af, &svc->addr),
311 		      ntohs(svc->port));
312 
313 out:
314 	if (svc->num_dests >= 1) {
315 		kfree(s->dest_setup);
316 		s->dest_setup = NULL;
317 	}
318 	return ret;
319 }
320 
321 static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
322 {
323 	struct ip_vs_dest *dest;
324 	int weight;
325 	int g = 0;
326 
327 	list_for_each_entry(dest, &svc->destinations, n_list) {
328 		weight = atomic_read(&dest->last_weight);
329 		if (weight > 0) {
330 			if (g > 0)
331 				g = gcd(weight, g);
332 			else
333 				g = weight;
334 		}
335 	}
336 	return g;
337 }
338 
339 /* To avoid assigning huge weight for the MH table,
340  * calculate shift value with gcd.
341  */
342 static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
343 {
344 	struct ip_vs_dest *dest;
345 	int new_weight, weight = 0;
346 	int mw, shift;
347 
348 	/* If gcd is smaller then 1, number of dests or
349 	 * all last_weight of dests are zero. So, return
350 	 * shift value as zero.
351 	 */
352 	if (gcd < 1)
353 		return 0;
354 
355 	list_for_each_entry(dest, &svc->destinations, n_list) {
356 		new_weight = atomic_read(&dest->last_weight);
357 		if (new_weight > weight)
358 			weight = new_weight;
359 	}
360 
361 	/* Because gcd is greater than zero,
362 	 * the maximum weight and gcd are always greater than zero
363 	 */
364 	mw = weight / gcd;
365 
366 	/* shift = occupied bits of weight/gcd - MH highest bits */
367 	shift = fls(mw) - IP_VS_MH_TAB_BITS;
368 	return (shift >= 0) ? shift : 0;
369 }
370 
371 static void ip_vs_mh_state_free(struct rcu_head *head)
372 {
373 	struct ip_vs_mh_state *s;
374 
375 	s = container_of(head, struct ip_vs_mh_state, rcu_head);
376 	kfree(s->lookup);
377 	kfree(s);
378 }
379 
380 static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
381 {
382 	int ret;
383 	struct ip_vs_mh_state *s;
384 
385 	/* Allocate the MH table for this service */
386 	s = kzalloc(sizeof(*s), GFP_KERNEL);
387 	if (!s)
388 		return -ENOMEM;
389 
390 	s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
391 			    GFP_KERNEL);
392 	if (!s->lookup) {
393 		kfree(s);
394 		return -ENOMEM;
395 	}
396 
397 	generate_hash_secret(&s->hash1, &s->hash2);
398 	s->gcd = ip_vs_mh_gcd_weight(svc);
399 	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
400 
401 	IP_VS_DBG(6,
402 		  "MH lookup table (memory=%zdbytes) allocated for current service\n",
403 		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
404 
405 	/* Assign the lookup table with current dests */
406 	ret = ip_vs_mh_reassign(s, svc);
407 	if (ret < 0) {
408 		ip_vs_mh_reset(s);
409 		ip_vs_mh_state_free(&s->rcu_head);
410 		return ret;
411 	}
412 
413 	/* No more failures, attach state */
414 	svc->sched_data = s;
415 	return 0;
416 }
417 
418 static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
419 {
420 	struct ip_vs_mh_state *s = svc->sched_data;
421 
422 	/* Got to clean up lookup entry here */
423 	ip_vs_mh_reset(s);
424 
425 	call_rcu(&s->rcu_head, ip_vs_mh_state_free);
426 	IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
427 		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
428 }
429 
430 static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
431 				 struct ip_vs_dest *dest)
432 {
433 	struct ip_vs_mh_state *s = svc->sched_data;
434 
435 	s->gcd = ip_vs_mh_gcd_weight(svc);
436 	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
437 
438 	/* Assign the lookup table with the updated service */
439 	return ip_vs_mh_reassign(s, svc);
440 }
441 
442 /* Helper function to get port number */
443 static inline __be16
444 ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
445 {
446 	__be16 _ports[2], *ports;
447 
448 	/* At this point we know that we have a valid packet of some kind.
449 	 * Because ICMP packets are only guaranteed to have the first 8
450 	 * bytes, let's just grab the ports.  Fortunately they're in the
451 	 * same position for all three of the protocols we care about.
452 	 */
453 	switch (iph->protocol) {
454 	case IPPROTO_TCP:
455 	case IPPROTO_UDP:
456 	case IPPROTO_SCTP:
457 		ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
458 					   &_ports);
459 		if (unlikely(!ports))
460 			return 0;
461 
462 		if (likely(!ip_vs_iph_inverse(iph)))
463 			return ports[0];
464 		else
465 			return ports[1];
466 	default:
467 		return 0;
468 	}
469 }
470 
471 /* Maglev Hashing scheduling */
472 static struct ip_vs_dest *
473 ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
474 		  struct ip_vs_iphdr *iph)
475 {
476 	struct ip_vs_dest *dest;
477 	struct ip_vs_mh_state *s;
478 	__be16 port = 0;
479 	const union nf_inet_addr *hash_addr;
480 
481 	hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
482 
483 	IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
484 
485 	if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
486 		port = ip_vs_mh_get_port(skb, iph);
487 
488 	s = (struct ip_vs_mh_state *)svc->sched_data;
489 
490 	if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
491 		dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
492 	else
493 		dest = ip_vs_mh_get(svc, s, hash_addr, port);
494 
495 	if (!dest) {
496 		ip_vs_scheduler_err(svc, "no destination available");
497 		return NULL;
498 	}
499 
500 	IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
501 		      IP_VS_DBG_ADDR(svc->af, hash_addr),
502 		      ntohs(port),
503 		      IP_VS_DBG_ADDR(dest->af, &dest->addr),
504 		      ntohs(dest->port));
505 
506 	return dest;
507 }
508 
509 /* IPVS MH Scheduler structure */
510 static struct ip_vs_scheduler ip_vs_mh_scheduler = {
511 	.name =			"mh",
512 	.refcnt =		ATOMIC_INIT(0),
513 	.module =		THIS_MODULE,
514 	.n_list	 =		LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
515 	.init_service =		ip_vs_mh_init_svc,
516 	.done_service =		ip_vs_mh_done_svc,
517 	.add_dest =		ip_vs_mh_dest_changed,
518 	.del_dest =		ip_vs_mh_dest_changed,
519 	.upd_dest =		ip_vs_mh_dest_changed,
520 	.schedule =		ip_vs_mh_schedule,
521 };
522 
523 static int __init ip_vs_mh_init(void)
524 {
525 	return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
526 }
527 
528 static void __exit ip_vs_mh_cleanup(void)
529 {
530 	unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
531 	rcu_barrier();
532 }
533 
534 module_init(ip_vs_mh_init);
535 module_exit(ip_vs_mh_cleanup);
536 MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
537 MODULE_LICENSE("GPL v2");
538 MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
539