1 // SPDX-License-Identifier: GPL-2.0 2 /* IPVS: Maglev Hashing scheduling module 3 * 4 * Authors: Inju Song <inju.song@navercorp.com> 5 * 6 */ 7 8 /* The mh algorithm is to assign a preference list of all the lookup 9 * table positions to each destination and populate the table with 10 * the most-preferred position of destinations. Then it is to select 11 * destination with the hash key of source IP address through looking 12 * up a the lookup table. 13 * 14 * The algorithm is detailed in: 15 * [3.4 Consistent Hasing] 16 https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf 17 * 18 */ 19 20 #define pr_fmt(fmt) "IPVS: " fmt 21 22 #include <linux/ip.h> 23 #include <linux/slab.h> 24 #include <linux/module.h> 25 #include <linux/kernel.h> 26 #include <linux/skbuff.h> 27 28 #include <net/ip_vs.h> 29 30 #include <linux/siphash.h> 31 #include <linux/bitops.h> 32 #include <linux/gcd.h> 33 34 #define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */ 35 #define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */ 36 37 struct ip_vs_mh_lookup { 38 struct ip_vs_dest __rcu *dest; /* real server (cache) */ 39 }; 40 41 struct ip_vs_mh_dest_setup { 42 unsigned int offset; /* starting offset */ 43 unsigned int skip; /* skip */ 44 unsigned int perm; /* next_offset */ 45 int turns; /* weight / gcd() and rshift */ 46 }; 47 48 /* Available prime numbers for MH table */ 49 static int primes[] = {251, 509, 1021, 2039, 4093, 50 8191, 16381, 32749, 65521, 131071}; 51 52 /* For IPVS MH entry hash table */ 53 #ifndef CONFIG_IP_VS_MH_TAB_INDEX 54 #define CONFIG_IP_VS_MH_TAB_INDEX 12 55 #endif 56 #define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2) 57 #define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8) 58 #define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX] 59 60 struct ip_vs_mh_state { 61 struct rcu_head rcu_head; 62 struct ip_vs_mh_lookup *lookup; 63 struct ip_vs_mh_dest_setup *dest_setup; 64 hsiphash_key_t hash1, hash2; 65 int gcd; 66 int rshift; 67 }; 68 69 static inline void generate_hash_secret(hsiphash_key_t *hash1, 70 hsiphash_key_t *hash2) 71 { 72 hash1->key[0] = 2654435761UL; 73 hash1->key[1] = 2654435761UL; 74 75 hash2->key[0] = 2654446892UL; 76 hash2->key[1] = 2654446892UL; 77 } 78 79 /* Helper function to determine if server is unavailable */ 80 static inline bool is_unavailable(struct ip_vs_dest *dest) 81 { 82 return atomic_read(&dest->weight) <= 0 || 83 dest->flags & IP_VS_DEST_F_OVERLOAD; 84 } 85 86 /* Returns hash value for IPVS MH entry */ 87 static inline unsigned int 88 ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr, 89 __be16 port, hsiphash_key_t *key, unsigned int offset) 90 { 91 unsigned int v; 92 __be32 addr_fold = addr->ip; 93 94 #ifdef CONFIG_IP_VS_IPV6 95 if (af == AF_INET6) 96 addr_fold = addr->ip6[0] ^ addr->ip6[1] ^ 97 addr->ip6[2] ^ addr->ip6[3]; 98 #endif 99 v = (offset + ntohs(port) + ntohl(addr_fold)); 100 return hsiphash(&v, sizeof(v), key); 101 } 102 103 /* Reset all the hash buckets of the specified table. */ 104 static void ip_vs_mh_reset(struct ip_vs_mh_state *s) 105 { 106 int i; 107 struct ip_vs_mh_lookup *l; 108 struct ip_vs_dest *dest; 109 110 l = &s->lookup[0]; 111 for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { 112 dest = rcu_dereference_protected(l->dest, 1); 113 if (dest) { 114 ip_vs_dest_put(dest); 115 RCU_INIT_POINTER(l->dest, NULL); 116 } 117 l++; 118 } 119 } 120 121 static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, 122 struct ip_vs_service *svc) 123 { 124 struct list_head *p; 125 struct ip_vs_mh_dest_setup *ds; 126 struct ip_vs_dest *dest; 127 int lw; 128 129 /* If gcd is smaller then 1, number of dests or 130 * all last_weight of dests are zero. So, skip 131 * permutation for the dests. 132 */ 133 if (s->gcd < 1) 134 return 0; 135 136 /* Set dest_setup for the dests permutation */ 137 p = &svc->destinations; 138 ds = &s->dest_setup[0]; 139 while ((p = p->next) != &svc->destinations) { 140 dest = list_entry(p, struct ip_vs_dest, n_list); 141 142 ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, 143 dest->port, &s->hash1, 0) % 144 IP_VS_MH_TAB_SIZE; 145 ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, 146 dest->port, &s->hash2, 0) % 147 (IP_VS_MH_TAB_SIZE - 1) + 1; 148 ds->perm = ds->offset; 149 150 lw = atomic_read(&dest->last_weight); 151 ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0); 152 ds++; 153 } 154 155 return 0; 156 } 157 158 static int ip_vs_mh_populate(struct ip_vs_mh_state *s, 159 struct ip_vs_service *svc) 160 { 161 int n, c, dt_count; 162 unsigned long *table; 163 struct list_head *p; 164 struct ip_vs_mh_dest_setup *ds; 165 struct ip_vs_dest *dest, *new_dest; 166 167 /* If gcd is smaller then 1, number of dests or 168 * all last_weight of dests are zero. So, skip 169 * the population for the dests and reset lookup table. 170 */ 171 if (s->gcd < 1) { 172 ip_vs_mh_reset(s); 173 return 0; 174 } 175 176 table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL); 177 if (!table) 178 return -ENOMEM; 179 180 p = &svc->destinations; 181 n = 0; 182 dt_count = 0; 183 while (n < IP_VS_MH_TAB_SIZE) { 184 if (p == &svc->destinations) 185 p = p->next; 186 187 ds = &s->dest_setup[0]; 188 while (p != &svc->destinations) { 189 /* Ignore added server with zero weight */ 190 if (ds->turns < 1) { 191 p = p->next; 192 ds++; 193 continue; 194 } 195 196 c = ds->perm; 197 while (test_bit(c, table)) { 198 /* Add skip, mod IP_VS_MH_TAB_SIZE */ 199 ds->perm += ds->skip; 200 if (ds->perm >= IP_VS_MH_TAB_SIZE) 201 ds->perm -= IP_VS_MH_TAB_SIZE; 202 c = ds->perm; 203 } 204 205 __set_bit(c, table); 206 207 dest = rcu_dereference_protected(s->lookup[c].dest, 1); 208 new_dest = list_entry(p, struct ip_vs_dest, n_list); 209 if (dest != new_dest) { 210 if (dest) 211 ip_vs_dest_put(dest); 212 ip_vs_dest_hold(new_dest); 213 RCU_INIT_POINTER(s->lookup[c].dest, new_dest); 214 } 215 216 if (++n == IP_VS_MH_TAB_SIZE) 217 goto out; 218 219 if (++dt_count >= ds->turns) { 220 dt_count = 0; 221 p = p->next; 222 ds++; 223 } 224 } 225 } 226 227 out: 228 bitmap_free(table); 229 return 0; 230 } 231 232 /* Get ip_vs_dest associated with supplied parameters. */ 233 static inline struct ip_vs_dest * 234 ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s, 235 const union nf_inet_addr *addr, __be16 port) 236 { 237 unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) 238 % IP_VS_MH_TAB_SIZE; 239 struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); 240 241 return (!dest || is_unavailable(dest)) ? NULL : dest; 242 } 243 244 /* As ip_vs_mh_get, but with fallback if selected server is unavailable */ 245 static inline struct ip_vs_dest * 246 ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, 247 const union nf_inet_addr *addr, __be16 port) 248 { 249 unsigned int offset, roffset; 250 unsigned int hash, ihash; 251 struct ip_vs_dest *dest; 252 253 /* First try the dest it's supposed to go to */ 254 ihash = ip_vs_mh_hashkey(svc->af, addr, port, 255 &s->hash1, 0) % IP_VS_MH_TAB_SIZE; 256 dest = rcu_dereference(s->lookup[ihash].dest); 257 if (!dest) 258 return NULL; 259 if (!is_unavailable(dest)) 260 return dest; 261 262 IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting", 263 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); 264 265 /* If the original dest is unavailable, loop around the table 266 * starting from ihash to find a new dest 267 */ 268 for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { 269 roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; 270 hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 271 roffset) % IP_VS_MH_TAB_SIZE; 272 dest = rcu_dereference(s->lookup[hash].dest); 273 if (!dest) 274 break; 275 if (!is_unavailable(dest)) 276 return dest; 277 IP_VS_DBG_BUF(6, 278 "MH: selected unavailable server %s:%u (offset %u), reselecting", 279 IP_VS_DBG_ADDR(dest->af, &dest->addr), 280 ntohs(dest->port), roffset); 281 } 282 283 return NULL; 284 } 285 286 /* Assign all the hash buckets of the specified table with the service. */ 287 static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, 288 struct ip_vs_service *svc) 289 { 290 int ret; 291 292 if (svc->num_dests > IP_VS_MH_TAB_SIZE) 293 return -EINVAL; 294 295 if (svc->num_dests >= 1) { 296 s->dest_setup = kcalloc(svc->num_dests, 297 sizeof(struct ip_vs_mh_dest_setup), 298 GFP_KERNEL); 299 if (!s->dest_setup) 300 return -ENOMEM; 301 } 302 303 ip_vs_mh_permutate(s, svc); 304 305 ret = ip_vs_mh_populate(s, svc); 306 if (ret < 0) 307 goto out; 308 309 IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n", 310 IP_VS_DBG_ADDR(svc->af, &svc->addr), 311 ntohs(svc->port)); 312 313 out: 314 if (svc->num_dests >= 1) { 315 kfree(s->dest_setup); 316 s->dest_setup = NULL; 317 } 318 return ret; 319 } 320 321 static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) 322 { 323 struct ip_vs_dest *dest; 324 int weight; 325 int g = 0; 326 327 list_for_each_entry(dest, &svc->destinations, n_list) { 328 weight = atomic_read(&dest->last_weight); 329 if (weight > 0) { 330 if (g > 0) 331 g = gcd(weight, g); 332 else 333 g = weight; 334 } 335 } 336 return g; 337 } 338 339 /* To avoid assigning huge weight for the MH table, 340 * calculate shift value with gcd. 341 */ 342 static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) 343 { 344 struct ip_vs_dest *dest; 345 int new_weight, weight = 0; 346 int mw, shift; 347 348 /* If gcd is smaller then 1, number of dests or 349 * all last_weight of dests are zero. So, return 350 * shift value as zero. 351 */ 352 if (gcd < 1) 353 return 0; 354 355 list_for_each_entry(dest, &svc->destinations, n_list) { 356 new_weight = atomic_read(&dest->last_weight); 357 if (new_weight > weight) 358 weight = new_weight; 359 } 360 361 /* Because gcd is greater than zero, 362 * the maximum weight and gcd are always greater than zero 363 */ 364 mw = weight / gcd; 365 366 /* shift = occupied bits of weight/gcd - MH highest bits */ 367 shift = fls(mw) - IP_VS_MH_TAB_BITS; 368 return (shift >= 0) ? shift : 0; 369 } 370 371 static void ip_vs_mh_state_free(struct rcu_head *head) 372 { 373 struct ip_vs_mh_state *s; 374 375 s = container_of(head, struct ip_vs_mh_state, rcu_head); 376 kfree(s->lookup); 377 kfree(s); 378 } 379 380 static int ip_vs_mh_init_svc(struct ip_vs_service *svc) 381 { 382 int ret; 383 struct ip_vs_mh_state *s; 384 385 /* Allocate the MH table for this service */ 386 s = kzalloc(sizeof(*s), GFP_KERNEL); 387 if (!s) 388 return -ENOMEM; 389 390 s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), 391 GFP_KERNEL); 392 if (!s->lookup) { 393 kfree(s); 394 return -ENOMEM; 395 } 396 397 generate_hash_secret(&s->hash1, &s->hash2); 398 s->gcd = ip_vs_mh_gcd_weight(svc); 399 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); 400 401 IP_VS_DBG(6, 402 "MH lookup table (memory=%zdbytes) allocated for current service\n", 403 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); 404 405 /* Assign the lookup table with current dests */ 406 ret = ip_vs_mh_reassign(s, svc); 407 if (ret < 0) { 408 ip_vs_mh_reset(s); 409 ip_vs_mh_state_free(&s->rcu_head); 410 return ret; 411 } 412 413 /* No more failures, attach state */ 414 svc->sched_data = s; 415 return 0; 416 } 417 418 static void ip_vs_mh_done_svc(struct ip_vs_service *svc) 419 { 420 struct ip_vs_mh_state *s = svc->sched_data; 421 422 /* Got to clean up lookup entry here */ 423 ip_vs_mh_reset(s); 424 425 call_rcu(&s->rcu_head, ip_vs_mh_state_free); 426 IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", 427 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); 428 } 429 430 static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, 431 struct ip_vs_dest *dest) 432 { 433 struct ip_vs_mh_state *s = svc->sched_data; 434 435 s->gcd = ip_vs_mh_gcd_weight(svc); 436 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); 437 438 /* Assign the lookup table with the updated service */ 439 return ip_vs_mh_reassign(s, svc); 440 } 441 442 /* Helper function to get port number */ 443 static inline __be16 444 ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) 445 { 446 __be16 _ports[2], *ports; 447 448 /* At this point we know that we have a valid packet of some kind. 449 * Because ICMP packets are only guaranteed to have the first 8 450 * bytes, let's just grab the ports. Fortunately they're in the 451 * same position for all three of the protocols we care about. 452 */ 453 switch (iph->protocol) { 454 case IPPROTO_TCP: 455 case IPPROTO_UDP: 456 case IPPROTO_SCTP: 457 ports = skb_header_pointer(skb, iph->len, sizeof(_ports), 458 &_ports); 459 if (unlikely(!ports)) 460 return 0; 461 462 if (likely(!ip_vs_iph_inverse(iph))) 463 return ports[0]; 464 else 465 return ports[1]; 466 default: 467 return 0; 468 } 469 } 470 471 /* Maglev Hashing scheduling */ 472 static struct ip_vs_dest * 473 ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, 474 struct ip_vs_iphdr *iph) 475 { 476 struct ip_vs_dest *dest; 477 struct ip_vs_mh_state *s; 478 __be16 port = 0; 479 const union nf_inet_addr *hash_addr; 480 481 hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; 482 483 IP_VS_DBG(6, "%s : Scheduling...\n", __func__); 484 485 if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT) 486 port = ip_vs_mh_get_port(skb, iph); 487 488 s = (struct ip_vs_mh_state *)svc->sched_data; 489 490 if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK) 491 dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port); 492 else 493 dest = ip_vs_mh_get(svc, s, hash_addr, port); 494 495 if (!dest) { 496 ip_vs_scheduler_err(svc, "no destination available"); 497 return NULL; 498 } 499 500 IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n", 501 IP_VS_DBG_ADDR(svc->af, hash_addr), 502 ntohs(port), 503 IP_VS_DBG_ADDR(dest->af, &dest->addr), 504 ntohs(dest->port)); 505 506 return dest; 507 } 508 509 /* IPVS MH Scheduler structure */ 510 static struct ip_vs_scheduler ip_vs_mh_scheduler = { 511 .name = "mh", 512 .refcnt = ATOMIC_INIT(0), 513 .module = THIS_MODULE, 514 .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list), 515 .init_service = ip_vs_mh_init_svc, 516 .done_service = ip_vs_mh_done_svc, 517 .add_dest = ip_vs_mh_dest_changed, 518 .del_dest = ip_vs_mh_dest_changed, 519 .upd_dest = ip_vs_mh_dest_changed, 520 .schedule = ip_vs_mh_schedule, 521 }; 522 523 static int __init ip_vs_mh_init(void) 524 { 525 return register_ip_vs_scheduler(&ip_vs_mh_scheduler); 526 } 527 528 static void __exit ip_vs_mh_cleanup(void) 529 { 530 unregister_ip_vs_scheduler(&ip_vs_mh_scheduler); 531 rcu_barrier(); 532 } 533 534 module_init(ip_vs_mh_init); 535 module_exit(ip_vs_mh_cleanup); 536 MODULE_DESCRIPTION("Maglev hashing ipvs scheduler"); 537 MODULE_LICENSE("GPL v2"); 538 MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>"); 539