1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Version 1, is capable of handling both version 0 and 1 messages. 10 * Version 0 is the plain old format. 11 * Note Version 0 receivers will just drop Ver 1 messages. 12 * Version 1 is capable of handle IPv6, Persistence data, 13 * time-outs, and firewall marks. 14 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. 15 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 16 * 17 * Definitions Message: is a complete datagram 18 * Sync_conn: is a part of a Message 19 * Param Data is an option to a Sync_conn. 20 * 21 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 22 * 23 * ip_vs_sync: sync connection info from master load balancer to backups 24 * through multicast 25 * 26 * Changes: 27 * Alexandre Cassen : Added master & backup support at a time. 28 * Alexandre Cassen : Added SyncID support for incoming sync 29 * messages filtering. 30 * Justin Ossevoort : Fix endian problem on sync message size. 31 * Hans Schillstrom : Added Version 1: i.e. IPv6, 32 * Persistence support, fwmark and time-out. 33 */ 34 35 #define pr_fmt(fmt) "IPVS: " fmt 36 37 #include <linux/module.h> 38 #include <linux/slab.h> 39 #include <linux/inetdevice.h> 40 #include <linux/net.h> 41 #include <linux/completion.h> 42 #include <linux/delay.h> 43 #include <linux/skbuff.h> 44 #include <linux/in.h> 45 #include <linux/igmp.h> /* for ip_mc_join_group */ 46 #include <linux/udp.h> 47 #include <linux/err.h> 48 #include <linux/kthread.h> 49 #include <linux/wait.h> 50 #include <linux/kernel.h> 51 #include <linux/sched/signal.h> 52 53 #include <linux/unaligned.h> /* Used for ntoh_seq and hton_seq */ 54 55 #include <net/ip.h> 56 #include <net/sock.h> 57 58 #include <net/ip_vs.h> 59 60 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 61 #define IP_VS_SYNC_PORT 8848 /* multicast port */ 62 63 #define SYNC_PROTO_VER 1 /* Protocol version in header */ 64 65 static struct lock_class_key __ipvs_sync_key; 66 /* 67 * IPVS sync connection entry 68 * Version 0, i.e. original version. 69 */ 70 struct ip_vs_sync_conn_v0 { 71 __u8 reserved; 72 73 /* Protocol, addresses and port numbers */ 74 __u8 protocol; /* Which protocol (TCP/UDP) */ 75 __be16 cport; 76 __be16 vport; 77 __be16 dport; 78 __be32 caddr; /* client address */ 79 __be32 vaddr; /* virtual address */ 80 __be32 daddr; /* destination address */ 81 82 /* Flags and state transition */ 83 __be16 flags; /* status flags */ 84 __be16 state; /* state info */ 85 86 /* The sequence options start here */ 87 }; 88 89 struct ip_vs_sync_conn_options { 90 struct ip_vs_seq in_seq; /* incoming seq. struct */ 91 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 92 }; 93 94 /* 95 Sync Connection format (sync_conn) 96 97 0 1 2 3 98 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 99 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 100 | Type | Protocol | Ver. | Size | 101 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 102 | Flags | 103 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104 | State | cport | 105 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106 | vport | dport | 107 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 108 | fwmark | 109 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 110 | timeout (in sec.) | 111 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 112 | ... | 113 | IP-Addresses (v4 or v6) | 114 | ... | 115 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 116 Optional Parameters. 117 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 118 | Param. Type | Param. Length | Param. data | 119 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 120 | ... | 121 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 122 | | Param Type | Param. Length | 123 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 124 | Param data | 125 | Last Param data should be padded for 32 bit alignment | 126 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 127 */ 128 129 /* 130 * Type 0, IPv4 sync connection format 131 */ 132 struct ip_vs_sync_v4 { 133 __u8 type; 134 __u8 protocol; /* Which protocol (TCP/UDP) */ 135 __be16 ver_size; /* Version msb 4 bits */ 136 /* Flags and state transition */ 137 __be32 flags; /* status flags */ 138 __be16 state; /* state info */ 139 /* Protocol, addresses and port numbers */ 140 __be16 cport; 141 __be16 vport; 142 __be16 dport; 143 __be32 fwmark; /* Firewall mark from skb */ 144 __be32 timeout; /* cp timeout */ 145 __be32 caddr; /* client address */ 146 __be32 vaddr; /* virtual address */ 147 __be32 daddr; /* destination address */ 148 /* The sequence options start here */ 149 /* PE data padded to 32bit alignment after seq. options */ 150 }; 151 /* 152 * Type 2 messages IPv6 153 */ 154 struct ip_vs_sync_v6 { 155 __u8 type; 156 __u8 protocol; /* Which protocol (TCP/UDP) */ 157 __be16 ver_size; /* Version msb 4 bits */ 158 /* Flags and state transition */ 159 __be32 flags; /* status flags */ 160 __be16 state; /* state info */ 161 /* Protocol, addresses and port numbers */ 162 __be16 cport; 163 __be16 vport; 164 __be16 dport; 165 __be32 fwmark; /* Firewall mark from skb */ 166 __be32 timeout; /* cp timeout */ 167 struct in6_addr caddr; /* client address */ 168 struct in6_addr vaddr; /* virtual address */ 169 struct in6_addr daddr; /* destination address */ 170 /* The sequence options start here */ 171 /* PE data padded to 32bit alignment after seq. options */ 172 }; 173 174 union ip_vs_sync_conn { 175 struct ip_vs_sync_v4 v4; 176 struct ip_vs_sync_v6 v6; 177 }; 178 179 /* Bits in Type field in above */ 180 #define STYPE_INET6 0 181 #define STYPE_F_INET6 (1 << STYPE_INET6) 182 183 #define SVER_SHIFT 12 /* Shift to get version */ 184 #define SVER_MASK 0x0fff /* Mask to strip version */ 185 186 #define IPVS_OPT_SEQ_DATA 1 187 #define IPVS_OPT_PE_DATA 2 188 #define IPVS_OPT_PE_NAME 3 189 #define IPVS_OPT_PARAM 7 190 191 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) 192 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) 193 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) 194 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) 195 196 struct ip_vs_sync_thread_data { 197 struct task_struct *task; 198 struct netns_ipvs *ipvs; 199 struct socket *sock; 200 char *buf; 201 int id; 202 }; 203 204 /* Version 0 definition of packet sizes */ 205 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) 206 #define FULL_CONN_SIZE \ 207 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) 208 209 210 /* 211 The master mulitcasts messages (Datagrams) to the backup load balancers 212 in the following format. 213 214 Version 1: 215 Note, first byte should be Zero, so ver 0 receivers will drop the packet. 216 217 0 1 2 3 218 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 219 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 220 | 0 | SyncID | Size | 221 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 222 | Count Conns | Version | Reserved, set to Zero | 223 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 224 | | 225 | IPVS Sync Connection (1) | 226 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 227 | . | 228 ~ . ~ 229 | . | 230 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 231 | | 232 | IPVS Sync Connection (n) | 233 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 234 235 Version 0 Header 236 0 1 2 3 237 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 238 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 239 | Count Conns | SyncID | Size | 240 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 241 | IPVS Sync Connection (1) | 242 */ 243 244 /* Version 0 header */ 245 struct ip_vs_sync_mesg_v0 { 246 __u8 nr_conns; 247 __u8 syncid; 248 __be16 size; 249 250 /* ip_vs_sync_conn entries start here */ 251 }; 252 253 /* Version 1 header */ 254 struct ip_vs_sync_mesg { 255 __u8 reserved; /* must be zero */ 256 __u8 syncid; 257 __be16 size; 258 __u8 nr_conns; 259 __s8 version; /* SYNC_PROTO_VER */ 260 __u16 spare; 261 /* ip_vs_sync_conn entries start here */ 262 }; 263 264 union ipvs_sockaddr { 265 struct sockaddr_in in; 266 struct sockaddr_in6 in6; 267 }; 268 269 struct ip_vs_sync_buff { 270 struct list_head list; 271 unsigned long firstuse; 272 273 /* pointers for the message data */ 274 struct ip_vs_sync_mesg *mesg; 275 unsigned char *head; 276 unsigned char *end; 277 }; 278 279 /* 280 * Copy of struct ip_vs_seq 281 * From unaligned network order to aligned host order 282 */ 283 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) 284 { 285 memset(ho, 0, sizeof(*ho)); 286 ho->init_seq = get_unaligned_be32(&no->init_seq); 287 ho->delta = get_unaligned_be32(&no->delta); 288 ho->previous_delta = get_unaligned_be32(&no->previous_delta); 289 } 290 291 /* 292 * Copy of struct ip_vs_seq 293 * From Aligned host order to unaligned network order 294 */ 295 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) 296 { 297 put_unaligned_be32(ho->init_seq, &no->init_seq); 298 put_unaligned_be32(ho->delta, &no->delta); 299 put_unaligned_be32(ho->previous_delta, &no->previous_delta); 300 } 301 302 static inline struct ip_vs_sync_buff * 303 sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 304 { 305 struct ip_vs_sync_buff *sb; 306 307 spin_lock_bh(&ipvs->sync_lock); 308 if (list_empty(&ms->sync_queue)) { 309 sb = NULL; 310 __set_current_state(TASK_INTERRUPTIBLE); 311 } else { 312 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 313 list); 314 list_del(&sb->list); 315 ms->sync_queue_len--; 316 if (!ms->sync_queue_len) 317 ms->sync_queue_delay = 0; 318 } 319 spin_unlock_bh(&ipvs->sync_lock); 320 321 return sb; 322 } 323 324 /* 325 * Create a new sync buffer for Version 1 proto. 326 */ 327 static inline struct ip_vs_sync_buff * 328 ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 329 { 330 struct ip_vs_sync_buff *sb; 331 332 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 333 return NULL; 334 335 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 336 ipvs->mcfg.sync_maxlen); 337 sb->mesg = kmalloc(len, GFP_ATOMIC); 338 if (!sb->mesg) { 339 kfree(sb); 340 return NULL; 341 } 342 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 343 sb->mesg->version = SYNC_PROTO_VER; 344 sb->mesg->syncid = ipvs->mcfg.syncid; 345 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 346 sb->mesg->nr_conns = 0; 347 sb->mesg->spare = 0; 348 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 349 sb->end = (unsigned char *)sb->mesg + len; 350 351 sb->firstuse = jiffies; 352 return sb; 353 } 354 355 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 356 { 357 kfree(sb->mesg); 358 kfree(sb); 359 } 360 361 static inline void sb_queue_tail(struct netns_ipvs *ipvs, 362 struct ipvs_master_sync_state *ms) 363 { 364 struct ip_vs_sync_buff *sb = ms->sync_buff; 365 366 spin_lock(&ipvs->sync_lock); 367 if (ipvs->sync_state & IP_VS_STATE_MASTER && 368 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 369 if (!ms->sync_queue_len) 370 schedule_delayed_work(&ms->master_wakeup_work, 371 max(IPVS_SYNC_SEND_DELAY, 1)); 372 ms->sync_queue_len++; 373 list_add_tail(&sb->list, &ms->sync_queue); 374 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) { 375 int id = (int)(ms - ipvs->ms); 376 377 wake_up_process(ipvs->master_tinfo[id].task); 378 } 379 } else 380 ip_vs_sync_buff_release(sb); 381 spin_unlock(&ipvs->sync_lock); 382 } 383 384 /* 385 * Get the current sync buffer if it has been created for more 386 * than the specified time or the specified time is zero. 387 */ 388 static inline struct ip_vs_sync_buff * 389 get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 390 unsigned long time) 391 { 392 struct ip_vs_sync_buff *sb; 393 394 spin_lock_bh(&ipvs->sync_buff_lock); 395 sb = ms->sync_buff; 396 if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 397 ms->sync_buff = NULL; 398 __set_current_state(TASK_RUNNING); 399 } else 400 sb = NULL; 401 spin_unlock_bh(&ipvs->sync_buff_lock); 402 return sb; 403 } 404 405 static inline int 406 select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 407 { 408 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 409 } 410 411 /* 412 * Create a new sync buffer for Version 0 proto. 413 */ 414 static inline struct ip_vs_sync_buff * 415 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 416 { 417 struct ip_vs_sync_buff *sb; 418 struct ip_vs_sync_mesg_v0 *mesg; 419 420 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 421 return NULL; 422 423 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 424 ipvs->mcfg.sync_maxlen); 425 sb->mesg = kmalloc(len, GFP_ATOMIC); 426 if (!sb->mesg) { 427 kfree(sb); 428 return NULL; 429 } 430 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 431 mesg->nr_conns = 0; 432 mesg->syncid = ipvs->mcfg.syncid; 433 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 434 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 435 sb->end = (unsigned char *)mesg + len; 436 sb->firstuse = jiffies; 437 return sb; 438 } 439 440 /* Check if connection is controlled by persistence */ 441 static inline bool in_persistence(struct ip_vs_conn *cp) 442 { 443 for (cp = cp->control; cp; cp = cp->control) { 444 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 445 return true; 446 } 447 return false; 448 } 449 450 /* Check if conn should be synced. 451 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 452 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 453 * sync_retries times with period of sync_refresh_period/8 454 * - (2) if both sync_refresh_period and sync_period are 0 send sync only 455 * for state changes or only once when pkts matches sync_threshold 456 * - (3) templates: rate can be reduced only with sync_refresh_period or 457 * with (2) 458 */ 459 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 460 struct ip_vs_conn *cp, int pkts) 461 { 462 unsigned long orig = READ_ONCE(cp->sync_endtime); 463 unsigned long now = jiffies; 464 unsigned long n = (now + cp->timeout) & ~3UL; 465 unsigned int sync_refresh_period; 466 int sync_period; 467 int force; 468 469 /* Check if we sync in current state */ 470 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 471 force = 0; 472 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) 473 return 0; 474 else if (likely(cp->protocol == IPPROTO_TCP)) { 475 if (!((1 << cp->state) & 476 ((1 << IP_VS_TCP_S_ESTABLISHED) | 477 (1 << IP_VS_TCP_S_FIN_WAIT) | 478 (1 << IP_VS_TCP_S_CLOSE) | 479 (1 << IP_VS_TCP_S_CLOSE_WAIT) | 480 (1 << IP_VS_TCP_S_TIME_WAIT)))) 481 return 0; 482 force = cp->state != cp->old_state; 483 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 484 goto set; 485 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 486 if (!((1 << cp->state) & 487 ((1 << IP_VS_SCTP_S_ESTABLISHED) | 488 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | 489 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | 490 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | 491 (1 << IP_VS_SCTP_S_CLOSED)))) 492 return 0; 493 force = cp->state != cp->old_state; 494 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 495 goto set; 496 } else { 497 /* UDP or another protocol with single state */ 498 force = 0; 499 } 500 501 sync_refresh_period = sysctl_sync_refresh_period(ipvs); 502 if (sync_refresh_period > 0) { 503 long diff = n - orig; 504 long min_diff = max(cp->timeout >> 1, 10UL * HZ); 505 506 /* Avoid sync if difference is below sync_refresh_period 507 * and below the half timeout. 508 */ 509 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 510 int retries = orig & 3; 511 512 if (retries >= sysctl_sync_retries(ipvs)) 513 return 0; 514 if (time_before(now, orig - cp->timeout + 515 (sync_refresh_period >> 3))) 516 return 0; 517 n |= retries + 1; 518 } 519 } 520 sync_period = sysctl_sync_period(ipvs); 521 if (sync_period > 0) { 522 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 523 pkts % sync_period != sysctl_sync_threshold(ipvs)) 524 return 0; 525 } else if (!sync_refresh_period && 526 pkts != sysctl_sync_threshold(ipvs)) 527 return 0; 528 529 set: 530 cp->old_state = cp->state; 531 n = cmpxchg(&cp->sync_endtime, orig, n); 532 return n == orig || force; 533 } 534 535 /* 536 * Version 0 , could be switched in by sys_ctl. 537 * Add an ip_vs_conn information into the current sync_buff. 538 */ 539 static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, 540 int pkts) 541 { 542 struct ip_vs_sync_mesg_v0 *m; 543 struct ip_vs_sync_conn_v0 *s; 544 struct ip_vs_sync_buff *buff; 545 struct ipvs_master_sync_state *ms; 546 int id; 547 unsigned int len; 548 549 if (unlikely(cp->af != AF_INET)) 550 return; 551 /* Do not sync ONE PACKET */ 552 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 553 return; 554 555 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 556 return; 557 558 spin_lock_bh(&ipvs->sync_buff_lock); 559 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 560 spin_unlock_bh(&ipvs->sync_buff_lock); 561 return; 562 } 563 564 id = select_master_thread_id(ipvs, cp); 565 ms = &ipvs->ms[id]; 566 buff = ms->sync_buff; 567 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 568 SIMPLE_CONN_SIZE; 569 if (buff) { 570 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 571 /* Send buffer if it is for v1 */ 572 if (buff->head + len > buff->end || !m->nr_conns) { 573 sb_queue_tail(ipvs, ms); 574 ms->sync_buff = NULL; 575 buff = NULL; 576 } 577 } 578 if (!buff) { 579 buff = ip_vs_sync_buff_create_v0(ipvs, len); 580 if (!buff) { 581 spin_unlock_bh(&ipvs->sync_buff_lock); 582 pr_err("ip_vs_sync_buff_create failed.\n"); 583 return; 584 } 585 ms->sync_buff = buff; 586 } 587 588 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 589 s = (struct ip_vs_sync_conn_v0 *) buff->head; 590 591 /* copy members */ 592 s->reserved = 0; 593 s->protocol = cp->protocol; 594 s->cport = cp->cport; 595 s->vport = cp->vport; 596 s->dport = cp->dport; 597 s->caddr = cp->caddr.ip; 598 s->vaddr = cp->vaddr.ip; 599 s->daddr = cp->daddr.ip; 600 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 601 s->state = htons(cp->state); 602 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 603 struct ip_vs_sync_conn_options *opt = 604 (struct ip_vs_sync_conn_options *)&s[1]; 605 memcpy(opt, &cp->sync_conn_opt, sizeof(*opt)); 606 } 607 608 m->nr_conns++; 609 m->size = htons(ntohs(m->size) + len); 610 buff->head += len; 611 spin_unlock_bh(&ipvs->sync_buff_lock); 612 613 /* synchronize its controller if it has */ 614 cp = cp->control; 615 if (cp) { 616 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 617 pkts = atomic_inc_return(&cp->in_pkts); 618 else 619 pkts = sysctl_sync_threshold(ipvs); 620 ip_vs_sync_conn(ipvs, cp, pkts); 621 } 622 } 623 624 /* 625 * Add an ip_vs_conn information into the current sync_buff. 626 * Called by ip_vs_in. 627 * Sending Version 1 messages 628 */ 629 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) 630 { 631 struct ip_vs_sync_mesg *m; 632 union ip_vs_sync_conn *s; 633 struct ip_vs_sync_buff *buff; 634 struct ipvs_master_sync_state *ms; 635 int id; 636 __u8 *p; 637 unsigned int len, pe_name_len, pad; 638 639 /* Handle old version of the protocol */ 640 if (sysctl_sync_ver(ipvs) == 0) { 641 ip_vs_sync_conn_v0(ipvs, cp, pkts); 642 return; 643 } 644 /* Do not sync ONE PACKET */ 645 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 646 goto control; 647 sloop: 648 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 649 goto control; 650 651 /* Sanity checks */ 652 pe_name_len = 0; 653 if (cp->pe_data_len) { 654 if (!cp->pe_data || !cp->dest) { 655 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); 656 return; 657 } 658 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 659 } 660 661 spin_lock_bh(&ipvs->sync_buff_lock); 662 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 663 spin_unlock_bh(&ipvs->sync_buff_lock); 664 return; 665 } 666 667 id = select_master_thread_id(ipvs, cp); 668 ms = &ipvs->ms[id]; 669 670 #ifdef CONFIG_IP_VS_IPV6 671 if (cp->af == AF_INET6) 672 len = sizeof(struct ip_vs_sync_v6); 673 else 674 #endif 675 len = sizeof(struct ip_vs_sync_v4); 676 677 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) 678 len += sizeof(struct ip_vs_sync_conn_options) + 2; 679 680 if (cp->pe_data_len) 681 len += cp->pe_data_len + 2; /* + Param hdr field */ 682 if (pe_name_len) 683 len += pe_name_len + 2; 684 685 /* check if there is a space for this one */ 686 pad = 0; 687 buff = ms->sync_buff; 688 if (buff) { 689 m = buff->mesg; 690 pad = (4 - (size_t) buff->head) & 3; 691 /* Send buffer if it is for v0 */ 692 if (buff->head + len + pad > buff->end || m->reserved) { 693 sb_queue_tail(ipvs, ms); 694 ms->sync_buff = NULL; 695 buff = NULL; 696 pad = 0; 697 } 698 } 699 700 if (!buff) { 701 buff = ip_vs_sync_buff_create(ipvs, len); 702 if (!buff) { 703 spin_unlock_bh(&ipvs->sync_buff_lock); 704 pr_err("ip_vs_sync_buff_create failed.\n"); 705 return; 706 } 707 ms->sync_buff = buff; 708 m = buff->mesg; 709 } 710 711 p = buff->head; 712 buff->head += pad + len; 713 m->size = htons(ntohs(m->size) + pad + len); 714 /* Add ev. padding from prev. sync_conn */ 715 while (pad--) 716 *(p++) = 0; 717 718 s = (union ip_vs_sync_conn *)p; 719 720 /* Set message type & copy members */ 721 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); 722 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ 723 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); 724 s->v4.state = htons(cp->state); 725 s->v4.protocol = cp->protocol; 726 s->v4.cport = cp->cport; 727 s->v4.vport = cp->vport; 728 s->v4.dport = cp->dport; 729 s->v4.fwmark = htonl(cp->fwmark); 730 s->v4.timeout = htonl(cp->timeout / HZ); 731 m->nr_conns++; 732 733 #ifdef CONFIG_IP_VS_IPV6 734 if (cp->af == AF_INET6) { 735 p += sizeof(struct ip_vs_sync_v6); 736 s->v6.caddr = cp->caddr.in6; 737 s->v6.vaddr = cp->vaddr.in6; 738 s->v6.daddr = cp->daddr.in6; 739 } else 740 #endif 741 { 742 p += sizeof(struct ip_vs_sync_v4); /* options ptr */ 743 s->v4.caddr = cp->caddr.ip; 744 s->v4.vaddr = cp->vaddr.ip; 745 s->v4.daddr = cp->daddr.ip; 746 } 747 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 748 *(p++) = IPVS_OPT_SEQ_DATA; 749 *(p++) = sizeof(struct ip_vs_sync_conn_options); 750 hton_seq((struct ip_vs_seq *)p, &cp->in_seq); 751 p += sizeof(struct ip_vs_seq); 752 hton_seq((struct ip_vs_seq *)p, &cp->out_seq); 753 p += sizeof(struct ip_vs_seq); 754 } 755 /* Handle pe data */ 756 if (cp->pe_data_len && cp->pe_data) { 757 *(p++) = IPVS_OPT_PE_DATA; 758 *(p++) = cp->pe_data_len; 759 memcpy(p, cp->pe_data, cp->pe_data_len); 760 p += cp->pe_data_len; 761 if (pe_name_len) { 762 /* Add PE_NAME */ 763 *(p++) = IPVS_OPT_PE_NAME; 764 *(p++) = pe_name_len; 765 memcpy(p, cp->pe->name, pe_name_len); 766 p += pe_name_len; 767 } 768 } 769 770 spin_unlock_bh(&ipvs->sync_buff_lock); 771 772 control: 773 /* synchronize its controller if it has */ 774 cp = cp->control; 775 if (!cp) 776 return; 777 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 778 pkts = atomic_inc_return(&cp->in_pkts); 779 else 780 pkts = sysctl_sync_threshold(ipvs); 781 goto sloop; 782 } 783 784 /* 785 * fill_param used by version 1 786 */ 787 static inline int 788 ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, 789 struct ip_vs_conn_param *p, 790 __u8 *pe_data, unsigned int pe_data_len, 791 __u8 *pe_name, unsigned int pe_name_len) 792 { 793 #ifdef CONFIG_IP_VS_IPV6 794 if (af == AF_INET6) 795 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, 796 (const union nf_inet_addr *)&sc->v6.caddr, 797 sc->v6.cport, 798 (const union nf_inet_addr *)&sc->v6.vaddr, 799 sc->v6.vport, p); 800 else 801 #endif 802 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, 803 (const union nf_inet_addr *)&sc->v4.caddr, 804 sc->v4.cport, 805 (const union nf_inet_addr *)&sc->v4.vaddr, 806 sc->v4.vport, p); 807 /* Handle pe data */ 808 if (pe_data_len) { 809 if (pe_name_len) { 810 char buff[IP_VS_PENAME_MAXLEN+1]; 811 812 memcpy(buff, pe_name, pe_name_len); 813 buff[pe_name_len]=0; 814 p->pe = __ip_vs_pe_getbyname(buff); 815 if (!p->pe) { 816 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", 817 buff); 818 return 1; 819 } 820 } else { 821 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); 822 return 1; 823 } 824 825 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); 826 if (!p->pe_data) { 827 module_put(p->pe->module); 828 return -ENOMEM; 829 } 830 p->pe_data_len = pe_data_len; 831 } 832 return 0; 833 } 834 835 /* 836 * Connection Add / Update. 837 * Common for version 0 and 1 reception of backup sync_conns. 838 * Param: ... 839 * timeout is in sec. 840 */ 841 static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, 842 unsigned int flags, unsigned int state, 843 unsigned int protocol, unsigned int type, 844 const union nf_inet_addr *daddr, __be16 dport, 845 unsigned long timeout, __u32 fwmark, 846 struct ip_vs_sync_conn_options *opt) 847 { 848 struct ip_vs_dest *dest; 849 struct ip_vs_conn *cp; 850 851 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 852 cp = ip_vs_conn_in_get(param); 853 if (cp && ((cp->dport != dport) || 854 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { 855 if (!(flags & IP_VS_CONN_F_INACTIVE)) { 856 ip_vs_conn_expire_now(cp); 857 __ip_vs_conn_put(cp); 858 cp = NULL; 859 } else { 860 /* This is the expiration message for the 861 * connection that was already replaced, so we 862 * just ignore it. 863 */ 864 __ip_vs_conn_put(cp); 865 kfree(param->pe_data); 866 return; 867 } 868 } 869 } else { 870 cp = ip_vs_ct_in_get(param); 871 } 872 873 if (cp) { 874 /* Free pe_data */ 875 kfree(param->pe_data); 876 877 dest = cp->dest; 878 spin_lock_bh(&cp->lock); 879 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 880 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 881 if (flags & IP_VS_CONN_F_INACTIVE) { 882 atomic_dec(&dest->activeconns); 883 atomic_inc(&dest->inactconns); 884 } else { 885 atomic_inc(&dest->activeconns); 886 atomic_dec(&dest->inactconns); 887 } 888 } 889 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 890 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 891 cp->flags = flags; 892 spin_unlock_bh(&cp->lock); 893 if (!dest) 894 ip_vs_try_bind_dest(cp); 895 } else { 896 /* 897 * Find the appropriate destination for the connection. 898 * If it is not found the connection will remain unbound 899 * but still handled. 900 */ 901 rcu_read_lock(); 902 /* This function is only invoked by the synchronization 903 * code. We do not currently support heterogeneous pools 904 * with synchronization, so we can make the assumption that 905 * the svc_af is the same as the dest_af 906 */ 907 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, 908 param->vaddr, param->vport, protocol, 909 fwmark, flags); 910 911 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, 912 fwmark); 913 rcu_read_unlock(); 914 if (!cp) { 915 kfree(param->pe_data); 916 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 917 return; 918 } 919 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 920 kfree(param->pe_data); 921 } 922 923 if (opt) { 924 cp->in_seq = opt->in_seq; 925 cp->out_seq = opt->out_seq; 926 } 927 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); 928 cp->state = state; 929 cp->old_state = cp->state; 930 /* 931 * For Ver 0 messages style 932 * - Not possible to recover the right timeout for templates 933 * - can not find the right fwmark 934 * virtual service. If needed, we can do it for 935 * non-fwmark persistent services. 936 * Ver 1 messages style. 937 * - No problem. 938 */ 939 if (timeout) { 940 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) 941 timeout = MAX_SCHEDULE_TIMEOUT / HZ; 942 cp->timeout = timeout*HZ; 943 } else { 944 struct ip_vs_proto_data *pd; 945 946 pd = ip_vs_proto_data_get(ipvs, protocol); 947 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) 948 cp->timeout = pd->timeout_table[state]; 949 else 950 cp->timeout = (3*60*HZ); 951 } 952 ip_vs_conn_put(cp); 953 } 954 955 /* 956 * Process received multicast message for Version 0 957 */ 958 static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, 959 const size_t buflen) 960 { 961 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; 962 struct ip_vs_sync_conn_v0 *s; 963 struct ip_vs_sync_conn_options *opt; 964 struct ip_vs_protocol *pp; 965 struct ip_vs_conn_param param; 966 char *p; 967 int i; 968 969 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); 970 for (i=0; i<m->nr_conns; i++) { 971 unsigned int flags, state; 972 973 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 974 IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); 975 return; 976 } 977 s = (struct ip_vs_sync_conn_v0 *) p; 978 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 979 flags &= ~IP_VS_CONN_F_HASHED; 980 if (flags & IP_VS_CONN_F_SEQ_MASK) { 981 opt = (struct ip_vs_sync_conn_options *)&s[1]; 982 p += FULL_CONN_SIZE; 983 if (p > buffer+buflen) { 984 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); 985 return; 986 } 987 } else { 988 opt = NULL; 989 p += SIMPLE_CONN_SIZE; 990 } 991 992 state = ntohs(s->state); 993 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 994 pp = ip_vs_proto_get(s->protocol); 995 if (!pp) { 996 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", 997 s->protocol); 998 continue; 999 } 1000 if (state >= pp->num_states) { 1001 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", 1002 pp->name, state); 1003 continue; 1004 } 1005 } else { 1006 if (state >= IP_VS_CTPL_S_LAST) 1007 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", 1008 state); 1009 } 1010 1011 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1012 (const union nf_inet_addr *)&s->caddr, 1013 s->cport, 1014 (const union nf_inet_addr *)&s->vaddr, 1015 s->vport, ¶m); 1016 1017 /* Send timeout as Zero */ 1018 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, 1019 (union nf_inet_addr *)&s->daddr, s->dport, 1020 0, 0, opt); 1021 } 1022 } 1023 1024 /* 1025 * Handle options 1026 */ 1027 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, 1028 __u32 *opt_flags, 1029 struct ip_vs_sync_conn_options *opt) 1030 { 1031 struct ip_vs_sync_conn_options *topt; 1032 1033 topt = (struct ip_vs_sync_conn_options *)p; 1034 1035 if (plen != sizeof(struct ip_vs_sync_conn_options)) { 1036 IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); 1037 return -EINVAL; 1038 } 1039 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { 1040 IP_VS_DBG(2, "BACKUP, conn options found twice\n"); 1041 return -EINVAL; 1042 } 1043 ntoh_seq(&topt->in_seq, &opt->in_seq); 1044 ntoh_seq(&topt->out_seq, &opt->out_seq); 1045 *opt_flags |= IPVS_OPT_F_SEQ_DATA; 1046 return 0; 1047 } 1048 1049 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, 1050 __u8 **data, unsigned int maxlen, 1051 __u32 *opt_flags, __u32 flag) 1052 { 1053 if (plen > maxlen) { 1054 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); 1055 return -EINVAL; 1056 } 1057 if (*opt_flags & flag) { 1058 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); 1059 return -EINVAL; 1060 } 1061 *data_len = plen; 1062 *data = p; 1063 *opt_flags |= flag; 1064 return 0; 1065 } 1066 /* 1067 * Process a Version 1 sync. connection 1068 */ 1069 static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) 1070 { 1071 struct ip_vs_sync_conn_options opt; 1072 union ip_vs_sync_conn *s; 1073 struct ip_vs_protocol *pp; 1074 struct ip_vs_conn_param param; 1075 __u32 flags; 1076 unsigned int af, state, pe_data_len=0, pe_name_len=0; 1077 __u8 *pe_data=NULL, *pe_name=NULL; 1078 __u32 opt_flags=0; 1079 int retc=0; 1080 1081 s = (union ip_vs_sync_conn *) p; 1082 1083 if (s->v6.type & STYPE_F_INET6) { 1084 #ifdef CONFIG_IP_VS_IPV6 1085 af = AF_INET6; 1086 p += sizeof(struct ip_vs_sync_v6); 1087 #else 1088 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); 1089 retc = 10; 1090 goto out; 1091 #endif 1092 } else if (!s->v4.type) { 1093 af = AF_INET; 1094 p += sizeof(struct ip_vs_sync_v4); 1095 } else { 1096 return -10; 1097 } 1098 if (p > msg_end) 1099 return -20; 1100 1101 /* Process optional params check Type & Len. */ 1102 while (p < msg_end) { 1103 int ptype; 1104 int plen; 1105 1106 if (p+2 > msg_end) 1107 return -30; 1108 ptype = *(p++); 1109 plen = *(p++); 1110 1111 if (!plen || ((p + plen) > msg_end)) 1112 return -40; 1113 /* Handle seq option p = param data */ 1114 switch (ptype & ~IPVS_OPT_F_PARAM) { 1115 case IPVS_OPT_SEQ_DATA: 1116 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) 1117 return -50; 1118 break; 1119 1120 case IPVS_OPT_PE_DATA: 1121 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, 1122 IP_VS_PEDATA_MAXLEN, &opt_flags, 1123 IPVS_OPT_F_PE_DATA)) 1124 return -60; 1125 break; 1126 1127 case IPVS_OPT_PE_NAME: 1128 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, 1129 IP_VS_PENAME_MAXLEN, &opt_flags, 1130 IPVS_OPT_F_PE_NAME)) 1131 return -70; 1132 break; 1133 1134 default: 1135 /* Param data mandatory ? */ 1136 if (!(ptype & IPVS_OPT_F_PARAM)) { 1137 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", 1138 ptype & ~IPVS_OPT_F_PARAM); 1139 retc = 20; 1140 goto out; 1141 } 1142 } 1143 p += plen; /* Next option */ 1144 } 1145 1146 /* Get flags and Mask off unsupported */ 1147 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; 1148 flags |= IP_VS_CONN_F_SYNC; 1149 state = ntohs(s->v4.state); 1150 1151 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 1152 pp = ip_vs_proto_get(s->v4.protocol); 1153 if (!pp) { 1154 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", 1155 s->v4.protocol); 1156 retc = 30; 1157 goto out; 1158 } 1159 if (state >= pp->num_states) { 1160 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", 1161 pp->name, state); 1162 retc = 40; 1163 goto out; 1164 } 1165 } else { 1166 if (state >= IP_VS_CTPL_S_LAST) 1167 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", 1168 state); 1169 } 1170 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, 1171 pe_data_len, pe_name, pe_name_len)) { 1172 retc = 50; 1173 goto out; 1174 } 1175 /* If only IPv4, just silent skip IPv6 */ 1176 if (af == AF_INET) 1177 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, 1178 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, 1179 ntohl(s->v4.timeout), ntohl(s->v4.fwmark), 1180 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1181 ); 1182 #ifdef CONFIG_IP_VS_IPV6 1183 else 1184 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, 1185 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, 1186 ntohl(s->v6.timeout), ntohl(s->v6.fwmark), 1187 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1188 ); 1189 #endif 1190 ip_vs_pe_put(param.pe); 1191 return 0; 1192 /* Error exit */ 1193 out: 1194 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); 1195 return retc; 1196 1197 } 1198 /* 1199 * Process received multicast message and create the corresponding 1200 * ip_vs_conn entries. 1201 * Handles Version 0 & 1 1202 */ 1203 static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, 1204 const size_t buflen) 1205 { 1206 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; 1207 __u8 *p, *msg_end; 1208 int i, nr_conns; 1209 1210 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { 1211 IP_VS_DBG(2, "BACKUP, message header too short\n"); 1212 return; 1213 } 1214 1215 if (buflen != ntohs(m2->size)) { 1216 IP_VS_DBG(2, "BACKUP, bogus message size\n"); 1217 return; 1218 } 1219 /* SyncID sanity check */ 1220 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1221 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1222 return; 1223 } 1224 /* Handle version 1 message */ 1225 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) 1226 && (m2->spare == 0)) { 1227 1228 msg_end = buffer + sizeof(struct ip_vs_sync_mesg); 1229 nr_conns = m2->nr_conns; 1230 1231 for (i=0; i<nr_conns; i++) { 1232 union ip_vs_sync_conn *s; 1233 unsigned int size; 1234 int retc; 1235 1236 p = msg_end; 1237 if (p + sizeof(s->v4) > buffer+buflen) { 1238 IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n"); 1239 return; 1240 } 1241 s = (union ip_vs_sync_conn *)p; 1242 size = ntohs(s->v4.ver_size) & SVER_MASK; 1243 msg_end = p + size; 1244 /* Basic sanity checks */ 1245 if (msg_end > buffer+buflen) { 1246 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); 1247 return; 1248 } 1249 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { 1250 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", 1251 ntohs(s->v4.ver_size) >> SVER_SHIFT); 1252 return; 1253 } 1254 /* Process a single sync_conn */ 1255 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); 1256 if (retc < 0) { 1257 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", 1258 retc); 1259 return; 1260 } 1261 /* Make sure we have 32 bit alignment */ 1262 msg_end = p + ((size + 3) & ~3); 1263 } 1264 } else { 1265 /* Old type of message */ 1266 ip_vs_process_message_v0(ipvs, buffer, buflen); 1267 return; 1268 } 1269 } 1270 1271 1272 /* 1273 * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1274 */ 1275 static void set_sock_size(struct sock *sk, int mode, int val) 1276 { 1277 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1278 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1279 lock_sock(sk); 1280 if (mode) { 1281 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1282 READ_ONCE(sysctl_wmem_max)); 1283 sk->sk_sndbuf = val * 2; 1284 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1285 } else { 1286 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1287 READ_ONCE(sysctl_rmem_max)); 1288 sk->sk_rcvbuf = val * 2; 1289 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1290 } 1291 release_sock(sk); 1292 } 1293 1294 /* 1295 * Setup loopback of outgoing multicasts on a sending socket 1296 */ 1297 static void set_mcast_loop(struct sock *sk, u_char loop) 1298 { 1299 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1300 inet_assign_bit(MC_LOOP, sk, loop); 1301 #ifdef CONFIG_IP_VS_IPV6 1302 if (READ_ONCE(sk->sk_family) == AF_INET6) { 1303 /* IPV6_MULTICAST_LOOP */ 1304 inet6_assign_bit(MC6_LOOP, sk, loop); 1305 } 1306 #endif 1307 } 1308 1309 /* 1310 * Specify TTL for outgoing multicasts on a sending socket 1311 */ 1312 static void set_mcast_ttl(struct sock *sk, u_char ttl) 1313 { 1314 struct inet_sock *inet = inet_sk(sk); 1315 1316 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1317 lock_sock(sk); 1318 WRITE_ONCE(inet->mc_ttl, ttl); 1319 #ifdef CONFIG_IP_VS_IPV6 1320 if (sk->sk_family == AF_INET6) { 1321 struct ipv6_pinfo *np = inet6_sk(sk); 1322 1323 /* IPV6_MULTICAST_HOPS */ 1324 WRITE_ONCE(np->mcast_hops, ttl); 1325 } 1326 #endif 1327 release_sock(sk); 1328 } 1329 1330 /* Control fragmentation of messages */ 1331 static void set_mcast_pmtudisc(struct sock *sk, int val) 1332 { 1333 struct inet_sock *inet = inet_sk(sk); 1334 1335 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1336 lock_sock(sk); 1337 WRITE_ONCE(inet->pmtudisc, val); 1338 #ifdef CONFIG_IP_VS_IPV6 1339 if (sk->sk_family == AF_INET6) { 1340 struct ipv6_pinfo *np = inet6_sk(sk); 1341 1342 /* IPV6_MTU_DISCOVER */ 1343 WRITE_ONCE(np->pmtudisc, val); 1344 } 1345 #endif 1346 release_sock(sk); 1347 } 1348 1349 /* 1350 * Specifiy default interface for outgoing multicasts 1351 */ 1352 static int set_mcast_if(struct sock *sk, struct net_device *dev) 1353 { 1354 struct inet_sock *inet = inet_sk(sk); 1355 1356 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1357 return -EINVAL; 1358 1359 lock_sock(sk); 1360 inet->mc_index = dev->ifindex; 1361 /* inet->mc_addr = 0; */ 1362 #ifdef CONFIG_IP_VS_IPV6 1363 if (sk->sk_family == AF_INET6) { 1364 struct ipv6_pinfo *np = inet6_sk(sk); 1365 1366 /* IPV6_MULTICAST_IF */ 1367 WRITE_ONCE(np->mcast_oif, dev->ifindex); 1368 } 1369 #endif 1370 release_sock(sk); 1371 1372 return 0; 1373 } 1374 1375 1376 /* 1377 * Join a multicast group. 1378 * the group is specified by a class D multicast address 224.0.0.0/8 1379 * in the in_addr structure passed in as a parameter. 1380 */ 1381 static int 1382 join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev) 1383 { 1384 struct ip_mreqn mreq; 1385 int ret; 1386 1387 memset(&mreq, 0, sizeof(mreq)); 1388 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1389 1390 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1391 return -EINVAL; 1392 1393 mreq.imr_ifindex = dev->ifindex; 1394 1395 lock_sock(sk); 1396 ret = ip_mc_join_group(sk, &mreq); 1397 release_sock(sk); 1398 1399 return ret; 1400 } 1401 1402 #ifdef CONFIG_IP_VS_IPV6 1403 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1404 struct net_device *dev) 1405 { 1406 int ret; 1407 1408 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1409 return -EINVAL; 1410 1411 lock_sock(sk); 1412 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1413 release_sock(sk); 1414 1415 return ret; 1416 } 1417 #endif 1418 1419 static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) 1420 { 1421 __be32 addr; 1422 struct sockaddr_in sin; 1423 1424 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1425 if (!addr) 1426 pr_err("You probably need to specify IP address on " 1427 "multicast interface.\n"); 1428 1429 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 1430 dev->name, &addr); 1431 1432 /* Now bind the socket with the address of multicast interface */ 1433 sin.sin_family = AF_INET; 1434 sin.sin_addr.s_addr = addr; 1435 sin.sin_port = 0; 1436 1437 return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); 1438 } 1439 1440 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1441 struct ipvs_sync_daemon_cfg *c, int id) 1442 { 1443 if (AF_INET6 == c->mcast_af) { 1444 sa->in6 = (struct sockaddr_in6) { 1445 .sin6_family = AF_INET6, 1446 .sin6_port = htons(c->mcast_port + id), 1447 }; 1448 sa->in6.sin6_addr = c->mcast_group.in6; 1449 *salen = sizeof(sa->in6); 1450 } else { 1451 sa->in = (struct sockaddr_in) { 1452 .sin_family = AF_INET, 1453 .sin_port = htons(c->mcast_port + id), 1454 }; 1455 sa->in.sin_addr = c->mcast_group.in; 1456 *salen = sizeof(sa->in); 1457 } 1458 } 1459 1460 /* 1461 * Set up sending multicast socket over UDP 1462 */ 1463 static int make_send_sock(struct netns_ipvs *ipvs, int id, 1464 struct net_device *dev, struct socket **sock_ret) 1465 { 1466 /* multicast addr */ 1467 union ipvs_sockaddr mcast_addr; 1468 struct socket *sock; 1469 int result, salen; 1470 1471 /* First create a socket */ 1472 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1473 IPPROTO_UDP, &sock); 1474 if (result < 0) { 1475 pr_err("Error during creation of socket; terminating\n"); 1476 goto error; 1477 } 1478 *sock_ret = sock; 1479 result = set_mcast_if(sock->sk, dev); 1480 if (result < 0) { 1481 pr_err("Error setting outbound mcast interface\n"); 1482 goto error; 1483 } 1484 1485 set_mcast_loop(sock->sk, 0); 1486 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1487 /* Allow fragmentation if MTU changes */ 1488 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1489 result = sysctl_sync_sock_size(ipvs); 1490 if (result > 0) 1491 set_sock_size(sock->sk, 1, result); 1492 1493 if (AF_INET == ipvs->mcfg.mcast_af) 1494 result = bind_mcastif_addr(sock, dev); 1495 else 1496 result = 0; 1497 if (result < 0) { 1498 pr_err("Error binding address of the mcast interface\n"); 1499 goto error; 1500 } 1501 1502 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1503 result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr, 1504 salen, 0); 1505 if (result < 0) { 1506 pr_err("Error connecting to the multicast addr\n"); 1507 goto error; 1508 } 1509 1510 return 0; 1511 1512 error: 1513 return result; 1514 } 1515 1516 1517 /* 1518 * Set up receiving multicast socket over UDP 1519 */ 1520 static int make_receive_sock(struct netns_ipvs *ipvs, int id, 1521 struct net_device *dev, struct socket **sock_ret) 1522 { 1523 /* multicast addr */ 1524 union ipvs_sockaddr mcast_addr; 1525 struct socket *sock; 1526 int result, salen; 1527 1528 /* First create a socket */ 1529 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1530 IPPROTO_UDP, &sock); 1531 if (result < 0) { 1532 pr_err("Error during creation of socket; terminating\n"); 1533 goto error; 1534 } 1535 *sock_ret = sock; 1536 /* it is equivalent to the REUSEADDR option in user-space */ 1537 sock->sk->sk_reuse = SK_CAN_REUSE; 1538 result = sysctl_sync_sock_size(ipvs); 1539 if (result > 0) 1540 set_sock_size(sock->sk, 0, result); 1541 1542 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1543 sock->sk->sk_bound_dev_if = dev->ifindex; 1544 result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen); 1545 if (result < 0) { 1546 pr_err("Error binding to the multicast addr\n"); 1547 goto error; 1548 } 1549 1550 /* join the multicast group */ 1551 #ifdef CONFIG_IP_VS_IPV6 1552 if (ipvs->bcfg.mcast_af == AF_INET6) 1553 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1554 dev); 1555 else 1556 #endif 1557 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1558 dev); 1559 if (result < 0) { 1560 pr_err("Error joining to the multicast group\n"); 1561 goto error; 1562 } 1563 1564 return 0; 1565 1566 error: 1567 return result; 1568 } 1569 1570 1571 static int 1572 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 1573 { 1574 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 1575 struct kvec iov; 1576 int len; 1577 1578 iov.iov_base = (void *)buffer; 1579 iov.iov_len = length; 1580 1581 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 1582 1583 return len; 1584 } 1585 1586 static int 1587 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1588 { 1589 int msize; 1590 int ret; 1591 1592 msize = ntohs(msg->size); 1593 1594 ret = ip_vs_send_async(sock, (char *)msg, msize); 1595 if (ret >= 0 || ret == -EAGAIN) 1596 return ret; 1597 pr_err("ip_vs_send_async error %d\n", ret); 1598 return 0; 1599 } 1600 1601 static int 1602 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 1603 { 1604 struct msghdr msg = {NULL,}; 1605 struct kvec iov = {buffer, buflen}; 1606 int len; 1607 1608 /* Receive a packet */ 1609 iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen); 1610 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); 1611 if (len < 0) 1612 return len; 1613 1614 return len; 1615 } 1616 1617 /* Wakeup the master thread for sending */ 1618 static void master_wakeup_work_handler(struct work_struct *work) 1619 { 1620 struct ipvs_master_sync_state *ms = 1621 container_of(work, struct ipvs_master_sync_state, 1622 master_wakeup_work.work); 1623 struct netns_ipvs *ipvs = ms->ipvs; 1624 1625 spin_lock_bh(&ipvs->sync_lock); 1626 if (ms->sync_queue_len && 1627 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1628 int id = (int)(ms - ipvs->ms); 1629 1630 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1631 wake_up_process(ipvs->master_tinfo[id].task); 1632 } 1633 spin_unlock_bh(&ipvs->sync_lock); 1634 } 1635 1636 /* Get next buffer to send */ 1637 static inline struct ip_vs_sync_buff * 1638 next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1639 { 1640 struct ip_vs_sync_buff *sb; 1641 1642 sb = sb_dequeue(ipvs, ms); 1643 if (sb) 1644 return sb; 1645 /* Do not delay entries in buffer for more than 2 seconds */ 1646 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1647 } 1648 1649 static int sync_thread_master(void *data) 1650 { 1651 struct ip_vs_sync_thread_data *tinfo = data; 1652 struct netns_ipvs *ipvs = tinfo->ipvs; 1653 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1654 struct sock *sk = tinfo->sock->sk; 1655 struct ip_vs_sync_buff *sb; 1656 1657 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1658 "syncid = %d, id = %d\n", 1659 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1660 1661 for (;;) { 1662 sb = next_sync_buff(ipvs, ms); 1663 if (unlikely(kthread_should_stop())) 1664 break; 1665 if (!sb) { 1666 schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1667 continue; 1668 } 1669 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1670 /* (Ab)use interruptible sleep to avoid increasing 1671 * the load avg. 1672 */ 1673 __wait_event_interruptible(*sk_sleep(sk), 1674 sock_writeable(sk) || 1675 kthread_should_stop()); 1676 if (unlikely(kthread_should_stop())) 1677 goto done; 1678 } 1679 ip_vs_sync_buff_release(sb); 1680 } 1681 1682 done: 1683 __set_current_state(TASK_RUNNING); 1684 if (sb) 1685 ip_vs_sync_buff_release(sb); 1686 1687 /* clean up the sync_buff queue */ 1688 while ((sb = sb_dequeue(ipvs, ms))) 1689 ip_vs_sync_buff_release(sb); 1690 __set_current_state(TASK_RUNNING); 1691 1692 /* clean up the current sync_buff */ 1693 sb = get_curr_sync_buff(ipvs, ms, 0); 1694 if (sb) 1695 ip_vs_sync_buff_release(sb); 1696 1697 return 0; 1698 } 1699 1700 1701 static int sync_thread_backup(void *data) 1702 { 1703 struct ip_vs_sync_thread_data *tinfo = data; 1704 struct netns_ipvs *ipvs = tinfo->ipvs; 1705 struct sock *sk = tinfo->sock->sk; 1706 struct udp_sock *up = udp_sk(sk); 1707 int len; 1708 1709 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1710 "syncid = %d, id = %d\n", 1711 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1712 1713 while (!kthread_should_stop()) { 1714 wait_event_interruptible(*sk_sleep(sk), 1715 !skb_queue_empty_lockless(&sk->sk_receive_queue) || 1716 !skb_queue_empty_lockless(&up->reader_queue) || 1717 kthread_should_stop()); 1718 1719 /* do we have data now? */ 1720 while (!skb_queue_empty_lockless(&sk->sk_receive_queue) || 1721 !skb_queue_empty_lockless(&up->reader_queue)) { 1722 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1723 ipvs->bcfg.sync_maxlen); 1724 if (len <= 0) { 1725 if (len != -EAGAIN) 1726 pr_err("receiving message error\n"); 1727 break; 1728 } 1729 1730 ip_vs_process_message(ipvs, tinfo->buf, len); 1731 } 1732 } 1733 1734 return 0; 1735 } 1736 1737 1738 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, 1739 int state) 1740 { 1741 struct ip_vs_sync_thread_data *ti = NULL, *tinfo; 1742 struct task_struct *task; 1743 struct net_device *dev; 1744 char *name; 1745 int (*threadfn)(void *data); 1746 int id = 0, count, hlen; 1747 int result = -ENOMEM; 1748 u16 mtu, min_mtu; 1749 1750 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1751 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", 1752 sizeof(struct ip_vs_sync_conn_v0)); 1753 1754 /* increase the module use count */ 1755 if (!ip_vs_use_count_inc()) 1756 return -ENOPROTOOPT; 1757 1758 /* Do not hold one mutex and then to block on another */ 1759 for (;;) { 1760 rtnl_lock(); 1761 if (mutex_trylock(&ipvs->sync_mutex)) 1762 break; 1763 rtnl_unlock(); 1764 mutex_lock(&ipvs->sync_mutex); 1765 if (rtnl_trylock()) 1766 break; 1767 mutex_unlock(&ipvs->sync_mutex); 1768 } 1769 1770 if (!ipvs->sync_state) { 1771 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1772 ipvs->threads_mask = count - 1; 1773 } else 1774 count = ipvs->threads_mask + 1; 1775 1776 if (c->mcast_af == AF_UNSPEC) { 1777 c->mcast_af = AF_INET; 1778 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1779 } 1780 if (!c->mcast_port) 1781 c->mcast_port = IP_VS_SYNC_PORT; 1782 if (!c->mcast_ttl) 1783 c->mcast_ttl = 1; 1784 1785 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); 1786 if (!dev) { 1787 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1788 result = -ENODEV; 1789 goto out_early; 1790 } 1791 hlen = (AF_INET6 == c->mcast_af) ? 1792 sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1793 sizeof(struct iphdr) + sizeof(struct udphdr); 1794 mtu = (state == IP_VS_STATE_BACKUP) ? 1795 clamp(dev->mtu, 1500U, 65535U) : 1500U; 1796 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1797 1798 if (c->sync_maxlen) 1799 c->sync_maxlen = clamp_t(unsigned int, 1800 c->sync_maxlen, min_mtu, 1801 65535 - hlen); 1802 else 1803 c->sync_maxlen = mtu - hlen; 1804 1805 if (state == IP_VS_STATE_MASTER) { 1806 result = -EEXIST; 1807 if (ipvs->ms) 1808 goto out_early; 1809 1810 ipvs->mcfg = *c; 1811 name = "ipvs-m:%d:%d"; 1812 threadfn = sync_thread_master; 1813 } else if (state == IP_VS_STATE_BACKUP) { 1814 result = -EEXIST; 1815 if (ipvs->backup_tinfo) 1816 goto out_early; 1817 1818 ipvs->bcfg = *c; 1819 name = "ipvs-b:%d:%d"; 1820 threadfn = sync_thread_backup; 1821 } else { 1822 result = -EINVAL; 1823 goto out_early; 1824 } 1825 1826 if (state == IP_VS_STATE_MASTER) { 1827 struct ipvs_master_sync_state *ms; 1828 1829 result = -ENOMEM; 1830 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); 1831 if (!ipvs->ms) 1832 goto out; 1833 ms = ipvs->ms; 1834 for (id = 0; id < count; id++, ms++) { 1835 INIT_LIST_HEAD(&ms->sync_queue); 1836 ms->sync_queue_len = 0; 1837 ms->sync_queue_delay = 0; 1838 INIT_DELAYED_WORK(&ms->master_wakeup_work, 1839 master_wakeup_work_handler); 1840 ms->ipvs = ipvs; 1841 } 1842 } 1843 result = -ENOMEM; 1844 ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data), 1845 GFP_KERNEL); 1846 if (!ti) 1847 goto out; 1848 1849 for (id = 0; id < count; id++) { 1850 tinfo = &ti[id]; 1851 tinfo->ipvs = ipvs; 1852 if (state == IP_VS_STATE_BACKUP) { 1853 result = -ENOMEM; 1854 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1855 GFP_KERNEL); 1856 if (!tinfo->buf) 1857 goto out; 1858 } 1859 tinfo->id = id; 1860 if (state == IP_VS_STATE_MASTER) 1861 result = make_send_sock(ipvs, id, dev, &tinfo->sock); 1862 else 1863 result = make_receive_sock(ipvs, id, dev, &tinfo->sock); 1864 if (result < 0) 1865 goto out; 1866 1867 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1868 if (IS_ERR(task)) { 1869 result = PTR_ERR(task); 1870 goto out; 1871 } 1872 tinfo->task = task; 1873 } 1874 1875 /* mark as active */ 1876 1877 if (state == IP_VS_STATE_MASTER) 1878 ipvs->master_tinfo = ti; 1879 else 1880 ipvs->backup_tinfo = ti; 1881 spin_lock_bh(&ipvs->sync_buff_lock); 1882 ipvs->sync_state |= state; 1883 spin_unlock_bh(&ipvs->sync_buff_lock); 1884 1885 mutex_unlock(&ipvs->sync_mutex); 1886 rtnl_unlock(); 1887 1888 return 0; 1889 1890 out: 1891 /* We do not need RTNL lock anymore, release it here so that 1892 * sock_release below can use rtnl_lock to leave the mcast group. 1893 */ 1894 rtnl_unlock(); 1895 id = min(id, count - 1); 1896 if (ti) { 1897 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1898 if (tinfo->task) 1899 kthread_stop(tinfo->task); 1900 } 1901 } 1902 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1903 kfree(ipvs->ms); 1904 ipvs->ms = NULL; 1905 } 1906 mutex_unlock(&ipvs->sync_mutex); 1907 1908 /* No more mutexes, release socks */ 1909 if (ti) { 1910 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1911 if (tinfo->sock) 1912 sock_release(tinfo->sock); 1913 kfree(tinfo->buf); 1914 } 1915 kfree(ti); 1916 } 1917 1918 /* decrease the module use count */ 1919 ip_vs_use_count_dec(); 1920 return result; 1921 1922 out_early: 1923 mutex_unlock(&ipvs->sync_mutex); 1924 rtnl_unlock(); 1925 1926 /* decrease the module use count */ 1927 ip_vs_use_count_dec(); 1928 return result; 1929 } 1930 1931 1932 int stop_sync_thread(struct netns_ipvs *ipvs, int state) 1933 { 1934 struct ip_vs_sync_thread_data *ti, *tinfo; 1935 int id; 1936 int retc = -EINVAL; 1937 1938 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1939 1940 mutex_lock(&ipvs->sync_mutex); 1941 if (state == IP_VS_STATE_MASTER) { 1942 retc = -ESRCH; 1943 if (!ipvs->ms) 1944 goto err; 1945 ti = ipvs->master_tinfo; 1946 1947 /* 1948 * The lock synchronizes with sb_queue_tail(), so that we don't 1949 * add sync buffers to the queue, when we are already in 1950 * progress of stopping the master sync daemon. 1951 */ 1952 1953 spin_lock_bh(&ipvs->sync_buff_lock); 1954 spin_lock(&ipvs->sync_lock); 1955 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1956 spin_unlock(&ipvs->sync_lock); 1957 spin_unlock_bh(&ipvs->sync_buff_lock); 1958 1959 retc = 0; 1960 for (id = ipvs->threads_mask; id >= 0; id--) { 1961 struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1962 int ret; 1963 1964 tinfo = &ti[id]; 1965 pr_info("stopping master sync thread %d ...\n", 1966 task_pid_nr(tinfo->task)); 1967 cancel_delayed_work_sync(&ms->master_wakeup_work); 1968 ret = kthread_stop(tinfo->task); 1969 if (retc >= 0) 1970 retc = ret; 1971 } 1972 kfree(ipvs->ms); 1973 ipvs->ms = NULL; 1974 ipvs->master_tinfo = NULL; 1975 } else if (state == IP_VS_STATE_BACKUP) { 1976 retc = -ESRCH; 1977 if (!ipvs->backup_tinfo) 1978 goto err; 1979 ti = ipvs->backup_tinfo; 1980 1981 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1982 retc = 0; 1983 for (id = ipvs->threads_mask; id >= 0; id--) { 1984 int ret; 1985 1986 tinfo = &ti[id]; 1987 pr_info("stopping backup sync thread %d ...\n", 1988 task_pid_nr(tinfo->task)); 1989 ret = kthread_stop(tinfo->task); 1990 if (retc >= 0) 1991 retc = ret; 1992 } 1993 ipvs->backup_tinfo = NULL; 1994 } else { 1995 goto err; 1996 } 1997 id = ipvs->threads_mask; 1998 mutex_unlock(&ipvs->sync_mutex); 1999 2000 /* No more mutexes, release socks */ 2001 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 2002 if (tinfo->sock) 2003 sock_release(tinfo->sock); 2004 kfree(tinfo->buf); 2005 } 2006 kfree(ti); 2007 2008 /* decrease the module use count */ 2009 ip_vs_use_count_dec(); 2010 return retc; 2011 2012 err: 2013 mutex_unlock(&ipvs->sync_mutex); 2014 return retc; 2015 } 2016 2017 /* 2018 * Initialize data struct for each netns 2019 */ 2020 int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) 2021 { 2022 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 2023 spin_lock_init(&ipvs->sync_lock); 2024 spin_lock_init(&ipvs->sync_buff_lock); 2025 return 0; 2026 } 2027 2028 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) 2029 { 2030 int retc; 2031 2032 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); 2033 if (retc && retc != -ESRCH) 2034 pr_err("Failed to stop Master Daemon\n"); 2035 2036 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); 2037 if (retc && retc != -ESRCH) 2038 pr_err("Failed to stop Backup Daemon\n"); 2039 } 2040