1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 10 * Peter Kese <peter.kese@ijs.si> 11 * Julian Anastasov <ja@ssi.bg> 12 * 13 * Changes: 14 */ 15 16 #define pr_fmt(fmt) "IPVS: " fmt 17 18 #include <linux/module.h> 19 #include <linux/init.h> 20 #include <linux/types.h> 21 #include <linux/capability.h> 22 #include <linux/fs.h> 23 #include <linux/sysctl.h> 24 #include <linux/proc_fs.h> 25 #include <linux/workqueue.h> 26 #include <linux/seq_file.h> 27 #include <linux/slab.h> 28 29 #include <linux/netfilter.h> 30 #include <linux/netfilter_ipv4.h> 31 #include <linux/mutex.h> 32 #include <linux/rcupdate_wait.h> 33 34 #include <net/net_namespace.h> 35 #include <linux/nsproxy.h> 36 #include <net/ip.h> 37 #ifdef CONFIG_IP_VS_IPV6 38 #include <net/ipv6.h> 39 #include <net/ip6_route.h> 40 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 41 #endif 42 #include <net/route.h> 43 #include <net/sock.h> 44 #include <net/genetlink.h> 45 46 #include <linux/uaccess.h> 47 48 #include <net/ip_vs.h> 49 50 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); 51 52 static struct lock_class_key __ipvs_service_key; 53 54 /* sysctl variables */ 55 56 #ifdef CONFIG_IP_VS_DEBUG 57 static int sysctl_ip_vs_debug_level = 0; 58 59 int ip_vs_get_debug_level(void) 60 { 61 return sysctl_ip_vs_debug_level; 62 } 63 #endif 64 65 66 /* Protos */ 67 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); 68 69 70 #ifdef CONFIG_IP_VS_IPV6 71 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ 72 static bool __ip_vs_addr_is_local_v6(struct net *net, 73 const struct in6_addr *addr) 74 { 75 struct flowi6 fl6 = { 76 .daddr = *addr, 77 }; 78 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); 79 bool is_local; 80 81 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); 82 83 dst_release(dst); 84 return is_local; 85 } 86 #endif 87 88 #ifdef CONFIG_SYSCTL 89 /* 90 * update_defense_level is called from keventd and from sysctl, 91 * so it needs to protect itself from softirqs 92 */ 93 static void update_defense_level(struct netns_ipvs *ipvs) 94 { 95 struct sysinfo i; 96 int availmem; 97 int amemthresh; 98 int nomem; 99 int to_change = -1; 100 101 /* we only count free and buffered memory (in pages) */ 102 si_meminfo(&i); 103 availmem = i.freeram + i.bufferram; 104 /* however in linux 2.5 the i.bufferram is total page cache size, 105 we need adjust it */ 106 /* si_swapinfo(&i); */ 107 /* availmem = availmem - (i.totalswap - i.freeswap); */ 108 109 amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); 110 nomem = (availmem < amemthresh); 111 112 local_bh_disable(); 113 114 /* drop_entry */ 115 spin_lock(&ipvs->dropentry_lock); 116 switch (ipvs->sysctl_drop_entry) { 117 case 0: 118 atomic_set(&ipvs->dropentry, 0); 119 break; 120 case 1: 121 if (nomem) { 122 atomic_set(&ipvs->dropentry, 1); 123 ipvs->sysctl_drop_entry = 2; 124 } else { 125 atomic_set(&ipvs->dropentry, 0); 126 } 127 break; 128 case 2: 129 if (nomem) { 130 atomic_set(&ipvs->dropentry, 1); 131 } else { 132 atomic_set(&ipvs->dropentry, 0); 133 ipvs->sysctl_drop_entry = 1; 134 } 135 break; 136 case 3: 137 atomic_set(&ipvs->dropentry, 1); 138 break; 139 } 140 spin_unlock(&ipvs->dropentry_lock); 141 142 /* drop_packet */ 143 spin_lock(&ipvs->droppacket_lock); 144 switch (ipvs->sysctl_drop_packet) { 145 case 0: 146 ipvs->drop_rate = 0; 147 break; 148 case 1: 149 if (nomem) { 150 ipvs->drop_counter = amemthresh / (amemthresh - availmem); 151 ipvs->drop_rate = ipvs->drop_counter; 152 ipvs->sysctl_drop_packet = 2; 153 } else { 154 ipvs->drop_rate = 0; 155 } 156 break; 157 case 2: 158 if (nomem) { 159 ipvs->drop_counter = amemthresh / (amemthresh - availmem); 160 ipvs->drop_rate = ipvs->drop_counter; 161 } else { 162 ipvs->drop_rate = 0; 163 ipvs->sysctl_drop_packet = 1; 164 } 165 break; 166 case 3: 167 ipvs->drop_rate = ipvs->sysctl_am_droprate; 168 break; 169 } 170 spin_unlock(&ipvs->droppacket_lock); 171 172 /* secure_tcp */ 173 spin_lock(&ipvs->securetcp_lock); 174 switch (ipvs->sysctl_secure_tcp) { 175 case 0: 176 if (ipvs->old_secure_tcp >= 2) 177 to_change = 0; 178 break; 179 case 1: 180 if (nomem) { 181 if (ipvs->old_secure_tcp < 2) 182 to_change = 1; 183 ipvs->sysctl_secure_tcp = 2; 184 } else { 185 if (ipvs->old_secure_tcp >= 2) 186 to_change = 0; 187 } 188 break; 189 case 2: 190 if (nomem) { 191 if (ipvs->old_secure_tcp < 2) 192 to_change = 1; 193 } else { 194 if (ipvs->old_secure_tcp >= 2) 195 to_change = 0; 196 ipvs->sysctl_secure_tcp = 1; 197 } 198 break; 199 case 3: 200 if (ipvs->old_secure_tcp < 2) 201 to_change = 1; 202 break; 203 } 204 ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; 205 if (to_change >= 0) 206 ip_vs_protocol_timeout_change(ipvs, 207 ipvs->sysctl_secure_tcp > 1); 208 spin_unlock(&ipvs->securetcp_lock); 209 210 local_bh_enable(); 211 } 212 213 /* Handler for delayed work for expiring no 214 * destination connections 215 */ 216 static void expire_nodest_conn_handler(struct work_struct *work) 217 { 218 struct netns_ipvs *ipvs; 219 220 ipvs = container_of(work, struct netns_ipvs, 221 expire_nodest_conn_work.work); 222 ip_vs_expire_nodest_conn_flush(ipvs); 223 } 224 225 /* 226 * Timer for checking the defense 227 */ 228 #define DEFENSE_TIMER_PERIOD 1*HZ 229 230 static void defense_work_handler(struct work_struct *work) 231 { 232 struct netns_ipvs *ipvs = 233 container_of(work, struct netns_ipvs, defense_work.work); 234 235 update_defense_level(ipvs); 236 if (atomic_read(&ipvs->dropentry)) 237 ip_vs_random_dropentry(ipvs); 238 queue_delayed_work(system_long_wq, &ipvs->defense_work, 239 DEFENSE_TIMER_PERIOD); 240 } 241 #endif 242 243 static void est_reload_work_handler(struct work_struct *work) 244 { 245 struct netns_ipvs *ipvs = 246 container_of(work, struct netns_ipvs, est_reload_work.work); 247 int genid_done = atomic_read(&ipvs->est_genid_done); 248 unsigned long delay = HZ / 10; /* repeat startups after failure */ 249 bool repeat = false; 250 int genid; 251 int id; 252 253 mutex_lock(&ipvs->est_mutex); 254 genid = atomic_read(&ipvs->est_genid); 255 for (id = 0; id < ipvs->est_kt_count; id++) { 256 struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; 257 258 /* netns clean up started, abort delayed work */ 259 if (!READ_ONCE(ipvs->enable)) 260 goto unlock; 261 if (!kd) 262 continue; 263 /* New config ? Stop kthread tasks */ 264 if (genid != genid_done) { 265 if (!id) { 266 /* Only we can stop kt 0 but not under mutex */ 267 mutex_unlock(&ipvs->est_mutex); 268 ip_vs_est_kthread_stop(kd); 269 mutex_lock(&ipvs->est_mutex); 270 if (!READ_ONCE(ipvs->enable)) 271 goto unlock; 272 /* kd for kt 0 is never destroyed */ 273 } else { 274 ip_vs_est_kthread_stop(kd); 275 } 276 } 277 if (!kd->task && !ip_vs_est_stopped(ipvs)) { 278 bool start; 279 280 /* Do not start kthreads above 0 in calc phase */ 281 if (id) 282 start = !ipvs->est_calc_phase; 283 else 284 start = kd->needed; 285 if (start && ip_vs_est_kthread_start(ipvs, kd) < 0) 286 repeat = true; 287 } 288 } 289 290 atomic_set(&ipvs->est_genid_done, genid); 291 292 if (repeat) 293 queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 294 delay); 295 296 unlock: 297 mutex_unlock(&ipvs->est_mutex); 298 } 299 300 static int get_conn_tab_size(struct netns_ipvs *ipvs) 301 { 302 const struct ip_vs_rht *t; 303 int size = 0; 304 305 rcu_read_lock(); 306 t = rcu_dereference(ipvs->conn_tab); 307 if (t) 308 size = t->size; 309 rcu_read_unlock(); 310 311 return size; 312 } 313 314 int 315 ip_vs_use_count_inc(void) 316 { 317 return try_module_get(THIS_MODULE); 318 } 319 320 void 321 ip_vs_use_count_dec(void) 322 { 323 module_put(THIS_MODULE); 324 } 325 326 327 /* Service hashing: 328 * Operation Locking order 329 * --------------------------------------------------------------------------- 330 * add first table service_mutex 331 * attach new table service_mutex 332 * add/del service service_mutex, RCU, bit lock 333 * move between tables (rehash) svc_resize_sem(W), seqcount_t(W), bit lock 334 * replace old with attached svc_resize_sem(W), svc_replace_sem(W) 335 * find service RCU, seqcount_t(R) 336 * walk services(blocking) service_mutex, svc_resize_sem(R) 337 * walk services(non-blocking) RCU, seqcount_t(R) 338 * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R) 339 * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R) 340 * del table service_mutex after stopped work 341 * 342 * - new table is attached on resizing under service_mutex and all operations 343 * can run in parallel in 2 tables until the new table is registered as current 344 * one 345 * - two contexts can modify buckets: config and table resize (work), both in 346 * process context 347 * - only table resizer can move entries, so we do not protect t->seqc[] 348 * items with t->lock[] 349 * - lookups occur under RCU lock and seqcount reader lock to detect if 350 * services are moved to new table 351 * - move operations may disturb readers: find operation will not miss entries 352 * but walkers may see same entry twice if they are forced to retry chains 353 * or to walk the newly attached second table 354 * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to check 355 * svc_table_changes and repeat the RCU read section if new table is installed 356 * - walkers may serialize with the whole resizing process (svc_resize_sem) 357 * to prevent seeing same service twice or just with the svc_table 358 * replace (svc_replace_sem) when we can see entries twice but we 359 * prefer to run concurrently with the rehashing. 360 */ 361 362 /* 363 * Returns hash value for virtual service 364 */ 365 static inline u32 366 ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto, 367 const union nf_inet_addr *addr, __be16 port) 368 { 369 return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto); 370 } 371 372 /* 373 * Returns hash value of fwmark for virtual service lookup 374 */ 375 static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af, 376 __u32 fwmark) 377 { 378 return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]); 379 } 380 381 /* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */ 382 static int ip_vs_svc_hash(struct ip_vs_service *svc) 383 { 384 struct netns_ipvs *ipvs = svc->ipvs; 385 struct hlist_bl_head *head; 386 struct ip_vs_rht *t; 387 u32 hash; 388 389 if (svc->flags & IP_VS_SVC_F_HASHED) { 390 pr_err("%s(): request for already hashed, called from %pS\n", 391 __func__, __builtin_return_address(0)); 392 return 0; 393 } 394 395 /* increase its refcnt because it is referenced by the svc table */ 396 atomic_inc(&svc->refcnt); 397 398 /* We know if new table is attached under service_mutex but rely on 399 * RCU to hold the old table to be freed in resizer 400 */ 401 rcu_read_lock(); 402 403 /* This can be the old or the new table */ 404 t = rcu_dereference(ipvs->svc_table); 405 406 /* New entries go into recent table */ 407 t = rcu_dereference(t->new_tbl); 408 409 if (svc->fwmark == 0) { 410 /* 411 * Hash it by <protocol,addr,port> 412 */ 413 hash = ip_vs_svc_hashval(t, svc->af, svc->protocol, 414 &svc->addr, svc->port); 415 } else { 416 /* 417 * Hash it by fwmark 418 */ 419 hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark); 420 } 421 head = t->buckets + (hash & t->mask); 422 hlist_bl_lock(head); 423 WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash)); 424 svc->flags |= IP_VS_SVC_F_HASHED; 425 hlist_bl_add_head_rcu(&svc->s_list, head); 426 hlist_bl_unlock(head); 427 428 rcu_read_unlock(); 429 430 return 1; 431 } 432 433 434 /* 435 * Unhashes a service from svc_table. 436 * Should be called with locked tables. 437 */ 438 static int ip_vs_svc_unhash(struct ip_vs_service *svc) 439 { 440 struct netns_ipvs *ipvs = svc->ipvs; 441 struct hlist_bl_head *head; 442 struct ip_vs_rht *t; 443 u32 hash_key2; 444 u32 hash_key; 445 446 if (!(svc->flags & IP_VS_SVC_F_HASHED)) { 447 pr_err("%s(): request for unhash flagged, called from %pS\n", 448 __func__, __builtin_return_address(0)); 449 return 0; 450 } 451 452 /* We know if new table is attached under service_mutex but rely on 453 * RCU to hold the old table to be freed in resizer 454 */ 455 rcu_read_lock(); 456 457 /* This can be the old or the new table */ 458 t = rcu_dereference(ipvs->svc_table); 459 hash_key = READ_ONCE(svc->hash_key); 460 /* We need to lock the bucket in the right table */ 461 if (ip_vs_rht_same_table(t, hash_key)) { 462 head = t->buckets + (hash_key & t->mask); 463 hlist_bl_lock(head); 464 /* Ensure hash_key is read under lock */ 465 hash_key2 = READ_ONCE(svc->hash_key); 466 /* Moved to new table ? */ 467 if (hash_key != hash_key2) { 468 hlist_bl_unlock(head); 469 t = rcu_dereference(t->new_tbl); 470 head = t->buckets + (hash_key2 & t->mask); 471 hlist_bl_lock(head); 472 } 473 } else { 474 /* It is already moved to new table */ 475 t = rcu_dereference(t->new_tbl); 476 head = t->buckets + (hash_key & t->mask); 477 hlist_bl_lock(head); 478 } 479 /* Remove it from svc_table */ 480 hlist_bl_del_rcu(&svc->s_list); 481 482 svc->flags &= ~IP_VS_SVC_F_HASHED; 483 atomic_dec(&svc->refcnt); 484 hlist_bl_unlock(head); 485 486 rcu_read_unlock(); 487 return 1; 488 } 489 490 491 /* 492 * Get service by {netns, proto,addr,port} in the service table. 493 */ 494 static inline struct ip_vs_service * 495 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, 496 const union nf_inet_addr *vaddr, __be16 vport) 497 { 498 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 499 struct hlist_bl_head *head; 500 struct ip_vs_service *svc; 501 struct ip_vs_rht *t, *p; 502 struct hlist_bl_node *e; 503 u32 hash, hash_key; 504 505 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 506 /* Check for "full" addressed entries */ 507 hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport); 508 509 hash_key = ip_vs_rht_build_hash_key(t, hash); 510 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 511 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 512 if (READ_ONCE(svc->hash_key) == hash_key && 513 svc->af == af && 514 ip_vs_addr_equal(af, &svc->addr, vaddr) && 515 svc->port == vport && 516 svc->protocol == protocol && !svc->fwmark) { 517 /* HIT */ 518 return svc; 519 } 520 } 521 } 522 } 523 524 return NULL; 525 } 526 527 528 /* 529 * Get service by {fwmark} in the service table. 530 */ 531 static inline struct ip_vs_service * 532 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) 533 { 534 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 535 struct hlist_bl_head *head; 536 struct ip_vs_service *svc; 537 struct ip_vs_rht *t, *p; 538 struct hlist_bl_node *e; 539 u32 hash, hash_key; 540 541 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 542 /* Check for fwmark addressed entries */ 543 hash = ip_vs_svc_fwm_hashval(t, af, fwmark); 544 545 hash_key = ip_vs_rht_build_hash_key(t, hash); 546 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 547 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 548 if (READ_ONCE(svc->hash_key) == hash_key && 549 svc->fwmark == fwmark && svc->af == af) { 550 /* HIT */ 551 return svc; 552 } 553 } 554 } 555 } 556 557 return NULL; 558 } 559 560 /* Find service, called under RCU lock */ 561 struct ip_vs_service * 562 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, 563 const union nf_inet_addr *vaddr, __be16 vport) 564 { 565 struct ip_vs_service *svc = NULL; 566 int af_id = ip_vs_af_index(af); 567 568 /* 569 * Check the table hashed by fwmark first 570 */ 571 if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) { 572 svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); 573 if (svc) 574 goto out; 575 } 576 577 if (!atomic_read(&ipvs->nonfwm_services[af_id])) 578 goto out; 579 580 /* 581 * Check the table hashed by <protocol,addr,port> 582 * for "full" addressed entries 583 */ 584 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); 585 if (svc) 586 goto out; 587 588 if (protocol == IPPROTO_TCP && 589 atomic_read(&ipvs->ftpsvc_counter[af_id]) && 590 (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { 591 /* 592 * Check if ftp service entry exists, the packet 593 * might belong to FTP data connections. 594 */ 595 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); 596 if (svc) 597 goto out; 598 } 599 600 if (atomic_read(&ipvs->nullsvc_counter[af_id])) { 601 /* 602 * Check if the catch-all port (port zero) exists 603 */ 604 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); 605 } 606 607 out: 608 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", 609 fwmark, ip_vs_proto_name(protocol), 610 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), 611 svc ? "hit" : "not hit"); 612 613 return svc; 614 } 615 616 /* Return the number of registered services */ 617 static int ip_vs_get_num_services(struct netns_ipvs *ipvs) 618 { 619 int ns = 0, ni = IP_VS_AF_MAX; 620 621 while (--ni >= 0) 622 ns += atomic_read(&ipvs->num_services[ni]); 623 return ns; 624 } 625 626 /* Get default load factor to map num_services/u_thresh to t->size */ 627 static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs) 628 { 629 int factor; 630 631 if (net_eq(ipvs->net, &init_net)) 632 factor = -3; /* grow if load is above 12.5% */ 633 else 634 factor = -2; /* grow if load is above 25% */ 635 return factor; 636 } 637 638 /* Get the desired svc_table size */ 639 static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, 640 int lfactor) 641 { 642 return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs), 643 lfactor, IP_VS_SVC_TAB_MIN_BITS, 644 IP_VS_SVC_TAB_MAX_BITS); 645 } 646 647 /* Allocate svc_table */ 648 static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs, 649 int buckets, int lfactor) 650 { 651 struct ip_vs_rht *t; 652 int scounts, locks; 653 654 /* No frequent lookups to race with resizing, so use max of 64 655 * seqcounts. Only resizer moves entries, so use 0 locks. 656 */ 657 scounts = clamp(buckets >> 4, 1, 64); 658 locks = 0; 659 660 t = ip_vs_rht_alloc(buckets, scounts, locks); 661 if (!t) 662 return NULL; 663 t->lfactor = lfactor; 664 ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS, 665 IP_VS_SVC_TAB_MAX_BITS); 666 return t; 667 } 668 669 /* svc_table resizer work */ 670 static void svc_resize_work_handler(struct work_struct *work) 671 { 672 struct hlist_bl_head *head, *head2; 673 struct ip_vs_rht *t_free = NULL; 674 unsigned int resched_score = 0; 675 struct hlist_bl_node *cn, *nn; 676 struct ip_vs_rht *t, *t_new; 677 struct ip_vs_service *svc; 678 struct netns_ipvs *ipvs; 679 bool more_work = true; 680 seqcount_t *sc; 681 int limit = 0; 682 int new_size; 683 int lfactor; 684 u32 bucket; 685 686 ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work); 687 688 if (!down_write_trylock(&ipvs->svc_resize_sem)) 689 goto out; 690 if (!mutex_trylock(&ipvs->service_mutex)) 691 goto unlock_sem; 692 more_work = false; 693 clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); 694 if (!READ_ONCE(ipvs->enable)) 695 goto unlock_m; 696 t = rcu_dereference_protected(ipvs->svc_table, 1); 697 /* Do nothing if table is removed */ 698 if (!t) 699 goto unlock_m; 700 /* New table already attached? BUG! */ 701 if (t != rcu_access_pointer(t->new_tbl)) 702 goto unlock_m; 703 704 lfactor = sysctl_svc_lfactor(ipvs); 705 /* Should we resize ? */ 706 new_size = ip_vs_svc_desired_size(ipvs, t, lfactor); 707 if (new_size == t->size && lfactor == t->lfactor) 708 goto unlock_m; 709 710 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 711 if (!t_new) { 712 more_work = true; 713 goto unlock_m; 714 } 715 /* Flip the table_id */ 716 t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; 717 718 /* Attach new table */ 719 rcu_assign_pointer(t->new_tbl, t_new); 720 /* Allow add/del to new_tbl while moving from old table */ 721 mutex_unlock(&ipvs->service_mutex); 722 723 ip_vs_rht_for_each_bucket(t, bucket, head) { 724 same_bucket: 725 if (++limit >= 16) { 726 /* Check if work is stopped */ 727 if (test_bit(IP_VS_WORK_SVC_NORESIZE, 728 &ipvs->work_flags)) 729 goto unlock_sem; 730 if (resched_score >= 100) { 731 resched_score = 0; 732 cond_resched(); 733 } 734 limit = 0; 735 } 736 if (hlist_bl_empty(head)) { 737 resched_score++; 738 continue; 739 } 740 /* Preemption calls ahead... */ 741 resched_score = 0; 742 743 sc = &t->seqc[bucket & t->seqc_mask]; 744 /* seqcount_t usage considering PREEMPT_RT rules: 745 * - we are the only writer => preemption can be allowed 746 * - readers (SoftIRQ) => disable BHs 747 * - readers (processes) => preemption should be disabled 748 */ 749 local_bh_disable(); 750 preempt_disable_nested(); 751 write_seqcount_begin(sc); 752 hlist_bl_lock(head); 753 754 hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) { 755 u32 hash; 756 757 /* New hash for the new table */ 758 if (svc->fwmark == 0) { 759 /* Hash it by <protocol,addr,port> */ 760 hash = ip_vs_svc_hashval(t_new, svc->af, 761 svc->protocol, 762 &svc->addr, svc->port); 763 } else { 764 /* Hash it by fwmark */ 765 hash = ip_vs_svc_fwm_hashval(t_new, svc->af, 766 svc->fwmark); 767 } 768 hlist_bl_del_rcu(&svc->s_list); 769 head2 = t_new->buckets + (hash & t_new->mask); 770 771 hlist_bl_lock(head2); 772 WRITE_ONCE(svc->hash_key, 773 ip_vs_rht_build_hash_key(t_new, hash)); 774 /* t_new->seqc are not used at this stage, we race 775 * only with add/del, so only lock the bucket. 776 */ 777 hlist_bl_add_head_rcu(&svc->s_list, head2); 778 hlist_bl_unlock(head2); 779 /* Too long chain? Do it in steps */ 780 if (++limit >= 64) 781 break; 782 } 783 784 hlist_bl_unlock(head); 785 write_seqcount_end(sc); 786 preempt_enable_nested(); 787 local_bh_enable(); 788 if (limit >= 64) 789 goto same_bucket; 790 } 791 792 /* Serialize with readers that don't like svc_table changes */ 793 down_write(&ipvs->svc_replace_sem); 794 795 /* Check if work is stopped to avoid synchronize_rcu() */ 796 if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 797 goto unlock_repl; 798 799 rcu_assign_pointer(ipvs->svc_table, t_new); 800 /* Inform readers that new table is installed */ 801 smp_mb__before_atomic(); 802 atomic_inc(&ipvs->svc_table_changes); 803 t_free = t; 804 805 unlock_repl: 806 up_write(&ipvs->svc_replace_sem); 807 808 unlock_sem: 809 up_write(&ipvs->svc_resize_sem); 810 811 if (t_free) { 812 /* RCU readers should not see more than two tables in chain. 813 * To prevent new table to be attached wait here instead of 814 * freeing the old table in RCU callback. 815 */ 816 synchronize_rcu(); 817 ip_vs_rht_free(t_free); 818 } 819 820 out: 821 if (!READ_ONCE(ipvs->enable) || !more_work || 822 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 823 return; 824 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); 825 return; 826 827 unlock_m: 828 mutex_unlock(&ipvs->service_mutex); 829 goto unlock_sem; 830 } 831 832 static inline void 833 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) 834 { 835 atomic_inc(&svc->refcnt); 836 rcu_assign_pointer(dest->svc, svc); 837 } 838 839 static void ip_vs_service_free(struct ip_vs_service *svc) 840 { 841 ip_vs_stats_release(&svc->stats); 842 kfree(svc); 843 } 844 845 static void ip_vs_service_rcu_free(struct rcu_head *head) 846 { 847 struct ip_vs_service *svc; 848 849 svc = container_of(head, struct ip_vs_service, rcu_head); 850 ip_vs_service_free(svc); 851 } 852 853 static void __ip_vs_svc_put(struct ip_vs_service *svc) 854 { 855 if (atomic_dec_and_test(&svc->refcnt)) { 856 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", 857 svc->fwmark, 858 IP_VS_DBG_ADDR(svc->af, &svc->addr), 859 ntohs(svc->port)); 860 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); 861 } 862 } 863 864 865 /* 866 * Returns hash value for real service 867 */ 868 static inline unsigned int ip_vs_rs_hashkey(int af, 869 const union nf_inet_addr *addr, 870 __be16 port) 871 { 872 unsigned int porth = ntohs(port); 873 __be32 addr_fold = addr->ip; 874 875 #ifdef CONFIG_IP_VS_IPV6 876 if (af == AF_INET6) 877 addr_fold = addr->ip6[0]^addr->ip6[1]^ 878 addr->ip6[2]^addr->ip6[3]; 879 #endif 880 881 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) 882 & IP_VS_RTAB_MASK; 883 } 884 885 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ 886 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) 887 { 888 unsigned int hash; 889 __be16 port; 890 891 if (dest->in_rs_table) 892 return; 893 894 switch (IP_VS_DFWD_METHOD(dest)) { 895 case IP_VS_CONN_F_MASQ: 896 port = dest->port; 897 break; 898 case IP_VS_CONN_F_TUNNEL: 899 switch (dest->tun_type) { 900 case IP_VS_CONN_F_TUNNEL_TYPE_GUE: 901 port = dest->tun_port; 902 break; 903 case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: 904 case IP_VS_CONN_F_TUNNEL_TYPE_GRE: 905 port = 0; 906 break; 907 default: 908 return; 909 } 910 break; 911 default: 912 return; 913 } 914 915 /* 916 * Hash by proto,addr,port, 917 * which are the parameters of the real service. 918 */ 919 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); 920 921 hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); 922 dest->in_rs_table = 1; 923 } 924 925 /* Unhash ip_vs_dest from rs_table. */ 926 static void ip_vs_rs_unhash(struct ip_vs_dest *dest) 927 { 928 /* 929 * Remove it from the rs_table table. 930 */ 931 if (dest->in_rs_table) { 932 hlist_del_rcu(&dest->d_list); 933 dest->in_rs_table = 0; 934 } 935 } 936 937 /* Check if real service by <proto,addr,port> is present */ 938 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, 939 const union nf_inet_addr *daddr, __be16 dport) 940 { 941 unsigned int hash; 942 struct ip_vs_dest *dest; 943 944 /* Check for "full" addressed entries */ 945 hash = ip_vs_rs_hashkey(af, daddr, dport); 946 947 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 948 if (dest->port == dport && 949 dest->af == af && 950 ip_vs_addr_equal(af, &dest->addr, daddr) && 951 (dest->protocol == protocol || dest->vfwmark) && 952 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 953 /* HIT */ 954 return true; 955 } 956 } 957 958 return false; 959 } 960 961 /* Find real service record by <proto,addr,port>. 962 * In case of multiple records with the same <proto,addr,port>, only 963 * the first found record is returned. 964 * 965 * To be called under RCU lock. 966 */ 967 struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, 968 __u16 protocol, 969 const union nf_inet_addr *daddr, 970 __be16 dport) 971 { 972 unsigned int hash; 973 struct ip_vs_dest *dest; 974 975 /* Check for "full" addressed entries */ 976 hash = ip_vs_rs_hashkey(af, daddr, dport); 977 978 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 979 if (dest->port == dport && 980 dest->af == af && 981 ip_vs_addr_equal(af, &dest->addr, daddr) && 982 (dest->protocol == protocol || dest->vfwmark) && 983 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 984 /* HIT */ 985 return dest; 986 } 987 } 988 989 return NULL; 990 } 991 992 /* Find real service record by <af,addr,tun_port>. 993 * In case of multiple records with the same <af,addr,tun_port>, only 994 * the first found record is returned. 995 * 996 * To be called under RCU lock. 997 */ 998 struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, 999 const union nf_inet_addr *daddr, 1000 __be16 tun_port) 1001 { 1002 struct ip_vs_dest *dest; 1003 unsigned int hash; 1004 1005 /* Check for "full" addressed entries */ 1006 hash = ip_vs_rs_hashkey(af, daddr, tun_port); 1007 1008 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 1009 if (dest->tun_port == tun_port && 1010 dest->af == af && 1011 ip_vs_addr_equal(af, &dest->addr, daddr) && 1012 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { 1013 /* HIT */ 1014 return dest; 1015 } 1016 } 1017 1018 return NULL; 1019 } 1020 1021 /* Lookup destination by {addr,port} in the given service 1022 * Called under RCU lock. 1023 */ 1024 static struct ip_vs_dest * 1025 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, 1026 const union nf_inet_addr *daddr, __be16 dport) 1027 { 1028 struct ip_vs_dest *dest; 1029 1030 /* 1031 * Find the destination for the given service 1032 */ 1033 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 1034 if ((dest->af == dest_af) && 1035 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 1036 (dest->port == dport)) { 1037 /* HIT */ 1038 return dest; 1039 } 1040 } 1041 1042 return NULL; 1043 } 1044 1045 /* 1046 * Find destination by {daddr,dport,vaddr,protocol} 1047 * Created to be used in ip_vs_process_message() in 1048 * the backup synchronization daemon. It finds the 1049 * destination to be bound to the received connection 1050 * on the backup. 1051 * Called under RCU lock, no refcnt is returned. 1052 */ 1053 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, 1054 const union nf_inet_addr *daddr, 1055 __be16 dport, 1056 const union nf_inet_addr *vaddr, 1057 __be16 vport, __u16 protocol, __u32 fwmark, 1058 __u32 flags) 1059 { 1060 struct ip_vs_dest *dest; 1061 struct ip_vs_service *svc; 1062 __be16 port = dport; 1063 1064 svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); 1065 if (!svc) 1066 return NULL; 1067 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) 1068 port = 0; 1069 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); 1070 if (!dest) 1071 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); 1072 return dest; 1073 } 1074 1075 void ip_vs_dest_dst_rcu_free(struct rcu_head *head) 1076 { 1077 struct ip_vs_dest_dst *dest_dst = container_of(head, 1078 struct ip_vs_dest_dst, 1079 rcu_head); 1080 1081 dst_release(dest_dst->dst_cache); 1082 kfree(dest_dst); 1083 } 1084 1085 /* Release dest_dst and dst_cache for dest in user context */ 1086 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) 1087 { 1088 struct ip_vs_dest_dst *old; 1089 1090 old = rcu_dereference_protected(dest->dest_dst, 1); 1091 if (old) { 1092 RCU_INIT_POINTER(dest->dest_dst, NULL); 1093 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 1094 } 1095 } 1096 1097 /* 1098 * Lookup dest by {svc,addr,port} in the destination trash. 1099 * The destination trash is used to hold the destinations that are removed 1100 * from the service table but are still referenced by some conn entries. 1101 * The reason to add the destination trash is when the dest is temporary 1102 * down (either by administrator or by monitor program), the dest can be 1103 * picked back from the trash, the remaining connections to the dest can 1104 * continue, and the counting information of the dest is also useful for 1105 * scheduling. 1106 */ 1107 static struct ip_vs_dest * 1108 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, 1109 const union nf_inet_addr *daddr, __be16 dport) 1110 { 1111 struct ip_vs_dest *dest; 1112 struct netns_ipvs *ipvs = svc->ipvs; 1113 1114 /* 1115 * Find the destination in trash 1116 */ 1117 spin_lock_bh(&ipvs->dest_trash_lock); 1118 list_for_each_entry(dest, &ipvs->dest_trash, t_list) { 1119 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " 1120 "dest->refcnt=%d\n", 1121 dest->vfwmark, 1122 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1123 ntohs(dest->port), 1124 refcount_read(&dest->refcnt)); 1125 if (dest->af == dest_af && 1126 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 1127 dest->port == dport && 1128 dest->vfwmark == svc->fwmark && 1129 dest->protocol == svc->protocol && 1130 (svc->fwmark || 1131 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && 1132 dest->vport == svc->port))) { 1133 /* HIT */ 1134 list_del(&dest->t_list); 1135 goto out; 1136 } 1137 } 1138 1139 dest = NULL; 1140 1141 out: 1142 spin_unlock_bh(&ipvs->dest_trash_lock); 1143 1144 return dest; 1145 } 1146 1147 /* Put destination in trash */ 1148 static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs, 1149 struct ip_vs_dest *dest, unsigned long istart, 1150 bool cleanup) 1151 { 1152 spin_lock_bh(&ipvs->dest_trash_lock); 1153 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", 1154 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), 1155 refcount_read(&dest->refcnt)); 1156 if (list_empty(&ipvs->dest_trash) && !cleanup) 1157 mod_timer(&ipvs->dest_trash_timer, 1158 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1159 /* dest lives in trash with reference */ 1160 list_add(&dest->t_list, &ipvs->dest_trash); 1161 dest->idle_start = istart; 1162 spin_unlock_bh(&ipvs->dest_trash_lock); 1163 } 1164 1165 static void ip_vs_dest_rcu_free(struct rcu_head *head) 1166 { 1167 struct ip_vs_dest *dest; 1168 1169 dest = container_of(head, struct ip_vs_dest, rcu_head); 1170 ip_vs_stats_release(&dest->stats); 1171 ip_vs_dest_put_and_free(dest); 1172 } 1173 1174 static void ip_vs_dest_free(struct ip_vs_dest *dest) 1175 { 1176 struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); 1177 1178 __ip_vs_svc_put(svc); 1179 call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); 1180 } 1181 1182 /* 1183 * Clean up all the destinations in the trash 1184 * Called by the ip_vs_control_cleanup() 1185 * 1186 * When the ip_vs_control_clearup is activated by ipvs module exit, 1187 * the service tables must have been flushed and all the connections 1188 * are expired, and the refcnt of each destination in the trash must 1189 * be 1, so we simply release them here. 1190 */ 1191 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) 1192 { 1193 struct ip_vs_dest *dest, *nxt; 1194 1195 timer_delete_sync(&ipvs->dest_trash_timer); 1196 /* No need to use dest_trash_lock */ 1197 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { 1198 list_del(&dest->t_list); 1199 ip_vs_dest_free(dest); 1200 } 1201 } 1202 1203 static void ip_vs_stats_rcu_free(struct rcu_head *head) 1204 { 1205 struct ip_vs_stats_rcu *rs = container_of(head, 1206 struct ip_vs_stats_rcu, 1207 rcu_head); 1208 1209 ip_vs_stats_release(&rs->s); 1210 kfree(rs); 1211 } 1212 1213 static void 1214 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) 1215 { 1216 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c 1217 1218 spin_lock(&src->lock); 1219 1220 IP_VS_SHOW_STATS_COUNTER(conns); 1221 IP_VS_SHOW_STATS_COUNTER(inpkts); 1222 IP_VS_SHOW_STATS_COUNTER(outpkts); 1223 IP_VS_SHOW_STATS_COUNTER(inbytes); 1224 IP_VS_SHOW_STATS_COUNTER(outbytes); 1225 1226 ip_vs_read_estimator(dst, src); 1227 1228 spin_unlock(&src->lock); 1229 } 1230 1231 static void 1232 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) 1233 { 1234 dst->conns = (u32)src->conns; 1235 dst->inpkts = (u32)src->inpkts; 1236 dst->outpkts = (u32)src->outpkts; 1237 dst->inbytes = src->inbytes; 1238 dst->outbytes = src->outbytes; 1239 dst->cps = (u32)src->cps; 1240 dst->inpps = (u32)src->inpps; 1241 dst->outpps = (u32)src->outpps; 1242 dst->inbps = (u32)src->inbps; 1243 dst->outbps = (u32)src->outbps; 1244 } 1245 1246 static void 1247 ip_vs_zero_stats(struct ip_vs_stats *stats) 1248 { 1249 spin_lock(&stats->lock); 1250 1251 /* get current counters as zero point, rates are zeroed */ 1252 1253 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c 1254 1255 IP_VS_ZERO_STATS_COUNTER(conns); 1256 IP_VS_ZERO_STATS_COUNTER(inpkts); 1257 IP_VS_ZERO_STATS_COUNTER(outpkts); 1258 IP_VS_ZERO_STATS_COUNTER(inbytes); 1259 IP_VS_ZERO_STATS_COUNTER(outbytes); 1260 1261 ip_vs_zero_estimator(stats); 1262 1263 spin_unlock(&stats->lock); 1264 } 1265 1266 /* Allocate fields after kzalloc */ 1267 int ip_vs_stats_init_alloc(struct ip_vs_stats *s) 1268 { 1269 int i; 1270 1271 spin_lock_init(&s->lock); 1272 s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); 1273 if (!s->cpustats) 1274 return -ENOMEM; 1275 1276 for_each_possible_cpu(i) { 1277 struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); 1278 1279 u64_stats_init(&cs->syncp); 1280 } 1281 return 0; 1282 } 1283 1284 struct ip_vs_stats *ip_vs_stats_alloc(void) 1285 { 1286 struct ip_vs_stats *s = kzalloc_obj(*s); 1287 1288 if (s && ip_vs_stats_init_alloc(s) >= 0) 1289 return s; 1290 kfree(s); 1291 return NULL; 1292 } 1293 1294 void ip_vs_stats_release(struct ip_vs_stats *stats) 1295 { 1296 free_percpu(stats->cpustats); 1297 } 1298 1299 void ip_vs_stats_free(struct ip_vs_stats *stats) 1300 { 1301 if (stats) { 1302 ip_vs_stats_release(stats); 1303 kfree(stats); 1304 } 1305 } 1306 1307 /* 1308 * Update a destination in the given service 1309 */ 1310 static void 1311 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, 1312 struct ip_vs_dest_user_kern *udest, int add) 1313 { 1314 struct netns_ipvs *ipvs = svc->ipvs; 1315 struct ip_vs_service *old_svc; 1316 struct ip_vs_scheduler *sched; 1317 int conn_flags; 1318 1319 /* We cannot modify an address and change the address family */ 1320 BUG_ON(!add && udest->af != dest->af); 1321 1322 if (add && udest->af != svc->af) 1323 ipvs->mixed_address_family_dests++; 1324 1325 /* keep the last_weight with latest non-0 weight */ 1326 if (add || udest->weight != 0) 1327 atomic_set(&dest->last_weight, udest->weight); 1328 1329 /* set the weight and the flags */ 1330 atomic_set(&dest->weight, udest->weight); 1331 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; 1332 conn_flags |= IP_VS_CONN_F_INACTIVE; 1333 1334 /* Need to rehash? */ 1335 if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != 1336 IP_VS_DFWD_METHOD(dest) || 1337 udest->tun_type != dest->tun_type || 1338 udest->tun_port != dest->tun_port) 1339 ip_vs_rs_unhash(dest); 1340 1341 /* set the tunnel info */ 1342 dest->tun_type = udest->tun_type; 1343 dest->tun_port = udest->tun_port; 1344 dest->tun_flags = udest->tun_flags; 1345 1346 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ 1347 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { 1348 conn_flags |= IP_VS_CONN_F_NOOUTPUT; 1349 } else { 1350 /* FTP-NAT requires conntrack for mangling */ 1351 if (svc->port == FTPPORT) 1352 ip_vs_register_conntrack(svc); 1353 } 1354 atomic_set(&dest->conn_flags, conn_flags); 1355 /* Put the real service in rs_table if not present. */ 1356 ip_vs_rs_hash(ipvs, dest); 1357 1358 /* bind the service */ 1359 old_svc = rcu_dereference_protected(dest->svc, 1); 1360 if (!old_svc) { 1361 __ip_vs_bind_svc(dest, svc); 1362 } else { 1363 if (old_svc != svc) { 1364 ip_vs_zero_stats(&dest->stats); 1365 __ip_vs_bind_svc(dest, svc); 1366 __ip_vs_svc_put(old_svc); 1367 } 1368 } 1369 1370 /* set the dest status flags */ 1371 dest->flags |= IP_VS_DEST_F_AVAILABLE; 1372 1373 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) 1374 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 1375 dest->u_threshold = udest->u_threshold; 1376 dest->l_threshold = udest->l_threshold; 1377 1378 dest->af = udest->af; 1379 1380 if (add) { 1381 list_add_rcu(&dest->n_list, &svc->destinations); 1382 svc->num_dests++; 1383 sched = rcu_dereference_protected(svc->scheduler, 1); 1384 if (sched && sched->add_dest) 1385 sched->add_dest(svc, dest); 1386 } else { 1387 spin_lock_bh(&dest->dst_lock); 1388 __ip_vs_dst_cache_reset(dest); 1389 spin_unlock_bh(&dest->dst_lock); 1390 1391 sched = rcu_dereference_protected(svc->scheduler, 1); 1392 if (sched && sched->upd_dest) 1393 sched->upd_dest(svc, dest); 1394 } 1395 } 1396 1397 1398 /* 1399 * Create a destination for the given service 1400 */ 1401 static int 1402 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1403 { 1404 struct ip_vs_dest *dest; 1405 unsigned int atype; 1406 int ret; 1407 1408 #ifdef CONFIG_IP_VS_IPV6 1409 if (udest->af == AF_INET6) { 1410 atype = ipv6_addr_type(&udest->addr.in6); 1411 if ((!(atype & IPV6_ADDR_UNICAST) || 1412 atype & IPV6_ADDR_LINKLOCAL) && 1413 !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) 1414 return -EINVAL; 1415 1416 ret = nf_defrag_ipv6_enable(svc->ipvs->net); 1417 if (ret) 1418 return ret; 1419 } else 1420 #endif 1421 { 1422 atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); 1423 if (atype != RTN_LOCAL && atype != RTN_UNICAST) 1424 return -EINVAL; 1425 } 1426 1427 dest = kzalloc_obj(struct ip_vs_dest); 1428 if (dest == NULL) 1429 return -ENOMEM; 1430 1431 ret = ip_vs_stats_init_alloc(&dest->stats); 1432 if (ret < 0) 1433 goto err_alloc; 1434 1435 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1436 if (ret < 0) 1437 goto err_stats; 1438 1439 dest->af = udest->af; 1440 dest->protocol = svc->protocol; 1441 dest->vaddr = svc->addr; 1442 dest->vport = svc->port; 1443 dest->vfwmark = svc->fwmark; 1444 ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); 1445 dest->port = udest->port; 1446 1447 atomic_set(&dest->activeconns, 0); 1448 atomic_set(&dest->inactconns, 0); 1449 atomic_set(&dest->persistconns, 0); 1450 refcount_set(&dest->refcnt, 1); 1451 1452 INIT_HLIST_NODE(&dest->d_list); 1453 spin_lock_init(&dest->dst_lock); 1454 __ip_vs_update_dest(svc, dest, udest, 1); 1455 1456 return 0; 1457 1458 err_stats: 1459 ip_vs_stats_release(&dest->stats); 1460 1461 err_alloc: 1462 kfree(dest); 1463 return ret; 1464 } 1465 1466 1467 /* 1468 * Add a destination into an existing service 1469 */ 1470 static int 1471 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1472 { 1473 struct ip_vs_dest *dest; 1474 union nf_inet_addr daddr; 1475 __be16 dport = udest->port; 1476 int ret; 1477 1478 if (udest->weight < 0) { 1479 pr_err("%s(): server weight less than zero\n", __func__); 1480 return -ERANGE; 1481 } 1482 1483 if (udest->l_threshold > udest->u_threshold) { 1484 pr_err("%s(): lower threshold is higher than upper threshold\n", 1485 __func__); 1486 return -ERANGE; 1487 } 1488 1489 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1490 if (udest->tun_port == 0) { 1491 pr_err("%s(): tunnel port is zero\n", __func__); 1492 return -EINVAL; 1493 } 1494 } 1495 1496 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1497 1498 /* We use function that requires RCU lock */ 1499 rcu_read_lock(); 1500 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1501 rcu_read_unlock(); 1502 1503 if (dest != NULL) { 1504 IP_VS_DBG(1, "%s(): dest already exists\n", __func__); 1505 return -EEXIST; 1506 } 1507 1508 /* 1509 * Check if the dest already exists in the trash and 1510 * is from the same service 1511 */ 1512 dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); 1513 1514 if (dest != NULL) { 1515 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " 1516 "dest->refcnt=%d, service %u/%s:%u\n", 1517 IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), 1518 refcount_read(&dest->refcnt), 1519 dest->vfwmark, 1520 IP_VS_DBG_ADDR(svc->af, &dest->vaddr), 1521 ntohs(dest->vport)); 1522 1523 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1524 /* On error put back dest into the trash */ 1525 if (ret < 0) 1526 ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start, 1527 false); 1528 else 1529 __ip_vs_update_dest(svc, dest, udest, 1); 1530 } else { 1531 /* 1532 * Allocate and initialize the dest structure 1533 */ 1534 ret = ip_vs_new_dest(svc, udest); 1535 } 1536 1537 return ret; 1538 } 1539 1540 1541 /* 1542 * Edit a destination in the given service 1543 */ 1544 static int 1545 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1546 { 1547 struct ip_vs_dest *dest; 1548 union nf_inet_addr daddr; 1549 __be16 dport = udest->port; 1550 1551 if (udest->weight < 0) { 1552 pr_err("%s(): server weight less than zero\n", __func__); 1553 return -ERANGE; 1554 } 1555 1556 if (udest->l_threshold > udest->u_threshold) { 1557 pr_err("%s(): lower threshold is higher than upper threshold\n", 1558 __func__); 1559 return -ERANGE; 1560 } 1561 1562 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1563 if (udest->tun_port == 0) { 1564 pr_err("%s(): tunnel port is zero\n", __func__); 1565 return -EINVAL; 1566 } 1567 } 1568 1569 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1570 1571 /* We use function that requires RCU lock */ 1572 rcu_read_lock(); 1573 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1574 rcu_read_unlock(); 1575 1576 if (dest == NULL) { 1577 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); 1578 return -ENOENT; 1579 } 1580 1581 __ip_vs_update_dest(svc, dest, udest, 0); 1582 1583 return 0; 1584 } 1585 1586 /* 1587 * Delete a destination (must be already unlinked from the service) 1588 */ 1589 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, 1590 bool cleanup) 1591 { 1592 ip_vs_stop_estimator(ipvs, &dest->stats); 1593 1594 /* 1595 * Remove it from the d-linked list with the real services. 1596 */ 1597 ip_vs_rs_unhash(dest); 1598 1599 ip_vs_trash_put_dest(ipvs, dest, 0, cleanup); 1600 1601 /* Queue up delayed work to expire all no destination connections. 1602 * No-op when CONFIG_SYSCTL is disabled. 1603 */ 1604 if (!cleanup) 1605 ip_vs_enqueue_expire_nodest_conns(ipvs); 1606 } 1607 1608 1609 /* 1610 * Unlink a destination from the given service 1611 */ 1612 static void __ip_vs_unlink_dest(struct ip_vs_service *svc, 1613 struct ip_vs_dest *dest, 1614 int svcupd) 1615 { 1616 dest->flags &= ~IP_VS_DEST_F_AVAILABLE; 1617 1618 spin_lock_bh(&dest->dst_lock); 1619 __ip_vs_dst_cache_reset(dest); 1620 spin_unlock_bh(&dest->dst_lock); 1621 1622 /* 1623 * Remove it from the d-linked destination list. 1624 */ 1625 list_del_rcu(&dest->n_list); 1626 svc->num_dests--; 1627 1628 if (dest->af != svc->af) 1629 svc->ipvs->mixed_address_family_dests--; 1630 1631 if (svcupd) { 1632 struct ip_vs_scheduler *sched; 1633 1634 sched = rcu_dereference_protected(svc->scheduler, 1); 1635 if (sched && sched->del_dest) 1636 sched->del_dest(svc, dest); 1637 } 1638 } 1639 1640 1641 /* 1642 * Delete a destination server in the given service 1643 */ 1644 static int 1645 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1646 { 1647 struct ip_vs_dest *dest; 1648 __be16 dport = udest->port; 1649 1650 /* We use function that requires RCU lock */ 1651 rcu_read_lock(); 1652 dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); 1653 rcu_read_unlock(); 1654 1655 if (dest == NULL) { 1656 IP_VS_DBG(1, "%s(): destination not found!\n", __func__); 1657 return -ENOENT; 1658 } 1659 1660 /* 1661 * Unlink dest from the service 1662 */ 1663 __ip_vs_unlink_dest(svc, dest, 1); 1664 1665 /* 1666 * Delete the destination 1667 */ 1668 __ip_vs_del_dest(svc->ipvs, dest, false); 1669 1670 return 0; 1671 } 1672 1673 static void ip_vs_dest_trash_expire(struct timer_list *t) 1674 { 1675 struct netns_ipvs *ipvs = timer_container_of(ipvs, t, 1676 dest_trash_timer); 1677 struct ip_vs_dest *dest, *next; 1678 unsigned long now = jiffies; 1679 1680 spin_lock(&ipvs->dest_trash_lock); 1681 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { 1682 if (refcount_read(&dest->refcnt) > 1) 1683 continue; 1684 if (dest->idle_start) { 1685 if (time_before(now, dest->idle_start + 1686 IP_VS_DEST_TRASH_PERIOD)) 1687 continue; 1688 } else { 1689 dest->idle_start = max(1UL, now); 1690 continue; 1691 } 1692 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", 1693 dest->vfwmark, 1694 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1695 ntohs(dest->port)); 1696 list_del(&dest->t_list); 1697 ip_vs_dest_free(dest); 1698 } 1699 if (!list_empty(&ipvs->dest_trash)) 1700 mod_timer(&ipvs->dest_trash_timer, 1701 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1702 spin_unlock(&ipvs->dest_trash_lock); 1703 } 1704 1705 /* 1706 * Add a service into the service hash table 1707 */ 1708 static int 1709 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, 1710 struct ip_vs_service **svc_p) 1711 { 1712 struct ip_vs_scheduler *sched = NULL; 1713 struct ip_vs_rht *tc_new = NULL; 1714 struct ip_vs_rht *t, *t_new = NULL; 1715 int af_id = ip_vs_af_index(u->af); 1716 struct ip_vs_service *svc = NULL; 1717 struct ip_vs_pe *pe = NULL; 1718 int ret_hooks = -1; 1719 int ret = 0; 1720 bool grow; 1721 1722 /* increase the module use count */ 1723 if (!ip_vs_use_count_inc()) 1724 return -ENOPROTOOPT; 1725 1726 /* Lookup the scheduler by 'u->sched_name' */ 1727 if (strcmp(u->sched_name, "none")) { 1728 sched = ip_vs_scheduler_get(u->sched_name); 1729 if (!sched) { 1730 pr_info("Scheduler module ip_vs_%s not found\n", 1731 u->sched_name); 1732 ret = -ENOENT; 1733 goto out_err; 1734 } 1735 } 1736 1737 if (u->pe_name && *u->pe_name) { 1738 pe = ip_vs_pe_getbyname(u->pe_name); 1739 if (pe == NULL) { 1740 pr_info("persistence engine module ip_vs_pe_%s " 1741 "not found\n", u->pe_name); 1742 ret = -ENOENT; 1743 goto out_err; 1744 } 1745 } 1746 1747 #ifdef CONFIG_IP_VS_IPV6 1748 if (u->af == AF_INET6) { 1749 __u32 plen = (__force __u32) u->netmask; 1750 1751 if (plen < 1 || plen > 128) { 1752 ret = -EINVAL; 1753 goto out_err; 1754 } 1755 1756 ret = nf_defrag_ipv6_enable(ipvs->net); 1757 if (ret) 1758 goto out_err; 1759 } 1760 #endif 1761 1762 /* The old table can be freed, protect it with RCU */ 1763 rcu_read_lock(); 1764 t = rcu_dereference(ipvs->svc_table); 1765 if (!t) { 1766 int lfactor = sysctl_svc_lfactor(ipvs); 1767 int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); 1768 1769 rcu_read_unlock(); 1770 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 1771 if (!t_new) { 1772 ret = -ENOMEM; 1773 goto out_err; 1774 } 1775 grow = false; 1776 } else { 1777 /* Even the currently attached new table may need to grow */ 1778 t = rcu_dereference(t->new_tbl); 1779 grow = ip_vs_get_num_services(ipvs) + 1 > t->u_thresh; 1780 rcu_read_unlock(); 1781 } 1782 1783 if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { 1784 int lfactor = sysctl_conn_lfactor(ipvs); 1785 int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor); 1786 1787 tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); 1788 if (!tc_new) { 1789 ret = -ENOMEM; 1790 goto out_err; 1791 } 1792 } 1793 1794 if (!atomic_read(&ipvs->num_services[af_id])) { 1795 ret = ip_vs_register_hooks(ipvs, u->af); 1796 if (ret < 0) 1797 goto out_err; 1798 ret_hooks = ret; 1799 } 1800 1801 svc = kzalloc_obj(struct ip_vs_service); 1802 if (svc == NULL) { 1803 IP_VS_DBG(1, "%s(): no memory\n", __func__); 1804 ret = -ENOMEM; 1805 goto out_err; 1806 } 1807 ret = ip_vs_stats_init_alloc(&svc->stats); 1808 if (ret < 0) 1809 goto out_err; 1810 1811 /* I'm the first user of the service */ 1812 atomic_set(&svc->refcnt, 0); 1813 1814 svc->af = u->af; 1815 svc->protocol = u->protocol; 1816 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); 1817 svc->port = u->port; 1818 svc->fwmark = u->fwmark; 1819 svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; 1820 svc->timeout = u->timeout * HZ; 1821 svc->netmask = u->netmask; 1822 svc->ipvs = ipvs; 1823 1824 INIT_LIST_HEAD(&svc->destinations); 1825 spin_lock_init(&svc->sched_lock); 1826 1827 /* Bind the scheduler */ 1828 if (sched) { 1829 ret = ip_vs_bind_scheduler(svc, sched); 1830 if (ret) 1831 goto out_err; 1832 } 1833 1834 ret = ip_vs_start_estimator(ipvs, &svc->stats); 1835 if (ret < 0) 1836 goto out_err; 1837 1838 if (t_new) { 1839 /* Add table for first time */ 1840 clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 1841 rcu_assign_pointer(ipvs->svc_table, t_new); 1842 t_new = NULL; 1843 } 1844 if (tc_new) { 1845 rcu_assign_pointer(ipvs->conn_tab, tc_new); 1846 tc_new = NULL; 1847 } 1848 1849 /* Update the virtual service counters */ 1850 if (svc->port == FTPPORT) 1851 atomic_inc(&ipvs->ftpsvc_counter[af_id]); 1852 else if (!svc->port && !svc->fwmark) 1853 atomic_inc(&ipvs->nullsvc_counter[af_id]); 1854 if (pe && pe->conn_out) 1855 atomic_inc(&ipvs->conn_out_counter[af_id]); 1856 1857 /* Bind the ct retriever */ 1858 RCU_INIT_POINTER(svc->pe, pe); 1859 pe = NULL; 1860 1861 if (svc->fwmark) 1862 atomic_inc(&ipvs->fwm_services[af_id]); 1863 else 1864 atomic_inc(&ipvs->nonfwm_services[af_id]); 1865 atomic_inc(&ipvs->num_services[af_id]); 1866 1867 /* Hash the service into the service table */ 1868 ip_vs_svc_hash(svc); 1869 1870 /* Schedule resize work */ 1871 if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) 1872 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1873 1); 1874 1875 *svc_p = svc; 1876 1877 if (!READ_ONCE(ipvs->enable)) { 1878 mutex_lock(&ipvs->est_mutex); 1879 1880 /* Now there is a service - full throttle */ 1881 WRITE_ONCE(ipvs->enable, 1); 1882 1883 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 1884 1885 /* Start estimation for first time */ 1886 ip_vs_est_reload_start(ipvs, true); 1887 mutex_unlock(&ipvs->est_mutex); 1888 } 1889 1890 return 0; 1891 1892 1893 out_err: 1894 if (tc_new) 1895 ip_vs_rht_free(tc_new); 1896 if (t_new) 1897 ip_vs_rht_free(t_new); 1898 if (ret_hooks >= 0) 1899 ip_vs_unregister_hooks(ipvs, u->af); 1900 if (svc != NULL) { 1901 ip_vs_unbind_scheduler(svc); 1902 ip_vs_service_free(svc); 1903 } 1904 ip_vs_scheduler_put(sched); 1905 ip_vs_pe_put(pe); 1906 1907 /* decrease the module use count */ 1908 ip_vs_use_count_dec(); 1909 1910 return ret; 1911 } 1912 1913 1914 /* 1915 * Edit a service and bind it with a new scheduler 1916 */ 1917 static int 1918 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) 1919 { 1920 struct ip_vs_scheduler *sched = NULL, *old_sched; 1921 struct ip_vs_pe *pe = NULL, *old_pe = NULL; 1922 int ret = 0; 1923 bool new_pe_conn_out, old_pe_conn_out; 1924 struct netns_ipvs *ipvs = svc->ipvs; 1925 int af_id = ip_vs_af_index(svc->af); 1926 1927 /* 1928 * Lookup the scheduler, by 'u->sched_name' 1929 */ 1930 if (strcmp(u->sched_name, "none")) { 1931 sched = ip_vs_scheduler_get(u->sched_name); 1932 if (!sched) { 1933 pr_info("Scheduler module ip_vs_%s not found\n", 1934 u->sched_name); 1935 return -ENOENT; 1936 } 1937 } 1938 old_sched = sched; 1939 1940 if (u->pe_name && *u->pe_name) { 1941 pe = ip_vs_pe_getbyname(u->pe_name); 1942 if (pe == NULL) { 1943 pr_info("persistence engine module ip_vs_pe_%s " 1944 "not found\n", u->pe_name); 1945 ret = -ENOENT; 1946 goto out; 1947 } 1948 old_pe = pe; 1949 } 1950 1951 #ifdef CONFIG_IP_VS_IPV6 1952 if (u->af == AF_INET6) { 1953 __u32 plen = (__force __u32) u->netmask; 1954 1955 if (plen < 1 || plen > 128) { 1956 ret = -EINVAL; 1957 goto out; 1958 } 1959 } 1960 #endif 1961 1962 old_sched = rcu_dereference_protected(svc->scheduler, 1); 1963 if (sched != old_sched) { 1964 if (old_sched) { 1965 ip_vs_unbind_scheduler(svc); 1966 /* Wait all svc->scheduler/sched_data users */ 1967 synchronize_rcu(); 1968 } 1969 /* Bind the new scheduler */ 1970 if (sched) { 1971 ret = ip_vs_bind_scheduler(svc, sched); 1972 if (ret) { 1973 ip_vs_scheduler_put(sched); 1974 /* Try to restore the old_sched */ 1975 if (old_sched && 1976 !ip_vs_bind_scheduler(svc, old_sched)) 1977 old_sched = NULL; 1978 goto out; 1979 } 1980 } 1981 } 1982 1983 /* 1984 * Set the flags and timeout value 1985 */ 1986 svc->flags = u->flags | IP_VS_SVC_F_HASHED; 1987 svc->timeout = u->timeout * HZ; 1988 svc->netmask = u->netmask; 1989 1990 old_pe = rcu_dereference_protected(svc->pe, 1); 1991 if (pe != old_pe) { 1992 rcu_assign_pointer(svc->pe, pe); 1993 /* check for optional methods in new pe */ 1994 new_pe_conn_out = (pe && pe->conn_out) ? true : false; 1995 old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; 1996 if (new_pe_conn_out && !old_pe_conn_out) 1997 atomic_inc(&ipvs->conn_out_counter[af_id]); 1998 if (old_pe_conn_out && !new_pe_conn_out) 1999 atomic_dec(&ipvs->conn_out_counter[af_id]); 2000 } 2001 2002 out: 2003 ip_vs_scheduler_put(old_sched); 2004 ip_vs_pe_put(old_pe); 2005 return ret; 2006 } 2007 2008 /* 2009 * Delete a service from the service list 2010 * - The service must be unlinked, unlocked and not referenced! 2011 * - We are called under _bh lock 2012 */ 2013 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) 2014 { 2015 struct ip_vs_dest *dest, *nxt; 2016 struct ip_vs_scheduler *old_sched; 2017 struct ip_vs_pe *old_pe; 2018 struct netns_ipvs *ipvs = svc->ipvs; 2019 int af_id = ip_vs_af_index(svc->af); 2020 2021 atomic_dec(&ipvs->num_services[af_id]); 2022 if (!atomic_read(&ipvs->num_services[af_id])) 2023 ip_vs_unregister_hooks(ipvs, svc->af); 2024 if (svc->fwmark) 2025 atomic_dec(&ipvs->fwm_services[af_id]); 2026 else 2027 atomic_dec(&ipvs->nonfwm_services[af_id]); 2028 2029 ip_vs_stop_estimator(svc->ipvs, &svc->stats); 2030 2031 /* Unbind scheduler */ 2032 old_sched = rcu_dereference_protected(svc->scheduler, 1); 2033 ip_vs_unbind_scheduler(svc); 2034 ip_vs_scheduler_put(old_sched); 2035 2036 /* Unbind persistence engine, keep svc->pe */ 2037 old_pe = rcu_dereference_protected(svc->pe, 1); 2038 if (old_pe && old_pe->conn_out) 2039 atomic_dec(&ipvs->conn_out_counter[af_id]); 2040 ip_vs_pe_put(old_pe); 2041 2042 /* 2043 * Unlink the whole destination list 2044 */ 2045 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { 2046 __ip_vs_unlink_dest(svc, dest, 0); 2047 __ip_vs_del_dest(svc->ipvs, dest, cleanup); 2048 } 2049 2050 /* 2051 * Update the virtual service counters 2052 */ 2053 if (svc->port == FTPPORT) 2054 atomic_dec(&ipvs->ftpsvc_counter[af_id]); 2055 else if (!svc->port && !svc->fwmark) 2056 atomic_dec(&ipvs->nullsvc_counter[af_id]); 2057 2058 /* 2059 * Free the service if nobody refers to it 2060 */ 2061 __ip_vs_svc_put(svc); 2062 2063 /* decrease the module use count */ 2064 ip_vs_use_count_dec(); 2065 } 2066 2067 /* 2068 * Unlink a service from list and try to delete it if its refcnt reached 0 2069 */ 2070 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) 2071 { 2072 ip_vs_unregister_conntrack(svc); 2073 /* Hold svc to avoid double release from dest_trash */ 2074 atomic_inc(&svc->refcnt); 2075 /* 2076 * Unhash it from the service table 2077 */ 2078 ip_vs_svc_unhash(svc); 2079 2080 __ip_vs_del_service(svc, cleanup); 2081 } 2082 2083 /* 2084 * Delete a service from the service list 2085 */ 2086 static int ip_vs_del_service(struct ip_vs_service *svc) 2087 { 2088 struct netns_ipvs *ipvs; 2089 struct ip_vs_rht *t, *p; 2090 int ns; 2091 2092 if (svc == NULL) 2093 return -EEXIST; 2094 ipvs = svc->ipvs; 2095 ip_vs_unlink_service(svc, false); 2096 2097 /* Drop the table if no more services */ 2098 ns = ip_vs_get_num_services(ipvs); 2099 if (!ns) { 2100 /* Stop the resizer and drop the tables */ 2101 set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 2102 cancel_delayed_work_sync(&ipvs->svc_resize_work); 2103 t = rcu_dereference_protected(ipvs->svc_table, 1); 2104 if (t) { 2105 rcu_assign_pointer(ipvs->svc_table, NULL); 2106 /* Inform readers that table is removed */ 2107 smp_mb__before_atomic(); 2108 atomic_inc(&ipvs->svc_table_changes); 2109 while (1) { 2110 p = rcu_dereference_protected(t->new_tbl, 1); 2111 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 2112 if (p == t) 2113 break; 2114 t = p; 2115 } 2116 } 2117 } else { 2118 bool shrink; 2119 2120 rcu_read_lock(); 2121 t = rcu_dereference(ipvs->svc_table); 2122 /* Even the currently attached new table may need to shrink */ 2123 t = rcu_dereference(t->new_tbl); 2124 shrink = ns <= t->l_thresh; 2125 rcu_read_unlock(); 2126 if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, 2127 &ipvs->work_flags)) 2128 queue_delayed_work(system_unbound_wq, 2129 &ipvs->svc_resize_work, 1); 2130 } 2131 return 0; 2132 } 2133 2134 2135 /* 2136 * Flush all the virtual services 2137 */ 2138 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) 2139 { 2140 DECLARE_IP_VS_RHT_WALK_BUCKETS(); 2141 struct hlist_bl_head *head; 2142 struct ip_vs_service *svc; 2143 struct hlist_bl_node *ne; 2144 struct hlist_bl_node *e; 2145 struct ip_vs_rht *t, *p; 2146 2147 /* Stop the resizer and drop the tables */ 2148 if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 2149 cancel_delayed_work_sync(&ipvs->svc_resize_work); 2150 /* No resizer, so now we have exclusive write access */ 2151 2152 if (ip_vs_get_num_services(ipvs)) { 2153 ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 2154 hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list) 2155 ip_vs_unlink_service(svc, cleanup); 2156 } 2157 } 2158 2159 /* Unregister the hash table and release it after RCU grace period */ 2160 t = rcu_dereference_protected(ipvs->svc_table, 1); 2161 if (t) { 2162 rcu_assign_pointer(ipvs->svc_table, NULL); 2163 /* Inform readers that table is removed */ 2164 smp_mb__before_atomic(); 2165 atomic_inc(&ipvs->svc_table_changes); 2166 while (1) { 2167 p = rcu_dereference_protected(t->new_tbl, 1); 2168 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 2169 if (p == t) 2170 break; 2171 t = p; 2172 } 2173 } 2174 /* Stop the tot_stats estimator early under service_mutex 2175 * to avoid locking it again later. 2176 */ 2177 if (cleanup) 2178 ip_vs_stop_estimator_tot_stats(ipvs); 2179 return 0; 2180 } 2181 2182 /* 2183 * Delete service by {netns} in the service table. 2184 * Called by __ip_vs_batch_cleanup() 2185 */ 2186 void ip_vs_service_nets_cleanup(struct list_head *net_list) 2187 { 2188 struct netns_ipvs *ipvs; 2189 struct net *net; 2190 2191 /* Check for "full" addressed entries */ 2192 list_for_each_entry(net, net_list, exit_list) { 2193 ipvs = net_ipvs(net); 2194 mutex_lock(&ipvs->service_mutex); 2195 ip_vs_flush(ipvs, true); 2196 mutex_unlock(&ipvs->service_mutex); 2197 } 2198 } 2199 2200 /* Put all references for device (dst_cache) */ 2201 static inline void 2202 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) 2203 { 2204 struct ip_vs_dest_dst *dest_dst; 2205 2206 spin_lock_bh(&dest->dst_lock); 2207 dest_dst = rcu_dereference_protected(dest->dest_dst, 1); 2208 if (dest_dst && dest_dst->dst_cache->dev == dev) { 2209 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", 2210 dev->name, 2211 IP_VS_DBG_ADDR(dest->af, &dest->addr), 2212 ntohs(dest->port), 2213 refcount_read(&dest->refcnt)); 2214 __ip_vs_dst_cache_reset(dest); 2215 } 2216 spin_unlock_bh(&dest->dst_lock); 2217 2218 } 2219 /* Netdev event receiver 2220 * Currently only NETDEV_DOWN is handled to release refs to cached dsts 2221 */ 2222 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, 2223 void *ptr) 2224 { 2225 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 2226 struct net *net = dev_net(dev); 2227 struct netns_ipvs *ipvs = net_ipvs(net); 2228 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 2229 unsigned int resched_score = 0; 2230 struct hlist_bl_head *head; 2231 struct ip_vs_service *svc; 2232 struct hlist_bl_node *e; 2233 struct ip_vs_dest *dest; 2234 int old_gen; 2235 2236 if (event != NETDEV_DOWN || !ipvs) 2237 return NOTIFY_DONE; 2238 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); 2239 2240 /* Allow concurrent rehashing on resize but to avoid loop 2241 * serialize with installing the new table. 2242 */ 2243 down_read(&ipvs->svc_replace_sem); 2244 2245 old_gen = atomic_read(&ipvs->svc_table_changes); 2246 2247 rcu_read_lock(); 2248 2249 smp_rmb(); /* ipvs->svc_table and svc_table_changes */ 2250 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 2251 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2252 list_for_each_entry_rcu(dest, &svc->destinations, 2253 n_list) { 2254 ip_vs_forget_dev(dest, dev); 2255 resched_score += 10; 2256 } 2257 resched_score++; 2258 } 2259 resched_score++; 2260 if (resched_score >= 100) { 2261 cond_resched_rcu(); 2262 /* Flushed? So no more dev refs */ 2263 if (atomic_read(&ipvs->svc_table_changes) != old_gen) 2264 goto done; 2265 resched_score = 0; 2266 } 2267 } 2268 2269 done: 2270 rcu_read_unlock(); 2271 up_read(&ipvs->svc_replace_sem); 2272 2273 return NOTIFY_DONE; 2274 } 2275 2276 /* 2277 * Zero counters in a service or all services 2278 */ 2279 static int ip_vs_zero_service(struct ip_vs_service *svc) 2280 { 2281 struct ip_vs_dest *dest; 2282 2283 list_for_each_entry(dest, &svc->destinations, n_list) { 2284 ip_vs_zero_stats(&dest->stats); 2285 } 2286 ip_vs_zero_stats(&svc->stats); 2287 return 0; 2288 } 2289 2290 static int ip_vs_zero_all(struct netns_ipvs *ipvs) 2291 { 2292 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 2293 unsigned int resched_score = 0; 2294 struct hlist_bl_head *head; 2295 struct ip_vs_service *svc; 2296 struct hlist_bl_node *e; 2297 2298 /* svc_table can not be replaced (svc_replace_sem) or 2299 * removed (service_mutex) 2300 */ 2301 down_read(&ipvs->svc_replace_sem); 2302 rcu_read_lock(); 2303 2304 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 2305 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2306 ip_vs_zero_service(svc); 2307 resched_score += 10; 2308 } 2309 resched_score++; 2310 if (resched_score >= 100) { 2311 resched_score = 0; 2312 cond_resched_rcu(); 2313 } 2314 } 2315 2316 rcu_read_unlock(); 2317 up_read(&ipvs->svc_replace_sem); 2318 2319 ip_vs_zero_stats(&ipvs->tot_stats->s); 2320 return 0; 2321 } 2322 2323 #ifdef CONFIG_SYSCTL 2324 2325 static int 2326 proc_do_defense_mode(const struct ctl_table *table, int write, 2327 void *buffer, size_t *lenp, loff_t *ppos) 2328 { 2329 struct netns_ipvs *ipvs = table->extra2; 2330 int *valp = table->data; 2331 int val = *valp; 2332 int rc; 2333 2334 struct ctl_table tmp = { 2335 .data = &val, 2336 .maxlen = sizeof(int), 2337 .mode = table->mode, 2338 }; 2339 2340 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2341 if (write && (*valp != val)) { 2342 if (val < 0 || val > 3) { 2343 rc = -EINVAL; 2344 } else { 2345 *valp = val; 2346 update_defense_level(ipvs); 2347 } 2348 } 2349 return rc; 2350 } 2351 2352 static int 2353 proc_do_sync_threshold(const struct ctl_table *table, int write, 2354 void *buffer, size_t *lenp, loff_t *ppos) 2355 { 2356 struct netns_ipvs *ipvs = table->extra2; 2357 int *valp = table->data; 2358 int val[2]; 2359 int rc; 2360 struct ctl_table tmp = { 2361 .data = &val, 2362 .maxlen = table->maxlen, 2363 .mode = table->mode, 2364 }; 2365 2366 mutex_lock(&ipvs->sync_mutex); 2367 memcpy(val, valp, sizeof(val)); 2368 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2369 if (write) { 2370 if (val[0] < 0 || val[1] < 0 || 2371 (val[0] >= val[1] && val[1])) 2372 rc = -EINVAL; 2373 else 2374 memcpy(valp, val, sizeof(val)); 2375 } 2376 mutex_unlock(&ipvs->sync_mutex); 2377 return rc; 2378 } 2379 2380 static int 2381 proc_do_sync_ports(const struct ctl_table *table, int write, 2382 void *buffer, size_t *lenp, loff_t *ppos) 2383 { 2384 int *valp = table->data; 2385 int val = *valp; 2386 int rc; 2387 2388 struct ctl_table tmp = { 2389 .data = &val, 2390 .maxlen = sizeof(int), 2391 .mode = table->mode, 2392 }; 2393 2394 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2395 if (write && (*valp != val)) { 2396 if (val < 1 || !is_power_of_2(val)) 2397 rc = -EINVAL; 2398 else 2399 *valp = val; 2400 } 2401 return rc; 2402 } 2403 2404 static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, 2405 void *buffer) 2406 { 2407 struct netns_ipvs *ipvs = table->extra2; 2408 cpumask_var_t *valp = table->data; 2409 cpumask_var_t newmask; 2410 int ret; 2411 2412 if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) 2413 return -ENOMEM; 2414 2415 ret = cpulist_parse(buffer, newmask); 2416 if (ret) 2417 goto out; 2418 2419 mutex_lock(&ipvs->est_mutex); 2420 2421 if (!ipvs->est_cpulist_valid) { 2422 if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { 2423 ret = -ENOMEM; 2424 goto unlock; 2425 } 2426 ipvs->est_cpulist_valid = 1; 2427 } 2428 cpumask_and(newmask, newmask, ¤t->cpus_mask); 2429 cpumask_copy(*valp, newmask); 2430 /* est_max_threads may depend on cpulist size */ 2431 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 2432 ipvs->est_calc_phase = 1; 2433 ip_vs_est_reload_start(ipvs, true); 2434 2435 unlock: 2436 mutex_unlock(&ipvs->est_mutex); 2437 2438 out: 2439 free_cpumask_var(newmask); 2440 return ret; 2441 } 2442 2443 static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, 2444 void *buffer, size_t size) 2445 { 2446 struct netns_ipvs *ipvs = table->extra2; 2447 cpumask_var_t *valp = table->data; 2448 struct cpumask *mask; 2449 int ret; 2450 2451 mutex_lock(&ipvs->est_mutex); 2452 2453 /* HK_TYPE_KTHREAD cpumask needs RCU protection */ 2454 scoped_guard(rcu) { 2455 if (ipvs->est_cpulist_valid) 2456 mask = *valp; 2457 else 2458 mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); 2459 ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); 2460 } 2461 2462 mutex_unlock(&ipvs->est_mutex); 2463 2464 return ret; 2465 } 2466 2467 static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, 2468 void *buffer, size_t *lenp, loff_t *ppos) 2469 { 2470 int ret; 2471 2472 /* Ignore both read and write(append) if *ppos not 0 */ 2473 if (*ppos || !*lenp) { 2474 *lenp = 0; 2475 return 0; 2476 } 2477 if (write) { 2478 /* proc_sys_call_handler() appends terminator */ 2479 ret = ipvs_proc_est_cpumask_set(table, buffer); 2480 if (ret >= 0) 2481 *ppos += *lenp; 2482 } else { 2483 /* proc_sys_call_handler() allocates 1 byte for terminator */ 2484 ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); 2485 if (ret >= 0) { 2486 *lenp = ret; 2487 *ppos += *lenp; 2488 ret = 0; 2489 } 2490 } 2491 return ret; 2492 } 2493 2494 static int ipvs_proc_est_nice(const struct ctl_table *table, int write, 2495 void *buffer, size_t *lenp, loff_t *ppos) 2496 { 2497 struct netns_ipvs *ipvs = table->extra2; 2498 int *valp = table->data; 2499 int val = *valp; 2500 int ret; 2501 2502 struct ctl_table tmp_table = { 2503 .data = &val, 2504 .maxlen = sizeof(int), 2505 .mode = table->mode, 2506 }; 2507 2508 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2509 if (write && ret >= 0) { 2510 if (val < MIN_NICE || val > MAX_NICE) { 2511 ret = -EINVAL; 2512 } else { 2513 mutex_lock(&ipvs->est_mutex); 2514 if (*valp != val) { 2515 *valp = val; 2516 ip_vs_est_reload_start(ipvs, true); 2517 } 2518 mutex_unlock(&ipvs->est_mutex); 2519 } 2520 } 2521 return ret; 2522 } 2523 2524 static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, 2525 void *buffer, size_t *lenp, loff_t *ppos) 2526 { 2527 struct netns_ipvs *ipvs = table->extra2; 2528 int *valp = table->data; 2529 int val = *valp; 2530 int ret; 2531 2532 struct ctl_table tmp_table = { 2533 .data = &val, 2534 .maxlen = sizeof(int), 2535 .mode = table->mode, 2536 }; 2537 2538 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2539 if (write && ret >= 0) { 2540 mutex_lock(&ipvs->est_mutex); 2541 if (*valp != val) { 2542 *valp = val; 2543 ip_vs_est_reload_start(ipvs, true); 2544 } 2545 mutex_unlock(&ipvs->est_mutex); 2546 } 2547 return ret; 2548 } 2549 2550 static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, 2551 void *buffer, size_t *lenp, loff_t *ppos) 2552 { 2553 struct netns_ipvs *ipvs = table->extra2; 2554 int *valp = table->data; 2555 int val = *valp; 2556 int ret; 2557 2558 struct ctl_table tmp_table = { 2559 .data = &val, 2560 .maxlen = sizeof(int), 2561 }; 2562 2563 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2564 if (write && ret >= 0) { 2565 if (val < -8 || val > 8) { 2566 ret = -EINVAL; 2567 } else { 2568 WRITE_ONCE(*valp, val); 2569 if (rcu_access_pointer(ipvs->conn_tab)) 2570 mod_delayed_work(system_unbound_wq, 2571 &ipvs->conn_resize_work, 0); 2572 } 2573 } 2574 return ret; 2575 } 2576 2577 static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, 2578 void *buffer, size_t *lenp, loff_t *ppos) 2579 { 2580 struct netns_ipvs *ipvs = table->extra2; 2581 int *valp = table->data; 2582 int val = *valp; 2583 int ret; 2584 2585 struct ctl_table tmp_table = { 2586 .data = &val, 2587 .maxlen = sizeof(int), 2588 }; 2589 2590 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2591 if (write && ret >= 0) { 2592 if (val < -8 || val > 8) { 2593 ret = -EINVAL; 2594 } else { 2595 mutex_lock(&ipvs->service_mutex); 2596 WRITE_ONCE(*valp, val); 2597 /* Make sure the services are present */ 2598 if (rcu_access_pointer(ipvs->svc_table) && 2599 READ_ONCE(ipvs->enable) && 2600 !test_bit(IP_VS_WORK_SVC_NORESIZE, 2601 &ipvs->work_flags)) 2602 mod_delayed_work(system_unbound_wq, 2603 &ipvs->svc_resize_work, 0); 2604 mutex_unlock(&ipvs->service_mutex); 2605 } 2606 } 2607 return ret; 2608 } 2609 2610 /* 2611 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) 2612 * Do not change order or insert new entries without 2613 * align with netns init in ip_vs_control_net_init() 2614 */ 2615 2616 static struct ctl_table vs_vars[] = { 2617 { 2618 .procname = "amemthresh", 2619 .maxlen = sizeof(int), 2620 .mode = 0644, 2621 .proc_handler = proc_dointvec, 2622 }, 2623 { 2624 .procname = "am_droprate", 2625 .maxlen = sizeof(int), 2626 .mode = 0644, 2627 .proc_handler = proc_dointvec, 2628 }, 2629 { 2630 .procname = "drop_entry", 2631 .maxlen = sizeof(int), 2632 .mode = 0644, 2633 .proc_handler = proc_do_defense_mode, 2634 }, 2635 { 2636 .procname = "drop_packet", 2637 .maxlen = sizeof(int), 2638 .mode = 0644, 2639 .proc_handler = proc_do_defense_mode, 2640 }, 2641 #ifdef CONFIG_IP_VS_NFCT 2642 { 2643 .procname = "conntrack", 2644 .maxlen = sizeof(int), 2645 .mode = 0644, 2646 .proc_handler = &proc_dointvec, 2647 }, 2648 #endif 2649 { 2650 .procname = "secure_tcp", 2651 .maxlen = sizeof(int), 2652 .mode = 0644, 2653 .proc_handler = proc_do_defense_mode, 2654 }, 2655 { 2656 .procname = "snat_reroute", 2657 .maxlen = sizeof(int), 2658 .mode = 0644, 2659 .proc_handler = &proc_dointvec, 2660 }, 2661 { 2662 .procname = "sync_version", 2663 .maxlen = sizeof(int), 2664 .mode = 0644, 2665 .proc_handler = proc_dointvec_minmax, 2666 .extra1 = SYSCTL_ZERO, 2667 .extra2 = SYSCTL_ONE, 2668 }, 2669 { 2670 .procname = "sync_ports", 2671 .maxlen = sizeof(int), 2672 .mode = 0644, 2673 .proc_handler = proc_do_sync_ports, 2674 }, 2675 { 2676 .procname = "sync_persist_mode", 2677 .maxlen = sizeof(int), 2678 .mode = 0644, 2679 .proc_handler = proc_dointvec, 2680 }, 2681 { 2682 .procname = "sync_qlen_max", 2683 .maxlen = sizeof(unsigned long), 2684 .mode = 0644, 2685 .proc_handler = proc_doulongvec_minmax, 2686 }, 2687 { 2688 .procname = "sync_sock_size", 2689 .maxlen = sizeof(int), 2690 .mode = 0644, 2691 .proc_handler = proc_dointvec, 2692 }, 2693 { 2694 .procname = "cache_bypass", 2695 .maxlen = sizeof(int), 2696 .mode = 0644, 2697 .proc_handler = proc_dointvec, 2698 }, 2699 { 2700 .procname = "expire_nodest_conn", 2701 .maxlen = sizeof(int), 2702 .mode = 0644, 2703 .proc_handler = proc_dointvec, 2704 }, 2705 { 2706 .procname = "sloppy_tcp", 2707 .maxlen = sizeof(int), 2708 .mode = 0644, 2709 .proc_handler = proc_dointvec, 2710 }, 2711 { 2712 .procname = "sloppy_sctp", 2713 .maxlen = sizeof(int), 2714 .mode = 0644, 2715 .proc_handler = proc_dointvec, 2716 }, 2717 { 2718 .procname = "expire_quiescent_template", 2719 .maxlen = sizeof(int), 2720 .mode = 0644, 2721 .proc_handler = proc_dointvec, 2722 }, 2723 { 2724 .procname = "sync_threshold", 2725 .maxlen = 2726 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), 2727 .mode = 0644, 2728 .proc_handler = proc_do_sync_threshold, 2729 }, 2730 { 2731 .procname = "sync_refresh_period", 2732 .maxlen = sizeof(int), 2733 .mode = 0644, 2734 .proc_handler = proc_dointvec_jiffies, 2735 }, 2736 { 2737 .procname = "sync_retries", 2738 .maxlen = sizeof(int), 2739 .mode = 0644, 2740 .proc_handler = proc_dointvec_minmax, 2741 .extra1 = SYSCTL_ZERO, 2742 .extra2 = SYSCTL_THREE, 2743 }, 2744 { 2745 .procname = "nat_icmp_send", 2746 .maxlen = sizeof(int), 2747 .mode = 0644, 2748 .proc_handler = proc_dointvec, 2749 }, 2750 { 2751 .procname = "pmtu_disc", 2752 .maxlen = sizeof(int), 2753 .mode = 0644, 2754 .proc_handler = proc_dointvec, 2755 }, 2756 { 2757 .procname = "backup_only", 2758 .maxlen = sizeof(int), 2759 .mode = 0644, 2760 .proc_handler = proc_dointvec, 2761 }, 2762 { 2763 .procname = "conn_reuse_mode", 2764 .maxlen = sizeof(int), 2765 .mode = 0644, 2766 .proc_handler = proc_dointvec, 2767 }, 2768 { 2769 .procname = "schedule_icmp", 2770 .maxlen = sizeof(int), 2771 .mode = 0644, 2772 .proc_handler = proc_dointvec, 2773 }, 2774 { 2775 .procname = "ignore_tunneled", 2776 .maxlen = sizeof(int), 2777 .mode = 0644, 2778 .proc_handler = proc_dointvec, 2779 }, 2780 { 2781 .procname = "run_estimation", 2782 .maxlen = sizeof(int), 2783 .mode = 0644, 2784 .proc_handler = ipvs_proc_run_estimation, 2785 }, 2786 { 2787 .procname = "est_cpulist", 2788 .maxlen = NR_CPUS, /* unused */ 2789 .mode = 0644, 2790 .proc_handler = ipvs_proc_est_cpulist, 2791 }, 2792 { 2793 .procname = "est_nice", 2794 .maxlen = sizeof(int), 2795 .mode = 0644, 2796 .proc_handler = ipvs_proc_est_nice, 2797 }, 2798 { 2799 .procname = "conn_lfactor", 2800 .maxlen = sizeof(int), 2801 .mode = 0644, 2802 .proc_handler = ipvs_proc_conn_lfactor, 2803 }, 2804 { 2805 .procname = "svc_lfactor", 2806 .maxlen = sizeof(int), 2807 .mode = 0644, 2808 .proc_handler = ipvs_proc_svc_lfactor, 2809 }, 2810 #ifdef CONFIG_IP_VS_DEBUG 2811 { 2812 .procname = "debug_level", 2813 .data = &sysctl_ip_vs_debug_level, 2814 .maxlen = sizeof(int), 2815 .mode = 0644, 2816 .proc_handler = proc_dointvec, 2817 }, 2818 #endif 2819 }; 2820 2821 #endif 2822 2823 #ifdef CONFIG_PROC_FS 2824 2825 struct ip_vs_iter { 2826 struct seq_net_private p; /* Do not move this, netns depends upon it*/ 2827 struct ip_vs_rht *t; 2828 u32 bucket; 2829 }; 2830 2831 /* 2832 * Write the contents of the VS rule table to a PROCfs file. 2833 * (It is kept just for backward compatibility) 2834 */ 2835 static inline const char *ip_vs_fwd_name(unsigned int flags) 2836 { 2837 switch (flags & IP_VS_CONN_F_FWD_MASK) { 2838 case IP_VS_CONN_F_LOCALNODE: 2839 return "Local"; 2840 case IP_VS_CONN_F_TUNNEL: 2841 return "Tunnel"; 2842 case IP_VS_CONN_F_DROUTE: 2843 return "Route"; 2844 default: 2845 return "Masq"; 2846 } 2847 } 2848 2849 /* Do not expect consistent view during add, del and move(table resize). 2850 * We may miss entries and even show duplicates. 2851 */ 2852 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) 2853 { 2854 struct ip_vs_iter *iter = seq->private; 2855 struct ip_vs_rht *t = iter->t; 2856 struct ip_vs_service *svc; 2857 struct hlist_bl_node *e; 2858 int idx; 2859 2860 if (!t) 2861 return NULL; 2862 for (idx = 0; idx < t->size; idx++) { 2863 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) { 2864 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2865 break; 2866 if (pos-- == 0) { 2867 iter->bucket = idx; 2868 return svc; 2869 } 2870 } 2871 } 2872 return NULL; 2873 } 2874 2875 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) 2876 __acquires(RCU) 2877 { 2878 struct ip_vs_iter *iter = seq->private; 2879 struct net *net = seq_file_net(seq); 2880 struct netns_ipvs *ipvs = net_ipvs(net); 2881 2882 rcu_read_lock(); 2883 iter->t = rcu_dereference(ipvs->svc_table); 2884 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; 2885 } 2886 2887 2888 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2889 { 2890 struct ip_vs_service *svc; 2891 struct ip_vs_iter *iter; 2892 struct hlist_bl_node *e; 2893 struct ip_vs_rht *t; 2894 2895 ++*pos; 2896 if (v == SEQ_START_TOKEN) 2897 return ip_vs_info_array(seq,0); 2898 2899 svc = v; 2900 iter = seq->private; 2901 t = iter->t; 2902 if (!t) 2903 return NULL; 2904 2905 hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) { 2906 /* Our cursor was moved to new table ? */ 2907 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2908 break; 2909 return svc; 2910 } 2911 2912 while (++iter->bucket < t->size) { 2913 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket], 2914 s_list) { 2915 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2916 break; 2917 return svc; 2918 } 2919 } 2920 return NULL; 2921 } 2922 2923 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) 2924 __releases(RCU) 2925 { 2926 rcu_read_unlock(); 2927 } 2928 2929 2930 static int ip_vs_info_seq_show(struct seq_file *seq, void *v) 2931 { 2932 struct net *net = seq_file_net(seq); 2933 struct netns_ipvs *ipvs = net_ipvs(net); 2934 2935 if (v == SEQ_START_TOKEN) { 2936 seq_printf(seq, 2937 "IP Virtual Server version %d.%d.%d (size=%d)\n", 2938 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); 2939 seq_puts(seq, 2940 "Prot LocalAddress:Port Scheduler Flags\n"); 2941 seq_puts(seq, 2942 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); 2943 } else { 2944 const struct ip_vs_service *svc = v; 2945 const struct ip_vs_dest *dest; 2946 struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); 2947 char *sched_name = sched ? sched->name : "none"; 2948 2949 if (!svc->fwmark) { 2950 #ifdef CONFIG_IP_VS_IPV6 2951 if (svc->af == AF_INET6) 2952 seq_printf(seq, "%s [%pI6]:%04X %s ", 2953 ip_vs_proto_name(svc->protocol), 2954 &svc->addr.in6, 2955 ntohs(svc->port), 2956 sched_name); 2957 else 2958 #endif 2959 seq_printf(seq, "%s %08X:%04X %s %s ", 2960 ip_vs_proto_name(svc->protocol), 2961 ntohl(svc->addr.ip), 2962 ntohs(svc->port), 2963 sched_name, 2964 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 2965 } else { 2966 seq_printf(seq, "FWM %08X %s %s", 2967 svc->fwmark, sched_name, 2968 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 2969 } 2970 2971 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 2972 seq_printf(seq, "persistent %d %08X\n", 2973 svc->timeout, 2974 ntohl(svc->netmask)); 2975 else 2976 seq_putc(seq, '\n'); 2977 2978 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 2979 #ifdef CONFIG_IP_VS_IPV6 2980 if (dest->af == AF_INET6) 2981 seq_printf(seq, 2982 " -> [%pI6]:%04X" 2983 " %-7s %-6d %-10d %-10d\n", 2984 &dest->addr.in6, 2985 ntohs(dest->port), 2986 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 2987 atomic_read(&dest->weight), 2988 atomic_read(&dest->activeconns), 2989 atomic_read(&dest->inactconns)); 2990 else 2991 #endif 2992 seq_printf(seq, 2993 " -> %08X:%04X " 2994 "%-7s %-6d %-10d %-10d\n", 2995 ntohl(dest->addr.ip), 2996 ntohs(dest->port), 2997 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 2998 atomic_read(&dest->weight), 2999 atomic_read(&dest->activeconns), 3000 atomic_read(&dest->inactconns)); 3001 3002 } 3003 } 3004 return 0; 3005 } 3006 3007 static const struct seq_operations ip_vs_info_seq_ops = { 3008 .start = ip_vs_info_seq_start, 3009 .next = ip_vs_info_seq_next, 3010 .stop = ip_vs_info_seq_stop, 3011 .show = ip_vs_info_seq_show, 3012 }; 3013 3014 static int ip_vs_stats_show(struct seq_file *seq, void *v) 3015 { 3016 struct net *net = seq_file_single_net(seq); 3017 struct ip_vs_kstats show; 3018 3019 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 3020 seq_puts(seq, 3021 " Total Incoming Outgoing Incoming Outgoing\n"); 3022 seq_puts(seq, 3023 " Conns Packets Packets Bytes Bytes\n"); 3024 3025 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); 3026 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", 3027 (unsigned long long)show.conns, 3028 (unsigned long long)show.inpkts, 3029 (unsigned long long)show.outpkts, 3030 (unsigned long long)show.inbytes, 3031 (unsigned long long)show.outbytes); 3032 3033 /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ 3034 seq_puts(seq, 3035 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 3036 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", 3037 (unsigned long long)show.cps, 3038 (unsigned long long)show.inpps, 3039 (unsigned long long)show.outpps, 3040 (unsigned long long)show.inbps, 3041 (unsigned long long)show.outbps); 3042 3043 return 0; 3044 } 3045 3046 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) 3047 { 3048 struct net *net = seq_file_single_net(seq); 3049 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; 3050 struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; 3051 struct ip_vs_kstats kstats; 3052 int i; 3053 3054 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 3055 seq_puts(seq, 3056 " Total Incoming Outgoing Incoming Outgoing\n"); 3057 seq_puts(seq, 3058 "CPU Conns Packets Packets Bytes Bytes\n"); 3059 3060 for_each_possible_cpu(i) { 3061 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); 3062 unsigned int start; 3063 u64 conns, inpkts, outpkts, inbytes, outbytes; 3064 3065 do { 3066 start = u64_stats_fetch_begin(&u->syncp); 3067 conns = u64_stats_read(&u->cnt.conns); 3068 inpkts = u64_stats_read(&u->cnt.inpkts); 3069 outpkts = u64_stats_read(&u->cnt.outpkts); 3070 inbytes = u64_stats_read(&u->cnt.inbytes); 3071 outbytes = u64_stats_read(&u->cnt.outbytes); 3072 } while (u64_stats_fetch_retry(&u->syncp, start)); 3073 3074 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", 3075 i, (u64)conns, (u64)inpkts, 3076 (u64)outpkts, (u64)inbytes, 3077 (u64)outbytes); 3078 } 3079 3080 ip_vs_copy_stats(&kstats, tot_stats); 3081 3082 seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", 3083 (unsigned long long)kstats.conns, 3084 (unsigned long long)kstats.inpkts, 3085 (unsigned long long)kstats.outpkts, 3086 (unsigned long long)kstats.inbytes, 3087 (unsigned long long)kstats.outbytes); 3088 3089 /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 3090 seq_puts(seq, 3091 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 3092 seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", 3093 kstats.cps, 3094 kstats.inpps, 3095 kstats.outpps, 3096 kstats.inbps, 3097 kstats.outbps); 3098 3099 return 0; 3100 } 3101 3102 static int ip_vs_status_show(struct seq_file *seq, void *v) 3103 { 3104 struct net *net = seq_file_single_net(seq); 3105 struct netns_ipvs *ipvs = net_ipvs(net); 3106 unsigned int resched_score = 0; 3107 struct ip_vs_conn_hnode *hn; 3108 struct hlist_bl_head *head; 3109 struct ip_vs_service *svc; 3110 struct ip_vs_rht *t, *pt; 3111 struct hlist_bl_node *e; 3112 int old_gen, new_gen; 3113 u32 counts[8]; 3114 u32 bucket; 3115 u32 count; 3116 int loops; 3117 u32 sum1; 3118 u32 sum; 3119 int i; 3120 3121 /* Info for conns */ 3122 rcu_read_lock(); 3123 3124 t = rcu_dereference(ipvs->conn_tab); 3125 3126 seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count)); 3127 seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n", 3128 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); 3129 3130 if (!atomic_read(&ipvs->conn_count)) 3131 goto after_conns; 3132 old_gen = atomic_read(&ipvs->conn_tab_changes); 3133 loops = 0; 3134 3135 repeat_conn: 3136 smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ 3137 memset(counts, 0, sizeof(counts)); 3138 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { 3139 for (bucket = 0; bucket < t->size; bucket++) { 3140 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 3141 3142 count = 0; 3143 resched_score++; 3144 ip_vs_rht_walk_bucket_rcu(t, bucket, head) { 3145 count = 0; 3146 hlist_bl_for_each_entry_rcu(hn, e, head, node) { 3147 count++; 3148 if (count >= ARRAY_SIZE(counts) - 1) 3149 break; 3150 } 3151 } 3152 resched_score += count; 3153 if (resched_score >= 100) { 3154 resched_score = 0; 3155 cond_resched_rcu(); 3156 new_gen = atomic_read(&ipvs->conn_tab_changes); 3157 /* New table installed ? */ 3158 if (old_gen != new_gen) { 3159 /* Too many changes? */ 3160 if (++loops >= 5) 3161 goto after_conns; 3162 old_gen = new_gen; 3163 goto repeat_conn; 3164 } 3165 } 3166 counts[count]++; 3167 } 3168 } 3169 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) 3170 sum += counts[i]; 3171 sum1 = sum - counts[0]; 3172 seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n", 3173 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); 3174 for (i = 1; i < ARRAY_SIZE(counts); i++) { 3175 if (!counts[i]) 3176 continue; 3177 seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n", 3178 i, counts[i], 3179 div_u64((u64)counts[i] * 100U, max(sum1, 1U))); 3180 } 3181 3182 after_conns: 3183 rcu_read_unlock(); 3184 3185 /* Info for services */ 3186 down_read(&ipvs->svc_replace_sem); 3187 rcu_read_lock(); 3188 3189 t = rcu_dereference(ipvs->svc_table); 3190 3191 count = ip_vs_get_num_services(ipvs); 3192 seq_printf(seq, "Services:\t%u\n", count); 3193 seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", 3194 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); 3195 3196 if (!count) 3197 goto after_svc; 3198 old_gen = atomic_read(&ipvs->svc_table_changes); 3199 3200 smp_rmb(); /* ipvs->svc_table and svc_table_changes */ 3201 memset(counts, 0, sizeof(counts)); 3202 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { 3203 for (bucket = 0; bucket < t->size; bucket++) { 3204 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 3205 3206 count = 0; 3207 resched_score++; 3208 ip_vs_rht_walk_bucket_rcu(t, bucket, head) { 3209 count = 0; 3210 hlist_bl_for_each_entry_rcu(svc, e, head, 3211 s_list) { 3212 count++; 3213 if (count >= ARRAY_SIZE(counts) - 1) 3214 break; 3215 } 3216 } 3217 resched_score += count; 3218 if (resched_score >= 100) { 3219 resched_score = 0; 3220 cond_resched_rcu(); 3221 /* Flushed? */ 3222 if (atomic_read(&ipvs->svc_table_changes) != 3223 old_gen) 3224 goto after_svc; 3225 } 3226 counts[count]++; 3227 } 3228 } 3229 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) 3230 sum += counts[i]; 3231 sum1 = sum - counts[0]; 3232 seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n", 3233 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); 3234 for (i = 1; i < ARRAY_SIZE(counts); i++) { 3235 if (!counts[i]) 3236 continue; 3237 seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n", 3238 i, counts[i], 3239 div_u64((u64)counts[i] * 100U, max(sum1, 1U))); 3240 } 3241 3242 after_svc: 3243 rcu_read_unlock(); 3244 up_read(&ipvs->svc_replace_sem); 3245 3246 seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", 3247 ipvs->est_kt_count, ipvs->est_max_threads); 3248 seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); 3249 seq_printf(seq, "Stats thread ests:\t%d\n", 3250 ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * 3251 IPVS_EST_NTICKS); 3252 3253 return 0; 3254 } 3255 3256 #endif 3257 3258 /* 3259 * Set timeout values for tcp tcpfin udp in the timeout_table. 3260 */ 3261 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 3262 { 3263 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 3264 struct ip_vs_proto_data *pd; 3265 #endif 3266 3267 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", 3268 u->tcp_timeout, 3269 u->tcp_fin_timeout, 3270 u->udp_timeout); 3271 3272 #ifdef CONFIG_IP_VS_PROTO_TCP 3273 if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || 3274 u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { 3275 return -EINVAL; 3276 } 3277 #endif 3278 3279 #ifdef CONFIG_IP_VS_PROTO_UDP 3280 if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) 3281 return -EINVAL; 3282 #endif 3283 3284 #ifdef CONFIG_IP_VS_PROTO_TCP 3285 if (u->tcp_timeout) { 3286 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3287 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] 3288 = u->tcp_timeout * HZ; 3289 } 3290 3291 if (u->tcp_fin_timeout) { 3292 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3293 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] 3294 = u->tcp_fin_timeout * HZ; 3295 } 3296 #endif 3297 3298 #ifdef CONFIG_IP_VS_PROTO_UDP 3299 if (u->udp_timeout) { 3300 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 3301 pd->timeout_table[IP_VS_UDP_S_NORMAL] 3302 = u->udp_timeout * HZ; 3303 } 3304 #endif 3305 return 0; 3306 } 3307 3308 #define CMDID(cmd) (cmd - IP_VS_BASE_CTL) 3309 3310 struct ip_vs_svcdest_user { 3311 struct ip_vs_service_user s; 3312 struct ip_vs_dest_user d; 3313 }; 3314 3315 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { 3316 [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), 3317 [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), 3318 [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), 3319 [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), 3320 [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), 3321 [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), 3322 [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 3323 [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), 3324 [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), 3325 [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), 3326 }; 3327 3328 union ip_vs_set_arglen { 3329 struct ip_vs_service_user field_IP_VS_SO_SET_ADD; 3330 struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; 3331 struct ip_vs_service_user field_IP_VS_SO_SET_DEL; 3332 struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; 3333 struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; 3334 struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; 3335 struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; 3336 struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; 3337 struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; 3338 struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; 3339 }; 3340 3341 #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) 3342 3343 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, 3344 struct ip_vs_service_user *usvc_compat) 3345 { 3346 memset(usvc, 0, sizeof(*usvc)); 3347 3348 usvc->af = AF_INET; 3349 usvc->protocol = usvc_compat->protocol; 3350 usvc->addr.ip = usvc_compat->addr; 3351 usvc->port = usvc_compat->port; 3352 usvc->fwmark = usvc_compat->fwmark; 3353 3354 /* Deep copy of sched_name is not needed here */ 3355 usvc->sched_name = usvc_compat->sched_name; 3356 3357 usvc->flags = usvc_compat->flags; 3358 usvc->timeout = usvc_compat->timeout; 3359 usvc->netmask = usvc_compat->netmask; 3360 } 3361 3362 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, 3363 struct ip_vs_dest_user *udest_compat) 3364 { 3365 memset(udest, 0, sizeof(*udest)); 3366 3367 udest->addr.ip = udest_compat->addr; 3368 udest->port = udest_compat->port; 3369 udest->conn_flags = udest_compat->conn_flags; 3370 udest->weight = udest_compat->weight; 3371 udest->u_threshold = udest_compat->u_threshold; 3372 udest->l_threshold = udest_compat->l_threshold; 3373 udest->af = AF_INET; 3374 udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; 3375 } 3376 3377 static int 3378 do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) 3379 { 3380 struct net *net = sock_net(sk); 3381 int ret; 3382 unsigned char arg[MAX_SET_ARGLEN]; 3383 struct ip_vs_service_user *usvc_compat; 3384 struct ip_vs_service_user_kern usvc; 3385 struct ip_vs_service *svc; 3386 struct ip_vs_dest_user *udest_compat; 3387 struct ip_vs_dest_user_kern udest; 3388 struct netns_ipvs *ipvs = net_ipvs(net); 3389 3390 BUILD_BUG_ON(sizeof(arg) > 255); 3391 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3392 return -EPERM; 3393 3394 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) 3395 return -EINVAL; 3396 if (len != set_arglen[CMDID(cmd)]) { 3397 IP_VS_DBG(1, "set_ctl: len %u != %u\n", 3398 len, set_arglen[CMDID(cmd)]); 3399 return -EINVAL; 3400 } 3401 3402 if (copy_from_sockptr(arg, ptr, len) != 0) 3403 return -EFAULT; 3404 3405 /* Handle daemons since they have another lock */ 3406 if (cmd == IP_VS_SO_SET_STARTDAEMON || 3407 cmd == IP_VS_SO_SET_STOPDAEMON) { 3408 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; 3409 3410 if (cmd == IP_VS_SO_SET_STARTDAEMON) { 3411 struct ipvs_sync_daemon_cfg cfg; 3412 3413 memset(&cfg, 0, sizeof(cfg)); 3414 ret = -EINVAL; 3415 if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, 3416 sizeof(cfg.mcast_ifn)) <= 0) 3417 return ret; 3418 cfg.syncid = dm->syncid; 3419 ret = start_sync_thread(ipvs, &cfg, dm->state); 3420 } else { 3421 ret = stop_sync_thread(ipvs, dm->state); 3422 } 3423 return ret; 3424 } 3425 3426 mutex_lock(&ipvs->service_mutex); 3427 if (cmd == IP_VS_SO_SET_FLUSH) { 3428 /* Flush the virtual service */ 3429 ret = ip_vs_flush(ipvs, false); 3430 goto out_unlock; 3431 } else if (cmd == IP_VS_SO_SET_TIMEOUT) { 3432 /* Set timeout values for (tcp tcpfin udp) */ 3433 ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); 3434 goto out_unlock; 3435 } else if (!len) { 3436 /* No more commands with len == 0 below */ 3437 ret = -EINVAL; 3438 goto out_unlock; 3439 } 3440 3441 usvc_compat = (struct ip_vs_service_user *)arg; 3442 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); 3443 3444 /* We only use the new structs internally, so copy userspace compat 3445 * structs to extended internal versions */ 3446 ip_vs_copy_usvc_compat(&usvc, usvc_compat); 3447 ip_vs_copy_udest_compat(&udest, udest_compat); 3448 3449 if (cmd == IP_VS_SO_SET_ZERO) { 3450 /* if no service address is set, zero counters in all */ 3451 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { 3452 ret = ip_vs_zero_all(ipvs); 3453 goto out_unlock; 3454 } 3455 } 3456 3457 if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && 3458 strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == 3459 IP_VS_SCHEDNAME_MAXLEN) { 3460 ret = -EINVAL; 3461 goto out_unlock; 3462 } 3463 3464 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ 3465 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && 3466 usvc.protocol != IPPROTO_SCTP) { 3467 pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", 3468 usvc.protocol, &usvc.addr.ip, 3469 ntohs(usvc.port)); 3470 ret = -EFAULT; 3471 goto out_unlock; 3472 } 3473 3474 /* Lookup the exact service by <protocol, addr, port> or fwmark */ 3475 rcu_read_lock(); 3476 if (usvc.fwmark == 0) 3477 svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, 3478 &usvc.addr, usvc.port); 3479 else 3480 svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); 3481 rcu_read_unlock(); 3482 3483 if (cmd != IP_VS_SO_SET_ADD 3484 && (svc == NULL || svc->protocol != usvc.protocol)) { 3485 ret = -ESRCH; 3486 goto out_unlock; 3487 } 3488 3489 switch (cmd) { 3490 case IP_VS_SO_SET_ADD: 3491 if (svc != NULL) 3492 ret = -EEXIST; 3493 else 3494 ret = ip_vs_add_service(ipvs, &usvc, &svc); 3495 break; 3496 case IP_VS_SO_SET_EDIT: 3497 ret = ip_vs_edit_service(svc, &usvc); 3498 break; 3499 case IP_VS_SO_SET_DEL: 3500 ret = ip_vs_del_service(svc); 3501 if (!ret) 3502 goto out_unlock; 3503 break; 3504 case IP_VS_SO_SET_ZERO: 3505 ret = ip_vs_zero_service(svc); 3506 break; 3507 case IP_VS_SO_SET_ADDDEST: 3508 ret = ip_vs_add_dest(svc, &udest); 3509 break; 3510 case IP_VS_SO_SET_EDITDEST: 3511 ret = ip_vs_edit_dest(svc, &udest); 3512 break; 3513 case IP_VS_SO_SET_DELDEST: 3514 ret = ip_vs_del_dest(svc, &udest); 3515 break; 3516 default: 3517 WARN_ON_ONCE(1); 3518 ret = -EINVAL; 3519 break; 3520 } 3521 3522 out_unlock: 3523 mutex_unlock(&ipvs->service_mutex); 3524 return ret; 3525 } 3526 3527 3528 static void 3529 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) 3530 { 3531 struct ip_vs_scheduler *sched; 3532 struct ip_vs_kstats kstats; 3533 char *sched_name; 3534 3535 sched = rcu_dereference_protected(src->scheduler, 1); 3536 sched_name = sched ? sched->name : "none"; 3537 dst->protocol = src->protocol; 3538 dst->addr = src->addr.ip; 3539 dst->port = src->port; 3540 dst->fwmark = src->fwmark; 3541 strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); 3542 dst->flags = src->flags; 3543 dst->timeout = src->timeout / HZ; 3544 dst->netmask = src->netmask; 3545 dst->num_dests = src->num_dests; 3546 ip_vs_copy_stats(&kstats, &src->stats); 3547 ip_vs_export_stats_user(&dst->stats, &kstats); 3548 } 3549 3550 static inline int 3551 __ip_vs_get_service_entries(struct netns_ipvs *ipvs, 3552 const struct ip_vs_get_services *get, 3553 struct ip_vs_get_services __user *uptr) 3554 { 3555 struct ip_vs_service_entry entry; 3556 DECLARE_IP_VS_RHT_WALK_BUCKETS(); 3557 struct hlist_bl_head *head; 3558 struct ip_vs_service *svc; 3559 struct hlist_bl_node *e; 3560 int count = 0; 3561 int ret = 0; 3562 3563 lockdep_assert_held(&ipvs->svc_resize_sem); 3564 /* All svc_table modifications are disabled, go ahead */ 3565 ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 3566 hlist_bl_for_each_entry(svc, e, head, s_list) { 3567 /* Only expose IPv4 entries to old interface */ 3568 if (svc->af != AF_INET) 3569 continue; 3570 3571 if (count >= get->num_services) 3572 goto out; 3573 memset(&entry, 0, sizeof(entry)); 3574 ip_vs_copy_service(&entry, svc); 3575 if (copy_to_user(&uptr->entrytable[count], 3576 &entry, sizeof(entry))) { 3577 ret = -EFAULT; 3578 goto out; 3579 } 3580 count++; 3581 } 3582 } 3583 3584 out: 3585 return ret; 3586 } 3587 3588 static inline int 3589 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, 3590 struct ip_vs_get_dests __user *uptr) 3591 { 3592 struct ip_vs_service *svc; 3593 union nf_inet_addr addr = { .ip = get->addr }; 3594 int ret = 0; 3595 3596 rcu_read_lock(); 3597 if (get->fwmark) 3598 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); 3599 else 3600 svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, 3601 get->port); 3602 rcu_read_unlock(); 3603 3604 if (svc) { 3605 int count = 0; 3606 struct ip_vs_dest *dest; 3607 struct ip_vs_dest_entry entry; 3608 struct ip_vs_kstats kstats; 3609 3610 memset(&entry, 0, sizeof(entry)); 3611 list_for_each_entry(dest, &svc->destinations, n_list) { 3612 if (count >= get->num_dests) 3613 break; 3614 3615 /* Cannot expose heterogeneous members via sockopt 3616 * interface 3617 */ 3618 if (dest->af != svc->af) 3619 continue; 3620 3621 entry.addr = dest->addr.ip; 3622 entry.port = dest->port; 3623 entry.conn_flags = atomic_read(&dest->conn_flags); 3624 entry.weight = atomic_read(&dest->weight); 3625 entry.u_threshold = dest->u_threshold; 3626 entry.l_threshold = dest->l_threshold; 3627 entry.activeconns = atomic_read(&dest->activeconns); 3628 entry.inactconns = atomic_read(&dest->inactconns); 3629 entry.persistconns = atomic_read(&dest->persistconns); 3630 ip_vs_copy_stats(&kstats, &dest->stats); 3631 ip_vs_export_stats_user(&entry.stats, &kstats); 3632 if (copy_to_user(&uptr->entrytable[count], 3633 &entry, sizeof(entry))) { 3634 ret = -EFAULT; 3635 break; 3636 } 3637 count++; 3638 } 3639 } else 3640 ret = -ESRCH; 3641 return ret; 3642 } 3643 3644 static inline void 3645 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 3646 { 3647 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 3648 struct ip_vs_proto_data *pd; 3649 #endif 3650 3651 memset(u, 0, sizeof (*u)); 3652 3653 #ifdef CONFIG_IP_VS_PROTO_TCP 3654 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3655 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; 3656 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; 3657 #endif 3658 #ifdef CONFIG_IP_VS_PROTO_UDP 3659 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 3660 u->udp_timeout = 3661 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; 3662 #endif 3663 } 3664 3665 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { 3666 [CMDID(IP_VS_SO_GET_VERSION)] = 64, 3667 [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), 3668 [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), 3669 [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), 3670 [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), 3671 [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 3672 [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), 3673 }; 3674 3675 union ip_vs_get_arglen { 3676 char field_IP_VS_SO_GET_VERSION[64]; 3677 struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; 3678 struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; 3679 struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; 3680 struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; 3681 struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; 3682 struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; 3683 }; 3684 3685 #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) 3686 3687 static int 3688 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) 3689 { 3690 unsigned char arg[MAX_GET_ARGLEN]; 3691 int ret = 0; 3692 unsigned int copylen; 3693 struct net *net = sock_net(sk); 3694 struct netns_ipvs *ipvs = net_ipvs(net); 3695 3696 BUG_ON(!net); 3697 BUILD_BUG_ON(sizeof(arg) > 255); 3698 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3699 return -EPERM; 3700 3701 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) 3702 return -EINVAL; 3703 3704 copylen = get_arglen[CMDID(cmd)]; 3705 if (*len < (int) copylen) { 3706 IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); 3707 return -EINVAL; 3708 } 3709 3710 if (copy_from_user(arg, user, copylen) != 0) 3711 return -EFAULT; 3712 /* 3713 * Handle daemons first since it has its own locking 3714 */ 3715 if (cmd == IP_VS_SO_GET_DAEMON) { 3716 struct ip_vs_daemon_user d[2]; 3717 3718 memset(&d, 0, sizeof(d)); 3719 mutex_lock(&ipvs->sync_mutex); 3720 if (ipvs->sync_state & IP_VS_STATE_MASTER) { 3721 d[0].state = IP_VS_STATE_MASTER; 3722 strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, 3723 sizeof(d[0].mcast_ifn)); 3724 d[0].syncid = ipvs->mcfg.syncid; 3725 } 3726 if (ipvs->sync_state & IP_VS_STATE_BACKUP) { 3727 d[1].state = IP_VS_STATE_BACKUP; 3728 strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, 3729 sizeof(d[1].mcast_ifn)); 3730 d[1].syncid = ipvs->bcfg.syncid; 3731 } 3732 if (copy_to_user(user, &d, sizeof(d)) != 0) 3733 ret = -EFAULT; 3734 mutex_unlock(&ipvs->sync_mutex); 3735 return ret; 3736 } 3737 3738 if (cmd == IP_VS_SO_GET_SERVICES) { 3739 struct ip_vs_get_services *get; 3740 size_t size; 3741 3742 get = (struct ip_vs_get_services *)arg; 3743 size = struct_size(get, entrytable, get->num_services); 3744 if (*len != size) { 3745 pr_err("length: %u != %zu\n", *len, size); 3746 return -EINVAL; 3747 } 3748 /* Prevent modifications to the list with services. 3749 * Try reverse locking, so that we do not hold the mutex 3750 * while waiting for semaphore. 3751 */ 3752 while (1) { 3753 ret = down_read_killable(&ipvs->svc_resize_sem); 3754 if (ret < 0) 3755 return ret; 3756 if (mutex_trylock(&ipvs->service_mutex)) 3757 break; 3758 up_read(&ipvs->svc_resize_sem); 3759 cond_resched(); 3760 } 3761 ret = __ip_vs_get_service_entries(ipvs, get, user); 3762 up_read(&ipvs->svc_resize_sem); 3763 mutex_unlock(&ipvs->service_mutex); 3764 return ret; 3765 } 3766 3767 mutex_lock(&ipvs->service_mutex); 3768 switch (cmd) { 3769 case IP_VS_SO_GET_VERSION: 3770 { 3771 char buf[64]; 3772 3773 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", 3774 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); 3775 if (copy_to_user(user, buf, strlen(buf)+1) != 0) { 3776 ret = -EFAULT; 3777 goto out; 3778 } 3779 *len = strlen(buf)+1; 3780 } 3781 break; 3782 3783 case IP_VS_SO_GET_INFO: 3784 { 3785 struct ip_vs_getinfo info; 3786 3787 info.version = IP_VS_VERSION_CODE; 3788 info.size = get_conn_tab_size(ipvs); 3789 info.num_services = 3790 atomic_read(&ipvs->num_services[IP_VS_AF_INET]); 3791 if (copy_to_user(user, &info, sizeof(info)) != 0) 3792 ret = -EFAULT; 3793 } 3794 break; 3795 3796 case IP_VS_SO_GET_SERVICE: 3797 { 3798 struct ip_vs_service_entry *entry; 3799 struct ip_vs_service *svc; 3800 union nf_inet_addr addr; 3801 3802 entry = (struct ip_vs_service_entry *)arg; 3803 addr.ip = entry->addr; 3804 rcu_read_lock(); 3805 if (entry->fwmark) 3806 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); 3807 else 3808 svc = __ip_vs_service_find(ipvs, AF_INET, 3809 entry->protocol, &addr, 3810 entry->port); 3811 rcu_read_unlock(); 3812 if (svc) { 3813 ip_vs_copy_service(entry, svc); 3814 if (copy_to_user(user, entry, sizeof(*entry)) != 0) 3815 ret = -EFAULT; 3816 } else 3817 ret = -ESRCH; 3818 } 3819 break; 3820 3821 case IP_VS_SO_GET_DESTS: 3822 { 3823 struct ip_vs_get_dests *get; 3824 size_t size; 3825 3826 get = (struct ip_vs_get_dests *)arg; 3827 size = struct_size(get, entrytable, get->num_dests); 3828 if (*len != size) { 3829 pr_err("length: %u != %zu\n", *len, size); 3830 ret = -EINVAL; 3831 goto out; 3832 } 3833 ret = __ip_vs_get_dest_entries(ipvs, get, user); 3834 } 3835 break; 3836 3837 case IP_VS_SO_GET_TIMEOUT: 3838 { 3839 struct ip_vs_timeout_user t; 3840 3841 __ip_vs_get_timeouts(ipvs, &t); 3842 if (copy_to_user(user, &t, sizeof(t)) != 0) 3843 ret = -EFAULT; 3844 } 3845 break; 3846 3847 default: 3848 ret = -EINVAL; 3849 } 3850 3851 out: 3852 mutex_unlock(&ipvs->service_mutex); 3853 return ret; 3854 } 3855 3856 3857 static struct nf_sockopt_ops ip_vs_sockopts = { 3858 .pf = PF_INET, 3859 .set_optmin = IP_VS_BASE_CTL, 3860 .set_optmax = IP_VS_SO_SET_MAX+1, 3861 .set = do_ip_vs_set_ctl, 3862 .get_optmin = IP_VS_BASE_CTL, 3863 .get_optmax = IP_VS_SO_GET_MAX+1, 3864 .get = do_ip_vs_get_ctl, 3865 .owner = THIS_MODULE, 3866 }; 3867 3868 /* 3869 * Generic Netlink interface 3870 */ 3871 3872 /* IPVS genetlink family */ 3873 static struct genl_family ip_vs_genl_family; 3874 3875 /* Policy used for first-level command attributes */ 3876 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { 3877 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, 3878 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, 3879 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, 3880 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, 3881 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, 3882 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, 3883 }; 3884 3885 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ 3886 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { 3887 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, 3888 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, 3889 .len = IP_VS_IFNAME_MAXLEN - 1 }, 3890 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, 3891 [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, 3892 [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, 3893 [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, 3894 [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, 3895 [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, 3896 }; 3897 3898 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ 3899 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { 3900 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, 3901 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, 3902 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, 3903 .len = sizeof(union nf_inet_addr) }, 3904 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, 3905 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, 3906 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, 3907 .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, 3908 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, 3909 .len = IP_VS_PENAME_MAXLEN }, 3910 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, 3911 .len = sizeof(struct ip_vs_flags) }, 3912 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, 3913 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, 3914 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, 3915 }; 3916 3917 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ 3918 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { 3919 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, 3920 .len = sizeof(union nf_inet_addr) }, 3921 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, 3922 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, 3923 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, 3924 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, 3925 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, 3926 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, 3927 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, 3928 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, 3929 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, 3930 [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, 3931 [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, 3932 [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, 3933 [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, 3934 }; 3935 3936 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, 3937 struct ip_vs_kstats *kstats) 3938 { 3939 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 3940 3941 if (!nl_stats) 3942 return -EMSGSIZE; 3943 3944 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || 3945 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || 3946 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || 3947 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 3948 IPVS_STATS_ATTR_PAD) || 3949 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 3950 IPVS_STATS_ATTR_PAD) || 3951 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || 3952 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || 3953 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || 3954 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || 3955 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) 3956 goto nla_put_failure; 3957 nla_nest_end(skb, nl_stats); 3958 3959 return 0; 3960 3961 nla_put_failure: 3962 nla_nest_cancel(skb, nl_stats); 3963 return -EMSGSIZE; 3964 } 3965 3966 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, 3967 struct ip_vs_kstats *kstats) 3968 { 3969 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 3970 3971 if (!nl_stats) 3972 return -EMSGSIZE; 3973 3974 if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, 3975 IPVS_STATS_ATTR_PAD) || 3976 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, 3977 IPVS_STATS_ATTR_PAD) || 3978 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, 3979 IPVS_STATS_ATTR_PAD) || 3980 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 3981 IPVS_STATS_ATTR_PAD) || 3982 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 3983 IPVS_STATS_ATTR_PAD) || 3984 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, 3985 IPVS_STATS_ATTR_PAD) || 3986 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, 3987 IPVS_STATS_ATTR_PAD) || 3988 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, 3989 IPVS_STATS_ATTR_PAD) || 3990 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, 3991 IPVS_STATS_ATTR_PAD) || 3992 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, 3993 IPVS_STATS_ATTR_PAD)) 3994 goto nla_put_failure; 3995 nla_nest_end(skb, nl_stats); 3996 3997 return 0; 3998 3999 nla_put_failure: 4000 nla_nest_cancel(skb, nl_stats); 4001 return -EMSGSIZE; 4002 } 4003 4004 static int ip_vs_genl_fill_service(struct sk_buff *skb, 4005 struct ip_vs_service *svc) 4006 { 4007 struct ip_vs_scheduler *sched; 4008 struct ip_vs_pe *pe; 4009 struct nlattr *nl_service; 4010 struct ip_vs_flags flags = { .flags = svc->flags, 4011 .mask = ~0 }; 4012 struct ip_vs_kstats kstats; 4013 char *sched_name; 4014 4015 nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); 4016 if (!nl_service) 4017 return -EMSGSIZE; 4018 4019 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) 4020 goto nla_put_failure; 4021 if (svc->fwmark) { 4022 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) 4023 goto nla_put_failure; 4024 } else { 4025 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || 4026 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || 4027 nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) 4028 goto nla_put_failure; 4029 } 4030 4031 sched = rcu_dereference(svc->scheduler); 4032 sched_name = sched ? sched->name : "none"; 4033 pe = rcu_dereference(svc->pe); 4034 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || 4035 (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || 4036 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || 4037 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || 4038 nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) 4039 goto nla_put_failure; 4040 ip_vs_copy_stats(&kstats, &svc->stats); 4041 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) 4042 goto nla_put_failure; 4043 if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) 4044 goto nla_put_failure; 4045 4046 nla_nest_end(skb, nl_service); 4047 4048 return 0; 4049 4050 nla_put_failure: 4051 nla_nest_cancel(skb, nl_service); 4052 return -EMSGSIZE; 4053 } 4054 4055 static int ip_vs_genl_dump_service(struct sk_buff *skb, 4056 struct ip_vs_service *svc, 4057 struct netlink_callback *cb) 4058 { 4059 void *hdr; 4060 4061 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4062 &ip_vs_genl_family, NLM_F_MULTI, 4063 IPVS_CMD_NEW_SERVICE); 4064 if (!hdr) 4065 return -EMSGSIZE; 4066 4067 if (ip_vs_genl_fill_service(skb, svc) < 0) 4068 goto nla_put_failure; 4069 4070 genlmsg_end(skb, hdr); 4071 return 0; 4072 4073 nla_put_failure: 4074 genlmsg_cancel(skb, hdr); 4075 return -EMSGSIZE; 4076 } 4077 4078 static int ip_vs_genl_dump_services(struct sk_buff *skb, 4079 struct netlink_callback *cb) 4080 { 4081 DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); 4082 struct net *net = sock_net(skb->sk); 4083 struct netns_ipvs *ipvs = net_ipvs(net); 4084 struct hlist_bl_head *head; 4085 struct ip_vs_service *svc; 4086 struct hlist_bl_node *e; 4087 int start = cb->args[0]; 4088 int idx = 0; 4089 4090 /* Make sure we do not see same service twice during resize */ 4091 down_read(&ipvs->svc_resize_sem); 4092 rcu_read_lock(); 4093 ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { 4094 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 4095 if (++idx <= start) 4096 continue; 4097 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { 4098 idx--; 4099 goto nla_put_failure; 4100 } 4101 } 4102 } 4103 4104 nla_put_failure: 4105 rcu_read_unlock(); 4106 up_read(&ipvs->svc_resize_sem); 4107 cb->args[0] = idx; 4108 4109 return skb->len; 4110 } 4111 4112 static bool ip_vs_is_af_valid(int af) 4113 { 4114 if (af == AF_INET) 4115 return true; 4116 #ifdef CONFIG_IP_VS_IPV6 4117 if (af == AF_INET6 && ipv6_mod_enabled()) 4118 return true; 4119 #endif 4120 return false; 4121 } 4122 4123 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, 4124 struct ip_vs_service_user_kern *usvc, 4125 struct nlattr *nla, bool full_entry, 4126 struct ip_vs_service **ret_svc) 4127 { 4128 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; 4129 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; 4130 struct ip_vs_service *svc; 4131 4132 /* Parse mandatory identifying service fields first */ 4133 if (nla == NULL || 4134 nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) 4135 return -EINVAL; 4136 4137 nla_af = attrs[IPVS_SVC_ATTR_AF]; 4138 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; 4139 nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; 4140 nla_port = attrs[IPVS_SVC_ATTR_PORT]; 4141 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; 4142 4143 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) 4144 return -EINVAL; 4145 4146 memset(usvc, 0, sizeof(*usvc)); 4147 4148 usvc->af = nla_get_u16(nla_af); 4149 if (!ip_vs_is_af_valid(usvc->af)) 4150 return -EAFNOSUPPORT; 4151 4152 if (nla_fwmark) { 4153 usvc->protocol = IPPROTO_TCP; 4154 usvc->fwmark = nla_get_u32(nla_fwmark); 4155 } else { 4156 usvc->protocol = nla_get_u16(nla_protocol); 4157 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); 4158 usvc->port = nla_get_be16(nla_port); 4159 usvc->fwmark = 0; 4160 } 4161 4162 if (usvc->fwmark) 4163 svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); 4164 else 4165 svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, 4166 &usvc->addr, usvc->port); 4167 *ret_svc = svc; 4168 4169 /* If a full entry was requested, check for the additional fields */ 4170 if (full_entry) { 4171 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, 4172 *nla_netmask; 4173 struct ip_vs_flags flags; 4174 4175 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; 4176 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; 4177 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; 4178 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; 4179 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; 4180 4181 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) 4182 return -EINVAL; 4183 4184 nla_memcpy(&flags, nla_flags, sizeof(flags)); 4185 4186 /* prefill flags from service if it already exists */ 4187 if (svc) 4188 usvc->flags = svc->flags; 4189 4190 /* set new flags from userland */ 4191 usvc->flags = (usvc->flags & ~flags.mask) | 4192 (flags.flags & flags.mask); 4193 usvc->sched_name = nla_data(nla_sched); 4194 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; 4195 usvc->timeout = nla_get_u32(nla_timeout); 4196 usvc->netmask = nla_get_be32(nla_netmask); 4197 } 4198 4199 return 0; 4200 } 4201 4202 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, 4203 struct nlattr *nla) 4204 { 4205 struct ip_vs_service_user_kern usvc; 4206 struct ip_vs_service *svc; 4207 int ret; 4208 4209 ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); 4210 return ret ? ERR_PTR(ret) : svc; 4211 } 4212 4213 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) 4214 { 4215 struct nlattr *nl_dest; 4216 struct ip_vs_kstats kstats; 4217 4218 nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); 4219 if (!nl_dest) 4220 return -EMSGSIZE; 4221 4222 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || 4223 nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || 4224 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, 4225 (atomic_read(&dest->conn_flags) & 4226 IP_VS_CONN_F_FWD_MASK)) || 4227 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, 4228 atomic_read(&dest->weight)) || 4229 nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, 4230 dest->tun_type) || 4231 nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, 4232 dest->tun_port) || 4233 nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, 4234 dest->tun_flags) || 4235 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || 4236 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || 4237 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, 4238 atomic_read(&dest->activeconns)) || 4239 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, 4240 atomic_read(&dest->inactconns)) || 4241 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, 4242 atomic_read(&dest->persistconns)) || 4243 nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) 4244 goto nla_put_failure; 4245 ip_vs_copy_stats(&kstats, &dest->stats); 4246 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) 4247 goto nla_put_failure; 4248 if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) 4249 goto nla_put_failure; 4250 4251 nla_nest_end(skb, nl_dest); 4252 4253 return 0; 4254 4255 nla_put_failure: 4256 nla_nest_cancel(skb, nl_dest); 4257 return -EMSGSIZE; 4258 } 4259 4260 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, 4261 struct netlink_callback *cb) 4262 { 4263 void *hdr; 4264 4265 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4266 &ip_vs_genl_family, NLM_F_MULTI, 4267 IPVS_CMD_NEW_DEST); 4268 if (!hdr) 4269 return -EMSGSIZE; 4270 4271 if (ip_vs_genl_fill_dest(skb, dest) < 0) 4272 goto nla_put_failure; 4273 4274 genlmsg_end(skb, hdr); 4275 return 0; 4276 4277 nla_put_failure: 4278 genlmsg_cancel(skb, hdr); 4279 return -EMSGSIZE; 4280 } 4281 4282 static int ip_vs_genl_dump_dests(struct sk_buff *skb, 4283 struct netlink_callback *cb) 4284 { 4285 int idx = 0; 4286 int start = cb->args[0]; 4287 struct ip_vs_service *svc; 4288 struct ip_vs_dest *dest; 4289 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; 4290 struct net *net = sock_net(skb->sk); 4291 struct netns_ipvs *ipvs = net_ipvs(net); 4292 4293 rcu_read_lock(); 4294 4295 /* Try to find the service for which to dump destinations */ 4296 if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) 4297 goto out_err; 4298 4299 4300 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); 4301 if (IS_ERR_OR_NULL(svc)) 4302 goto out_err; 4303 4304 /* Dump the destinations */ 4305 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 4306 if (++idx <= start) 4307 continue; 4308 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { 4309 idx--; 4310 goto nla_put_failure; 4311 } 4312 } 4313 4314 nla_put_failure: 4315 cb->args[0] = idx; 4316 4317 out_err: 4318 rcu_read_unlock(); 4319 4320 return skb->len; 4321 } 4322 4323 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, 4324 struct nlattr *nla, bool full_entry) 4325 { 4326 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; 4327 struct nlattr *nla_addr, *nla_port; 4328 struct nlattr *nla_addr_family; 4329 4330 /* Parse mandatory identifying destination fields first */ 4331 if (nla == NULL || 4332 nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) 4333 return -EINVAL; 4334 4335 nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; 4336 nla_port = attrs[IPVS_DEST_ATTR_PORT]; 4337 nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; 4338 4339 if (!(nla_addr && nla_port)) 4340 return -EINVAL; 4341 4342 memset(udest, 0, sizeof(*udest)); 4343 4344 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); 4345 udest->port = nla_get_be16(nla_port); 4346 4347 udest->af = nla_get_u16_default(nla_addr_family, 0); 4348 4349 /* If a full entry was requested, check for the additional fields */ 4350 if (full_entry) { 4351 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, 4352 *nla_l_thresh, *nla_tun_type, *nla_tun_port, 4353 *nla_tun_flags; 4354 4355 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; 4356 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; 4357 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; 4358 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; 4359 nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; 4360 nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; 4361 nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; 4362 4363 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) 4364 return -EINVAL; 4365 4366 udest->conn_flags = nla_get_u32(nla_fwd) 4367 & IP_VS_CONN_F_FWD_MASK; 4368 udest->weight = nla_get_u32(nla_weight); 4369 udest->u_threshold = nla_get_u32(nla_u_thresh); 4370 udest->l_threshold = nla_get_u32(nla_l_thresh); 4371 4372 if (nla_tun_type) 4373 udest->tun_type = nla_get_u8(nla_tun_type); 4374 4375 if (nla_tun_port) 4376 udest->tun_port = nla_get_be16(nla_tun_port); 4377 4378 if (nla_tun_flags) 4379 udest->tun_flags = nla_get_u16(nla_tun_flags); 4380 } 4381 4382 return 0; 4383 } 4384 4385 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, 4386 struct ipvs_sync_daemon_cfg *c) 4387 { 4388 struct nlattr *nl_daemon; 4389 4390 nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); 4391 if (!nl_daemon) 4392 return -EMSGSIZE; 4393 4394 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || 4395 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || 4396 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || 4397 nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || 4398 nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || 4399 nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) 4400 goto nla_put_failure; 4401 #ifdef CONFIG_IP_VS_IPV6 4402 if (c->mcast_af == AF_INET6) { 4403 if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, 4404 &c->mcast_group.in6)) 4405 goto nla_put_failure; 4406 } else 4407 #endif 4408 if (c->mcast_af == AF_INET && 4409 nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, 4410 c->mcast_group.ip)) 4411 goto nla_put_failure; 4412 nla_nest_end(skb, nl_daemon); 4413 4414 return 0; 4415 4416 nla_put_failure: 4417 nla_nest_cancel(skb, nl_daemon); 4418 return -EMSGSIZE; 4419 } 4420 4421 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, 4422 struct ipvs_sync_daemon_cfg *c, 4423 struct netlink_callback *cb) 4424 { 4425 void *hdr; 4426 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4427 &ip_vs_genl_family, NLM_F_MULTI, 4428 IPVS_CMD_NEW_DAEMON); 4429 if (!hdr) 4430 return -EMSGSIZE; 4431 4432 if (ip_vs_genl_fill_daemon(skb, state, c)) 4433 goto nla_put_failure; 4434 4435 genlmsg_end(skb, hdr); 4436 return 0; 4437 4438 nla_put_failure: 4439 genlmsg_cancel(skb, hdr); 4440 return -EMSGSIZE; 4441 } 4442 4443 static int ip_vs_genl_dump_daemons(struct sk_buff *skb, 4444 struct netlink_callback *cb) 4445 { 4446 struct net *net = sock_net(skb->sk); 4447 struct netns_ipvs *ipvs = net_ipvs(net); 4448 4449 mutex_lock(&ipvs->sync_mutex); 4450 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { 4451 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, 4452 &ipvs->mcfg, cb) < 0) 4453 goto nla_put_failure; 4454 4455 cb->args[0] = 1; 4456 } 4457 4458 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { 4459 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, 4460 &ipvs->bcfg, cb) < 0) 4461 goto nla_put_failure; 4462 4463 cb->args[1] = 1; 4464 } 4465 4466 nla_put_failure: 4467 mutex_unlock(&ipvs->sync_mutex); 4468 4469 return skb->len; 4470 } 4471 4472 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 4473 { 4474 struct ipvs_sync_daemon_cfg c; 4475 struct nlattr *a; 4476 int ret; 4477 4478 memset(&c, 0, sizeof(c)); 4479 if (!(attrs[IPVS_DAEMON_ATTR_STATE] && 4480 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && 4481 attrs[IPVS_DAEMON_ATTR_SYNC_ID])) 4482 return -EINVAL; 4483 strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), 4484 sizeof(c.mcast_ifn)); 4485 c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); 4486 4487 a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; 4488 if (a) 4489 c.sync_maxlen = nla_get_u16(a); 4490 4491 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; 4492 if (a) { 4493 c.mcast_af = AF_INET; 4494 c.mcast_group.ip = nla_get_in_addr(a); 4495 if (!ipv4_is_multicast(c.mcast_group.ip)) 4496 return -EINVAL; 4497 } else { 4498 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; 4499 if (a) { 4500 #ifdef CONFIG_IP_VS_IPV6 4501 int addr_type; 4502 4503 c.mcast_af = AF_INET6; 4504 c.mcast_group.in6 = nla_get_in6_addr(a); 4505 addr_type = ipv6_addr_type(&c.mcast_group.in6); 4506 if (!(addr_type & IPV6_ADDR_MULTICAST)) 4507 return -EINVAL; 4508 #else 4509 return -EAFNOSUPPORT; 4510 #endif 4511 } 4512 } 4513 4514 a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; 4515 if (a) 4516 c.mcast_port = nla_get_u16(a); 4517 4518 a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; 4519 if (a) 4520 c.mcast_ttl = nla_get_u8(a); 4521 4522 /* The synchronization protocol is incompatible with mixed family 4523 * services 4524 */ 4525 if (ipvs->mixed_address_family_dests > 0) 4526 return -EINVAL; 4527 4528 ret = start_sync_thread(ipvs, &c, 4529 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 4530 return ret; 4531 } 4532 4533 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 4534 { 4535 int ret; 4536 4537 if (!attrs[IPVS_DAEMON_ATTR_STATE]) 4538 return -EINVAL; 4539 4540 ret = stop_sync_thread(ipvs, 4541 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 4542 return ret; 4543 } 4544 4545 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) 4546 { 4547 struct ip_vs_timeout_user t; 4548 4549 __ip_vs_get_timeouts(ipvs, &t); 4550 4551 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) 4552 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); 4553 4554 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) 4555 t.tcp_fin_timeout = 4556 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); 4557 4558 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) 4559 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); 4560 4561 return ip_vs_set_timeout(ipvs, &t); 4562 } 4563 4564 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) 4565 { 4566 int ret = -EINVAL, cmd; 4567 struct net *net = sock_net(skb->sk); 4568 struct netns_ipvs *ipvs = net_ipvs(net); 4569 4570 cmd = info->genlhdr->cmd; 4571 4572 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { 4573 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; 4574 4575 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || 4576 nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) 4577 goto out; 4578 4579 if (cmd == IPVS_CMD_NEW_DAEMON) 4580 ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); 4581 else 4582 ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); 4583 } 4584 4585 out: 4586 return ret; 4587 } 4588 4589 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) 4590 { 4591 bool need_full_svc = false, need_full_dest = false; 4592 struct ip_vs_service *svc = NULL; 4593 struct ip_vs_service_user_kern usvc; 4594 struct ip_vs_dest_user_kern udest; 4595 int ret = 0, cmd; 4596 struct net *net = sock_net(skb->sk); 4597 struct netns_ipvs *ipvs = net_ipvs(net); 4598 4599 cmd = info->genlhdr->cmd; 4600 4601 mutex_lock(&ipvs->service_mutex); 4602 4603 if (cmd == IPVS_CMD_FLUSH) { 4604 ret = ip_vs_flush(ipvs, false); 4605 goto out; 4606 } else if (cmd == IPVS_CMD_SET_CONFIG) { 4607 ret = ip_vs_genl_set_config(ipvs, info->attrs); 4608 goto out; 4609 } else if (cmd == IPVS_CMD_ZERO && 4610 !info->attrs[IPVS_CMD_ATTR_SERVICE]) { 4611 ret = ip_vs_zero_all(ipvs); 4612 goto out; 4613 } 4614 4615 /* All following commands require a service argument, so check if we 4616 * received a valid one. We need a full service specification when 4617 * adding / editing a service. Only identifying members otherwise. */ 4618 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) 4619 need_full_svc = true; 4620 4621 /* We use function that requires RCU lock (hlist_bl) */ 4622 rcu_read_lock(); 4623 ret = ip_vs_genl_parse_service(ipvs, &usvc, 4624 info->attrs[IPVS_CMD_ATTR_SERVICE], 4625 need_full_svc, &svc); 4626 rcu_read_unlock(); 4627 if (ret) 4628 goto out; 4629 4630 /* Unless we're adding a new service, the service must already exist */ 4631 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { 4632 ret = -ESRCH; 4633 goto out; 4634 } 4635 4636 /* Destination commands require a valid destination argument. For 4637 * adding / editing a destination, we need a full destination 4638 * specification. */ 4639 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || 4640 cmd == IPVS_CMD_DEL_DEST) { 4641 if (cmd != IPVS_CMD_DEL_DEST) 4642 need_full_dest = true; 4643 4644 ret = ip_vs_genl_parse_dest(&udest, 4645 info->attrs[IPVS_CMD_ATTR_DEST], 4646 need_full_dest); 4647 if (ret) 4648 goto out; 4649 4650 /* Old protocols did not allow the user to specify address 4651 * family, so we set it to zero instead. We also didn't 4652 * allow heterogeneous pools in the old code, so it's safe 4653 * to assume that this will have the same address family as 4654 * the service. 4655 */ 4656 if (udest.af == 0) 4657 udest.af = svc->af; 4658 4659 if (!ip_vs_is_af_valid(udest.af)) { 4660 ret = -EAFNOSUPPORT; 4661 goto out; 4662 } 4663 4664 if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { 4665 /* The synchronization protocol is incompatible 4666 * with mixed family services 4667 */ 4668 if (ipvs->sync_state) { 4669 ret = -EINVAL; 4670 goto out; 4671 } 4672 4673 /* Which connection types do we support? */ 4674 switch (udest.conn_flags) { 4675 case IP_VS_CONN_F_TUNNEL: 4676 /* We are able to forward this */ 4677 break; 4678 default: 4679 ret = -EINVAL; 4680 goto out; 4681 } 4682 } 4683 } 4684 4685 switch (cmd) { 4686 case IPVS_CMD_NEW_SERVICE: 4687 if (svc == NULL) 4688 ret = ip_vs_add_service(ipvs, &usvc, &svc); 4689 else 4690 ret = -EEXIST; 4691 break; 4692 case IPVS_CMD_SET_SERVICE: 4693 ret = ip_vs_edit_service(svc, &usvc); 4694 break; 4695 case IPVS_CMD_DEL_SERVICE: 4696 ret = ip_vs_del_service(svc); 4697 /* do not use svc, it can be freed */ 4698 break; 4699 case IPVS_CMD_NEW_DEST: 4700 ret = ip_vs_add_dest(svc, &udest); 4701 break; 4702 case IPVS_CMD_SET_DEST: 4703 ret = ip_vs_edit_dest(svc, &udest); 4704 break; 4705 case IPVS_CMD_DEL_DEST: 4706 ret = ip_vs_del_dest(svc, &udest); 4707 break; 4708 case IPVS_CMD_ZERO: 4709 ret = ip_vs_zero_service(svc); 4710 break; 4711 default: 4712 ret = -EINVAL; 4713 } 4714 4715 out: 4716 mutex_unlock(&ipvs->service_mutex); 4717 4718 return ret; 4719 } 4720 4721 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) 4722 { 4723 struct sk_buff *msg; 4724 void *reply; 4725 int ret, cmd, reply_cmd; 4726 struct net *net = sock_net(skb->sk); 4727 struct netns_ipvs *ipvs = net_ipvs(net); 4728 4729 cmd = info->genlhdr->cmd; 4730 4731 if (cmd == IPVS_CMD_GET_SERVICE) 4732 reply_cmd = IPVS_CMD_NEW_SERVICE; 4733 else if (cmd == IPVS_CMD_GET_INFO) 4734 reply_cmd = IPVS_CMD_SET_INFO; 4735 else if (cmd == IPVS_CMD_GET_CONFIG) 4736 reply_cmd = IPVS_CMD_SET_CONFIG; 4737 else { 4738 pr_err("unknown Generic Netlink command\n"); 4739 return -EINVAL; 4740 } 4741 4742 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 4743 if (!msg) 4744 return -ENOMEM; 4745 4746 rcu_read_lock(); 4747 4748 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); 4749 if (reply == NULL) 4750 goto nla_put_failure; 4751 4752 switch (cmd) { 4753 case IPVS_CMD_GET_SERVICE: 4754 { 4755 struct ip_vs_service *svc; 4756 4757 svc = ip_vs_genl_find_service(ipvs, 4758 info->attrs[IPVS_CMD_ATTR_SERVICE]); 4759 if (IS_ERR(svc)) { 4760 ret = PTR_ERR(svc); 4761 goto out_err; 4762 } else if (svc) { 4763 ret = ip_vs_genl_fill_service(msg, svc); 4764 if (ret) 4765 goto nla_put_failure; 4766 } else { 4767 ret = -ESRCH; 4768 goto out_err; 4769 } 4770 4771 break; 4772 } 4773 4774 case IPVS_CMD_GET_CONFIG: 4775 { 4776 struct ip_vs_timeout_user t; 4777 4778 __ip_vs_get_timeouts(ipvs, &t); 4779 #ifdef CONFIG_IP_VS_PROTO_TCP 4780 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, 4781 t.tcp_timeout) || 4782 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, 4783 t.tcp_fin_timeout)) 4784 goto nla_put_failure; 4785 #endif 4786 #ifdef CONFIG_IP_VS_PROTO_UDP 4787 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) 4788 goto nla_put_failure; 4789 #endif 4790 4791 break; 4792 } 4793 4794 case IPVS_CMD_GET_INFO: 4795 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, 4796 IP_VS_VERSION_CODE) || 4797 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, 4798 get_conn_tab_size(ipvs))) 4799 goto nla_put_failure; 4800 break; 4801 } 4802 4803 genlmsg_end(msg, reply); 4804 ret = genlmsg_reply(msg, info); 4805 goto out; 4806 4807 nla_put_failure: 4808 pr_err("not enough space in Netlink message\n"); 4809 ret = -EMSGSIZE; 4810 4811 out_err: 4812 nlmsg_free(msg); 4813 out: 4814 rcu_read_unlock(); 4815 4816 return ret; 4817 } 4818 4819 4820 static const struct genl_small_ops ip_vs_genl_ops[] = { 4821 { 4822 .cmd = IPVS_CMD_NEW_SERVICE, 4823 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4824 .flags = GENL_ADMIN_PERM, 4825 .doit = ip_vs_genl_set_cmd, 4826 }, 4827 { 4828 .cmd = IPVS_CMD_SET_SERVICE, 4829 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4830 .flags = GENL_ADMIN_PERM, 4831 .doit = ip_vs_genl_set_cmd, 4832 }, 4833 { 4834 .cmd = IPVS_CMD_DEL_SERVICE, 4835 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4836 .flags = GENL_ADMIN_PERM, 4837 .doit = ip_vs_genl_set_cmd, 4838 }, 4839 { 4840 .cmd = IPVS_CMD_GET_SERVICE, 4841 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4842 .flags = GENL_ADMIN_PERM, 4843 .doit = ip_vs_genl_get_cmd, 4844 .dumpit = ip_vs_genl_dump_services, 4845 }, 4846 { 4847 .cmd = IPVS_CMD_NEW_DEST, 4848 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4849 .flags = GENL_ADMIN_PERM, 4850 .doit = ip_vs_genl_set_cmd, 4851 }, 4852 { 4853 .cmd = IPVS_CMD_SET_DEST, 4854 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4855 .flags = GENL_ADMIN_PERM, 4856 .doit = ip_vs_genl_set_cmd, 4857 }, 4858 { 4859 .cmd = IPVS_CMD_DEL_DEST, 4860 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4861 .flags = GENL_ADMIN_PERM, 4862 .doit = ip_vs_genl_set_cmd, 4863 }, 4864 { 4865 .cmd = IPVS_CMD_GET_DEST, 4866 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4867 .flags = GENL_ADMIN_PERM, 4868 .dumpit = ip_vs_genl_dump_dests, 4869 }, 4870 { 4871 .cmd = IPVS_CMD_NEW_DAEMON, 4872 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4873 .flags = GENL_ADMIN_PERM, 4874 .doit = ip_vs_genl_set_daemon, 4875 }, 4876 { 4877 .cmd = IPVS_CMD_DEL_DAEMON, 4878 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4879 .flags = GENL_ADMIN_PERM, 4880 .doit = ip_vs_genl_set_daemon, 4881 }, 4882 { 4883 .cmd = IPVS_CMD_GET_DAEMON, 4884 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4885 .flags = GENL_ADMIN_PERM, 4886 .dumpit = ip_vs_genl_dump_daemons, 4887 }, 4888 { 4889 .cmd = IPVS_CMD_SET_CONFIG, 4890 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4891 .flags = GENL_ADMIN_PERM, 4892 .doit = ip_vs_genl_set_cmd, 4893 }, 4894 { 4895 .cmd = IPVS_CMD_GET_CONFIG, 4896 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4897 .flags = GENL_ADMIN_PERM, 4898 .doit = ip_vs_genl_get_cmd, 4899 }, 4900 { 4901 .cmd = IPVS_CMD_GET_INFO, 4902 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4903 .flags = GENL_ADMIN_PERM, 4904 .doit = ip_vs_genl_get_cmd, 4905 }, 4906 { 4907 .cmd = IPVS_CMD_ZERO, 4908 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4909 .flags = GENL_ADMIN_PERM, 4910 .doit = ip_vs_genl_set_cmd, 4911 }, 4912 { 4913 .cmd = IPVS_CMD_FLUSH, 4914 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4915 .flags = GENL_ADMIN_PERM, 4916 .doit = ip_vs_genl_set_cmd, 4917 }, 4918 }; 4919 4920 static struct genl_family ip_vs_genl_family __ro_after_init = { 4921 .hdrsize = 0, 4922 .name = IPVS_GENL_NAME, 4923 .version = IPVS_GENL_VERSION, 4924 .maxattr = IPVS_CMD_ATTR_MAX, 4925 .policy = ip_vs_cmd_policy, 4926 .netnsok = true, /* Make ipvsadm to work on netns */ 4927 .module = THIS_MODULE, 4928 .small_ops = ip_vs_genl_ops, 4929 .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), 4930 .resv_start_op = IPVS_CMD_FLUSH + 1, 4931 .parallel_ops = 1, 4932 }; 4933 4934 static int __init ip_vs_genl_register(void) 4935 { 4936 return genl_register_family(&ip_vs_genl_family); 4937 } 4938 4939 static void ip_vs_genl_unregister(void) 4940 { 4941 genl_unregister_family(&ip_vs_genl_family); 4942 } 4943 4944 /* End of Generic Netlink interface definitions */ 4945 4946 /* 4947 * per netns intit/exit func. 4948 */ 4949 #ifdef CONFIG_SYSCTL 4950 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) 4951 { 4952 struct net *net = ipvs->net; 4953 struct ctl_table *tbl; 4954 int idx, ret; 4955 size_t ctl_table_size = ARRAY_SIZE(vs_vars); 4956 bool unpriv = net->user_ns != &init_user_ns; 4957 4958 atomic_set(&ipvs->dropentry, 0); 4959 spin_lock_init(&ipvs->dropentry_lock); 4960 spin_lock_init(&ipvs->droppacket_lock); 4961 spin_lock_init(&ipvs->securetcp_lock); 4962 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); 4963 INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, 4964 expire_nodest_conn_handler); 4965 ipvs->est_stopped = 0; 4966 4967 if (!net_eq(net, &init_net)) { 4968 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); 4969 if (tbl == NULL) 4970 return -ENOMEM; 4971 } else 4972 tbl = vs_vars; 4973 /* Initialize sysctl defaults */ 4974 for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { 4975 if (tbl[idx].proc_handler == proc_do_defense_mode) 4976 tbl[idx].extra2 = ipvs; 4977 } 4978 idx = 0; 4979 ipvs->sysctl_amemthresh = 1024; 4980 tbl[idx++].data = &ipvs->sysctl_amemthresh; 4981 ipvs->sysctl_am_droprate = 10; 4982 tbl[idx++].data = &ipvs->sysctl_am_droprate; 4983 tbl[idx++].data = &ipvs->sysctl_drop_entry; 4984 tbl[idx++].data = &ipvs->sysctl_drop_packet; 4985 #ifdef CONFIG_IP_VS_NFCT 4986 tbl[idx++].data = &ipvs->sysctl_conntrack; 4987 #endif 4988 tbl[idx++].data = &ipvs->sysctl_secure_tcp; 4989 ipvs->sysctl_snat_reroute = 1; 4990 tbl[idx++].data = &ipvs->sysctl_snat_reroute; 4991 ipvs->sysctl_sync_ver = 1; 4992 tbl[idx++].data = &ipvs->sysctl_sync_ver; 4993 ipvs->sysctl_sync_ports = 1; 4994 tbl[idx++].data = &ipvs->sysctl_sync_ports; 4995 tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; 4996 4997 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; 4998 if (unpriv) 4999 tbl[idx].mode = 0444; 5000 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; 5001 5002 ipvs->sysctl_sync_sock_size = 0; 5003 if (unpriv) 5004 tbl[idx].mode = 0444; 5005 tbl[idx++].data = &ipvs->sysctl_sync_sock_size; 5006 5007 tbl[idx++].data = &ipvs->sysctl_cache_bypass; 5008 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; 5009 tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; 5010 tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; 5011 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; 5012 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; 5013 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; 5014 tbl[idx].data = &ipvs->sysctl_sync_threshold; 5015 tbl[idx].extra2 = ipvs; 5016 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); 5017 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; 5018 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; 5019 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); 5020 tbl[idx++].data = &ipvs->sysctl_sync_retries; 5021 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; 5022 ipvs->sysctl_pmtu_disc = 1; 5023 tbl[idx++].data = &ipvs->sysctl_pmtu_disc; 5024 tbl[idx++].data = &ipvs->sysctl_backup_only; 5025 ipvs->sysctl_conn_reuse_mode = 1; 5026 tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; 5027 tbl[idx++].data = &ipvs->sysctl_schedule_icmp; 5028 tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; 5029 5030 ipvs->sysctl_run_estimation = 1; 5031 if (unpriv) 5032 tbl[idx].mode = 0444; 5033 tbl[idx].extra2 = ipvs; 5034 tbl[idx++].data = &ipvs->sysctl_run_estimation; 5035 5036 ipvs->est_cpulist_valid = 0; 5037 if (unpriv) 5038 tbl[idx].mode = 0444; 5039 tbl[idx].extra2 = ipvs; 5040 tbl[idx++].data = &ipvs->sysctl_est_cpulist; 5041 5042 ipvs->sysctl_est_nice = IPVS_EST_NICE; 5043 if (unpriv) 5044 tbl[idx].mode = 0444; 5045 tbl[idx].extra2 = ipvs; 5046 tbl[idx++].data = &ipvs->sysctl_est_nice; 5047 5048 if (unpriv) 5049 tbl[idx].mode = 0444; 5050 tbl[idx].extra2 = ipvs; 5051 tbl[idx++].data = &ipvs->sysctl_conn_lfactor; 5052 5053 if (unpriv) 5054 tbl[idx].mode = 0444; 5055 tbl[idx].extra2 = ipvs; 5056 tbl[idx++].data = &ipvs->sysctl_svc_lfactor; 5057 5058 #ifdef CONFIG_IP_VS_DEBUG 5059 /* Global sysctls must be ro in non-init netns */ 5060 if (!net_eq(net, &init_net)) 5061 tbl[idx++].mode = 0444; 5062 #endif 5063 5064 ret = -ENOMEM; 5065 ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, 5066 ctl_table_size); 5067 if (!ipvs->sysctl_hdr) 5068 goto err; 5069 ipvs->sysctl_tbl = tbl; 5070 5071 ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); 5072 if (ret < 0) 5073 goto err; 5074 5075 /* Schedule defense work */ 5076 queue_delayed_work(system_long_wq, &ipvs->defense_work, 5077 DEFENSE_TIMER_PERIOD); 5078 5079 return 0; 5080 5081 err: 5082 unregister_net_sysctl_table(ipvs->sysctl_hdr); 5083 if (!net_eq(net, &init_net)) 5084 kfree(tbl); 5085 return ret; 5086 } 5087 5088 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) 5089 { 5090 struct net *net = ipvs->net; 5091 5092 cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); 5093 cancel_delayed_work_sync(&ipvs->defense_work); 5094 cancel_work_sync(&ipvs->defense_work.work); 5095 unregister_net_sysctl_table(ipvs->sysctl_hdr); 5096 if (ipvs->tot_stats->s.est.ktid != -2) { 5097 /* Not stopped yet? This happens only on netns init error and 5098 * we even do not need to lock the service_mutex for this case. 5099 */ 5100 mutex_lock(&ipvs->service_mutex); 5101 ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); 5102 mutex_unlock(&ipvs->service_mutex); 5103 } 5104 5105 if (ipvs->est_cpulist_valid) 5106 free_cpumask_var(ipvs->sysctl_est_cpulist); 5107 5108 if (!net_eq(net, &init_net)) 5109 kfree(ipvs->sysctl_tbl); 5110 } 5111 5112 #else 5113 5114 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } 5115 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } 5116 5117 #endif 5118 5119 static struct notifier_block ip_vs_dst_notifier = { 5120 .notifier_call = ip_vs_dst_event, 5121 #ifdef CONFIG_IP_VS_IPV6 5122 .priority = ADDRCONF_NOTIFY_PRIORITY + 5, 5123 #endif 5124 }; 5125 5126 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) 5127 { 5128 int ret = -ENOMEM; 5129 int idx; 5130 5131 /* Initialize service_mutex, svc_table per netns */ 5132 __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); 5133 init_rwsem(&ipvs->svc_resize_sem); 5134 init_rwsem(&ipvs->svc_replace_sem); 5135 INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); 5136 atomic_set(&ipvs->svc_table_changes, 0); 5137 RCU_INIT_POINTER(ipvs->svc_table, NULL); 5138 5139 /* Initialize rs_table */ 5140 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) 5141 INIT_HLIST_HEAD(&ipvs->rs_table[idx]); 5142 5143 INIT_LIST_HEAD(&ipvs->dest_trash); 5144 spin_lock_init(&ipvs->dest_trash_lock); 5145 timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); 5146 for (idx = 0; idx < IP_VS_AF_MAX; idx++) { 5147 atomic_set(&ipvs->num_services[idx], 0); 5148 atomic_set(&ipvs->fwm_services[idx], 0); 5149 atomic_set(&ipvs->nonfwm_services[idx], 0); 5150 atomic_set(&ipvs->ftpsvc_counter[idx], 0); 5151 atomic_set(&ipvs->nullsvc_counter[idx], 0); 5152 atomic_set(&ipvs->conn_out_counter[idx], 0); 5153 } 5154 5155 INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); 5156 ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs); 5157 5158 /* procfs stats */ 5159 ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats); 5160 if (!ipvs->tot_stats) 5161 goto out; 5162 if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) 5163 goto err_tot_stats; 5164 5165 #ifdef CONFIG_PROC_FS 5166 if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, 5167 &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) 5168 goto err_vs; 5169 if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, 5170 ip_vs_stats_show, NULL)) 5171 goto err_stats; 5172 if (!proc_create_net_single("ip_vs_stats_percpu", 0, 5173 ipvs->net->proc_net, 5174 ip_vs_stats_percpu_show, NULL)) 5175 goto err_percpu; 5176 if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net, 5177 ip_vs_status_show, NULL)) 5178 goto err_status; 5179 #endif 5180 5181 ret = ip_vs_control_net_init_sysctl(ipvs); 5182 if (ret < 0) 5183 goto err; 5184 5185 return 0; 5186 5187 err: 5188 #ifdef CONFIG_PROC_FS 5189 remove_proc_entry("ip_vs_status", ipvs->net->proc_net); 5190 5191 err_status: 5192 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 5193 5194 err_percpu: 5195 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 5196 5197 err_stats: 5198 remove_proc_entry("ip_vs", ipvs->net->proc_net); 5199 5200 err_vs: 5201 #endif 5202 ip_vs_stats_release(&ipvs->tot_stats->s); 5203 5204 err_tot_stats: 5205 kfree(ipvs->tot_stats); 5206 5207 out: 5208 return ret; 5209 } 5210 5211 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) 5212 { 5213 ip_vs_trash_cleanup(ipvs); 5214 ip_vs_control_net_cleanup_sysctl(ipvs); 5215 cancel_delayed_work_sync(&ipvs->est_reload_work); 5216 #ifdef CONFIG_PROC_FS 5217 remove_proc_entry("ip_vs_status", ipvs->net->proc_net); 5218 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 5219 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 5220 remove_proc_entry("ip_vs", ipvs->net->proc_net); 5221 #endif 5222 call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); 5223 } 5224 5225 int __init ip_vs_register_nl_ioctl(void) 5226 { 5227 int ret; 5228 5229 ret = nf_register_sockopt(&ip_vs_sockopts); 5230 if (ret) { 5231 pr_err("cannot register sockopt.\n"); 5232 goto err_sock; 5233 } 5234 5235 ret = ip_vs_genl_register(); 5236 if (ret) { 5237 pr_err("cannot register Generic Netlink interface.\n"); 5238 goto err_genl; 5239 } 5240 return 0; 5241 5242 err_genl: 5243 nf_unregister_sockopt(&ip_vs_sockopts); 5244 err_sock: 5245 return ret; 5246 } 5247 5248 void ip_vs_unregister_nl_ioctl(void) 5249 { 5250 ip_vs_genl_unregister(); 5251 nf_unregister_sockopt(&ip_vs_sockopts); 5252 } 5253 5254 int __init ip_vs_control_init(void) 5255 { 5256 int ret; 5257 5258 ret = register_netdevice_notifier(&ip_vs_dst_notifier); 5259 if (ret < 0) 5260 return ret; 5261 5262 return 0; 5263 } 5264 5265 5266 void ip_vs_control_cleanup(void) 5267 { 5268 unregister_netdevice_notifier(&ip_vs_dst_notifier); 5269 /* relying on common rcu_barrier() in ip_vs_cleanup() */ 5270 } 5271