1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 10 * Peter Kese <peter.kese@ijs.si> 11 * Julian Anastasov <ja@ssi.bg> 12 * 13 * Changes: 14 */ 15 16 #define pr_fmt(fmt) "IPVS: " fmt 17 18 #include <linux/module.h> 19 #include <linux/init.h> 20 #include <linux/types.h> 21 #include <linux/capability.h> 22 #include <linux/fs.h> 23 #include <linux/sysctl.h> 24 #include <linux/proc_fs.h> 25 #include <linux/workqueue.h> 26 #include <linux/seq_file.h> 27 #include <linux/slab.h> 28 29 #include <linux/netfilter.h> 30 #include <linux/netfilter_ipv4.h> 31 #include <linux/mutex.h> 32 #include <linux/rcupdate_wait.h> 33 34 #include <net/net_namespace.h> 35 #include <linux/nsproxy.h> 36 #include <net/ip.h> 37 #ifdef CONFIG_IP_VS_IPV6 38 #include <net/ipv6.h> 39 #include <net/ip6_route.h> 40 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 41 #endif 42 #include <net/route.h> 43 #include <net/sock.h> 44 #include <net/genetlink.h> 45 46 #include <linux/uaccess.h> 47 48 #include <net/ip_vs.h> 49 50 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); 51 52 static struct lock_class_key __ipvs_service_key; 53 54 /* sysctl variables */ 55 56 #ifdef CONFIG_IP_VS_DEBUG 57 static int sysctl_ip_vs_debug_level = 0; 58 59 int ip_vs_get_debug_level(void) 60 { 61 return sysctl_ip_vs_debug_level; 62 } 63 #endif 64 65 66 /* Protos */ 67 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); 68 69 70 #ifdef CONFIG_IP_VS_IPV6 71 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ 72 static bool __ip_vs_addr_is_local_v6(struct net *net, 73 const struct in6_addr *addr) 74 { 75 struct flowi6 fl6 = { 76 .daddr = *addr, 77 }; 78 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); 79 bool is_local; 80 81 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); 82 83 dst_release(dst); 84 return is_local; 85 } 86 #endif 87 88 #ifdef CONFIG_SYSCTL 89 /* 90 * update_defense_level is called from keventd and from sysctl, 91 * so it needs to protect itself from softirqs 92 */ 93 static void update_defense_level(struct netns_ipvs *ipvs) 94 { 95 struct sysinfo i; 96 int availmem; 97 int amemthresh; 98 int nomem; 99 int to_change = -1; 100 101 /* we only count free and buffered memory (in pages) */ 102 si_meminfo(&i); 103 availmem = i.freeram + i.bufferram; 104 /* however in linux 2.5 the i.bufferram is total page cache size, 105 we need adjust it */ 106 /* si_swapinfo(&i); */ 107 /* availmem = availmem - (i.totalswap - i.freeswap); */ 108 109 amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); 110 nomem = (availmem < amemthresh); 111 112 local_bh_disable(); 113 114 /* drop_entry */ 115 spin_lock(&ipvs->dropentry_lock); 116 switch (ipvs->sysctl_drop_entry) { 117 case 0: 118 atomic_set(&ipvs->dropentry, 0); 119 break; 120 case 1: 121 if (nomem) { 122 atomic_set(&ipvs->dropentry, 1); 123 ipvs->sysctl_drop_entry = 2; 124 } else { 125 atomic_set(&ipvs->dropentry, 0); 126 } 127 break; 128 case 2: 129 if (nomem) { 130 atomic_set(&ipvs->dropentry, 1); 131 } else { 132 atomic_set(&ipvs->dropentry, 0); 133 ipvs->sysctl_drop_entry = 1; 134 } 135 break; 136 case 3: 137 atomic_set(&ipvs->dropentry, 1); 138 break; 139 } 140 spin_unlock(&ipvs->dropentry_lock); 141 142 /* drop_packet */ 143 spin_lock(&ipvs->droppacket_lock); 144 switch (ipvs->sysctl_drop_packet) { 145 case 0: 146 ipvs->drop_rate = 0; 147 break; 148 case 1: 149 if (nomem) { 150 ipvs->drop_counter = amemthresh / (amemthresh - availmem); 151 ipvs->drop_rate = ipvs->drop_counter; 152 ipvs->sysctl_drop_packet = 2; 153 } else { 154 ipvs->drop_rate = 0; 155 } 156 break; 157 case 2: 158 if (nomem) { 159 ipvs->drop_counter = amemthresh / (amemthresh - availmem); 160 ipvs->drop_rate = ipvs->drop_counter; 161 } else { 162 ipvs->drop_rate = 0; 163 ipvs->sysctl_drop_packet = 1; 164 } 165 break; 166 case 3: 167 ipvs->drop_rate = ipvs->sysctl_am_droprate; 168 break; 169 } 170 spin_unlock(&ipvs->droppacket_lock); 171 172 /* secure_tcp */ 173 spin_lock(&ipvs->securetcp_lock); 174 switch (ipvs->sysctl_secure_tcp) { 175 case 0: 176 if (ipvs->old_secure_tcp >= 2) 177 to_change = 0; 178 break; 179 case 1: 180 if (nomem) { 181 if (ipvs->old_secure_tcp < 2) 182 to_change = 1; 183 ipvs->sysctl_secure_tcp = 2; 184 } else { 185 if (ipvs->old_secure_tcp >= 2) 186 to_change = 0; 187 } 188 break; 189 case 2: 190 if (nomem) { 191 if (ipvs->old_secure_tcp < 2) 192 to_change = 1; 193 } else { 194 if (ipvs->old_secure_tcp >= 2) 195 to_change = 0; 196 ipvs->sysctl_secure_tcp = 1; 197 } 198 break; 199 case 3: 200 if (ipvs->old_secure_tcp < 2) 201 to_change = 1; 202 break; 203 } 204 ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; 205 if (to_change >= 0) 206 ip_vs_protocol_timeout_change(ipvs, 207 ipvs->sysctl_secure_tcp > 1); 208 spin_unlock(&ipvs->securetcp_lock); 209 210 local_bh_enable(); 211 } 212 213 /* Handler for delayed work for expiring no 214 * destination connections 215 */ 216 static void expire_nodest_conn_handler(struct work_struct *work) 217 { 218 struct netns_ipvs *ipvs; 219 220 ipvs = container_of(work, struct netns_ipvs, 221 expire_nodest_conn_work.work); 222 ip_vs_expire_nodest_conn_flush(ipvs); 223 } 224 225 /* 226 * Timer for checking the defense 227 */ 228 #define DEFENSE_TIMER_PERIOD 1*HZ 229 230 static void defense_work_handler(struct work_struct *work) 231 { 232 struct netns_ipvs *ipvs = 233 container_of(work, struct netns_ipvs, defense_work.work); 234 235 update_defense_level(ipvs); 236 if (atomic_read(&ipvs->dropentry)) 237 ip_vs_random_dropentry(ipvs); 238 queue_delayed_work(system_long_wq, &ipvs->defense_work, 239 DEFENSE_TIMER_PERIOD); 240 } 241 #endif 242 243 static void est_reload_work_handler(struct work_struct *work) 244 { 245 struct netns_ipvs *ipvs = 246 container_of(work, struct netns_ipvs, est_reload_work.work); 247 int genid_done = atomic_read(&ipvs->est_genid_done); 248 unsigned long delay = HZ / 10; /* repeat startups after failure */ 249 bool repeat = false; 250 int genid; 251 int id; 252 253 mutex_lock(&ipvs->est_mutex); 254 genid = atomic_read(&ipvs->est_genid); 255 for (id = 0; id < ipvs->est_kt_count; id++) { 256 struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; 257 258 /* netns clean up started, abort delayed work */ 259 if (!READ_ONCE(ipvs->enable)) 260 goto unlock; 261 if (!kd) 262 continue; 263 /* New config ? Stop kthread tasks */ 264 if (genid != genid_done) { 265 if (!id) { 266 /* Only we can stop kt 0 but not under mutex */ 267 mutex_unlock(&ipvs->est_mutex); 268 ip_vs_est_kthread_stop(kd); 269 mutex_lock(&ipvs->est_mutex); 270 if (!READ_ONCE(ipvs->enable)) 271 goto unlock; 272 /* kd for kt 0 is never destroyed */ 273 } else { 274 ip_vs_est_kthread_stop(kd); 275 } 276 } 277 if (!kd->task && !ip_vs_est_stopped(ipvs)) { 278 bool start; 279 280 /* Do not start kthreads above 0 in calc phase */ 281 if (id) 282 start = !ipvs->est_calc_phase; 283 else 284 start = kd->needed; 285 if (start && ip_vs_est_kthread_start(ipvs, kd) < 0) 286 repeat = true; 287 } 288 } 289 290 atomic_set(&ipvs->est_genid_done, genid); 291 292 if (repeat) 293 queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 294 delay); 295 296 unlock: 297 mutex_unlock(&ipvs->est_mutex); 298 } 299 300 static int get_conn_tab_size(struct netns_ipvs *ipvs) 301 { 302 const struct ip_vs_rht *t; 303 int size = 0; 304 305 rcu_read_lock(); 306 t = rcu_dereference(ipvs->conn_tab); 307 if (t) 308 size = t->size; 309 rcu_read_unlock(); 310 311 return size; 312 } 313 314 int 315 ip_vs_use_count_inc(void) 316 { 317 return try_module_get(THIS_MODULE); 318 } 319 320 void 321 ip_vs_use_count_dec(void) 322 { 323 module_put(THIS_MODULE); 324 } 325 326 327 /* Service hashing: 328 * Operation Locking order 329 * --------------------------------------------------------------------------- 330 * add first table service_mutex 331 * attach new table service_mutex 332 * add/del service service_mutex, RCU, bit lock 333 * move between tables (rehash) svc_resize_sem(W), seqcount_t(W), bit lock 334 * replace old with attached svc_resize_sem(W), svc_replace_sem(W) 335 * find service RCU, seqcount_t(R) 336 * walk services(blocking) service_mutex, svc_resize_sem(R) 337 * walk services(non-blocking) RCU, seqcount_t(R) 338 * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R) 339 * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R) 340 * del table service_mutex after stopped work 341 * 342 * - new table is attached on resizing under service_mutex and all operations 343 * can run in parallel in 2 tables until the new table is registered as current 344 * one 345 * - two contexts can modify buckets: config and table resize (work), both in 346 * process context 347 * - only table resizer can move entries, so we do not protect t->seqc[] 348 * items with t->lock[] 349 * - lookups occur under RCU lock and seqcount reader lock to detect if 350 * services are moved to new table 351 * - move operations may disturb readers: find operation will not miss entries 352 * but walkers may see same entry twice if they are forced to retry chains 353 * or to walk the newly attached second table 354 * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to check 355 * svc_table_changes and repeat the RCU read section if new table is installed 356 * - walkers may serialize with the whole resizing process (svc_resize_sem) 357 * to prevent seeing same service twice or just with the svc_table 358 * replace (svc_replace_sem) when we can see entries twice but we 359 * prefer to run concurrently with the rehashing. 360 */ 361 362 /* 363 * Returns hash value for virtual service 364 */ 365 static inline u32 366 ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto, 367 const union nf_inet_addr *addr, __be16 port) 368 { 369 return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto); 370 } 371 372 /* 373 * Returns hash value of fwmark for virtual service lookup 374 */ 375 static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af, 376 __u32 fwmark) 377 { 378 return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]); 379 } 380 381 /* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */ 382 static int ip_vs_svc_hash(struct ip_vs_service *svc) 383 { 384 struct netns_ipvs *ipvs = svc->ipvs; 385 struct hlist_bl_head *head; 386 struct ip_vs_rht *t; 387 u32 hash; 388 389 if (svc->flags & IP_VS_SVC_F_HASHED) { 390 pr_err("%s(): request for already hashed, called from %pS\n", 391 __func__, __builtin_return_address(0)); 392 return 0; 393 } 394 395 /* increase its refcnt because it is referenced by the svc table */ 396 atomic_inc(&svc->refcnt); 397 398 /* We know if new table is attached under service_mutex but rely on 399 * RCU to hold the old table to be freed in resizer 400 */ 401 rcu_read_lock(); 402 403 /* This can be the old or the new table */ 404 t = rcu_dereference(ipvs->svc_table); 405 406 /* New entries go into recent table */ 407 t = rcu_dereference(t->new_tbl); 408 409 if (svc->fwmark == 0) { 410 /* 411 * Hash it by <protocol,addr,port> 412 */ 413 hash = ip_vs_svc_hashval(t, svc->af, svc->protocol, 414 &svc->addr, svc->port); 415 } else { 416 /* 417 * Hash it by fwmark 418 */ 419 hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark); 420 } 421 head = t->buckets + (hash & t->mask); 422 hlist_bl_lock(head); 423 WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash)); 424 svc->flags |= IP_VS_SVC_F_HASHED; 425 hlist_bl_add_head_rcu(&svc->s_list, head); 426 hlist_bl_unlock(head); 427 428 rcu_read_unlock(); 429 430 return 1; 431 } 432 433 434 /* 435 * Unhashes a service from svc_table. 436 * Should be called with locked tables. 437 */ 438 static int ip_vs_svc_unhash(struct ip_vs_service *svc) 439 { 440 struct netns_ipvs *ipvs = svc->ipvs; 441 struct hlist_bl_head *head; 442 struct ip_vs_rht *t; 443 u32 hash_key2; 444 u32 hash_key; 445 446 if (!(svc->flags & IP_VS_SVC_F_HASHED)) { 447 pr_err("%s(): request for unhash flagged, called from %pS\n", 448 __func__, __builtin_return_address(0)); 449 return 0; 450 } 451 452 /* We know if new table is attached under service_mutex but rely on 453 * RCU to hold the old table to be freed in resizer 454 */ 455 rcu_read_lock(); 456 457 /* This can be the old or the new table */ 458 t = rcu_dereference(ipvs->svc_table); 459 hash_key = READ_ONCE(svc->hash_key); 460 /* We need to lock the bucket in the right table */ 461 if (ip_vs_rht_same_table(t, hash_key)) { 462 head = t->buckets + (hash_key & t->mask); 463 hlist_bl_lock(head); 464 /* Ensure hash_key is read under lock */ 465 hash_key2 = READ_ONCE(svc->hash_key); 466 /* Moved to new table ? */ 467 if (hash_key != hash_key2) { 468 hlist_bl_unlock(head); 469 t = rcu_dereference(t->new_tbl); 470 head = t->buckets + (hash_key2 & t->mask); 471 hlist_bl_lock(head); 472 } 473 } else { 474 /* It is already moved to new table */ 475 t = rcu_dereference(t->new_tbl); 476 head = t->buckets + (hash_key & t->mask); 477 hlist_bl_lock(head); 478 } 479 /* Remove it from svc_table */ 480 hlist_bl_del_rcu(&svc->s_list); 481 482 svc->flags &= ~IP_VS_SVC_F_HASHED; 483 atomic_dec(&svc->refcnt); 484 hlist_bl_unlock(head); 485 486 rcu_read_unlock(); 487 return 1; 488 } 489 490 491 /* 492 * Get service by {netns, proto,addr,port} in the service table. 493 */ 494 static inline struct ip_vs_service * 495 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, 496 const union nf_inet_addr *vaddr, __be16 vport) 497 { 498 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 499 struct hlist_bl_head *head; 500 struct ip_vs_service *svc; 501 struct ip_vs_rht *t, *p; 502 struct hlist_bl_node *e; 503 u32 hash, hash_key; 504 505 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 506 /* Check for "full" addressed entries */ 507 hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport); 508 509 hash_key = ip_vs_rht_build_hash_key(t, hash); 510 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 511 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 512 if (READ_ONCE(svc->hash_key) == hash_key && 513 svc->af == af && 514 ip_vs_addr_equal(af, &svc->addr, vaddr) && 515 svc->port == vport && 516 svc->protocol == protocol && !svc->fwmark) { 517 /* HIT */ 518 return svc; 519 } 520 } 521 } 522 } 523 524 return NULL; 525 } 526 527 528 /* 529 * Get service by {fwmark} in the service table. 530 */ 531 static inline struct ip_vs_service * 532 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) 533 { 534 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 535 struct hlist_bl_head *head; 536 struct ip_vs_service *svc; 537 struct ip_vs_rht *t, *p; 538 struct hlist_bl_node *e; 539 u32 hash, hash_key; 540 541 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 542 /* Check for fwmark addressed entries */ 543 hash = ip_vs_svc_fwm_hashval(t, af, fwmark); 544 545 hash_key = ip_vs_rht_build_hash_key(t, hash); 546 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 547 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 548 if (READ_ONCE(svc->hash_key) == hash_key && 549 svc->fwmark == fwmark && svc->af == af) { 550 /* HIT */ 551 return svc; 552 } 553 } 554 } 555 } 556 557 return NULL; 558 } 559 560 /* Find service, called under RCU lock */ 561 struct ip_vs_service * 562 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, 563 const union nf_inet_addr *vaddr, __be16 vport) 564 { 565 struct ip_vs_service *svc = NULL; 566 int af_id = ip_vs_af_index(af); 567 568 /* 569 * Check the table hashed by fwmark first 570 */ 571 if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) { 572 svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); 573 if (svc) 574 goto out; 575 } 576 577 if (!atomic_read(&ipvs->nonfwm_services[af_id])) 578 goto out; 579 580 /* 581 * Check the table hashed by <protocol,addr,port> 582 * for "full" addressed entries 583 */ 584 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); 585 if (svc) 586 goto out; 587 588 if (protocol == IPPROTO_TCP && 589 atomic_read(&ipvs->ftpsvc_counter[af_id]) && 590 (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { 591 /* 592 * Check if ftp service entry exists, the packet 593 * might belong to FTP data connections. 594 */ 595 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); 596 if (svc) 597 goto out; 598 } 599 600 if (atomic_read(&ipvs->nullsvc_counter[af_id])) { 601 /* 602 * Check if the catch-all port (port zero) exists 603 */ 604 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); 605 } 606 607 out: 608 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", 609 fwmark, ip_vs_proto_name(protocol), 610 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), 611 svc ? "hit" : "not hit"); 612 613 return svc; 614 } 615 616 /* Return the number of registered services */ 617 static int ip_vs_get_num_services(struct netns_ipvs *ipvs) 618 { 619 int ns = 0, ni = IP_VS_AF_MAX; 620 621 while (--ni >= 0) 622 ns += atomic_read(&ipvs->num_services[ni]); 623 return ns; 624 } 625 626 /* Get default load factor to map num_services/u_thresh to t->size */ 627 static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs) 628 { 629 int factor; 630 631 if (net_eq(ipvs->net, &init_net)) 632 factor = -3; /* grow if load is above 12.5% */ 633 else 634 factor = -2; /* grow if load is above 25% */ 635 return factor; 636 } 637 638 /* Get the desired svc_table size */ 639 static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, 640 int lfactor) 641 { 642 return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs), 643 lfactor, IP_VS_SVC_TAB_MIN_BITS, 644 IP_VS_SVC_TAB_MAX_BITS); 645 } 646 647 /* Allocate svc_table */ 648 static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs, 649 int buckets, int lfactor) 650 { 651 struct ip_vs_rht *t; 652 int scounts, locks; 653 654 /* No frequent lookups to race with resizing, so use max of 64 655 * seqcounts. Only resizer moves entries, so use 0 locks. 656 */ 657 scounts = clamp(buckets >> 4, 1, 64); 658 locks = 0; 659 660 t = ip_vs_rht_alloc(buckets, scounts, locks); 661 if (!t) 662 return NULL; 663 t->lfactor = lfactor; 664 ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS, 665 IP_VS_SVC_TAB_MAX_BITS); 666 return t; 667 } 668 669 /* svc_table resizer work */ 670 static void svc_resize_work_handler(struct work_struct *work) 671 { 672 struct hlist_bl_head *head, *head2; 673 struct ip_vs_rht *t_free = NULL; 674 unsigned int resched_score = 0; 675 struct hlist_bl_node *cn, *nn; 676 struct ip_vs_rht *t, *t_new; 677 struct ip_vs_service *svc; 678 struct netns_ipvs *ipvs; 679 bool more_work = true; 680 seqcount_t *sc; 681 int limit = 0; 682 int new_size; 683 int lfactor; 684 u32 bucket; 685 686 ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work); 687 688 if (!down_write_trylock(&ipvs->svc_resize_sem)) 689 goto out; 690 if (!mutex_trylock(&ipvs->service_mutex)) 691 goto unlock_sem; 692 more_work = false; 693 clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); 694 if (!READ_ONCE(ipvs->enable)) 695 goto unlock_m; 696 t = rcu_dereference_protected(ipvs->svc_table, 1); 697 /* Do nothing if table is removed */ 698 if (!t) 699 goto unlock_m; 700 /* New table already attached? BUG! */ 701 if (t != rcu_access_pointer(t->new_tbl)) 702 goto unlock_m; 703 704 lfactor = sysctl_svc_lfactor(ipvs); 705 /* Should we resize ? */ 706 new_size = ip_vs_svc_desired_size(ipvs, t, lfactor); 707 if (new_size == t->size && lfactor == t->lfactor) 708 goto unlock_m; 709 710 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 711 if (!t_new) { 712 more_work = true; 713 goto unlock_m; 714 } 715 /* Flip the table_id */ 716 t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; 717 718 /* Attach new table */ 719 rcu_assign_pointer(t->new_tbl, t_new); 720 /* Allow add/del to new_tbl while moving from old table */ 721 mutex_unlock(&ipvs->service_mutex); 722 723 ip_vs_rht_for_each_bucket(t, bucket, head) { 724 same_bucket: 725 if (++limit >= 16) { 726 /* Check if work is stopped */ 727 if (test_bit(IP_VS_WORK_SVC_NORESIZE, 728 &ipvs->work_flags)) 729 goto unlock_sem; 730 if (resched_score >= 100) { 731 resched_score = 0; 732 cond_resched(); 733 } 734 limit = 0; 735 } 736 if (hlist_bl_empty(head)) { 737 resched_score++; 738 continue; 739 } 740 /* Preemption calls ahead... */ 741 resched_score = 0; 742 743 sc = &t->seqc[bucket & t->seqc_mask]; 744 /* seqcount_t usage considering PREEMPT_RT rules: 745 * - we are the only writer => preemption can be allowed 746 * - readers (SoftIRQ) => disable BHs 747 * - readers (processes) => preemption should be disabled 748 */ 749 local_bh_disable(); 750 preempt_disable_nested(); 751 write_seqcount_begin(sc); 752 hlist_bl_lock(head); 753 754 hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) { 755 u32 hash; 756 757 /* New hash for the new table */ 758 if (svc->fwmark == 0) { 759 /* Hash it by <protocol,addr,port> */ 760 hash = ip_vs_svc_hashval(t_new, svc->af, 761 svc->protocol, 762 &svc->addr, svc->port); 763 } else { 764 /* Hash it by fwmark */ 765 hash = ip_vs_svc_fwm_hashval(t_new, svc->af, 766 svc->fwmark); 767 } 768 hlist_bl_del_rcu(&svc->s_list); 769 head2 = t_new->buckets + (hash & t_new->mask); 770 771 hlist_bl_lock(head2); 772 WRITE_ONCE(svc->hash_key, 773 ip_vs_rht_build_hash_key(t_new, hash)); 774 /* t_new->seqc are not used at this stage, we race 775 * only with add/del, so only lock the bucket. 776 */ 777 hlist_bl_add_head_rcu(&svc->s_list, head2); 778 hlist_bl_unlock(head2); 779 /* Too long chain? Do it in steps */ 780 if (++limit >= 64) 781 break; 782 } 783 784 hlist_bl_unlock(head); 785 write_seqcount_end(sc); 786 preempt_enable_nested(); 787 local_bh_enable(); 788 if (limit >= 64) 789 goto same_bucket; 790 } 791 792 /* Serialize with readers that don't like svc_table changes */ 793 down_write(&ipvs->svc_replace_sem); 794 795 /* Check if work is stopped to avoid synchronize_rcu() */ 796 if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 797 goto unlock_repl; 798 799 rcu_assign_pointer(ipvs->svc_table, t_new); 800 /* Inform readers that new table is installed */ 801 smp_mb__before_atomic(); 802 atomic_inc(&ipvs->svc_table_changes); 803 t_free = t; 804 805 unlock_repl: 806 up_write(&ipvs->svc_replace_sem); 807 808 unlock_sem: 809 up_write(&ipvs->svc_resize_sem); 810 811 if (t_free) { 812 /* RCU readers should not see more than two tables in chain. 813 * To prevent new table to be attached wait here instead of 814 * freeing the old table in RCU callback. 815 */ 816 synchronize_rcu(); 817 ip_vs_rht_free(t_free); 818 } 819 820 out: 821 if (!READ_ONCE(ipvs->enable) || !more_work || 822 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 823 return; 824 queue_delayed_work(system_dfl_long_wq, &ipvs->svc_resize_work, 1); 825 return; 826 827 unlock_m: 828 mutex_unlock(&ipvs->service_mutex); 829 goto unlock_sem; 830 } 831 832 static inline void 833 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) 834 { 835 atomic_inc(&svc->refcnt); 836 rcu_assign_pointer(dest->svc, svc); 837 } 838 839 static void ip_vs_service_free(struct ip_vs_service *svc) 840 { 841 ip_vs_stats_release(&svc->stats); 842 kfree(svc); 843 } 844 845 static void ip_vs_service_rcu_free(struct rcu_head *head) 846 { 847 struct ip_vs_service *svc; 848 849 svc = container_of(head, struct ip_vs_service, rcu_head); 850 ip_vs_service_free(svc); 851 } 852 853 static void __ip_vs_svc_put(struct ip_vs_service *svc) 854 { 855 if (atomic_dec_and_test(&svc->refcnt)) { 856 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", 857 svc->fwmark, 858 IP_VS_DBG_ADDR(svc->af, &svc->addr), 859 ntohs(svc->port)); 860 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); 861 } 862 } 863 864 865 /* 866 * Returns hash value for real service 867 */ 868 static inline unsigned int ip_vs_rs_hashkey(int af, 869 const union nf_inet_addr *addr, 870 __be16 port) 871 { 872 unsigned int porth = ntohs(port); 873 __be32 addr_fold = addr->ip; 874 875 #ifdef CONFIG_IP_VS_IPV6 876 if (af == AF_INET6) 877 addr_fold = addr->ip6[0]^addr->ip6[1]^ 878 addr->ip6[2]^addr->ip6[3]; 879 #endif 880 881 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) 882 & IP_VS_RTAB_MASK; 883 } 884 885 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ 886 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) 887 { 888 unsigned int hash; 889 __be16 port; 890 891 if (dest->in_rs_table) 892 return; 893 894 switch (IP_VS_DFWD_METHOD(dest)) { 895 case IP_VS_CONN_F_MASQ: 896 port = dest->port; 897 break; 898 case IP_VS_CONN_F_TUNNEL: 899 switch (dest->tun_type) { 900 case IP_VS_CONN_F_TUNNEL_TYPE_GUE: 901 port = dest->tun_port; 902 break; 903 case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: 904 case IP_VS_CONN_F_TUNNEL_TYPE_GRE: 905 port = 0; 906 break; 907 default: 908 return; 909 } 910 break; 911 default: 912 return; 913 } 914 915 /* 916 * Hash by proto,addr,port, 917 * which are the parameters of the real service. 918 */ 919 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); 920 921 hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); 922 dest->in_rs_table = 1; 923 } 924 925 /* Unhash ip_vs_dest from rs_table. */ 926 static void ip_vs_rs_unhash(struct ip_vs_dest *dest) 927 { 928 /* 929 * Remove it from the rs_table table. 930 */ 931 if (dest->in_rs_table) { 932 hlist_del_rcu(&dest->d_list); 933 dest->in_rs_table = 0; 934 } 935 } 936 937 /* Check if real service by <proto,addr,port> is present */ 938 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, 939 const union nf_inet_addr *daddr, __be16 dport) 940 { 941 unsigned int hash; 942 struct ip_vs_dest *dest; 943 944 /* Check for "full" addressed entries */ 945 hash = ip_vs_rs_hashkey(af, daddr, dport); 946 947 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 948 if (dest->port == dport && 949 dest->af == af && 950 ip_vs_addr_equal(af, &dest->addr, daddr) && 951 (dest->protocol == protocol || dest->vfwmark) && 952 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 953 /* HIT */ 954 return true; 955 } 956 } 957 958 return false; 959 } 960 961 /* Find real service record by <proto,addr,port>. 962 * In case of multiple records with the same <proto,addr,port>, only 963 * the first found record is returned. 964 * 965 * To be called under RCU lock. 966 */ 967 struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, 968 __u16 protocol, 969 const union nf_inet_addr *daddr, 970 __be16 dport) 971 { 972 unsigned int hash; 973 struct ip_vs_dest *dest; 974 975 /* Check for "full" addressed entries */ 976 hash = ip_vs_rs_hashkey(af, daddr, dport); 977 978 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 979 if (dest->port == dport && 980 dest->af == af && 981 ip_vs_addr_equal(af, &dest->addr, daddr) && 982 (dest->protocol == protocol || dest->vfwmark) && 983 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 984 /* HIT */ 985 return dest; 986 } 987 } 988 989 return NULL; 990 } 991 992 /* Find real service record by <af,addr,tun_port>. 993 * In case of multiple records with the same <af,addr,tun_port>, only 994 * the first found record is returned. 995 * 996 * To be called under RCU lock. 997 */ 998 struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, 999 const union nf_inet_addr *daddr, 1000 __be16 tun_port) 1001 { 1002 struct ip_vs_dest *dest; 1003 unsigned int hash; 1004 1005 /* Check for "full" addressed entries */ 1006 hash = ip_vs_rs_hashkey(af, daddr, tun_port); 1007 1008 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 1009 if (dest->tun_port == tun_port && 1010 dest->af == af && 1011 ip_vs_addr_equal(af, &dest->addr, daddr) && 1012 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { 1013 /* HIT */ 1014 return dest; 1015 } 1016 } 1017 1018 return NULL; 1019 } 1020 1021 /* Lookup destination by {addr,port} in the given service 1022 * Called under RCU lock. 1023 */ 1024 static struct ip_vs_dest * 1025 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, 1026 const union nf_inet_addr *daddr, __be16 dport) 1027 { 1028 struct ip_vs_dest *dest; 1029 1030 /* 1031 * Find the destination for the given service 1032 */ 1033 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 1034 if ((dest->af == dest_af) && 1035 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 1036 (dest->port == dport)) { 1037 /* HIT */ 1038 return dest; 1039 } 1040 } 1041 1042 return NULL; 1043 } 1044 1045 /* 1046 * Find destination by {daddr,dport,vaddr,protocol} 1047 * Created to be used in ip_vs_process_message() in 1048 * the backup synchronization daemon. It finds the 1049 * destination to be bound to the received connection 1050 * on the backup. 1051 * Called under RCU lock, no refcnt is returned. 1052 */ 1053 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, 1054 const union nf_inet_addr *daddr, 1055 __be16 dport, 1056 const union nf_inet_addr *vaddr, 1057 __be16 vport, __u16 protocol, __u32 fwmark, 1058 __u32 flags) 1059 { 1060 struct ip_vs_dest *dest; 1061 struct ip_vs_service *svc; 1062 __be16 port = dport; 1063 1064 svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); 1065 if (!svc) 1066 return NULL; 1067 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) 1068 port = 0; 1069 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); 1070 if (!dest) 1071 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); 1072 return dest; 1073 } 1074 1075 void ip_vs_dest_dst_rcu_free(struct rcu_head *head) 1076 { 1077 struct ip_vs_dest_dst *dest_dst = container_of(head, 1078 struct ip_vs_dest_dst, 1079 rcu_head); 1080 1081 dst_release(dest_dst->dst_cache); 1082 kfree(dest_dst); 1083 } 1084 1085 /* Release dest_dst and dst_cache for dest in user context */ 1086 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) 1087 { 1088 struct ip_vs_dest_dst *old; 1089 1090 old = rcu_dereference_protected(dest->dest_dst, 1); 1091 if (old) { 1092 RCU_INIT_POINTER(dest->dest_dst, NULL); 1093 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 1094 } 1095 } 1096 1097 /* 1098 * Lookup dest by {svc,addr,port} in the destination trash. 1099 * The destination trash is used to hold the destinations that are removed 1100 * from the service table but are still referenced by some conn entries. 1101 * The reason to add the destination trash is when the dest is temporary 1102 * down (either by administrator or by monitor program), the dest can be 1103 * picked back from the trash, the remaining connections to the dest can 1104 * continue, and the counting information of the dest is also useful for 1105 * scheduling. 1106 */ 1107 static struct ip_vs_dest * 1108 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, 1109 const union nf_inet_addr *daddr, __be16 dport) 1110 { 1111 struct ip_vs_dest *dest; 1112 struct netns_ipvs *ipvs = svc->ipvs; 1113 1114 /* 1115 * Find the destination in trash 1116 */ 1117 spin_lock_bh(&ipvs->dest_trash_lock); 1118 list_for_each_entry(dest, &ipvs->dest_trash, t_list) { 1119 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " 1120 "dest->refcnt=%d\n", 1121 dest->vfwmark, 1122 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1123 ntohs(dest->port), 1124 refcount_read(&dest->refcnt)); 1125 if (dest->af == dest_af && 1126 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 1127 dest->port == dport && 1128 dest->vfwmark == svc->fwmark && 1129 dest->protocol == svc->protocol && 1130 (svc->fwmark || 1131 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && 1132 dest->vport == svc->port))) { 1133 /* HIT */ 1134 list_del(&dest->t_list); 1135 goto out; 1136 } 1137 } 1138 1139 dest = NULL; 1140 1141 out: 1142 spin_unlock_bh(&ipvs->dest_trash_lock); 1143 1144 return dest; 1145 } 1146 1147 /* Put destination in trash */ 1148 static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs, 1149 struct ip_vs_dest *dest, unsigned long istart, 1150 bool cleanup) 1151 { 1152 spin_lock_bh(&ipvs->dest_trash_lock); 1153 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", 1154 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), 1155 refcount_read(&dest->refcnt)); 1156 if (list_empty(&ipvs->dest_trash) && !cleanup) 1157 mod_timer(&ipvs->dest_trash_timer, 1158 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1159 /* dest lives in trash with reference */ 1160 list_add(&dest->t_list, &ipvs->dest_trash); 1161 dest->idle_start = istart; 1162 spin_unlock_bh(&ipvs->dest_trash_lock); 1163 } 1164 1165 static void ip_vs_dest_rcu_free(struct rcu_head *head) 1166 { 1167 struct ip_vs_dest *dest; 1168 1169 dest = container_of(head, struct ip_vs_dest, rcu_head); 1170 ip_vs_stats_release(&dest->stats); 1171 ip_vs_dest_put_and_free(dest); 1172 } 1173 1174 static void ip_vs_dest_free(struct ip_vs_dest *dest) 1175 { 1176 struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); 1177 1178 __ip_vs_svc_put(svc); 1179 call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); 1180 } 1181 1182 /* 1183 * Clean up all the destinations in the trash 1184 * Called by the ip_vs_control_cleanup() 1185 * 1186 * When the ip_vs_control_clearup is activated by ipvs module exit, 1187 * the service tables must have been flushed and all the connections 1188 * are expired, and the refcnt of each destination in the trash must 1189 * be 1, so we simply release them here. 1190 */ 1191 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) 1192 { 1193 struct ip_vs_dest *dest, *nxt; 1194 1195 timer_delete_sync(&ipvs->dest_trash_timer); 1196 /* No need to use dest_trash_lock */ 1197 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { 1198 list_del(&dest->t_list); 1199 ip_vs_dest_free(dest); 1200 } 1201 } 1202 1203 static void ip_vs_stats_rcu_free(struct rcu_head *head) 1204 { 1205 struct ip_vs_stats_rcu *rs = container_of(head, 1206 struct ip_vs_stats_rcu, 1207 rcu_head); 1208 1209 ip_vs_stats_release(&rs->s); 1210 kfree(rs); 1211 } 1212 1213 static void 1214 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) 1215 { 1216 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c 1217 1218 spin_lock(&src->lock); 1219 1220 IP_VS_SHOW_STATS_COUNTER(conns); 1221 IP_VS_SHOW_STATS_COUNTER(inpkts); 1222 IP_VS_SHOW_STATS_COUNTER(outpkts); 1223 IP_VS_SHOW_STATS_COUNTER(inbytes); 1224 IP_VS_SHOW_STATS_COUNTER(outbytes); 1225 1226 ip_vs_read_estimator(dst, src); 1227 1228 spin_unlock(&src->lock); 1229 } 1230 1231 static void 1232 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) 1233 { 1234 dst->conns = (u32)src->conns; 1235 dst->inpkts = (u32)src->inpkts; 1236 dst->outpkts = (u32)src->outpkts; 1237 dst->inbytes = src->inbytes; 1238 dst->outbytes = src->outbytes; 1239 dst->cps = (u32)src->cps; 1240 dst->inpps = (u32)src->inpps; 1241 dst->outpps = (u32)src->outpps; 1242 dst->inbps = (u32)src->inbps; 1243 dst->outbps = (u32)src->outbps; 1244 } 1245 1246 static void 1247 ip_vs_zero_stats(struct ip_vs_stats *stats) 1248 { 1249 spin_lock(&stats->lock); 1250 1251 /* get current counters as zero point, rates are zeroed */ 1252 1253 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c 1254 1255 IP_VS_ZERO_STATS_COUNTER(conns); 1256 IP_VS_ZERO_STATS_COUNTER(inpkts); 1257 IP_VS_ZERO_STATS_COUNTER(outpkts); 1258 IP_VS_ZERO_STATS_COUNTER(inbytes); 1259 IP_VS_ZERO_STATS_COUNTER(outbytes); 1260 1261 ip_vs_zero_estimator(stats); 1262 1263 spin_unlock(&stats->lock); 1264 } 1265 1266 /* Allocate fields after kzalloc */ 1267 int ip_vs_stats_init_alloc(struct ip_vs_stats *s) 1268 { 1269 int i; 1270 1271 spin_lock_init(&s->lock); 1272 s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); 1273 if (!s->cpustats) 1274 return -ENOMEM; 1275 1276 for_each_possible_cpu(i) { 1277 struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); 1278 1279 u64_stats_init(&cs->syncp); 1280 } 1281 return 0; 1282 } 1283 1284 struct ip_vs_stats *ip_vs_stats_alloc(void) 1285 { 1286 struct ip_vs_stats *s = kzalloc_obj(*s); 1287 1288 if (s && ip_vs_stats_init_alloc(s) >= 0) 1289 return s; 1290 kfree(s); 1291 return NULL; 1292 } 1293 1294 void ip_vs_stats_release(struct ip_vs_stats *stats) 1295 { 1296 free_percpu(stats->cpustats); 1297 } 1298 1299 void ip_vs_stats_free(struct ip_vs_stats *stats) 1300 { 1301 if (stats) { 1302 ip_vs_stats_release(stats); 1303 kfree(stats); 1304 } 1305 } 1306 1307 /* 1308 * Update a destination in the given service 1309 */ 1310 static void 1311 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, 1312 struct ip_vs_dest_user_kern *udest, int add) 1313 { 1314 struct netns_ipvs *ipvs = svc->ipvs; 1315 struct ip_vs_service *old_svc; 1316 struct ip_vs_scheduler *sched; 1317 int conn_flags; 1318 1319 /* We cannot modify an address and change the address family */ 1320 BUG_ON(!add && udest->af != dest->af); 1321 1322 if (add && udest->af != svc->af) 1323 ipvs->mixed_address_family_dests++; 1324 1325 /* keep the last_weight with latest non-0 weight */ 1326 if (add || udest->weight != 0) 1327 atomic_set(&dest->last_weight, udest->weight); 1328 1329 /* set the weight and the flags */ 1330 atomic_set(&dest->weight, udest->weight); 1331 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; 1332 conn_flags |= IP_VS_CONN_F_INACTIVE; 1333 1334 /* Need to rehash? */ 1335 if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != 1336 IP_VS_DFWD_METHOD(dest) || 1337 udest->tun_type != dest->tun_type || 1338 udest->tun_port != dest->tun_port) 1339 ip_vs_rs_unhash(dest); 1340 1341 /* set the tunnel info */ 1342 dest->tun_type = udest->tun_type; 1343 dest->tun_port = udest->tun_port; 1344 dest->tun_flags = udest->tun_flags; 1345 1346 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ 1347 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { 1348 conn_flags |= IP_VS_CONN_F_NOOUTPUT; 1349 } else { 1350 /* FTP-NAT requires conntrack for mangling */ 1351 if (svc->port == FTPPORT) 1352 ip_vs_register_conntrack(svc); 1353 } 1354 atomic_set(&dest->conn_flags, conn_flags); 1355 /* Put the real service in rs_table if not present. */ 1356 ip_vs_rs_hash(ipvs, dest); 1357 1358 /* bind the service */ 1359 old_svc = rcu_dereference_protected(dest->svc, 1); 1360 if (!old_svc) { 1361 __ip_vs_bind_svc(dest, svc); 1362 } else { 1363 if (old_svc != svc) { 1364 ip_vs_zero_stats(&dest->stats); 1365 __ip_vs_bind_svc(dest, svc); 1366 __ip_vs_svc_put(old_svc); 1367 } 1368 } 1369 1370 /* set the dest status flags */ 1371 dest->flags |= IP_VS_DEST_F_AVAILABLE; 1372 1373 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) 1374 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 1375 dest->u_threshold = udest->u_threshold; 1376 dest->l_threshold = udest->l_threshold; 1377 1378 dest->af = udest->af; 1379 1380 if (add) { 1381 list_add_rcu(&dest->n_list, &svc->destinations); 1382 svc->num_dests++; 1383 sched = rcu_dereference_protected(svc->scheduler, 1); 1384 if (sched && sched->add_dest) 1385 sched->add_dest(svc, dest); 1386 } else { 1387 spin_lock_bh(&dest->dst_lock); 1388 __ip_vs_dst_cache_reset(dest); 1389 spin_unlock_bh(&dest->dst_lock); 1390 1391 sched = rcu_dereference_protected(svc->scheduler, 1); 1392 if (sched && sched->upd_dest) 1393 sched->upd_dest(svc, dest); 1394 } 1395 } 1396 1397 1398 /* 1399 * Create a destination for the given service 1400 */ 1401 static int 1402 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1403 { 1404 struct ip_vs_dest *dest; 1405 unsigned int atype; 1406 int ret; 1407 1408 #ifdef CONFIG_IP_VS_IPV6 1409 if (udest->af == AF_INET6) { 1410 atype = ipv6_addr_type(&udest->addr.in6); 1411 if ((!(atype & IPV6_ADDR_UNICAST) || 1412 atype & IPV6_ADDR_LINKLOCAL) && 1413 !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) 1414 return -EINVAL; 1415 1416 ret = nf_defrag_ipv6_enable(svc->ipvs->net); 1417 if (ret) 1418 return ret; 1419 } else 1420 #endif 1421 { 1422 atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); 1423 if (atype != RTN_LOCAL && atype != RTN_UNICAST) 1424 return -EINVAL; 1425 } 1426 1427 dest = kzalloc_obj(struct ip_vs_dest); 1428 if (dest == NULL) 1429 return -ENOMEM; 1430 1431 ret = ip_vs_stats_init_alloc(&dest->stats); 1432 if (ret < 0) 1433 goto err_alloc; 1434 1435 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1436 if (ret < 0) 1437 goto err_stats; 1438 1439 dest->af = udest->af; 1440 dest->protocol = svc->protocol; 1441 dest->vaddr = svc->addr; 1442 dest->vport = svc->port; 1443 dest->vfwmark = svc->fwmark; 1444 ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); 1445 dest->port = udest->port; 1446 1447 atomic_set(&dest->activeconns, 0); 1448 atomic_set(&dest->inactconns, 0); 1449 atomic_set(&dest->persistconns, 0); 1450 refcount_set(&dest->refcnt, 1); 1451 1452 INIT_HLIST_NODE(&dest->d_list); 1453 spin_lock_init(&dest->dst_lock); 1454 __ip_vs_update_dest(svc, dest, udest, 1); 1455 1456 return 0; 1457 1458 err_stats: 1459 ip_vs_stats_release(&dest->stats); 1460 1461 err_alloc: 1462 kfree(dest); 1463 return ret; 1464 } 1465 1466 1467 /* 1468 * Add a destination into an existing service 1469 */ 1470 static int 1471 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1472 { 1473 struct ip_vs_dest *dest; 1474 union nf_inet_addr daddr; 1475 __be16 dport = udest->port; 1476 int ret; 1477 1478 if (udest->weight < 0) { 1479 pr_err("%s(): server weight less than zero\n", __func__); 1480 return -ERANGE; 1481 } 1482 1483 if (udest->l_threshold > udest->u_threshold) { 1484 pr_err("%s(): lower threshold is higher than upper threshold\n", 1485 __func__); 1486 return -ERANGE; 1487 } 1488 1489 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1490 if (udest->tun_port == 0) { 1491 pr_err("%s(): tunnel port is zero\n", __func__); 1492 return -EINVAL; 1493 } 1494 } 1495 1496 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1497 1498 /* We use function that requires RCU lock */ 1499 rcu_read_lock(); 1500 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1501 rcu_read_unlock(); 1502 1503 if (dest != NULL) { 1504 IP_VS_DBG(1, "%s(): dest already exists\n", __func__); 1505 return -EEXIST; 1506 } 1507 1508 /* 1509 * Check if the dest already exists in the trash and 1510 * is from the same service 1511 */ 1512 dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); 1513 1514 if (dest != NULL) { 1515 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " 1516 "dest->refcnt=%d, service %u/%s:%u\n", 1517 IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), 1518 refcount_read(&dest->refcnt), 1519 dest->vfwmark, 1520 IP_VS_DBG_ADDR(svc->af, &dest->vaddr), 1521 ntohs(dest->vport)); 1522 1523 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1524 /* On error put back dest into the trash */ 1525 if (ret < 0) 1526 ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start, 1527 false); 1528 else 1529 __ip_vs_update_dest(svc, dest, udest, 1); 1530 } else { 1531 /* 1532 * Allocate and initialize the dest structure 1533 */ 1534 ret = ip_vs_new_dest(svc, udest); 1535 } 1536 1537 return ret; 1538 } 1539 1540 1541 /* 1542 * Edit a destination in the given service 1543 */ 1544 static int 1545 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1546 { 1547 struct ip_vs_dest *dest; 1548 union nf_inet_addr daddr; 1549 __be16 dport = udest->port; 1550 1551 if (udest->weight < 0) { 1552 pr_err("%s(): server weight less than zero\n", __func__); 1553 return -ERANGE; 1554 } 1555 1556 if (udest->l_threshold > udest->u_threshold) { 1557 pr_err("%s(): lower threshold is higher than upper threshold\n", 1558 __func__); 1559 return -ERANGE; 1560 } 1561 1562 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1563 if (udest->tun_port == 0) { 1564 pr_err("%s(): tunnel port is zero\n", __func__); 1565 return -EINVAL; 1566 } 1567 } 1568 1569 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1570 1571 /* We use function that requires RCU lock */ 1572 rcu_read_lock(); 1573 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1574 rcu_read_unlock(); 1575 1576 if (dest == NULL) { 1577 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); 1578 return -ENOENT; 1579 } 1580 1581 __ip_vs_update_dest(svc, dest, udest, 0); 1582 1583 return 0; 1584 } 1585 1586 /* 1587 * Delete a destination (must be already unlinked from the service) 1588 */ 1589 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, 1590 bool cleanup) 1591 { 1592 ip_vs_stop_estimator(ipvs, &dest->stats); 1593 1594 /* 1595 * Remove it from the d-linked list with the real services. 1596 */ 1597 ip_vs_rs_unhash(dest); 1598 1599 ip_vs_trash_put_dest(ipvs, dest, 0, cleanup); 1600 1601 /* Queue up delayed work to expire all no destination connections. 1602 * No-op when CONFIG_SYSCTL is disabled. 1603 */ 1604 if (!cleanup) 1605 ip_vs_enqueue_expire_nodest_conns(ipvs); 1606 } 1607 1608 1609 /* 1610 * Unlink a destination from the given service 1611 */ 1612 static void __ip_vs_unlink_dest(struct ip_vs_service *svc, 1613 struct ip_vs_dest *dest, 1614 int svcupd) 1615 { 1616 dest->flags &= ~IP_VS_DEST_F_AVAILABLE; 1617 1618 spin_lock_bh(&dest->dst_lock); 1619 __ip_vs_dst_cache_reset(dest); 1620 spin_unlock_bh(&dest->dst_lock); 1621 1622 /* 1623 * Remove it from the d-linked destination list. 1624 */ 1625 list_del_rcu(&dest->n_list); 1626 svc->num_dests--; 1627 1628 if (dest->af != svc->af) 1629 svc->ipvs->mixed_address_family_dests--; 1630 1631 if (svcupd) { 1632 struct ip_vs_scheduler *sched; 1633 1634 sched = rcu_dereference_protected(svc->scheduler, 1); 1635 if (sched && sched->del_dest) 1636 sched->del_dest(svc, dest); 1637 } 1638 } 1639 1640 1641 /* 1642 * Delete a destination server in the given service 1643 */ 1644 static int 1645 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1646 { 1647 struct ip_vs_dest *dest; 1648 __be16 dport = udest->port; 1649 1650 /* We use function that requires RCU lock */ 1651 rcu_read_lock(); 1652 dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); 1653 rcu_read_unlock(); 1654 1655 if (dest == NULL) { 1656 IP_VS_DBG(1, "%s(): destination not found!\n", __func__); 1657 return -ENOENT; 1658 } 1659 1660 /* 1661 * Unlink dest from the service 1662 */ 1663 __ip_vs_unlink_dest(svc, dest, 1); 1664 1665 /* 1666 * Delete the destination 1667 */ 1668 __ip_vs_del_dest(svc->ipvs, dest, false); 1669 1670 return 0; 1671 } 1672 1673 static void ip_vs_dest_trash_expire(struct timer_list *t) 1674 { 1675 struct netns_ipvs *ipvs = timer_container_of(ipvs, t, 1676 dest_trash_timer); 1677 struct ip_vs_dest *dest, *next; 1678 unsigned long now = jiffies; 1679 1680 spin_lock(&ipvs->dest_trash_lock); 1681 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { 1682 if (refcount_read(&dest->refcnt) > 1) 1683 continue; 1684 if (dest->idle_start) { 1685 if (time_before(now, dest->idle_start + 1686 IP_VS_DEST_TRASH_PERIOD)) 1687 continue; 1688 } else { 1689 dest->idle_start = max(1UL, now); 1690 continue; 1691 } 1692 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", 1693 dest->vfwmark, 1694 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1695 ntohs(dest->port)); 1696 list_del(&dest->t_list); 1697 ip_vs_dest_free(dest); 1698 } 1699 if (!list_empty(&ipvs->dest_trash)) 1700 mod_timer(&ipvs->dest_trash_timer, 1701 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1702 spin_unlock(&ipvs->dest_trash_lock); 1703 } 1704 1705 /* 1706 * Add a service into the service hash table 1707 */ 1708 static int 1709 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, 1710 struct ip_vs_service **svc_p) 1711 { 1712 struct ip_vs_scheduler *sched = NULL; 1713 struct ip_vs_rht *tc_new = NULL; 1714 struct ip_vs_rht *t, *t_new = NULL; 1715 int af_id = ip_vs_af_index(u->af); 1716 struct ip_vs_service *svc = NULL; 1717 struct ip_vs_pe *pe = NULL; 1718 int ret_hooks = -1; 1719 int ret = 0; 1720 bool grow; 1721 1722 /* increase the module use count */ 1723 if (!ip_vs_use_count_inc()) 1724 return -ENOPROTOOPT; 1725 1726 /* Lookup the scheduler by 'u->sched_name' */ 1727 if (strcmp(u->sched_name, "none")) { 1728 sched = ip_vs_scheduler_get(u->sched_name); 1729 if (!sched) { 1730 pr_info("Scheduler module ip_vs_%s not found\n", 1731 u->sched_name); 1732 ret = -ENOENT; 1733 goto out_err; 1734 } 1735 } 1736 1737 if (u->pe_name && *u->pe_name) { 1738 pe = ip_vs_pe_getbyname(u->pe_name); 1739 if (pe == NULL) { 1740 pr_info("persistence engine module ip_vs_pe_%s " 1741 "not found\n", u->pe_name); 1742 ret = -ENOENT; 1743 goto out_err; 1744 } 1745 } 1746 1747 #ifdef CONFIG_IP_VS_IPV6 1748 if (u->af == AF_INET6) { 1749 __u32 plen = (__force __u32) u->netmask; 1750 1751 if (plen < 1 || plen > 128) { 1752 ret = -EINVAL; 1753 goto out_err; 1754 } 1755 1756 ret = nf_defrag_ipv6_enable(ipvs->net); 1757 if (ret) 1758 goto out_err; 1759 } 1760 #endif 1761 1762 /* The old table can be freed, protect it with RCU */ 1763 rcu_read_lock(); 1764 t = rcu_dereference(ipvs->svc_table); 1765 if (!t) { 1766 int lfactor = sysctl_svc_lfactor(ipvs); 1767 int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); 1768 1769 rcu_read_unlock(); 1770 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 1771 if (!t_new) { 1772 ret = -ENOMEM; 1773 goto out_err; 1774 } 1775 grow = false; 1776 } else { 1777 /* Even the currently attached new table may need to grow */ 1778 t = rcu_dereference(t->new_tbl); 1779 grow = ip_vs_get_num_services(ipvs) + 1 > t->u_thresh; 1780 rcu_read_unlock(); 1781 } 1782 1783 if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { 1784 int lfactor = sysctl_conn_lfactor(ipvs); 1785 int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor); 1786 1787 tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); 1788 if (!tc_new) { 1789 ret = -ENOMEM; 1790 goto out_err; 1791 } 1792 } 1793 1794 if (!atomic_read(&ipvs->num_services[af_id])) { 1795 ret = ip_vs_register_hooks(ipvs, u->af); 1796 if (ret < 0) 1797 goto out_err; 1798 ret_hooks = ret; 1799 } 1800 1801 svc = kzalloc_obj(struct ip_vs_service); 1802 if (svc == NULL) { 1803 IP_VS_DBG(1, "%s(): no memory\n", __func__); 1804 ret = -ENOMEM; 1805 goto out_err; 1806 } 1807 ret = ip_vs_stats_init_alloc(&svc->stats); 1808 if (ret < 0) 1809 goto out_err; 1810 1811 /* I'm the first user of the service */ 1812 atomic_set(&svc->refcnt, 0); 1813 1814 svc->af = u->af; 1815 svc->protocol = u->protocol; 1816 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); 1817 svc->port = u->port; 1818 svc->fwmark = u->fwmark; 1819 svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; 1820 svc->timeout = u->timeout * HZ; 1821 svc->netmask = u->netmask; 1822 svc->ipvs = ipvs; 1823 1824 INIT_LIST_HEAD(&svc->destinations); 1825 spin_lock_init(&svc->sched_lock); 1826 1827 /* Bind the scheduler */ 1828 if (sched) { 1829 ret = ip_vs_bind_scheduler(svc, sched); 1830 if (ret) 1831 goto out_err; 1832 } 1833 1834 ret = ip_vs_start_estimator(ipvs, &svc->stats); 1835 if (ret < 0) 1836 goto out_err; 1837 1838 if (t_new) { 1839 /* Add table for first time */ 1840 clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 1841 rcu_assign_pointer(ipvs->svc_table, t_new); 1842 t_new = NULL; 1843 } 1844 if (tc_new) { 1845 rcu_assign_pointer(ipvs->conn_tab, tc_new); 1846 tc_new = NULL; 1847 } 1848 1849 /* Update the virtual service counters */ 1850 if (svc->port == FTPPORT) 1851 atomic_inc(&ipvs->ftpsvc_counter[af_id]); 1852 else if (!svc->port && !svc->fwmark) 1853 atomic_inc(&ipvs->nullsvc_counter[af_id]); 1854 if (pe && pe->conn_out) 1855 atomic_inc(&ipvs->conn_out_counter[af_id]); 1856 1857 /* Bind the ct retriever */ 1858 RCU_INIT_POINTER(svc->pe, pe); 1859 pe = NULL; 1860 1861 if (svc->fwmark) 1862 atomic_inc(&ipvs->fwm_services[af_id]); 1863 else 1864 atomic_inc(&ipvs->nonfwm_services[af_id]); 1865 atomic_inc(&ipvs->num_services[af_id]); 1866 1867 /* Hash the service into the service table */ 1868 ip_vs_svc_hash(svc); 1869 1870 /* Schedule resize work */ 1871 if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) 1872 queue_delayed_work(system_dfl_long_wq, &ipvs->svc_resize_work, 1873 1); 1874 1875 *svc_p = svc; 1876 1877 if (!READ_ONCE(ipvs->enable)) { 1878 mutex_lock(&ipvs->est_mutex); 1879 1880 /* Now there is a service - full throttle */ 1881 WRITE_ONCE(ipvs->enable, 1); 1882 1883 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 1884 1885 /* Start estimation for first time */ 1886 ip_vs_est_reload_start(ipvs, true); 1887 mutex_unlock(&ipvs->est_mutex); 1888 } 1889 1890 return 0; 1891 1892 1893 out_err: 1894 if (tc_new) 1895 ip_vs_rht_free(tc_new); 1896 if (t_new) 1897 ip_vs_rht_free(t_new); 1898 if (ret_hooks >= 0) 1899 ip_vs_unregister_hooks(ipvs, u->af); 1900 if (svc != NULL) { 1901 ip_vs_unbind_scheduler(svc); 1902 ip_vs_service_free(svc); 1903 } 1904 ip_vs_scheduler_put(sched); 1905 ip_vs_pe_put(pe); 1906 1907 /* decrease the module use count */ 1908 ip_vs_use_count_dec(); 1909 1910 return ret; 1911 } 1912 1913 1914 /* 1915 * Edit a service and bind it with a new scheduler 1916 */ 1917 static int 1918 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) 1919 { 1920 struct ip_vs_scheduler *sched = NULL, *old_sched; 1921 struct ip_vs_pe *pe = NULL, *old_pe = NULL; 1922 int ret = 0; 1923 bool new_pe_conn_out, old_pe_conn_out; 1924 struct netns_ipvs *ipvs = svc->ipvs; 1925 int af_id = ip_vs_af_index(svc->af); 1926 1927 /* 1928 * Lookup the scheduler, by 'u->sched_name' 1929 */ 1930 if (strcmp(u->sched_name, "none")) { 1931 sched = ip_vs_scheduler_get(u->sched_name); 1932 if (!sched) { 1933 pr_info("Scheduler module ip_vs_%s not found\n", 1934 u->sched_name); 1935 return -ENOENT; 1936 } 1937 } 1938 old_sched = sched; 1939 1940 if (u->pe_name && *u->pe_name) { 1941 pe = ip_vs_pe_getbyname(u->pe_name); 1942 if (pe == NULL) { 1943 pr_info("persistence engine module ip_vs_pe_%s " 1944 "not found\n", u->pe_name); 1945 ret = -ENOENT; 1946 goto out; 1947 } 1948 old_pe = pe; 1949 } 1950 1951 #ifdef CONFIG_IP_VS_IPV6 1952 if (u->af == AF_INET6) { 1953 __u32 plen = (__force __u32) u->netmask; 1954 1955 if (plen < 1 || plen > 128) { 1956 ret = -EINVAL; 1957 goto out; 1958 } 1959 } 1960 #endif 1961 1962 old_sched = rcu_dereference_protected(svc->scheduler, 1); 1963 if (sched != old_sched) { 1964 if (old_sched) { 1965 ip_vs_unbind_scheduler(svc); 1966 /* Wait all svc->scheduler/sched_data users */ 1967 synchronize_rcu(); 1968 } 1969 /* Bind the new scheduler */ 1970 if (sched) { 1971 ret = ip_vs_bind_scheduler(svc, sched); 1972 if (ret) { 1973 ip_vs_scheduler_put(sched); 1974 /* Try to restore the old_sched */ 1975 if (old_sched && 1976 !ip_vs_bind_scheduler(svc, old_sched)) 1977 old_sched = NULL; 1978 goto out; 1979 } 1980 } 1981 } 1982 1983 /* 1984 * Set the flags and timeout value 1985 */ 1986 svc->flags = u->flags | IP_VS_SVC_F_HASHED; 1987 svc->timeout = u->timeout * HZ; 1988 svc->netmask = u->netmask; 1989 1990 old_pe = rcu_dereference_protected(svc->pe, 1); 1991 if (pe != old_pe) { 1992 rcu_assign_pointer(svc->pe, pe); 1993 /* check for optional methods in new pe */ 1994 new_pe_conn_out = (pe && pe->conn_out) ? true : false; 1995 old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; 1996 if (new_pe_conn_out && !old_pe_conn_out) 1997 atomic_inc(&ipvs->conn_out_counter[af_id]); 1998 if (old_pe_conn_out && !new_pe_conn_out) 1999 atomic_dec(&ipvs->conn_out_counter[af_id]); 2000 } 2001 2002 out: 2003 ip_vs_scheduler_put(old_sched); 2004 ip_vs_pe_put(old_pe); 2005 return ret; 2006 } 2007 2008 /* 2009 * Delete a service from the service list 2010 * - The service must be unlinked, unlocked and not referenced! 2011 * - We are called under _bh lock 2012 */ 2013 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) 2014 { 2015 struct ip_vs_dest *dest, *nxt; 2016 struct ip_vs_scheduler *old_sched; 2017 struct ip_vs_pe *old_pe; 2018 struct netns_ipvs *ipvs = svc->ipvs; 2019 int af_id = ip_vs_af_index(svc->af); 2020 2021 atomic_dec(&ipvs->num_services[af_id]); 2022 if (!atomic_read(&ipvs->num_services[af_id])) 2023 ip_vs_unregister_hooks(ipvs, svc->af); 2024 if (svc->fwmark) 2025 atomic_dec(&ipvs->fwm_services[af_id]); 2026 else 2027 atomic_dec(&ipvs->nonfwm_services[af_id]); 2028 2029 ip_vs_stop_estimator(svc->ipvs, &svc->stats); 2030 2031 /* Unbind scheduler */ 2032 old_sched = rcu_dereference_protected(svc->scheduler, 1); 2033 ip_vs_unbind_scheduler(svc); 2034 ip_vs_scheduler_put(old_sched); 2035 2036 /* Unbind persistence engine, keep svc->pe */ 2037 old_pe = rcu_dereference_protected(svc->pe, 1); 2038 if (old_pe && old_pe->conn_out) 2039 atomic_dec(&ipvs->conn_out_counter[af_id]); 2040 ip_vs_pe_put(old_pe); 2041 2042 /* 2043 * Unlink the whole destination list 2044 */ 2045 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { 2046 __ip_vs_unlink_dest(svc, dest, 0); 2047 __ip_vs_del_dest(svc->ipvs, dest, cleanup); 2048 } 2049 2050 /* 2051 * Update the virtual service counters 2052 */ 2053 if (svc->port == FTPPORT) 2054 atomic_dec(&ipvs->ftpsvc_counter[af_id]); 2055 else if (!svc->port && !svc->fwmark) 2056 atomic_dec(&ipvs->nullsvc_counter[af_id]); 2057 2058 /* 2059 * Free the service if nobody refers to it 2060 */ 2061 __ip_vs_svc_put(svc); 2062 2063 /* decrease the module use count */ 2064 ip_vs_use_count_dec(); 2065 } 2066 2067 /* 2068 * Unlink a service from list and try to delete it if its refcnt reached 0 2069 */ 2070 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) 2071 { 2072 ip_vs_unregister_conntrack(svc); 2073 /* Hold svc to avoid double release from dest_trash */ 2074 atomic_inc(&svc->refcnt); 2075 /* 2076 * Unhash it from the service table 2077 */ 2078 ip_vs_svc_unhash(svc); 2079 2080 __ip_vs_del_service(svc, cleanup); 2081 } 2082 2083 /* 2084 * Delete a service from the service list 2085 */ 2086 static int ip_vs_del_service(struct ip_vs_service *svc) 2087 { 2088 struct netns_ipvs *ipvs; 2089 struct ip_vs_rht *t, *p; 2090 int ns; 2091 2092 if (svc == NULL) 2093 return -EEXIST; 2094 ipvs = svc->ipvs; 2095 ip_vs_unlink_service(svc, false); 2096 2097 /* Drop the table if no more services */ 2098 ns = ip_vs_get_num_services(ipvs); 2099 if (!ns) { 2100 /* Stop the resizer and drop the tables */ 2101 set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 2102 cancel_delayed_work_sync(&ipvs->svc_resize_work); 2103 t = rcu_dereference_protected(ipvs->svc_table, 1); 2104 if (t) { 2105 rcu_assign_pointer(ipvs->svc_table, NULL); 2106 /* Inform readers that table is removed */ 2107 smp_mb__before_atomic(); 2108 atomic_inc(&ipvs->svc_table_changes); 2109 while (1) { 2110 p = rcu_dereference_protected(t->new_tbl, 1); 2111 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 2112 if (p == t) 2113 break; 2114 t = p; 2115 } 2116 } 2117 } else { 2118 bool shrink; 2119 2120 rcu_read_lock(); 2121 t = rcu_dereference(ipvs->svc_table); 2122 /* Even the currently attached new table may need to shrink */ 2123 t = rcu_dereference(t->new_tbl); 2124 shrink = ns <= t->l_thresh; 2125 rcu_read_unlock(); 2126 if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, 2127 &ipvs->work_flags)) 2128 queue_delayed_work(system_dfl_long_wq, 2129 &ipvs->svc_resize_work, 1); 2130 } 2131 return 0; 2132 } 2133 2134 2135 /* 2136 * Flush all the virtual services 2137 */ 2138 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) 2139 { 2140 DECLARE_IP_VS_RHT_WALK_BUCKETS(); 2141 struct hlist_bl_head *head; 2142 struct ip_vs_service *svc; 2143 struct hlist_bl_node *ne; 2144 struct hlist_bl_node *e; 2145 struct ip_vs_rht *t, *p; 2146 2147 /* Stop the resizer and drop the tables */ 2148 if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 2149 cancel_delayed_work_sync(&ipvs->svc_resize_work); 2150 /* No resizer, so now we have exclusive write access */ 2151 2152 if (ip_vs_get_num_services(ipvs)) { 2153 ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 2154 hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list) 2155 ip_vs_unlink_service(svc, cleanup); 2156 } 2157 } 2158 2159 /* Unregister the hash table and release it after RCU grace period */ 2160 t = rcu_dereference_protected(ipvs->svc_table, 1); 2161 if (t) { 2162 rcu_assign_pointer(ipvs->svc_table, NULL); 2163 /* Inform readers that table is removed */ 2164 smp_mb__before_atomic(); 2165 atomic_inc(&ipvs->svc_table_changes); 2166 while (1) { 2167 p = rcu_dereference_protected(t->new_tbl, 1); 2168 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 2169 if (p == t) 2170 break; 2171 t = p; 2172 } 2173 } 2174 /* Stop the tot_stats estimator early under service_mutex 2175 * to avoid locking it again later. 2176 */ 2177 if (cleanup) 2178 ip_vs_stop_estimator_tot_stats(ipvs); 2179 return 0; 2180 } 2181 2182 /* 2183 * Delete service by {netns} in the service table. 2184 * Called by __ip_vs_batch_cleanup() 2185 */ 2186 void ip_vs_service_nets_cleanup(struct list_head *net_list) 2187 { 2188 struct netns_ipvs *ipvs; 2189 struct net *net; 2190 2191 /* Check for "full" addressed entries */ 2192 list_for_each_entry(net, net_list, exit_list) { 2193 ipvs = net_ipvs(net); 2194 mutex_lock(&ipvs->service_mutex); 2195 ip_vs_flush(ipvs, true); 2196 mutex_unlock(&ipvs->service_mutex); 2197 } 2198 } 2199 2200 /* Put all references for device (dst_cache) */ 2201 static inline void 2202 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) 2203 { 2204 struct ip_vs_dest_dst *dest_dst; 2205 2206 spin_lock_bh(&dest->dst_lock); 2207 dest_dst = rcu_dereference_protected(dest->dest_dst, 1); 2208 if (dest_dst && dest_dst->dst_cache->dev == dev) { 2209 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", 2210 dev->name, 2211 IP_VS_DBG_ADDR(dest->af, &dest->addr), 2212 ntohs(dest->port), 2213 refcount_read(&dest->refcnt)); 2214 __ip_vs_dst_cache_reset(dest); 2215 } 2216 spin_unlock_bh(&dest->dst_lock); 2217 2218 } 2219 /* Netdev event receiver 2220 * Currently only NETDEV_DOWN is handled to release refs to cached dsts 2221 */ 2222 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, 2223 void *ptr) 2224 { 2225 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 2226 struct net *net = dev_net(dev); 2227 struct netns_ipvs *ipvs = net_ipvs(net); 2228 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 2229 unsigned int resched_score = 0; 2230 struct hlist_bl_head *head; 2231 struct ip_vs_service *svc; 2232 struct hlist_bl_node *e; 2233 struct ip_vs_dest *dest; 2234 int old_gen; 2235 2236 if (event != NETDEV_DOWN || !ipvs) 2237 return NOTIFY_DONE; 2238 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); 2239 2240 /* Allow concurrent rehashing on resize but to avoid loop 2241 * serialize with installing the new table. 2242 */ 2243 down_read(&ipvs->svc_replace_sem); 2244 2245 old_gen = atomic_read(&ipvs->svc_table_changes); 2246 2247 rcu_read_lock(); 2248 2249 smp_rmb(); /* ipvs->svc_table and svc_table_changes */ 2250 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 2251 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2252 list_for_each_entry_rcu(dest, &svc->destinations, 2253 n_list) { 2254 ip_vs_forget_dev(dest, dev); 2255 resched_score += 10; 2256 } 2257 resched_score++; 2258 } 2259 resched_score++; 2260 if (resched_score >= 100) { 2261 cond_resched_rcu(); 2262 /* Flushed? So no more dev refs */ 2263 if (atomic_read(&ipvs->svc_table_changes) != old_gen) 2264 goto done; 2265 resched_score = 0; 2266 } 2267 } 2268 2269 done: 2270 rcu_read_unlock(); 2271 up_read(&ipvs->svc_replace_sem); 2272 2273 return NOTIFY_DONE; 2274 } 2275 2276 /* 2277 * Zero counters in a service or all services 2278 */ 2279 static int ip_vs_zero_service(struct ip_vs_service *svc) 2280 { 2281 struct ip_vs_dest *dest; 2282 2283 list_for_each_entry(dest, &svc->destinations, n_list) { 2284 ip_vs_zero_stats(&dest->stats); 2285 } 2286 ip_vs_zero_stats(&svc->stats); 2287 return 0; 2288 } 2289 2290 static int ip_vs_zero_all(struct netns_ipvs *ipvs) 2291 { 2292 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 2293 unsigned int resched_score = 0; 2294 struct hlist_bl_head *head; 2295 struct ip_vs_service *svc; 2296 struct hlist_bl_node *e; 2297 2298 /* svc_table can not be replaced (svc_replace_sem) or 2299 * removed (service_mutex) 2300 */ 2301 down_read(&ipvs->svc_replace_sem); 2302 rcu_read_lock(); 2303 2304 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 2305 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2306 ip_vs_zero_service(svc); 2307 resched_score += 10; 2308 } 2309 resched_score++; 2310 if (resched_score >= 100) { 2311 resched_score = 0; 2312 cond_resched_rcu(); 2313 } 2314 } 2315 2316 rcu_read_unlock(); 2317 up_read(&ipvs->svc_replace_sem); 2318 2319 ip_vs_zero_stats(&ipvs->tot_stats->s); 2320 return 0; 2321 } 2322 2323 #ifdef CONFIG_SYSCTL 2324 2325 static int 2326 proc_do_conn_max(const struct ctl_table *table, int write, 2327 void *buffer, size_t *lenp, loff_t *ppos) 2328 { 2329 int *valp = table->data; 2330 /* We can not use *valp to check if new value is provided, use INT_MIN 2331 * for this because different admins change different limits. 2332 */ 2333 int unset = INT_MIN; 2334 int val = write ? unset : READ_ONCE(*valp); 2335 int rc; 2336 2337 const struct ctl_table tmp = { 2338 .data = &val, 2339 .maxlen = sizeof(int), 2340 }; 2341 2342 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2343 if (write && !rc && val != unset) { 2344 struct netns_ipvs *ipvs = table->extra2; 2345 bool priv = capable(CAP_NET_ADMIN); 2346 int max; 2347 2348 mutex_lock(&ipvs->service_mutex); 2349 /* Unprivileged admins can not go above the hard limit */ 2350 max = priv ? IP_VS_CONN_MAX : ipvs->conn_max_limit; 2351 if (val < 0 || val > max) { 2352 rc = -EINVAL; 2353 } else { 2354 /* Privileged admin changes both limits */ 2355 if (priv) 2356 ipvs->conn_max_limit = val; 2357 WRITE_ONCE(*valp, val); 2358 } 2359 mutex_unlock(&ipvs->service_mutex); 2360 } 2361 return rc; 2362 } 2363 2364 static int 2365 proc_do_defense_mode(const struct ctl_table *table, int write, 2366 void *buffer, size_t *lenp, loff_t *ppos) 2367 { 2368 struct netns_ipvs *ipvs = table->extra2; 2369 int *valp = table->data; 2370 int val = *valp; 2371 int rc; 2372 2373 struct ctl_table tmp = { 2374 .data = &val, 2375 .maxlen = sizeof(int), 2376 .mode = table->mode, 2377 }; 2378 2379 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2380 if (write && (*valp != val)) { 2381 if (val < 0 || val > 3) { 2382 rc = -EINVAL; 2383 } else { 2384 *valp = val; 2385 update_defense_level(ipvs); 2386 } 2387 } 2388 return rc; 2389 } 2390 2391 static int 2392 proc_do_sync_threshold(const struct ctl_table *table, int write, 2393 void *buffer, size_t *lenp, loff_t *ppos) 2394 { 2395 struct netns_ipvs *ipvs = table->extra2; 2396 int *valp = table->data; 2397 int val[2]; 2398 int rc; 2399 struct ctl_table tmp = { 2400 .data = &val, 2401 .maxlen = table->maxlen, 2402 .mode = table->mode, 2403 }; 2404 2405 mutex_lock(&ipvs->sync_mutex); 2406 memcpy(val, valp, sizeof(val)); 2407 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2408 if (write) { 2409 if (val[0] < 0 || val[1] < 0 || 2410 (val[0] >= val[1] && val[1])) 2411 rc = -EINVAL; 2412 else 2413 memcpy(valp, val, sizeof(val)); 2414 } 2415 mutex_unlock(&ipvs->sync_mutex); 2416 return rc; 2417 } 2418 2419 static int 2420 proc_do_sync_ports(const struct ctl_table *table, int write, 2421 void *buffer, size_t *lenp, loff_t *ppos) 2422 { 2423 int *valp = table->data; 2424 int val = *valp; 2425 int rc; 2426 2427 struct ctl_table tmp = { 2428 .data = &val, 2429 .maxlen = sizeof(int), 2430 .mode = table->mode, 2431 }; 2432 2433 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2434 if (write && (*valp != val)) { 2435 if (val < 1 || !is_power_of_2(val)) 2436 rc = -EINVAL; 2437 else 2438 *valp = val; 2439 } 2440 return rc; 2441 } 2442 2443 static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, 2444 void *buffer) 2445 { 2446 struct netns_ipvs *ipvs = table->extra2; 2447 cpumask_var_t *valp = table->data; 2448 cpumask_var_t newmask; 2449 int ret; 2450 2451 if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) 2452 return -ENOMEM; 2453 2454 ret = cpulist_parse(buffer, newmask); 2455 if (ret) 2456 goto out; 2457 2458 mutex_lock(&ipvs->est_mutex); 2459 2460 if (!ipvs->est_cpulist_valid) { 2461 if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { 2462 ret = -ENOMEM; 2463 goto unlock; 2464 } 2465 ipvs->est_cpulist_valid = 1; 2466 } 2467 cpumask_and(newmask, newmask, ¤t->cpus_mask); 2468 cpumask_copy(*valp, newmask); 2469 /* est_max_threads may depend on cpulist size */ 2470 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 2471 ipvs->est_calc_phase = 1; 2472 ip_vs_est_reload_start(ipvs, true); 2473 2474 unlock: 2475 mutex_unlock(&ipvs->est_mutex); 2476 2477 out: 2478 free_cpumask_var(newmask); 2479 return ret; 2480 } 2481 2482 static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, 2483 void *buffer, size_t size) 2484 { 2485 struct netns_ipvs *ipvs = table->extra2; 2486 cpumask_var_t *valp = table->data; 2487 struct cpumask *mask; 2488 int ret; 2489 2490 mutex_lock(&ipvs->est_mutex); 2491 2492 /* HK_TYPE_KTHREAD cpumask needs RCU protection */ 2493 scoped_guard(rcu) { 2494 if (ipvs->est_cpulist_valid) 2495 mask = *valp; 2496 else 2497 mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); 2498 ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); 2499 } 2500 2501 mutex_unlock(&ipvs->est_mutex); 2502 2503 return ret; 2504 } 2505 2506 static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, 2507 void *buffer, size_t *lenp, loff_t *ppos) 2508 { 2509 int ret; 2510 2511 /* Ignore both read and write(append) if *ppos not 0 */ 2512 if (*ppos || !*lenp) { 2513 *lenp = 0; 2514 return 0; 2515 } 2516 if (write) { 2517 /* proc_sys_call_handler() appends terminator */ 2518 ret = ipvs_proc_est_cpumask_set(table, buffer); 2519 if (ret >= 0) 2520 *ppos += *lenp; 2521 } else { 2522 /* proc_sys_call_handler() allocates 1 byte for terminator */ 2523 ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); 2524 if (ret >= 0) { 2525 *lenp = ret; 2526 *ppos += *lenp; 2527 ret = 0; 2528 } 2529 } 2530 return ret; 2531 } 2532 2533 static int ipvs_proc_est_nice(const struct ctl_table *table, int write, 2534 void *buffer, size_t *lenp, loff_t *ppos) 2535 { 2536 struct netns_ipvs *ipvs = table->extra2; 2537 int *valp = table->data; 2538 int val = *valp; 2539 int ret; 2540 2541 struct ctl_table tmp_table = { 2542 .data = &val, 2543 .maxlen = sizeof(int), 2544 .mode = table->mode, 2545 }; 2546 2547 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2548 if (write && ret >= 0) { 2549 if (val < MIN_NICE || val > MAX_NICE) { 2550 ret = -EINVAL; 2551 } else { 2552 mutex_lock(&ipvs->est_mutex); 2553 if (*valp != val) { 2554 *valp = val; 2555 ip_vs_est_reload_start(ipvs, true); 2556 } 2557 mutex_unlock(&ipvs->est_mutex); 2558 } 2559 } 2560 return ret; 2561 } 2562 2563 static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, 2564 void *buffer, size_t *lenp, loff_t *ppos) 2565 { 2566 struct netns_ipvs *ipvs = table->extra2; 2567 int *valp = table->data; 2568 int val = *valp; 2569 int ret; 2570 2571 struct ctl_table tmp_table = { 2572 .data = &val, 2573 .maxlen = sizeof(int), 2574 .mode = table->mode, 2575 }; 2576 2577 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2578 if (write && ret >= 0) { 2579 mutex_lock(&ipvs->est_mutex); 2580 if (*valp != val) { 2581 *valp = val; 2582 ip_vs_est_reload_start(ipvs, true); 2583 } 2584 mutex_unlock(&ipvs->est_mutex); 2585 } 2586 return ret; 2587 } 2588 2589 static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, 2590 void *buffer, size_t *lenp, loff_t *ppos) 2591 { 2592 struct netns_ipvs *ipvs = table->extra2; 2593 int *valp = table->data; 2594 int val = *valp; 2595 int ret; 2596 2597 struct ctl_table tmp_table = { 2598 .data = &val, 2599 .maxlen = sizeof(int), 2600 }; 2601 2602 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2603 if (write && ret >= 0) { 2604 if (val < -8 || val > 8) { 2605 ret = -EINVAL; 2606 } else { 2607 WRITE_ONCE(*valp, val); 2608 if (rcu_access_pointer(ipvs->conn_tab)) 2609 mod_delayed_work(system_dfl_long_wq, 2610 &ipvs->conn_resize_work, 0); 2611 } 2612 } 2613 return ret; 2614 } 2615 2616 static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, 2617 void *buffer, size_t *lenp, loff_t *ppos) 2618 { 2619 struct netns_ipvs *ipvs = table->extra2; 2620 int *valp = table->data; 2621 int val = *valp; 2622 int ret; 2623 2624 struct ctl_table tmp_table = { 2625 .data = &val, 2626 .maxlen = sizeof(int), 2627 }; 2628 2629 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2630 if (write && ret >= 0) { 2631 if (val < -8 || val > 8) { 2632 ret = -EINVAL; 2633 } else { 2634 mutex_lock(&ipvs->service_mutex); 2635 WRITE_ONCE(*valp, val); 2636 /* Make sure the services are present */ 2637 if (rcu_access_pointer(ipvs->svc_table) && 2638 READ_ONCE(ipvs->enable) && 2639 !test_bit(IP_VS_WORK_SVC_NORESIZE, 2640 &ipvs->work_flags)) 2641 mod_delayed_work(system_dfl_long_wq, 2642 &ipvs->svc_resize_work, 0); 2643 mutex_unlock(&ipvs->service_mutex); 2644 } 2645 } 2646 return ret; 2647 } 2648 2649 /* 2650 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) 2651 * Do not change order or insert new entries without 2652 * align with netns init in ip_vs_control_net_init() 2653 */ 2654 2655 static struct ctl_table vs_vars[] = { 2656 { 2657 .procname = "amemthresh", 2658 .maxlen = sizeof(int), 2659 .mode = 0644, 2660 .proc_handler = proc_dointvec, 2661 }, 2662 { 2663 .procname = "am_droprate", 2664 .maxlen = sizeof(int), 2665 .mode = 0644, 2666 .proc_handler = proc_dointvec, 2667 }, 2668 { 2669 .procname = "conn_max", 2670 .maxlen = sizeof(int), 2671 .mode = 0644, 2672 .proc_handler = proc_do_conn_max, 2673 }, 2674 { 2675 .procname = "drop_entry", 2676 .maxlen = sizeof(int), 2677 .mode = 0644, 2678 .proc_handler = proc_do_defense_mode, 2679 }, 2680 { 2681 .procname = "drop_packet", 2682 .maxlen = sizeof(int), 2683 .mode = 0644, 2684 .proc_handler = proc_do_defense_mode, 2685 }, 2686 #ifdef CONFIG_IP_VS_NFCT 2687 { 2688 .procname = "conntrack", 2689 .maxlen = sizeof(int), 2690 .mode = 0644, 2691 .proc_handler = &proc_dointvec, 2692 }, 2693 #endif 2694 { 2695 .procname = "secure_tcp", 2696 .maxlen = sizeof(int), 2697 .mode = 0644, 2698 .proc_handler = proc_do_defense_mode, 2699 }, 2700 { 2701 .procname = "snat_reroute", 2702 .maxlen = sizeof(int), 2703 .mode = 0644, 2704 .proc_handler = &proc_dointvec, 2705 }, 2706 { 2707 .procname = "sync_version", 2708 .maxlen = sizeof(int), 2709 .mode = 0644, 2710 .proc_handler = proc_dointvec_minmax, 2711 .extra1 = SYSCTL_ZERO, 2712 .extra2 = SYSCTL_ONE, 2713 }, 2714 { 2715 .procname = "sync_ports", 2716 .maxlen = sizeof(int), 2717 .mode = 0644, 2718 .proc_handler = proc_do_sync_ports, 2719 }, 2720 { 2721 .procname = "sync_persist_mode", 2722 .maxlen = sizeof(int), 2723 .mode = 0644, 2724 .proc_handler = proc_dointvec, 2725 }, 2726 { 2727 .procname = "sync_qlen_max", 2728 .maxlen = sizeof(unsigned long), 2729 .mode = 0644, 2730 .proc_handler = proc_doulongvec_minmax, 2731 }, 2732 { 2733 .procname = "sync_sock_size", 2734 .maxlen = sizeof(int), 2735 .mode = 0644, 2736 .proc_handler = proc_dointvec, 2737 }, 2738 { 2739 .procname = "cache_bypass", 2740 .maxlen = sizeof(int), 2741 .mode = 0644, 2742 .proc_handler = proc_dointvec, 2743 }, 2744 { 2745 .procname = "expire_nodest_conn", 2746 .maxlen = sizeof(int), 2747 .mode = 0644, 2748 .proc_handler = proc_dointvec, 2749 }, 2750 { 2751 .procname = "sloppy_tcp", 2752 .maxlen = sizeof(int), 2753 .mode = 0644, 2754 .proc_handler = proc_dointvec, 2755 }, 2756 { 2757 .procname = "sloppy_sctp", 2758 .maxlen = sizeof(int), 2759 .mode = 0644, 2760 .proc_handler = proc_dointvec, 2761 }, 2762 { 2763 .procname = "expire_quiescent_template", 2764 .maxlen = sizeof(int), 2765 .mode = 0644, 2766 .proc_handler = proc_dointvec, 2767 }, 2768 { 2769 .procname = "sync_threshold", 2770 .maxlen = 2771 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), 2772 .mode = 0644, 2773 .proc_handler = proc_do_sync_threshold, 2774 }, 2775 { 2776 .procname = "sync_refresh_period", 2777 .maxlen = sizeof(int), 2778 .mode = 0644, 2779 .proc_handler = proc_dointvec_jiffies, 2780 }, 2781 { 2782 .procname = "sync_retries", 2783 .maxlen = sizeof(int), 2784 .mode = 0644, 2785 .proc_handler = proc_dointvec_minmax, 2786 .extra1 = SYSCTL_ZERO, 2787 .extra2 = SYSCTL_THREE, 2788 }, 2789 { 2790 .procname = "nat_icmp_send", 2791 .maxlen = sizeof(int), 2792 .mode = 0644, 2793 .proc_handler = proc_dointvec, 2794 }, 2795 { 2796 .procname = "pmtu_disc", 2797 .maxlen = sizeof(int), 2798 .mode = 0644, 2799 .proc_handler = proc_dointvec, 2800 }, 2801 { 2802 .procname = "backup_only", 2803 .maxlen = sizeof(int), 2804 .mode = 0644, 2805 .proc_handler = proc_dointvec, 2806 }, 2807 { 2808 .procname = "conn_reuse_mode", 2809 .maxlen = sizeof(int), 2810 .mode = 0644, 2811 .proc_handler = proc_dointvec, 2812 }, 2813 { 2814 .procname = "schedule_icmp", 2815 .maxlen = sizeof(int), 2816 .mode = 0644, 2817 .proc_handler = proc_dointvec, 2818 }, 2819 { 2820 .procname = "ignore_tunneled", 2821 .maxlen = sizeof(int), 2822 .mode = 0644, 2823 .proc_handler = proc_dointvec, 2824 }, 2825 { 2826 .procname = "run_estimation", 2827 .maxlen = sizeof(int), 2828 .mode = 0644, 2829 .proc_handler = ipvs_proc_run_estimation, 2830 }, 2831 { 2832 .procname = "est_cpulist", 2833 .maxlen = NR_CPUS, /* unused */ 2834 .mode = 0644, 2835 .proc_handler = ipvs_proc_est_cpulist, 2836 }, 2837 { 2838 .procname = "est_nice", 2839 .maxlen = sizeof(int), 2840 .mode = 0644, 2841 .proc_handler = ipvs_proc_est_nice, 2842 }, 2843 { 2844 .procname = "conn_lfactor", 2845 .maxlen = sizeof(int), 2846 .mode = 0644, 2847 .proc_handler = ipvs_proc_conn_lfactor, 2848 }, 2849 { 2850 .procname = "svc_lfactor", 2851 .maxlen = sizeof(int), 2852 .mode = 0644, 2853 .proc_handler = ipvs_proc_svc_lfactor, 2854 }, 2855 #ifdef CONFIG_IP_VS_DEBUG 2856 { 2857 .procname = "debug_level", 2858 .data = &sysctl_ip_vs_debug_level, 2859 .maxlen = sizeof(int), 2860 .mode = 0644, 2861 .proc_handler = proc_dointvec, 2862 }, 2863 #endif 2864 }; 2865 2866 #endif 2867 2868 #ifdef CONFIG_PROC_FS 2869 2870 struct ip_vs_iter { 2871 struct seq_net_private p; /* Do not move this, netns depends upon it*/ 2872 struct ip_vs_rht *t; 2873 u32 bucket; 2874 }; 2875 2876 /* 2877 * Write the contents of the VS rule table to a PROCfs file. 2878 * (It is kept just for backward compatibility) 2879 */ 2880 static inline const char *ip_vs_fwd_name(unsigned int flags) 2881 { 2882 switch (flags & IP_VS_CONN_F_FWD_MASK) { 2883 case IP_VS_CONN_F_LOCALNODE: 2884 return "Local"; 2885 case IP_VS_CONN_F_TUNNEL: 2886 return "Tunnel"; 2887 case IP_VS_CONN_F_DROUTE: 2888 return "Route"; 2889 default: 2890 return "Masq"; 2891 } 2892 } 2893 2894 /* Do not expect consistent view during add, del and move(table resize). 2895 * We may miss entries and even show duplicates. 2896 */ 2897 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) 2898 { 2899 struct ip_vs_iter *iter = seq->private; 2900 struct ip_vs_rht *t = iter->t; 2901 struct ip_vs_service *svc; 2902 struct hlist_bl_node *e; 2903 int idx; 2904 2905 if (!t) 2906 return NULL; 2907 for (idx = 0; idx < t->size; idx++) { 2908 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) { 2909 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2910 break; 2911 if (pos-- == 0) { 2912 iter->bucket = idx; 2913 return svc; 2914 } 2915 } 2916 } 2917 return NULL; 2918 } 2919 2920 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) 2921 __acquires(RCU) 2922 { 2923 struct ip_vs_iter *iter = seq->private; 2924 struct net *net = seq_file_net(seq); 2925 struct netns_ipvs *ipvs = net_ipvs(net); 2926 2927 rcu_read_lock(); 2928 iter->t = rcu_dereference(ipvs->svc_table); 2929 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; 2930 } 2931 2932 2933 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2934 { 2935 struct ip_vs_service *svc; 2936 struct ip_vs_iter *iter; 2937 struct hlist_bl_node *e; 2938 struct ip_vs_rht *t; 2939 2940 ++*pos; 2941 if (v == SEQ_START_TOKEN) 2942 return ip_vs_info_array(seq,0); 2943 2944 svc = v; 2945 iter = seq->private; 2946 t = iter->t; 2947 if (!t) 2948 return NULL; 2949 2950 hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) { 2951 /* Our cursor was moved to new table ? */ 2952 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2953 break; 2954 return svc; 2955 } 2956 2957 while (++iter->bucket < t->size) { 2958 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket], 2959 s_list) { 2960 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2961 break; 2962 return svc; 2963 } 2964 } 2965 return NULL; 2966 } 2967 2968 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) 2969 __releases(RCU) 2970 { 2971 rcu_read_unlock(); 2972 } 2973 2974 2975 static int ip_vs_info_seq_show(struct seq_file *seq, void *v) 2976 { 2977 struct net *net = seq_file_net(seq); 2978 struct netns_ipvs *ipvs = net_ipvs(net); 2979 2980 if (v == SEQ_START_TOKEN) { 2981 seq_printf(seq, 2982 "IP Virtual Server version %d.%d.%d (size=%d)\n", 2983 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); 2984 seq_puts(seq, 2985 "Prot LocalAddress:Port Scheduler Flags\n"); 2986 seq_puts(seq, 2987 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); 2988 } else { 2989 const struct ip_vs_service *svc = v; 2990 const struct ip_vs_dest *dest; 2991 struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); 2992 char *sched_name = sched ? sched->name : "none"; 2993 2994 if (!svc->fwmark) { 2995 #ifdef CONFIG_IP_VS_IPV6 2996 if (svc->af == AF_INET6) 2997 seq_printf(seq, "%s [%pI6]:%04X %s ", 2998 ip_vs_proto_name(svc->protocol), 2999 &svc->addr.in6, 3000 ntohs(svc->port), 3001 sched_name); 3002 else 3003 #endif 3004 seq_printf(seq, "%s %08X:%04X %s %s ", 3005 ip_vs_proto_name(svc->protocol), 3006 ntohl(svc->addr.ip), 3007 ntohs(svc->port), 3008 sched_name, 3009 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 3010 } else { 3011 seq_printf(seq, "FWM %08X %s %s", 3012 svc->fwmark, sched_name, 3013 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 3014 } 3015 3016 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 3017 seq_printf(seq, "persistent %d %08X\n", 3018 svc->timeout, 3019 ntohl(svc->netmask)); 3020 else 3021 seq_putc(seq, '\n'); 3022 3023 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 3024 #ifdef CONFIG_IP_VS_IPV6 3025 if (dest->af == AF_INET6) 3026 seq_printf(seq, 3027 " -> [%pI6]:%04X" 3028 " %-7s %-6d %-10d %-10d\n", 3029 &dest->addr.in6, 3030 ntohs(dest->port), 3031 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 3032 atomic_read(&dest->weight), 3033 atomic_read(&dest->activeconns), 3034 atomic_read(&dest->inactconns)); 3035 else 3036 #endif 3037 seq_printf(seq, 3038 " -> %08X:%04X " 3039 "%-7s %-6d %-10d %-10d\n", 3040 ntohl(dest->addr.ip), 3041 ntohs(dest->port), 3042 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 3043 atomic_read(&dest->weight), 3044 atomic_read(&dest->activeconns), 3045 atomic_read(&dest->inactconns)); 3046 3047 } 3048 } 3049 return 0; 3050 } 3051 3052 static const struct seq_operations ip_vs_info_seq_ops = { 3053 .start = ip_vs_info_seq_start, 3054 .next = ip_vs_info_seq_next, 3055 .stop = ip_vs_info_seq_stop, 3056 .show = ip_vs_info_seq_show, 3057 }; 3058 3059 static int ip_vs_stats_show(struct seq_file *seq, void *v) 3060 { 3061 struct net *net = seq_file_single_net(seq); 3062 struct ip_vs_kstats show; 3063 3064 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 3065 seq_puts(seq, 3066 " Total Incoming Outgoing Incoming Outgoing\n"); 3067 seq_puts(seq, 3068 " Conns Packets Packets Bytes Bytes\n"); 3069 3070 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); 3071 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", 3072 (unsigned long long)show.conns, 3073 (unsigned long long)show.inpkts, 3074 (unsigned long long)show.outpkts, 3075 (unsigned long long)show.inbytes, 3076 (unsigned long long)show.outbytes); 3077 3078 /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ 3079 seq_puts(seq, 3080 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 3081 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", 3082 (unsigned long long)show.cps, 3083 (unsigned long long)show.inpps, 3084 (unsigned long long)show.outpps, 3085 (unsigned long long)show.inbps, 3086 (unsigned long long)show.outbps); 3087 3088 return 0; 3089 } 3090 3091 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) 3092 { 3093 struct net *net = seq_file_single_net(seq); 3094 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; 3095 struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; 3096 struct ip_vs_kstats kstats; 3097 int i; 3098 3099 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 3100 seq_puts(seq, 3101 " Total Incoming Outgoing Incoming Outgoing\n"); 3102 seq_puts(seq, 3103 "CPU Conns Packets Packets Bytes Bytes\n"); 3104 3105 for_each_possible_cpu(i) { 3106 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); 3107 unsigned int start; 3108 u64 conns, inpkts, outpkts, inbytes, outbytes; 3109 3110 do { 3111 start = u64_stats_fetch_begin(&u->syncp); 3112 conns = u64_stats_read(&u->cnt.conns); 3113 inpkts = u64_stats_read(&u->cnt.inpkts); 3114 outpkts = u64_stats_read(&u->cnt.outpkts); 3115 inbytes = u64_stats_read(&u->cnt.inbytes); 3116 outbytes = u64_stats_read(&u->cnt.outbytes); 3117 } while (u64_stats_fetch_retry(&u->syncp, start)); 3118 3119 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", 3120 i, (u64)conns, (u64)inpkts, 3121 (u64)outpkts, (u64)inbytes, 3122 (u64)outbytes); 3123 } 3124 3125 ip_vs_copy_stats(&kstats, tot_stats); 3126 3127 seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", 3128 (unsigned long long)kstats.conns, 3129 (unsigned long long)kstats.inpkts, 3130 (unsigned long long)kstats.outpkts, 3131 (unsigned long long)kstats.inbytes, 3132 (unsigned long long)kstats.outbytes); 3133 3134 /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 3135 seq_puts(seq, 3136 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 3137 seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", 3138 kstats.cps, 3139 kstats.inpps, 3140 kstats.outpps, 3141 kstats.inbps, 3142 kstats.outbps); 3143 3144 return 0; 3145 } 3146 3147 static int ip_vs_status_show(struct seq_file *seq, void *v) 3148 { 3149 struct net *net = seq_file_single_net(seq); 3150 struct netns_ipvs *ipvs = net_ipvs(net); 3151 unsigned int resched_score = 0; 3152 struct ip_vs_conn_hnode *hn; 3153 struct hlist_bl_head *head; 3154 struct ip_vs_service *svc; 3155 struct ip_vs_rht *t, *pt; 3156 struct hlist_bl_node *e; 3157 int old_gen, new_gen; 3158 u32 counts[8]; 3159 u32 bucket; 3160 u32 count; 3161 int loops; 3162 u32 sum1; 3163 u32 sum; 3164 int i; 3165 3166 /* Info for conns */ 3167 rcu_read_lock(); 3168 3169 t = rcu_dereference(ipvs->conn_tab); 3170 3171 seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count)); 3172 seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n", 3173 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); 3174 3175 if (!atomic_read(&ipvs->conn_count)) 3176 goto after_conns; 3177 old_gen = atomic_read(&ipvs->conn_tab_changes); 3178 loops = 0; 3179 3180 repeat_conn: 3181 smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ 3182 memset(counts, 0, sizeof(counts)); 3183 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { 3184 for (bucket = 0; bucket < t->size; bucket++) { 3185 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 3186 3187 count = 0; 3188 resched_score++; 3189 ip_vs_rht_walk_bucket_rcu(t, bucket, head) { 3190 count = 0; 3191 hlist_bl_for_each_entry_rcu(hn, e, head, node) { 3192 count++; 3193 if (count >= ARRAY_SIZE(counts) - 1) 3194 break; 3195 } 3196 } 3197 resched_score += count; 3198 if (resched_score >= 100) { 3199 resched_score = 0; 3200 cond_resched_rcu(); 3201 new_gen = atomic_read(&ipvs->conn_tab_changes); 3202 /* New table installed ? */ 3203 if (old_gen != new_gen) { 3204 /* Too many changes? */ 3205 if (++loops >= 5) 3206 goto after_conns; 3207 old_gen = new_gen; 3208 goto repeat_conn; 3209 } 3210 } 3211 counts[count]++; 3212 } 3213 } 3214 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) 3215 sum += counts[i]; 3216 sum1 = sum - counts[0]; 3217 seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n", 3218 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); 3219 for (i = 1; i < ARRAY_SIZE(counts); i++) { 3220 if (!counts[i]) 3221 continue; 3222 seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n", 3223 i, counts[i], 3224 div_u64((u64)counts[i] * 100U, max(sum1, 1U))); 3225 } 3226 3227 after_conns: 3228 rcu_read_unlock(); 3229 3230 /* Info for services */ 3231 down_read(&ipvs->svc_replace_sem); 3232 rcu_read_lock(); 3233 3234 t = rcu_dereference(ipvs->svc_table); 3235 3236 count = ip_vs_get_num_services(ipvs); 3237 seq_printf(seq, "Services:\t%u\n", count); 3238 seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", 3239 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); 3240 3241 if (!count) 3242 goto after_svc; 3243 old_gen = atomic_read(&ipvs->svc_table_changes); 3244 3245 smp_rmb(); /* ipvs->svc_table and svc_table_changes */ 3246 memset(counts, 0, sizeof(counts)); 3247 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { 3248 for (bucket = 0; bucket < t->size; bucket++) { 3249 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 3250 3251 count = 0; 3252 resched_score++; 3253 ip_vs_rht_walk_bucket_rcu(t, bucket, head) { 3254 count = 0; 3255 hlist_bl_for_each_entry_rcu(svc, e, head, 3256 s_list) { 3257 count++; 3258 if (count >= ARRAY_SIZE(counts) - 1) 3259 break; 3260 } 3261 } 3262 resched_score += count; 3263 if (resched_score >= 100) { 3264 resched_score = 0; 3265 cond_resched_rcu(); 3266 /* Flushed? */ 3267 if (atomic_read(&ipvs->svc_table_changes) != 3268 old_gen) 3269 goto after_svc; 3270 } 3271 counts[count]++; 3272 } 3273 } 3274 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) 3275 sum += counts[i]; 3276 sum1 = sum - counts[0]; 3277 seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n", 3278 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); 3279 for (i = 1; i < ARRAY_SIZE(counts); i++) { 3280 if (!counts[i]) 3281 continue; 3282 seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n", 3283 i, counts[i], 3284 div_u64((u64)counts[i] * 100U, max(sum1, 1U))); 3285 } 3286 3287 after_svc: 3288 rcu_read_unlock(); 3289 up_read(&ipvs->svc_replace_sem); 3290 3291 seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", 3292 ipvs->est_kt_count, ipvs->est_max_threads); 3293 seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); 3294 seq_printf(seq, "Stats thread ests:\t%d\n", 3295 ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * 3296 IPVS_EST_NTICKS); 3297 3298 return 0; 3299 } 3300 3301 #endif 3302 3303 /* 3304 * Set timeout values for tcp tcpfin udp in the timeout_table. 3305 */ 3306 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 3307 { 3308 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 3309 struct ip_vs_proto_data *pd; 3310 #endif 3311 3312 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", 3313 u->tcp_timeout, 3314 u->tcp_fin_timeout, 3315 u->udp_timeout); 3316 3317 #ifdef CONFIG_IP_VS_PROTO_TCP 3318 if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || 3319 u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { 3320 return -EINVAL; 3321 } 3322 #endif 3323 3324 #ifdef CONFIG_IP_VS_PROTO_UDP 3325 if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) 3326 return -EINVAL; 3327 #endif 3328 3329 #ifdef CONFIG_IP_VS_PROTO_TCP 3330 if (u->tcp_timeout) { 3331 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3332 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] 3333 = u->tcp_timeout * HZ; 3334 } 3335 3336 if (u->tcp_fin_timeout) { 3337 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3338 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] 3339 = u->tcp_fin_timeout * HZ; 3340 } 3341 #endif 3342 3343 #ifdef CONFIG_IP_VS_PROTO_UDP 3344 if (u->udp_timeout) { 3345 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 3346 pd->timeout_table[IP_VS_UDP_S_NORMAL] 3347 = u->udp_timeout * HZ; 3348 } 3349 #endif 3350 return 0; 3351 } 3352 3353 #define CMDID(cmd) (cmd - IP_VS_BASE_CTL) 3354 3355 struct ip_vs_svcdest_user { 3356 struct ip_vs_service_user s; 3357 struct ip_vs_dest_user d; 3358 }; 3359 3360 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { 3361 [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), 3362 [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), 3363 [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), 3364 [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), 3365 [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), 3366 [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), 3367 [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 3368 [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), 3369 [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), 3370 [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), 3371 }; 3372 3373 union ip_vs_set_arglen { 3374 struct ip_vs_service_user field_IP_VS_SO_SET_ADD; 3375 struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; 3376 struct ip_vs_service_user field_IP_VS_SO_SET_DEL; 3377 struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; 3378 struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; 3379 struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; 3380 struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; 3381 struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; 3382 struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; 3383 struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; 3384 }; 3385 3386 #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) 3387 3388 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, 3389 struct ip_vs_service_user *usvc_compat) 3390 { 3391 memset(usvc, 0, sizeof(*usvc)); 3392 3393 usvc->af = AF_INET; 3394 usvc->protocol = usvc_compat->protocol; 3395 usvc->addr.ip = usvc_compat->addr; 3396 usvc->port = usvc_compat->port; 3397 usvc->fwmark = usvc_compat->fwmark; 3398 3399 /* Deep copy of sched_name is not needed here */ 3400 usvc->sched_name = usvc_compat->sched_name; 3401 3402 usvc->flags = usvc_compat->flags; 3403 usvc->timeout = usvc_compat->timeout; 3404 usvc->netmask = usvc_compat->netmask; 3405 } 3406 3407 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, 3408 struct ip_vs_dest_user *udest_compat) 3409 { 3410 memset(udest, 0, sizeof(*udest)); 3411 3412 udest->addr.ip = udest_compat->addr; 3413 udest->port = udest_compat->port; 3414 udest->conn_flags = udest_compat->conn_flags; 3415 udest->weight = udest_compat->weight; 3416 udest->u_threshold = udest_compat->u_threshold; 3417 udest->l_threshold = udest_compat->l_threshold; 3418 udest->af = AF_INET; 3419 udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; 3420 } 3421 3422 static int 3423 do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) 3424 { 3425 struct net *net = sock_net(sk); 3426 int ret; 3427 unsigned char arg[MAX_SET_ARGLEN]; 3428 struct ip_vs_service_user *usvc_compat; 3429 struct ip_vs_service_user_kern usvc; 3430 struct ip_vs_service *svc; 3431 struct ip_vs_dest_user *udest_compat; 3432 struct ip_vs_dest_user_kern udest; 3433 struct netns_ipvs *ipvs = net_ipvs(net); 3434 3435 BUILD_BUG_ON(sizeof(arg) > 255); 3436 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3437 return -EPERM; 3438 3439 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) 3440 return -EINVAL; 3441 if (len != set_arglen[CMDID(cmd)]) { 3442 IP_VS_DBG(1, "set_ctl: len %u != %u\n", 3443 len, set_arglen[CMDID(cmd)]); 3444 return -EINVAL; 3445 } 3446 3447 if (copy_from_sockptr(arg, ptr, len) != 0) 3448 return -EFAULT; 3449 3450 /* Handle daemons since they have another lock */ 3451 if (cmd == IP_VS_SO_SET_STARTDAEMON || 3452 cmd == IP_VS_SO_SET_STOPDAEMON) { 3453 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; 3454 3455 if (cmd == IP_VS_SO_SET_STARTDAEMON) { 3456 struct ipvs_sync_daemon_cfg cfg; 3457 3458 memset(&cfg, 0, sizeof(cfg)); 3459 ret = -EINVAL; 3460 if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, 3461 sizeof(cfg.mcast_ifn)) <= 0) 3462 return ret; 3463 cfg.syncid = dm->syncid; 3464 ret = start_sync_thread(ipvs, &cfg, dm->state); 3465 } else { 3466 ret = stop_sync_thread(ipvs, dm->state); 3467 } 3468 return ret; 3469 } 3470 3471 mutex_lock(&ipvs->service_mutex); 3472 if (cmd == IP_VS_SO_SET_FLUSH) { 3473 /* Flush the virtual service */ 3474 ret = ip_vs_flush(ipvs, false); 3475 goto out_unlock; 3476 } else if (cmd == IP_VS_SO_SET_TIMEOUT) { 3477 /* Set timeout values for (tcp tcpfin udp) */ 3478 ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); 3479 goto out_unlock; 3480 } else if (!len) { 3481 /* No more commands with len == 0 below */ 3482 ret = -EINVAL; 3483 goto out_unlock; 3484 } 3485 3486 usvc_compat = (struct ip_vs_service_user *)arg; 3487 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); 3488 3489 /* We only use the new structs internally, so copy userspace compat 3490 * structs to extended internal versions */ 3491 ip_vs_copy_usvc_compat(&usvc, usvc_compat); 3492 ip_vs_copy_udest_compat(&udest, udest_compat); 3493 3494 if (cmd == IP_VS_SO_SET_ZERO) { 3495 /* if no service address is set, zero counters in all */ 3496 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { 3497 ret = ip_vs_zero_all(ipvs); 3498 goto out_unlock; 3499 } 3500 } 3501 3502 if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && 3503 strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == 3504 IP_VS_SCHEDNAME_MAXLEN) { 3505 ret = -EINVAL; 3506 goto out_unlock; 3507 } 3508 3509 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ 3510 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && 3511 usvc.protocol != IPPROTO_SCTP) { 3512 pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", 3513 usvc.protocol, &usvc.addr.ip, 3514 ntohs(usvc.port)); 3515 ret = -EFAULT; 3516 goto out_unlock; 3517 } 3518 3519 /* Lookup the exact service by <protocol, addr, port> or fwmark */ 3520 rcu_read_lock(); 3521 if (usvc.fwmark == 0) 3522 svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, 3523 &usvc.addr, usvc.port); 3524 else 3525 svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); 3526 rcu_read_unlock(); 3527 3528 if (cmd != IP_VS_SO_SET_ADD 3529 && (svc == NULL || svc->protocol != usvc.protocol)) { 3530 ret = -ESRCH; 3531 goto out_unlock; 3532 } 3533 3534 switch (cmd) { 3535 case IP_VS_SO_SET_ADD: 3536 if (svc != NULL) 3537 ret = -EEXIST; 3538 else 3539 ret = ip_vs_add_service(ipvs, &usvc, &svc); 3540 break; 3541 case IP_VS_SO_SET_EDIT: 3542 ret = ip_vs_edit_service(svc, &usvc); 3543 break; 3544 case IP_VS_SO_SET_DEL: 3545 ret = ip_vs_del_service(svc); 3546 if (!ret) 3547 goto out_unlock; 3548 break; 3549 case IP_VS_SO_SET_ZERO: 3550 ret = ip_vs_zero_service(svc); 3551 break; 3552 case IP_VS_SO_SET_ADDDEST: 3553 ret = ip_vs_add_dest(svc, &udest); 3554 break; 3555 case IP_VS_SO_SET_EDITDEST: 3556 ret = ip_vs_edit_dest(svc, &udest); 3557 break; 3558 case IP_VS_SO_SET_DELDEST: 3559 ret = ip_vs_del_dest(svc, &udest); 3560 break; 3561 default: 3562 WARN_ON_ONCE(1); 3563 ret = -EINVAL; 3564 break; 3565 } 3566 3567 out_unlock: 3568 mutex_unlock(&ipvs->service_mutex); 3569 return ret; 3570 } 3571 3572 3573 static void 3574 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) 3575 { 3576 struct ip_vs_scheduler *sched; 3577 struct ip_vs_kstats kstats; 3578 char *sched_name; 3579 3580 sched = rcu_dereference_protected(src->scheduler, 1); 3581 sched_name = sched ? sched->name : "none"; 3582 dst->protocol = src->protocol; 3583 dst->addr = src->addr.ip; 3584 dst->port = src->port; 3585 dst->fwmark = src->fwmark; 3586 strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); 3587 dst->flags = src->flags; 3588 dst->timeout = src->timeout / HZ; 3589 dst->netmask = src->netmask; 3590 dst->num_dests = src->num_dests; 3591 ip_vs_copy_stats(&kstats, &src->stats); 3592 ip_vs_export_stats_user(&dst->stats, &kstats); 3593 } 3594 3595 static inline int 3596 __ip_vs_get_service_entries(struct netns_ipvs *ipvs, 3597 const struct ip_vs_get_services *get, 3598 struct ip_vs_get_services __user *uptr) 3599 { 3600 struct ip_vs_service_entry entry; 3601 DECLARE_IP_VS_RHT_WALK_BUCKETS(); 3602 struct hlist_bl_head *head; 3603 struct ip_vs_service *svc; 3604 struct hlist_bl_node *e; 3605 int count = 0; 3606 int ret = 0; 3607 3608 lockdep_assert_held(&ipvs->svc_resize_sem); 3609 /* All svc_table modifications are disabled, go ahead */ 3610 ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 3611 hlist_bl_for_each_entry(svc, e, head, s_list) { 3612 /* Only expose IPv4 entries to old interface */ 3613 if (svc->af != AF_INET) 3614 continue; 3615 3616 if (count >= get->num_services) 3617 goto out; 3618 memset(&entry, 0, sizeof(entry)); 3619 ip_vs_copy_service(&entry, svc); 3620 if (copy_to_user(&uptr->entrytable[count], 3621 &entry, sizeof(entry))) { 3622 ret = -EFAULT; 3623 goto out; 3624 } 3625 count++; 3626 } 3627 } 3628 3629 out: 3630 return ret; 3631 } 3632 3633 static inline int 3634 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, 3635 struct ip_vs_get_dests __user *uptr) 3636 { 3637 struct ip_vs_service *svc; 3638 union nf_inet_addr addr = { .ip = get->addr }; 3639 int ret = 0; 3640 3641 rcu_read_lock(); 3642 if (get->fwmark) 3643 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); 3644 else 3645 svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, 3646 get->port); 3647 rcu_read_unlock(); 3648 3649 if (svc) { 3650 int count = 0; 3651 struct ip_vs_dest *dest; 3652 struct ip_vs_dest_entry entry; 3653 struct ip_vs_kstats kstats; 3654 3655 memset(&entry, 0, sizeof(entry)); 3656 list_for_each_entry(dest, &svc->destinations, n_list) { 3657 if (count >= get->num_dests) 3658 break; 3659 3660 /* Cannot expose heterogeneous members via sockopt 3661 * interface 3662 */ 3663 if (dest->af != svc->af) 3664 continue; 3665 3666 entry.addr = dest->addr.ip; 3667 entry.port = dest->port; 3668 entry.conn_flags = atomic_read(&dest->conn_flags); 3669 entry.weight = atomic_read(&dest->weight); 3670 entry.u_threshold = dest->u_threshold; 3671 entry.l_threshold = dest->l_threshold; 3672 entry.activeconns = atomic_read(&dest->activeconns); 3673 entry.inactconns = atomic_read(&dest->inactconns); 3674 entry.persistconns = atomic_read(&dest->persistconns); 3675 ip_vs_copy_stats(&kstats, &dest->stats); 3676 ip_vs_export_stats_user(&entry.stats, &kstats); 3677 if (copy_to_user(&uptr->entrytable[count], 3678 &entry, sizeof(entry))) { 3679 ret = -EFAULT; 3680 break; 3681 } 3682 count++; 3683 } 3684 } else 3685 ret = -ESRCH; 3686 return ret; 3687 } 3688 3689 static inline void 3690 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 3691 { 3692 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 3693 struct ip_vs_proto_data *pd; 3694 #endif 3695 3696 memset(u, 0, sizeof (*u)); 3697 3698 #ifdef CONFIG_IP_VS_PROTO_TCP 3699 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3700 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; 3701 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; 3702 #endif 3703 #ifdef CONFIG_IP_VS_PROTO_UDP 3704 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 3705 u->udp_timeout = 3706 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; 3707 #endif 3708 } 3709 3710 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { 3711 [CMDID(IP_VS_SO_GET_VERSION)] = 64, 3712 [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), 3713 [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), 3714 [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), 3715 [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), 3716 [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 3717 [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), 3718 }; 3719 3720 union ip_vs_get_arglen { 3721 char field_IP_VS_SO_GET_VERSION[64]; 3722 struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; 3723 struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; 3724 struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; 3725 struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; 3726 struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; 3727 struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; 3728 }; 3729 3730 #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) 3731 3732 static int 3733 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) 3734 { 3735 unsigned char arg[MAX_GET_ARGLEN]; 3736 int ret = 0; 3737 unsigned int copylen; 3738 struct net *net = sock_net(sk); 3739 struct netns_ipvs *ipvs = net_ipvs(net); 3740 3741 BUG_ON(!net); 3742 BUILD_BUG_ON(sizeof(arg) > 255); 3743 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3744 return -EPERM; 3745 3746 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) 3747 return -EINVAL; 3748 3749 copylen = get_arglen[CMDID(cmd)]; 3750 if (*len < (int) copylen) { 3751 IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); 3752 return -EINVAL; 3753 } 3754 3755 if (copy_from_user(arg, user, copylen) != 0) 3756 return -EFAULT; 3757 /* 3758 * Handle daemons first since it has its own locking 3759 */ 3760 if (cmd == IP_VS_SO_GET_DAEMON) { 3761 struct ip_vs_daemon_user d[2]; 3762 3763 memset(&d, 0, sizeof(d)); 3764 mutex_lock(&ipvs->sync_mutex); 3765 if (ipvs->sync_state & IP_VS_STATE_MASTER) { 3766 d[0].state = IP_VS_STATE_MASTER; 3767 strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, 3768 sizeof(d[0].mcast_ifn)); 3769 d[0].syncid = ipvs->mcfg.syncid; 3770 } 3771 if (ipvs->sync_state & IP_VS_STATE_BACKUP) { 3772 d[1].state = IP_VS_STATE_BACKUP; 3773 strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, 3774 sizeof(d[1].mcast_ifn)); 3775 d[1].syncid = ipvs->bcfg.syncid; 3776 } 3777 if (copy_to_user(user, &d, sizeof(d)) != 0) 3778 ret = -EFAULT; 3779 mutex_unlock(&ipvs->sync_mutex); 3780 return ret; 3781 } 3782 3783 if (cmd == IP_VS_SO_GET_SERVICES) { 3784 struct ip_vs_get_services *get; 3785 size_t size; 3786 3787 get = (struct ip_vs_get_services *)arg; 3788 size = struct_size(get, entrytable, get->num_services); 3789 if (*len != size) { 3790 pr_err("length: %u != %zu\n", *len, size); 3791 return -EINVAL; 3792 } 3793 /* Prevent modifications to the list with services. 3794 * Try reverse locking, so that we do not hold the mutex 3795 * while waiting for semaphore. 3796 */ 3797 while (1) { 3798 ret = down_read_killable(&ipvs->svc_resize_sem); 3799 if (ret < 0) 3800 return ret; 3801 if (mutex_trylock(&ipvs->service_mutex)) 3802 break; 3803 up_read(&ipvs->svc_resize_sem); 3804 cond_resched(); 3805 } 3806 ret = __ip_vs_get_service_entries(ipvs, get, user); 3807 up_read(&ipvs->svc_resize_sem); 3808 mutex_unlock(&ipvs->service_mutex); 3809 return ret; 3810 } 3811 3812 mutex_lock(&ipvs->service_mutex); 3813 switch (cmd) { 3814 case IP_VS_SO_GET_VERSION: 3815 { 3816 char buf[64]; 3817 3818 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", 3819 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); 3820 if (copy_to_user(user, buf, strlen(buf)+1) != 0) { 3821 ret = -EFAULT; 3822 goto out; 3823 } 3824 *len = strlen(buf)+1; 3825 } 3826 break; 3827 3828 case IP_VS_SO_GET_INFO: 3829 { 3830 struct ip_vs_getinfo info; 3831 3832 info.version = IP_VS_VERSION_CODE; 3833 info.size = get_conn_tab_size(ipvs); 3834 info.num_services = 3835 atomic_read(&ipvs->num_services[IP_VS_AF_INET]); 3836 if (copy_to_user(user, &info, sizeof(info)) != 0) 3837 ret = -EFAULT; 3838 } 3839 break; 3840 3841 case IP_VS_SO_GET_SERVICE: 3842 { 3843 struct ip_vs_service_entry *entry; 3844 struct ip_vs_service *svc; 3845 union nf_inet_addr addr; 3846 3847 entry = (struct ip_vs_service_entry *)arg; 3848 addr.ip = entry->addr; 3849 rcu_read_lock(); 3850 if (entry->fwmark) 3851 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); 3852 else 3853 svc = __ip_vs_service_find(ipvs, AF_INET, 3854 entry->protocol, &addr, 3855 entry->port); 3856 rcu_read_unlock(); 3857 if (svc) { 3858 ip_vs_copy_service(entry, svc); 3859 if (copy_to_user(user, entry, sizeof(*entry)) != 0) 3860 ret = -EFAULT; 3861 } else 3862 ret = -ESRCH; 3863 } 3864 break; 3865 3866 case IP_VS_SO_GET_DESTS: 3867 { 3868 struct ip_vs_get_dests *get; 3869 size_t size; 3870 3871 get = (struct ip_vs_get_dests *)arg; 3872 size = struct_size(get, entrytable, get->num_dests); 3873 if (*len != size) { 3874 pr_err("length: %u != %zu\n", *len, size); 3875 ret = -EINVAL; 3876 goto out; 3877 } 3878 ret = __ip_vs_get_dest_entries(ipvs, get, user); 3879 } 3880 break; 3881 3882 case IP_VS_SO_GET_TIMEOUT: 3883 { 3884 struct ip_vs_timeout_user t; 3885 3886 __ip_vs_get_timeouts(ipvs, &t); 3887 if (copy_to_user(user, &t, sizeof(t)) != 0) 3888 ret = -EFAULT; 3889 } 3890 break; 3891 3892 default: 3893 ret = -EINVAL; 3894 } 3895 3896 out: 3897 mutex_unlock(&ipvs->service_mutex); 3898 return ret; 3899 } 3900 3901 3902 static struct nf_sockopt_ops ip_vs_sockopts = { 3903 .pf = PF_INET, 3904 .set_optmin = IP_VS_BASE_CTL, 3905 .set_optmax = IP_VS_SO_SET_MAX+1, 3906 .set = do_ip_vs_set_ctl, 3907 .get_optmin = IP_VS_BASE_CTL, 3908 .get_optmax = IP_VS_SO_GET_MAX+1, 3909 .get = do_ip_vs_get_ctl, 3910 .owner = THIS_MODULE, 3911 }; 3912 3913 /* 3914 * Generic Netlink interface 3915 */ 3916 3917 /* IPVS genetlink family */ 3918 static struct genl_family ip_vs_genl_family; 3919 3920 /* Policy used for first-level command attributes */ 3921 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { 3922 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, 3923 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, 3924 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, 3925 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, 3926 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, 3927 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, 3928 }; 3929 3930 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ 3931 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { 3932 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, 3933 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, 3934 .len = IP_VS_IFNAME_MAXLEN - 1 }, 3935 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, 3936 [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, 3937 [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, 3938 [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, 3939 [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, 3940 [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, 3941 }; 3942 3943 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ 3944 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { 3945 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, 3946 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, 3947 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, 3948 .len = sizeof(union nf_inet_addr) }, 3949 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, 3950 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, 3951 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, 3952 .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, 3953 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, 3954 .len = IP_VS_PENAME_MAXLEN }, 3955 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, 3956 .len = sizeof(struct ip_vs_flags) }, 3957 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, 3958 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, 3959 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, 3960 }; 3961 3962 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ 3963 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { 3964 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, 3965 .len = sizeof(union nf_inet_addr) }, 3966 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, 3967 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, 3968 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, 3969 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, 3970 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, 3971 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, 3972 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, 3973 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, 3974 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, 3975 [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, 3976 [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, 3977 [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, 3978 [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, 3979 }; 3980 3981 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, 3982 struct ip_vs_kstats *kstats) 3983 { 3984 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 3985 3986 if (!nl_stats) 3987 return -EMSGSIZE; 3988 3989 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || 3990 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || 3991 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || 3992 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 3993 IPVS_STATS_ATTR_PAD) || 3994 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 3995 IPVS_STATS_ATTR_PAD) || 3996 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || 3997 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || 3998 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || 3999 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || 4000 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) 4001 goto nla_put_failure; 4002 nla_nest_end(skb, nl_stats); 4003 4004 return 0; 4005 4006 nla_put_failure: 4007 nla_nest_cancel(skb, nl_stats); 4008 return -EMSGSIZE; 4009 } 4010 4011 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, 4012 struct ip_vs_kstats *kstats) 4013 { 4014 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 4015 4016 if (!nl_stats) 4017 return -EMSGSIZE; 4018 4019 if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, 4020 IPVS_STATS_ATTR_PAD) || 4021 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, 4022 IPVS_STATS_ATTR_PAD) || 4023 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, 4024 IPVS_STATS_ATTR_PAD) || 4025 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 4026 IPVS_STATS_ATTR_PAD) || 4027 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 4028 IPVS_STATS_ATTR_PAD) || 4029 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, 4030 IPVS_STATS_ATTR_PAD) || 4031 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, 4032 IPVS_STATS_ATTR_PAD) || 4033 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, 4034 IPVS_STATS_ATTR_PAD) || 4035 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, 4036 IPVS_STATS_ATTR_PAD) || 4037 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, 4038 IPVS_STATS_ATTR_PAD)) 4039 goto nla_put_failure; 4040 nla_nest_end(skb, nl_stats); 4041 4042 return 0; 4043 4044 nla_put_failure: 4045 nla_nest_cancel(skb, nl_stats); 4046 return -EMSGSIZE; 4047 } 4048 4049 static int ip_vs_genl_fill_service(struct sk_buff *skb, 4050 struct ip_vs_service *svc) 4051 { 4052 struct ip_vs_scheduler *sched; 4053 struct ip_vs_pe *pe; 4054 struct nlattr *nl_service; 4055 struct ip_vs_flags flags = { .flags = svc->flags, 4056 .mask = ~0 }; 4057 struct ip_vs_kstats kstats; 4058 char *sched_name; 4059 4060 nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); 4061 if (!nl_service) 4062 return -EMSGSIZE; 4063 4064 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) 4065 goto nla_put_failure; 4066 if (svc->fwmark) { 4067 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) 4068 goto nla_put_failure; 4069 } else { 4070 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || 4071 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || 4072 nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) 4073 goto nla_put_failure; 4074 } 4075 4076 sched = rcu_dereference(svc->scheduler); 4077 sched_name = sched ? sched->name : "none"; 4078 pe = rcu_dereference(svc->pe); 4079 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || 4080 (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || 4081 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || 4082 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || 4083 nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) 4084 goto nla_put_failure; 4085 ip_vs_copy_stats(&kstats, &svc->stats); 4086 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) 4087 goto nla_put_failure; 4088 if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) 4089 goto nla_put_failure; 4090 4091 nla_nest_end(skb, nl_service); 4092 4093 return 0; 4094 4095 nla_put_failure: 4096 nla_nest_cancel(skb, nl_service); 4097 return -EMSGSIZE; 4098 } 4099 4100 static int ip_vs_genl_dump_service(struct sk_buff *skb, 4101 struct ip_vs_service *svc, 4102 struct netlink_callback *cb) 4103 { 4104 void *hdr; 4105 4106 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4107 &ip_vs_genl_family, NLM_F_MULTI, 4108 IPVS_CMD_NEW_SERVICE); 4109 if (!hdr) 4110 return -EMSGSIZE; 4111 4112 if (ip_vs_genl_fill_service(skb, svc) < 0) 4113 goto nla_put_failure; 4114 4115 genlmsg_end(skb, hdr); 4116 return 0; 4117 4118 nla_put_failure: 4119 genlmsg_cancel(skb, hdr); 4120 return -EMSGSIZE; 4121 } 4122 4123 static int ip_vs_genl_dump_services(struct sk_buff *skb, 4124 struct netlink_callback *cb) 4125 { 4126 DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); 4127 struct net *net = sock_net(skb->sk); 4128 struct netns_ipvs *ipvs = net_ipvs(net); 4129 struct hlist_bl_head *head; 4130 struct ip_vs_service *svc; 4131 struct hlist_bl_node *e; 4132 int start = cb->args[0]; 4133 int idx = 0; 4134 4135 /* Make sure we do not see same service twice during resize */ 4136 down_read(&ipvs->svc_resize_sem); 4137 rcu_read_lock(); 4138 ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { 4139 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 4140 if (++idx <= start) 4141 continue; 4142 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { 4143 idx--; 4144 goto nla_put_failure; 4145 } 4146 } 4147 } 4148 4149 nla_put_failure: 4150 rcu_read_unlock(); 4151 up_read(&ipvs->svc_resize_sem); 4152 cb->args[0] = idx; 4153 4154 return skb->len; 4155 } 4156 4157 static bool ip_vs_is_af_valid(int af) 4158 { 4159 if (af == AF_INET) 4160 return true; 4161 #ifdef CONFIG_IP_VS_IPV6 4162 if (af == AF_INET6 && ipv6_mod_enabled()) 4163 return true; 4164 #endif 4165 return false; 4166 } 4167 4168 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, 4169 struct ip_vs_service_user_kern *usvc, 4170 struct nlattr *nla, bool full_entry, 4171 struct ip_vs_service **ret_svc) 4172 { 4173 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; 4174 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; 4175 struct ip_vs_service *svc; 4176 4177 /* Parse mandatory identifying service fields first */ 4178 if (nla == NULL || 4179 nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) 4180 return -EINVAL; 4181 4182 nla_af = attrs[IPVS_SVC_ATTR_AF]; 4183 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; 4184 nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; 4185 nla_port = attrs[IPVS_SVC_ATTR_PORT]; 4186 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; 4187 4188 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) 4189 return -EINVAL; 4190 4191 memset(usvc, 0, sizeof(*usvc)); 4192 4193 usvc->af = nla_get_u16(nla_af); 4194 if (!ip_vs_is_af_valid(usvc->af)) 4195 return -EAFNOSUPPORT; 4196 4197 if (nla_fwmark) { 4198 usvc->protocol = IPPROTO_TCP; 4199 usvc->fwmark = nla_get_u32(nla_fwmark); 4200 } else { 4201 usvc->protocol = nla_get_u16(nla_protocol); 4202 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); 4203 usvc->port = nla_get_be16(nla_port); 4204 usvc->fwmark = 0; 4205 } 4206 4207 if (usvc->fwmark) 4208 svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); 4209 else 4210 svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, 4211 &usvc->addr, usvc->port); 4212 *ret_svc = svc; 4213 4214 /* If a full entry was requested, check for the additional fields */ 4215 if (full_entry) { 4216 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, 4217 *nla_netmask; 4218 struct ip_vs_flags flags; 4219 4220 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; 4221 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; 4222 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; 4223 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; 4224 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; 4225 4226 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) 4227 return -EINVAL; 4228 4229 nla_memcpy(&flags, nla_flags, sizeof(flags)); 4230 4231 /* prefill flags from service if it already exists */ 4232 if (svc) 4233 usvc->flags = svc->flags; 4234 4235 /* set new flags from userland */ 4236 usvc->flags = (usvc->flags & ~flags.mask) | 4237 (flags.flags & flags.mask); 4238 usvc->sched_name = nla_data(nla_sched); 4239 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; 4240 usvc->timeout = nla_get_u32(nla_timeout); 4241 usvc->netmask = nla_get_be32(nla_netmask); 4242 } 4243 4244 return 0; 4245 } 4246 4247 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, 4248 struct nlattr *nla) 4249 { 4250 struct ip_vs_service_user_kern usvc; 4251 struct ip_vs_service *svc; 4252 int ret; 4253 4254 ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); 4255 return ret ? ERR_PTR(ret) : svc; 4256 } 4257 4258 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) 4259 { 4260 struct nlattr *nl_dest; 4261 struct ip_vs_kstats kstats; 4262 4263 nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); 4264 if (!nl_dest) 4265 return -EMSGSIZE; 4266 4267 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || 4268 nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || 4269 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, 4270 (atomic_read(&dest->conn_flags) & 4271 IP_VS_CONN_F_FWD_MASK)) || 4272 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, 4273 atomic_read(&dest->weight)) || 4274 nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, 4275 dest->tun_type) || 4276 nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, 4277 dest->tun_port) || 4278 nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, 4279 dest->tun_flags) || 4280 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || 4281 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || 4282 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, 4283 atomic_read(&dest->activeconns)) || 4284 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, 4285 atomic_read(&dest->inactconns)) || 4286 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, 4287 atomic_read(&dest->persistconns)) || 4288 nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) 4289 goto nla_put_failure; 4290 ip_vs_copy_stats(&kstats, &dest->stats); 4291 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) 4292 goto nla_put_failure; 4293 if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) 4294 goto nla_put_failure; 4295 4296 nla_nest_end(skb, nl_dest); 4297 4298 return 0; 4299 4300 nla_put_failure: 4301 nla_nest_cancel(skb, nl_dest); 4302 return -EMSGSIZE; 4303 } 4304 4305 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, 4306 struct netlink_callback *cb) 4307 { 4308 void *hdr; 4309 4310 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4311 &ip_vs_genl_family, NLM_F_MULTI, 4312 IPVS_CMD_NEW_DEST); 4313 if (!hdr) 4314 return -EMSGSIZE; 4315 4316 if (ip_vs_genl_fill_dest(skb, dest) < 0) 4317 goto nla_put_failure; 4318 4319 genlmsg_end(skb, hdr); 4320 return 0; 4321 4322 nla_put_failure: 4323 genlmsg_cancel(skb, hdr); 4324 return -EMSGSIZE; 4325 } 4326 4327 static int ip_vs_genl_dump_dests(struct sk_buff *skb, 4328 struct netlink_callback *cb) 4329 { 4330 int idx = 0; 4331 int start = cb->args[0]; 4332 struct ip_vs_service *svc; 4333 struct ip_vs_dest *dest; 4334 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; 4335 struct net *net = sock_net(skb->sk); 4336 struct netns_ipvs *ipvs = net_ipvs(net); 4337 4338 rcu_read_lock(); 4339 4340 /* Try to find the service for which to dump destinations */ 4341 if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) 4342 goto out_err; 4343 4344 4345 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); 4346 if (IS_ERR_OR_NULL(svc)) 4347 goto out_err; 4348 4349 /* Dump the destinations */ 4350 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 4351 if (++idx <= start) 4352 continue; 4353 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { 4354 idx--; 4355 goto nla_put_failure; 4356 } 4357 } 4358 4359 nla_put_failure: 4360 cb->args[0] = idx; 4361 4362 out_err: 4363 rcu_read_unlock(); 4364 4365 return skb->len; 4366 } 4367 4368 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, 4369 struct nlattr *nla, bool full_entry) 4370 { 4371 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; 4372 struct nlattr *nla_addr, *nla_port; 4373 struct nlattr *nla_addr_family; 4374 4375 /* Parse mandatory identifying destination fields first */ 4376 if (nla == NULL || 4377 nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) 4378 return -EINVAL; 4379 4380 nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; 4381 nla_port = attrs[IPVS_DEST_ATTR_PORT]; 4382 nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; 4383 4384 if (!(nla_addr && nla_port)) 4385 return -EINVAL; 4386 4387 memset(udest, 0, sizeof(*udest)); 4388 4389 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); 4390 udest->port = nla_get_be16(nla_port); 4391 4392 udest->af = nla_get_u16_default(nla_addr_family, 0); 4393 4394 /* If a full entry was requested, check for the additional fields */ 4395 if (full_entry) { 4396 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, 4397 *nla_l_thresh, *nla_tun_type, *nla_tun_port, 4398 *nla_tun_flags; 4399 4400 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; 4401 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; 4402 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; 4403 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; 4404 nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; 4405 nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; 4406 nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; 4407 4408 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) 4409 return -EINVAL; 4410 4411 udest->conn_flags = nla_get_u32(nla_fwd) 4412 & IP_VS_CONN_F_FWD_MASK; 4413 udest->weight = nla_get_u32(nla_weight); 4414 udest->u_threshold = nla_get_u32(nla_u_thresh); 4415 udest->l_threshold = nla_get_u32(nla_l_thresh); 4416 4417 if (nla_tun_type) 4418 udest->tun_type = nla_get_u8(nla_tun_type); 4419 4420 if (nla_tun_port) 4421 udest->tun_port = nla_get_be16(nla_tun_port); 4422 4423 if (nla_tun_flags) 4424 udest->tun_flags = nla_get_u16(nla_tun_flags); 4425 } 4426 4427 return 0; 4428 } 4429 4430 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, 4431 struct ipvs_sync_daemon_cfg *c) 4432 { 4433 struct nlattr *nl_daemon; 4434 4435 nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); 4436 if (!nl_daemon) 4437 return -EMSGSIZE; 4438 4439 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || 4440 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || 4441 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || 4442 nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || 4443 nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || 4444 nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) 4445 goto nla_put_failure; 4446 #ifdef CONFIG_IP_VS_IPV6 4447 if (c->mcast_af == AF_INET6) { 4448 if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, 4449 &c->mcast_group.in6)) 4450 goto nla_put_failure; 4451 } else 4452 #endif 4453 if (c->mcast_af == AF_INET && 4454 nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, 4455 c->mcast_group.ip)) 4456 goto nla_put_failure; 4457 nla_nest_end(skb, nl_daemon); 4458 4459 return 0; 4460 4461 nla_put_failure: 4462 nla_nest_cancel(skb, nl_daemon); 4463 return -EMSGSIZE; 4464 } 4465 4466 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, 4467 struct ipvs_sync_daemon_cfg *c, 4468 struct netlink_callback *cb) 4469 { 4470 void *hdr; 4471 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4472 &ip_vs_genl_family, NLM_F_MULTI, 4473 IPVS_CMD_NEW_DAEMON); 4474 if (!hdr) 4475 return -EMSGSIZE; 4476 4477 if (ip_vs_genl_fill_daemon(skb, state, c)) 4478 goto nla_put_failure; 4479 4480 genlmsg_end(skb, hdr); 4481 return 0; 4482 4483 nla_put_failure: 4484 genlmsg_cancel(skb, hdr); 4485 return -EMSGSIZE; 4486 } 4487 4488 static int ip_vs_genl_dump_daemons(struct sk_buff *skb, 4489 struct netlink_callback *cb) 4490 { 4491 struct net *net = sock_net(skb->sk); 4492 struct netns_ipvs *ipvs = net_ipvs(net); 4493 4494 mutex_lock(&ipvs->sync_mutex); 4495 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { 4496 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, 4497 &ipvs->mcfg, cb) < 0) 4498 goto nla_put_failure; 4499 4500 cb->args[0] = 1; 4501 } 4502 4503 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { 4504 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, 4505 &ipvs->bcfg, cb) < 0) 4506 goto nla_put_failure; 4507 4508 cb->args[1] = 1; 4509 } 4510 4511 nla_put_failure: 4512 mutex_unlock(&ipvs->sync_mutex); 4513 4514 return skb->len; 4515 } 4516 4517 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 4518 { 4519 struct ipvs_sync_daemon_cfg c; 4520 struct nlattr *a; 4521 int ret; 4522 4523 memset(&c, 0, sizeof(c)); 4524 if (!(attrs[IPVS_DAEMON_ATTR_STATE] && 4525 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && 4526 attrs[IPVS_DAEMON_ATTR_SYNC_ID])) 4527 return -EINVAL; 4528 strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), 4529 sizeof(c.mcast_ifn)); 4530 c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); 4531 4532 a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; 4533 if (a) 4534 c.sync_maxlen = nla_get_u16(a); 4535 4536 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; 4537 if (a) { 4538 c.mcast_af = AF_INET; 4539 c.mcast_group.ip = nla_get_in_addr(a); 4540 if (!ipv4_is_multicast(c.mcast_group.ip)) 4541 return -EINVAL; 4542 } else { 4543 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; 4544 if (a) { 4545 #ifdef CONFIG_IP_VS_IPV6 4546 int addr_type; 4547 4548 c.mcast_af = AF_INET6; 4549 c.mcast_group.in6 = nla_get_in6_addr(a); 4550 addr_type = ipv6_addr_type(&c.mcast_group.in6); 4551 if (!(addr_type & IPV6_ADDR_MULTICAST)) 4552 return -EINVAL; 4553 #else 4554 return -EAFNOSUPPORT; 4555 #endif 4556 } 4557 } 4558 4559 a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; 4560 if (a) 4561 c.mcast_port = nla_get_u16(a); 4562 4563 a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; 4564 if (a) 4565 c.mcast_ttl = nla_get_u8(a); 4566 4567 /* The synchronization protocol is incompatible with mixed family 4568 * services 4569 */ 4570 if (ipvs->mixed_address_family_dests > 0) 4571 return -EINVAL; 4572 4573 ret = start_sync_thread(ipvs, &c, 4574 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 4575 return ret; 4576 } 4577 4578 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 4579 { 4580 int ret; 4581 4582 if (!attrs[IPVS_DAEMON_ATTR_STATE]) 4583 return -EINVAL; 4584 4585 ret = stop_sync_thread(ipvs, 4586 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 4587 return ret; 4588 } 4589 4590 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) 4591 { 4592 struct ip_vs_timeout_user t; 4593 4594 __ip_vs_get_timeouts(ipvs, &t); 4595 4596 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) 4597 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); 4598 4599 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) 4600 t.tcp_fin_timeout = 4601 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); 4602 4603 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) 4604 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); 4605 4606 return ip_vs_set_timeout(ipvs, &t); 4607 } 4608 4609 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) 4610 { 4611 int ret = -EINVAL, cmd; 4612 struct net *net = sock_net(skb->sk); 4613 struct netns_ipvs *ipvs = net_ipvs(net); 4614 4615 cmd = info->genlhdr->cmd; 4616 4617 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { 4618 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; 4619 4620 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || 4621 nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) 4622 goto out; 4623 4624 if (cmd == IPVS_CMD_NEW_DAEMON) 4625 ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); 4626 else 4627 ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); 4628 } 4629 4630 out: 4631 return ret; 4632 } 4633 4634 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) 4635 { 4636 bool need_full_svc = false, need_full_dest = false; 4637 struct ip_vs_service *svc = NULL; 4638 struct ip_vs_service_user_kern usvc; 4639 struct ip_vs_dest_user_kern udest; 4640 int ret = 0, cmd; 4641 struct net *net = sock_net(skb->sk); 4642 struct netns_ipvs *ipvs = net_ipvs(net); 4643 4644 cmd = info->genlhdr->cmd; 4645 4646 mutex_lock(&ipvs->service_mutex); 4647 4648 if (cmd == IPVS_CMD_FLUSH) { 4649 ret = ip_vs_flush(ipvs, false); 4650 goto out; 4651 } else if (cmd == IPVS_CMD_SET_CONFIG) { 4652 ret = ip_vs_genl_set_config(ipvs, info->attrs); 4653 goto out; 4654 } else if (cmd == IPVS_CMD_ZERO && 4655 !info->attrs[IPVS_CMD_ATTR_SERVICE]) { 4656 ret = ip_vs_zero_all(ipvs); 4657 goto out; 4658 } 4659 4660 /* All following commands require a service argument, so check if we 4661 * received a valid one. We need a full service specification when 4662 * adding / editing a service. Only identifying members otherwise. */ 4663 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) 4664 need_full_svc = true; 4665 4666 /* We use function that requires RCU lock (hlist_bl) */ 4667 rcu_read_lock(); 4668 ret = ip_vs_genl_parse_service(ipvs, &usvc, 4669 info->attrs[IPVS_CMD_ATTR_SERVICE], 4670 need_full_svc, &svc); 4671 rcu_read_unlock(); 4672 if (ret) 4673 goto out; 4674 4675 /* Unless we're adding a new service, the service must already exist */ 4676 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { 4677 ret = -ESRCH; 4678 goto out; 4679 } 4680 4681 /* Destination commands require a valid destination argument. For 4682 * adding / editing a destination, we need a full destination 4683 * specification. */ 4684 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || 4685 cmd == IPVS_CMD_DEL_DEST) { 4686 if (cmd != IPVS_CMD_DEL_DEST) 4687 need_full_dest = true; 4688 4689 ret = ip_vs_genl_parse_dest(&udest, 4690 info->attrs[IPVS_CMD_ATTR_DEST], 4691 need_full_dest); 4692 if (ret) 4693 goto out; 4694 4695 /* Old protocols did not allow the user to specify address 4696 * family, so we set it to zero instead. We also didn't 4697 * allow heterogeneous pools in the old code, so it's safe 4698 * to assume that this will have the same address family as 4699 * the service. 4700 */ 4701 if (udest.af == 0) 4702 udest.af = svc->af; 4703 4704 if (!ip_vs_is_af_valid(udest.af)) { 4705 ret = -EAFNOSUPPORT; 4706 goto out; 4707 } 4708 4709 if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { 4710 /* The synchronization protocol is incompatible 4711 * with mixed family services 4712 */ 4713 if (ipvs->sync_state) { 4714 ret = -EINVAL; 4715 goto out; 4716 } 4717 4718 /* Which connection types do we support? */ 4719 switch (udest.conn_flags) { 4720 case IP_VS_CONN_F_TUNNEL: 4721 /* We are able to forward this */ 4722 break; 4723 default: 4724 ret = -EINVAL; 4725 goto out; 4726 } 4727 } 4728 } 4729 4730 switch (cmd) { 4731 case IPVS_CMD_NEW_SERVICE: 4732 if (svc == NULL) 4733 ret = ip_vs_add_service(ipvs, &usvc, &svc); 4734 else 4735 ret = -EEXIST; 4736 break; 4737 case IPVS_CMD_SET_SERVICE: 4738 ret = ip_vs_edit_service(svc, &usvc); 4739 break; 4740 case IPVS_CMD_DEL_SERVICE: 4741 ret = ip_vs_del_service(svc); 4742 /* do not use svc, it can be freed */ 4743 break; 4744 case IPVS_CMD_NEW_DEST: 4745 ret = ip_vs_add_dest(svc, &udest); 4746 break; 4747 case IPVS_CMD_SET_DEST: 4748 ret = ip_vs_edit_dest(svc, &udest); 4749 break; 4750 case IPVS_CMD_DEL_DEST: 4751 ret = ip_vs_del_dest(svc, &udest); 4752 break; 4753 case IPVS_CMD_ZERO: 4754 ret = ip_vs_zero_service(svc); 4755 break; 4756 default: 4757 ret = -EINVAL; 4758 } 4759 4760 out: 4761 mutex_unlock(&ipvs->service_mutex); 4762 4763 return ret; 4764 } 4765 4766 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) 4767 { 4768 struct sk_buff *msg; 4769 void *reply; 4770 int ret, cmd, reply_cmd; 4771 struct net *net = sock_net(skb->sk); 4772 struct netns_ipvs *ipvs = net_ipvs(net); 4773 4774 cmd = info->genlhdr->cmd; 4775 4776 if (cmd == IPVS_CMD_GET_SERVICE) 4777 reply_cmd = IPVS_CMD_NEW_SERVICE; 4778 else if (cmd == IPVS_CMD_GET_INFO) 4779 reply_cmd = IPVS_CMD_SET_INFO; 4780 else if (cmd == IPVS_CMD_GET_CONFIG) 4781 reply_cmd = IPVS_CMD_SET_CONFIG; 4782 else { 4783 pr_err("unknown Generic Netlink command\n"); 4784 return -EINVAL; 4785 } 4786 4787 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 4788 if (!msg) 4789 return -ENOMEM; 4790 4791 rcu_read_lock(); 4792 4793 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); 4794 if (reply == NULL) 4795 goto nla_put_failure; 4796 4797 switch (cmd) { 4798 case IPVS_CMD_GET_SERVICE: 4799 { 4800 struct ip_vs_service *svc; 4801 4802 svc = ip_vs_genl_find_service(ipvs, 4803 info->attrs[IPVS_CMD_ATTR_SERVICE]); 4804 if (IS_ERR(svc)) { 4805 ret = PTR_ERR(svc); 4806 goto out_err; 4807 } else if (svc) { 4808 ret = ip_vs_genl_fill_service(msg, svc); 4809 if (ret) 4810 goto nla_put_failure; 4811 } else { 4812 ret = -ESRCH; 4813 goto out_err; 4814 } 4815 4816 break; 4817 } 4818 4819 case IPVS_CMD_GET_CONFIG: 4820 { 4821 struct ip_vs_timeout_user t; 4822 4823 __ip_vs_get_timeouts(ipvs, &t); 4824 #ifdef CONFIG_IP_VS_PROTO_TCP 4825 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, 4826 t.tcp_timeout) || 4827 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, 4828 t.tcp_fin_timeout)) 4829 goto nla_put_failure; 4830 #endif 4831 #ifdef CONFIG_IP_VS_PROTO_UDP 4832 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) 4833 goto nla_put_failure; 4834 #endif 4835 4836 break; 4837 } 4838 4839 case IPVS_CMD_GET_INFO: 4840 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, 4841 IP_VS_VERSION_CODE) || 4842 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, 4843 get_conn_tab_size(ipvs))) 4844 goto nla_put_failure; 4845 break; 4846 } 4847 4848 genlmsg_end(msg, reply); 4849 ret = genlmsg_reply(msg, info); 4850 goto out; 4851 4852 nla_put_failure: 4853 pr_err("not enough space in Netlink message\n"); 4854 ret = -EMSGSIZE; 4855 4856 out_err: 4857 nlmsg_free(msg); 4858 out: 4859 rcu_read_unlock(); 4860 4861 return ret; 4862 } 4863 4864 4865 static const struct genl_small_ops ip_vs_genl_ops[] = { 4866 { 4867 .cmd = IPVS_CMD_NEW_SERVICE, 4868 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4869 .flags = GENL_ADMIN_PERM, 4870 .doit = ip_vs_genl_set_cmd, 4871 }, 4872 { 4873 .cmd = IPVS_CMD_SET_SERVICE, 4874 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4875 .flags = GENL_ADMIN_PERM, 4876 .doit = ip_vs_genl_set_cmd, 4877 }, 4878 { 4879 .cmd = IPVS_CMD_DEL_SERVICE, 4880 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4881 .flags = GENL_ADMIN_PERM, 4882 .doit = ip_vs_genl_set_cmd, 4883 }, 4884 { 4885 .cmd = IPVS_CMD_GET_SERVICE, 4886 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4887 .flags = GENL_ADMIN_PERM, 4888 .doit = ip_vs_genl_get_cmd, 4889 .dumpit = ip_vs_genl_dump_services, 4890 }, 4891 { 4892 .cmd = IPVS_CMD_NEW_DEST, 4893 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4894 .flags = GENL_ADMIN_PERM, 4895 .doit = ip_vs_genl_set_cmd, 4896 }, 4897 { 4898 .cmd = IPVS_CMD_SET_DEST, 4899 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4900 .flags = GENL_ADMIN_PERM, 4901 .doit = ip_vs_genl_set_cmd, 4902 }, 4903 { 4904 .cmd = IPVS_CMD_DEL_DEST, 4905 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4906 .flags = GENL_ADMIN_PERM, 4907 .doit = ip_vs_genl_set_cmd, 4908 }, 4909 { 4910 .cmd = IPVS_CMD_GET_DEST, 4911 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4912 .flags = GENL_ADMIN_PERM, 4913 .dumpit = ip_vs_genl_dump_dests, 4914 }, 4915 { 4916 .cmd = IPVS_CMD_NEW_DAEMON, 4917 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4918 .flags = GENL_ADMIN_PERM, 4919 .doit = ip_vs_genl_set_daemon, 4920 }, 4921 { 4922 .cmd = IPVS_CMD_DEL_DAEMON, 4923 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4924 .flags = GENL_ADMIN_PERM, 4925 .doit = ip_vs_genl_set_daemon, 4926 }, 4927 { 4928 .cmd = IPVS_CMD_GET_DAEMON, 4929 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4930 .flags = GENL_ADMIN_PERM, 4931 .dumpit = ip_vs_genl_dump_daemons, 4932 }, 4933 { 4934 .cmd = IPVS_CMD_SET_CONFIG, 4935 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4936 .flags = GENL_ADMIN_PERM, 4937 .doit = ip_vs_genl_set_cmd, 4938 }, 4939 { 4940 .cmd = IPVS_CMD_GET_CONFIG, 4941 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4942 .flags = GENL_ADMIN_PERM, 4943 .doit = ip_vs_genl_get_cmd, 4944 }, 4945 { 4946 .cmd = IPVS_CMD_GET_INFO, 4947 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4948 .flags = GENL_ADMIN_PERM, 4949 .doit = ip_vs_genl_get_cmd, 4950 }, 4951 { 4952 .cmd = IPVS_CMD_ZERO, 4953 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4954 .flags = GENL_ADMIN_PERM, 4955 .doit = ip_vs_genl_set_cmd, 4956 }, 4957 { 4958 .cmd = IPVS_CMD_FLUSH, 4959 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4960 .flags = GENL_ADMIN_PERM, 4961 .doit = ip_vs_genl_set_cmd, 4962 }, 4963 }; 4964 4965 static struct genl_family ip_vs_genl_family __ro_after_init = { 4966 .hdrsize = 0, 4967 .name = IPVS_GENL_NAME, 4968 .version = IPVS_GENL_VERSION, 4969 .maxattr = IPVS_CMD_ATTR_MAX, 4970 .policy = ip_vs_cmd_policy, 4971 .netnsok = true, /* Make ipvsadm to work on netns */ 4972 .module = THIS_MODULE, 4973 .small_ops = ip_vs_genl_ops, 4974 .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), 4975 .resv_start_op = IPVS_CMD_FLUSH + 1, 4976 .parallel_ops = 1, 4977 }; 4978 4979 static int __init ip_vs_genl_register(void) 4980 { 4981 return genl_register_family(&ip_vs_genl_family); 4982 } 4983 4984 static void ip_vs_genl_unregister(void) 4985 { 4986 genl_unregister_family(&ip_vs_genl_family); 4987 } 4988 4989 /* End of Generic Netlink interface definitions */ 4990 4991 /* 4992 * per netns intit/exit func. 4993 */ 4994 #ifdef CONFIG_SYSCTL 4995 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) 4996 { 4997 struct net *net = ipvs->net; 4998 struct ctl_table *tbl; 4999 int idx, ret; 5000 size_t ctl_table_size = ARRAY_SIZE(vs_vars); 5001 bool unpriv = net->user_ns != &init_user_ns; 5002 5003 atomic_set(&ipvs->dropentry, 0); 5004 spin_lock_init(&ipvs->dropentry_lock); 5005 spin_lock_init(&ipvs->droppacket_lock); 5006 spin_lock_init(&ipvs->securetcp_lock); 5007 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); 5008 INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, 5009 expire_nodest_conn_handler); 5010 ipvs->est_stopped = 0; 5011 5012 if (!net_eq(net, &init_net)) { 5013 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); 5014 if (tbl == NULL) 5015 return -ENOMEM; 5016 } else 5017 tbl = vs_vars; 5018 /* Initialize sysctl defaults */ 5019 for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { 5020 if (tbl[idx].proc_handler == proc_do_defense_mode) 5021 tbl[idx].extra2 = ipvs; 5022 } 5023 idx = 0; 5024 ipvs->sysctl_amemthresh = 1024; 5025 tbl[idx++].data = &ipvs->sysctl_amemthresh; 5026 ipvs->sysctl_am_droprate = 10; 5027 tbl[idx++].data = &ipvs->sysctl_am_droprate; 5028 5029 /* Inherit both limits from init_net:conn_max */ 5030 ipvs->conn_max_limit = net_eq(net, &init_net) ? IP_VS_CONN_MAX : 5031 READ_ONCE(*(int *)vs_vars[idx].data); 5032 ipvs->sysctl_conn_max = ipvs->conn_max_limit; 5033 tbl[idx].extra2 = ipvs; 5034 tbl[idx++].data = &ipvs->sysctl_conn_max; 5035 5036 tbl[idx++].data = &ipvs->sysctl_drop_entry; 5037 tbl[idx++].data = &ipvs->sysctl_drop_packet; 5038 #ifdef CONFIG_IP_VS_NFCT 5039 tbl[idx++].data = &ipvs->sysctl_conntrack; 5040 #endif 5041 tbl[idx++].data = &ipvs->sysctl_secure_tcp; 5042 ipvs->sysctl_snat_reroute = 1; 5043 tbl[idx++].data = &ipvs->sysctl_snat_reroute; 5044 ipvs->sysctl_sync_ver = 1; 5045 tbl[idx++].data = &ipvs->sysctl_sync_ver; 5046 ipvs->sysctl_sync_ports = 1; 5047 tbl[idx++].data = &ipvs->sysctl_sync_ports; 5048 tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; 5049 5050 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; 5051 if (unpriv) 5052 tbl[idx].mode = 0444; 5053 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; 5054 5055 ipvs->sysctl_sync_sock_size = 0; 5056 if (unpriv) 5057 tbl[idx].mode = 0444; 5058 tbl[idx++].data = &ipvs->sysctl_sync_sock_size; 5059 5060 tbl[idx++].data = &ipvs->sysctl_cache_bypass; 5061 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; 5062 tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; 5063 tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; 5064 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; 5065 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; 5066 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; 5067 tbl[idx].data = &ipvs->sysctl_sync_threshold; 5068 tbl[idx].extra2 = ipvs; 5069 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); 5070 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; 5071 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; 5072 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); 5073 tbl[idx++].data = &ipvs->sysctl_sync_retries; 5074 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; 5075 ipvs->sysctl_pmtu_disc = 1; 5076 tbl[idx++].data = &ipvs->sysctl_pmtu_disc; 5077 tbl[idx++].data = &ipvs->sysctl_backup_only; 5078 ipvs->sysctl_conn_reuse_mode = 1; 5079 tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; 5080 tbl[idx++].data = &ipvs->sysctl_schedule_icmp; 5081 tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; 5082 5083 ipvs->sysctl_run_estimation = 1; 5084 if (unpriv) 5085 tbl[idx].mode = 0444; 5086 tbl[idx].extra2 = ipvs; 5087 tbl[idx++].data = &ipvs->sysctl_run_estimation; 5088 5089 ipvs->est_cpulist_valid = 0; 5090 if (unpriv) 5091 tbl[idx].mode = 0444; 5092 tbl[idx].extra2 = ipvs; 5093 tbl[idx++].data = &ipvs->sysctl_est_cpulist; 5094 5095 ipvs->sysctl_est_nice = IPVS_EST_NICE; 5096 if (unpriv) 5097 tbl[idx].mode = 0444; 5098 tbl[idx].extra2 = ipvs; 5099 tbl[idx++].data = &ipvs->sysctl_est_nice; 5100 5101 if (unpriv) 5102 tbl[idx].mode = 0444; 5103 tbl[idx].extra2 = ipvs; 5104 tbl[idx++].data = &ipvs->sysctl_conn_lfactor; 5105 5106 if (unpriv) 5107 tbl[idx].mode = 0444; 5108 tbl[idx].extra2 = ipvs; 5109 tbl[idx++].data = &ipvs->sysctl_svc_lfactor; 5110 5111 #ifdef CONFIG_IP_VS_DEBUG 5112 /* Global sysctls must be ro in non-init netns */ 5113 if (!net_eq(net, &init_net)) 5114 tbl[idx++].mode = 0444; 5115 #endif 5116 5117 ret = -ENOMEM; 5118 ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, 5119 ctl_table_size); 5120 if (!ipvs->sysctl_hdr) 5121 goto err; 5122 ipvs->sysctl_tbl = tbl; 5123 5124 ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); 5125 if (ret < 0) 5126 goto err; 5127 5128 /* Schedule defense work */ 5129 queue_delayed_work(system_long_wq, &ipvs->defense_work, 5130 DEFENSE_TIMER_PERIOD); 5131 5132 return 0; 5133 5134 err: 5135 unregister_net_sysctl_table(ipvs->sysctl_hdr); 5136 if (!net_eq(net, &init_net)) 5137 kfree(tbl); 5138 return ret; 5139 } 5140 5141 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) 5142 { 5143 struct net *net = ipvs->net; 5144 5145 cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); 5146 cancel_delayed_work_sync(&ipvs->defense_work); 5147 cancel_work_sync(&ipvs->defense_work.work); 5148 unregister_net_sysctl_table(ipvs->sysctl_hdr); 5149 if (ipvs->tot_stats->s.est.ktid != -2) { 5150 /* Not stopped yet? This happens only on netns init error and 5151 * we even do not need to lock the service_mutex for this case. 5152 */ 5153 mutex_lock(&ipvs->service_mutex); 5154 ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); 5155 mutex_unlock(&ipvs->service_mutex); 5156 } 5157 5158 if (ipvs->est_cpulist_valid) 5159 free_cpumask_var(ipvs->sysctl_est_cpulist); 5160 5161 if (!net_eq(net, &init_net)) 5162 kfree(ipvs->sysctl_tbl); 5163 } 5164 5165 #else 5166 5167 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } 5168 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } 5169 5170 #endif 5171 5172 static struct notifier_block ip_vs_dst_notifier = { 5173 .notifier_call = ip_vs_dst_event, 5174 #ifdef CONFIG_IP_VS_IPV6 5175 .priority = ADDRCONF_NOTIFY_PRIORITY + 5, 5176 #endif 5177 }; 5178 5179 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) 5180 { 5181 int ret = -ENOMEM; 5182 int idx; 5183 5184 /* Initialize service_mutex, svc_table per netns */ 5185 __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); 5186 init_rwsem(&ipvs->svc_resize_sem); 5187 init_rwsem(&ipvs->svc_replace_sem); 5188 INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); 5189 atomic_set(&ipvs->svc_table_changes, 0); 5190 RCU_INIT_POINTER(ipvs->svc_table, NULL); 5191 5192 /* Initialize rs_table */ 5193 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) 5194 INIT_HLIST_HEAD(&ipvs->rs_table[idx]); 5195 5196 INIT_LIST_HEAD(&ipvs->dest_trash); 5197 spin_lock_init(&ipvs->dest_trash_lock); 5198 timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); 5199 for (idx = 0; idx < IP_VS_AF_MAX; idx++) { 5200 atomic_set(&ipvs->num_services[idx], 0); 5201 atomic_set(&ipvs->fwm_services[idx], 0); 5202 atomic_set(&ipvs->nonfwm_services[idx], 0); 5203 atomic_set(&ipvs->ftpsvc_counter[idx], 0); 5204 atomic_set(&ipvs->nullsvc_counter[idx], 0); 5205 atomic_set(&ipvs->conn_out_counter[idx], 0); 5206 } 5207 5208 INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); 5209 ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs); 5210 5211 /* procfs stats */ 5212 ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats); 5213 if (!ipvs->tot_stats) 5214 goto out; 5215 if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) 5216 goto err_tot_stats; 5217 5218 #ifdef CONFIG_PROC_FS 5219 if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, 5220 &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) 5221 goto err_vs; 5222 if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, 5223 ip_vs_stats_show, NULL)) 5224 goto err_stats; 5225 if (!proc_create_net_single("ip_vs_stats_percpu", 0, 5226 ipvs->net->proc_net, 5227 ip_vs_stats_percpu_show, NULL)) 5228 goto err_percpu; 5229 if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net, 5230 ip_vs_status_show, NULL)) 5231 goto err_status; 5232 #endif 5233 5234 ret = ip_vs_control_net_init_sysctl(ipvs); 5235 if (ret < 0) 5236 goto err; 5237 5238 return 0; 5239 5240 err: 5241 #ifdef CONFIG_PROC_FS 5242 remove_proc_entry("ip_vs_status", ipvs->net->proc_net); 5243 5244 err_status: 5245 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 5246 5247 err_percpu: 5248 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 5249 5250 err_stats: 5251 remove_proc_entry("ip_vs", ipvs->net->proc_net); 5252 5253 err_vs: 5254 #endif 5255 ip_vs_stats_release(&ipvs->tot_stats->s); 5256 5257 err_tot_stats: 5258 kfree(ipvs->tot_stats); 5259 5260 out: 5261 return ret; 5262 } 5263 5264 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) 5265 { 5266 ip_vs_trash_cleanup(ipvs); 5267 ip_vs_control_net_cleanup_sysctl(ipvs); 5268 cancel_delayed_work_sync(&ipvs->est_reload_work); 5269 #ifdef CONFIG_PROC_FS 5270 remove_proc_entry("ip_vs_status", ipvs->net->proc_net); 5271 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 5272 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 5273 remove_proc_entry("ip_vs", ipvs->net->proc_net); 5274 #endif 5275 call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); 5276 } 5277 5278 int __init ip_vs_register_nl_ioctl(void) 5279 { 5280 int ret; 5281 5282 ret = nf_register_sockopt(&ip_vs_sockopts); 5283 if (ret) { 5284 pr_err("cannot register sockopt.\n"); 5285 goto err_sock; 5286 } 5287 5288 ret = ip_vs_genl_register(); 5289 if (ret) { 5290 pr_err("cannot register Generic Netlink interface.\n"); 5291 goto err_genl; 5292 } 5293 return 0; 5294 5295 err_genl: 5296 nf_unregister_sockopt(&ip_vs_sockopts); 5297 err_sock: 5298 return ret; 5299 } 5300 5301 void ip_vs_unregister_nl_ioctl(void) 5302 { 5303 ip_vs_genl_unregister(); 5304 nf_unregister_sockopt(&ip_vs_sockopts); 5305 } 5306 5307 int __init ip_vs_control_init(void) 5308 { 5309 int ret; 5310 5311 ret = register_netdevice_notifier(&ip_vs_dst_notifier); 5312 if (ret < 0) 5313 return ret; 5314 5315 return 0; 5316 } 5317 5318 5319 void ip_vs_control_cleanup(void) 5320 { 5321 unregister_netdevice_notifier(&ip_vs_dst_notifier); 5322 /* relying on common rcu_barrier() in ip_vs_cleanup() */ 5323 } 5324