1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 10 * Peter Kese <peter.kese@ijs.si> 11 * Julian Anastasov <ja@ssi.bg> 12 * 13 * Changes: 14 */ 15 16 #define pr_fmt(fmt) "IPVS: " fmt 17 18 #include <linux/module.h> 19 #include <linux/init.h> 20 #include <linux/types.h> 21 #include <linux/capability.h> 22 #include <linux/fs.h> 23 #include <linux/sysctl.h> 24 #include <linux/proc_fs.h> 25 #include <linux/workqueue.h> 26 #include <linux/seq_file.h> 27 #include <linux/slab.h> 28 29 #include <linux/netfilter.h> 30 #include <linux/netfilter_ipv4.h> 31 #include <linux/mutex.h> 32 #include <linux/rcupdate_wait.h> 33 34 #include <net/net_namespace.h> 35 #include <linux/nsproxy.h> 36 #include <net/ip.h> 37 #ifdef CONFIG_IP_VS_IPV6 38 #include <net/ipv6.h> 39 #include <net/ip6_route.h> 40 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 41 #endif 42 #include <net/route.h> 43 #include <net/sock.h> 44 #include <net/genetlink.h> 45 46 #include <linux/uaccess.h> 47 48 #include <net/ip_vs.h> 49 50 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); 51 52 static struct lock_class_key __ipvs_service_key; 53 54 /* sysctl variables */ 55 56 #ifdef CONFIG_IP_VS_DEBUG 57 static int sysctl_ip_vs_debug_level = 0; 58 59 int ip_vs_get_debug_level(void) 60 { 61 return sysctl_ip_vs_debug_level; 62 } 63 #endif 64 65 66 /* Protos */ 67 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); 68 69 70 #ifdef CONFIG_IP_VS_IPV6 71 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ 72 static bool __ip_vs_addr_is_local_v6(struct net *net, 73 const struct in6_addr *addr) 74 { 75 struct flowi6 fl6 = { 76 .daddr = *addr, 77 }; 78 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); 79 bool is_local; 80 81 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); 82 83 dst_release(dst); 84 return is_local; 85 } 86 #endif 87 88 #ifdef CONFIG_SYSCTL 89 /* 90 * update_defense_level is called from keventd and from sysctl, 91 * so it needs to protect itself from softirqs 92 */ 93 static void update_defense_level(struct netns_ipvs *ipvs) 94 { 95 struct sysinfo i; 96 int availmem; 97 int amemthresh; 98 int nomem; 99 int to_change = -1; 100 101 /* we only count free and buffered memory (in pages) */ 102 si_meminfo(&i); 103 availmem = i.freeram + i.bufferram; 104 /* however in linux 2.5 the i.bufferram is total page cache size, 105 we need adjust it */ 106 /* si_swapinfo(&i); */ 107 /* availmem = availmem - (i.totalswap - i.freeswap); */ 108 109 amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); 110 nomem = (availmem < amemthresh); 111 112 local_bh_disable(); 113 114 /* drop_entry */ 115 spin_lock(&ipvs->dropentry_lock); 116 switch (ipvs->sysctl_drop_entry) { 117 case 0: 118 atomic_set(&ipvs->dropentry, 0); 119 break; 120 case 1: 121 if (nomem) { 122 atomic_set(&ipvs->dropentry, 1); 123 ipvs->sysctl_drop_entry = 2; 124 } else { 125 atomic_set(&ipvs->dropentry, 0); 126 } 127 break; 128 case 2: 129 if (nomem) { 130 atomic_set(&ipvs->dropentry, 1); 131 } else { 132 atomic_set(&ipvs->dropentry, 0); 133 ipvs->sysctl_drop_entry = 1; 134 } 135 break; 136 case 3: 137 atomic_set(&ipvs->dropentry, 1); 138 break; 139 } 140 spin_unlock(&ipvs->dropentry_lock); 141 142 /* drop_packet */ 143 spin_lock(&ipvs->droppacket_lock); 144 switch (ipvs->sysctl_drop_packet) { 145 case 0: 146 ipvs->drop_rate = 0; 147 break; 148 case 1: 149 if (nomem) { 150 ipvs->drop_counter = amemthresh / (amemthresh - availmem); 151 ipvs->drop_rate = ipvs->drop_counter; 152 ipvs->sysctl_drop_packet = 2; 153 } else { 154 ipvs->drop_rate = 0; 155 } 156 break; 157 case 2: 158 if (nomem) { 159 ipvs->drop_counter = amemthresh / (amemthresh - availmem); 160 ipvs->drop_rate = ipvs->drop_counter; 161 } else { 162 ipvs->drop_rate = 0; 163 ipvs->sysctl_drop_packet = 1; 164 } 165 break; 166 case 3: 167 ipvs->drop_rate = ipvs->sysctl_am_droprate; 168 break; 169 } 170 spin_unlock(&ipvs->droppacket_lock); 171 172 /* secure_tcp */ 173 spin_lock(&ipvs->securetcp_lock); 174 switch (ipvs->sysctl_secure_tcp) { 175 case 0: 176 if (ipvs->old_secure_tcp >= 2) 177 to_change = 0; 178 break; 179 case 1: 180 if (nomem) { 181 if (ipvs->old_secure_tcp < 2) 182 to_change = 1; 183 ipvs->sysctl_secure_tcp = 2; 184 } else { 185 if (ipvs->old_secure_tcp >= 2) 186 to_change = 0; 187 } 188 break; 189 case 2: 190 if (nomem) { 191 if (ipvs->old_secure_tcp < 2) 192 to_change = 1; 193 } else { 194 if (ipvs->old_secure_tcp >= 2) 195 to_change = 0; 196 ipvs->sysctl_secure_tcp = 1; 197 } 198 break; 199 case 3: 200 if (ipvs->old_secure_tcp < 2) 201 to_change = 1; 202 break; 203 } 204 ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; 205 if (to_change >= 0) 206 ip_vs_protocol_timeout_change(ipvs, 207 ipvs->sysctl_secure_tcp > 1); 208 spin_unlock(&ipvs->securetcp_lock); 209 210 local_bh_enable(); 211 } 212 213 /* Handler for delayed work for expiring no 214 * destination connections 215 */ 216 static void expire_nodest_conn_handler(struct work_struct *work) 217 { 218 struct netns_ipvs *ipvs; 219 220 ipvs = container_of(work, struct netns_ipvs, 221 expire_nodest_conn_work.work); 222 ip_vs_expire_nodest_conn_flush(ipvs); 223 } 224 225 /* 226 * Timer for checking the defense 227 */ 228 #define DEFENSE_TIMER_PERIOD 1*HZ 229 230 static void defense_work_handler(struct work_struct *work) 231 { 232 struct netns_ipvs *ipvs = 233 container_of(work, struct netns_ipvs, defense_work.work); 234 235 update_defense_level(ipvs); 236 if (atomic_read(&ipvs->dropentry)) 237 ip_vs_random_dropentry(ipvs); 238 queue_delayed_work(system_long_wq, &ipvs->defense_work, 239 DEFENSE_TIMER_PERIOD); 240 } 241 #endif 242 243 static void est_reload_work_handler(struct work_struct *work) 244 { 245 struct netns_ipvs *ipvs = 246 container_of(work, struct netns_ipvs, est_reload_work.work); 247 int genid_done = atomic_read(&ipvs->est_genid_done); 248 unsigned long delay = HZ / 10; /* repeat startups after failure */ 249 bool repeat = false; 250 int genid; 251 int id; 252 253 mutex_lock(&ipvs->est_mutex); 254 genid = atomic_read(&ipvs->est_genid); 255 for (id = 0; id < ipvs->est_kt_count; id++) { 256 struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; 257 258 /* netns clean up started, abort delayed work */ 259 if (!READ_ONCE(ipvs->enable)) 260 goto unlock; 261 if (!kd) 262 continue; 263 /* New config ? Stop kthread tasks */ 264 if (genid != genid_done) { 265 if (!id) { 266 /* Only we can stop kt 0 but not under mutex */ 267 mutex_unlock(&ipvs->est_mutex); 268 ip_vs_est_kthread_stop(kd); 269 mutex_lock(&ipvs->est_mutex); 270 if (!READ_ONCE(ipvs->enable)) 271 goto unlock; 272 /* kd for kt 0 is never destroyed */ 273 } else { 274 ip_vs_est_kthread_stop(kd); 275 } 276 } 277 if (!kd->task && !ip_vs_est_stopped(ipvs)) { 278 bool start; 279 280 /* Do not start kthreads above 0 in calc phase */ 281 if (id) 282 start = !ipvs->est_calc_phase; 283 else 284 start = kd->needed; 285 if (start && ip_vs_est_kthread_start(ipvs, kd) < 0) 286 repeat = true; 287 } 288 } 289 290 atomic_set(&ipvs->est_genid_done, genid); 291 292 if (repeat) 293 queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 294 delay); 295 296 unlock: 297 mutex_unlock(&ipvs->est_mutex); 298 } 299 300 static int get_conn_tab_size(struct netns_ipvs *ipvs) 301 { 302 const struct ip_vs_rht *t; 303 int size = 0; 304 305 rcu_read_lock(); 306 t = rcu_dereference(ipvs->conn_tab); 307 if (t) 308 size = t->size; 309 rcu_read_unlock(); 310 311 return size; 312 } 313 314 int 315 ip_vs_use_count_inc(void) 316 { 317 return try_module_get(THIS_MODULE); 318 } 319 320 void 321 ip_vs_use_count_dec(void) 322 { 323 module_put(THIS_MODULE); 324 } 325 326 327 /* Service hashing: 328 * Operation Locking order 329 * --------------------------------------------------------------------------- 330 * add table service_mutex, svc_resize_sem(W) 331 * del table service_mutex 332 * move between tables svc_resize_sem(W), seqcount_t(W), bit lock 333 * add/del service service_mutex, bit lock 334 * find service RCU, seqcount_t(R) 335 * walk services(blocking) service_mutex, svc_resize_sem(R) 336 * walk services(non-blocking) RCU, seqcount_t(R) 337 * 338 * - new tables are linked/unlinked under service_mutex and svc_resize_sem 339 * - new table is linked on resizing and all operations can run in parallel 340 * in 2 tables until the new table is registered as current one 341 * - two contexts can modify buckets: config and table resize, both in 342 * process context 343 * - only table resizer can move entries, so we do not protect t->seqc[] 344 * items with t->lock[] 345 * - lookups occur under RCU lock and seqcount reader lock to detect if 346 * services are moved to new table 347 * - move operations may disturb readers: find operation will not miss entries 348 * but walkers may see same entry twice if they are forced to retry chains 349 * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold 350 * service_mutex to disallow new tables to be installed or to check 351 * svc_table_changes and repeat the RCU read section if new table is installed 352 */ 353 354 /* 355 * Returns hash value for virtual service 356 */ 357 static inline u32 358 ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto, 359 const union nf_inet_addr *addr, __be16 port) 360 { 361 return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto); 362 } 363 364 /* 365 * Returns hash value of fwmark for virtual service lookup 366 */ 367 static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af, 368 __u32 fwmark) 369 { 370 return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]); 371 } 372 373 /* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */ 374 static int ip_vs_svc_hash(struct ip_vs_service *svc) 375 { 376 struct netns_ipvs *ipvs = svc->ipvs; 377 struct hlist_bl_head *head; 378 struct ip_vs_rht *t; 379 u32 hash; 380 381 if (svc->flags & IP_VS_SVC_F_HASHED) { 382 pr_err("%s(): request for already hashed, called from %pS\n", 383 __func__, __builtin_return_address(0)); 384 return 0; 385 } 386 387 /* increase its refcnt because it is referenced by the svc table */ 388 atomic_inc(&svc->refcnt); 389 390 /* New entries go into recent table */ 391 t = rcu_dereference_protected(ipvs->svc_table, 1); 392 t = rcu_dereference_protected(t->new_tbl, 1); 393 394 if (svc->fwmark == 0) { 395 /* 396 * Hash it by <protocol,addr,port> 397 */ 398 hash = ip_vs_svc_hashval(t, svc->af, svc->protocol, 399 &svc->addr, svc->port); 400 } else { 401 /* 402 * Hash it by fwmark 403 */ 404 hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark); 405 } 406 head = t->buckets + (hash & t->mask); 407 hlist_bl_lock(head); 408 WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash)); 409 svc->flags |= IP_VS_SVC_F_HASHED; 410 hlist_bl_add_head_rcu(&svc->s_list, head); 411 hlist_bl_unlock(head); 412 413 return 1; 414 } 415 416 417 /* 418 * Unhashes a service from svc_table. 419 * Should be called with locked tables. 420 */ 421 static int ip_vs_svc_unhash(struct ip_vs_service *svc) 422 { 423 struct netns_ipvs *ipvs = svc->ipvs; 424 struct hlist_bl_head *head; 425 struct ip_vs_rht *t; 426 u32 hash_key2; 427 u32 hash_key; 428 429 if (!(svc->flags & IP_VS_SVC_F_HASHED)) { 430 pr_err("%s(): request for unhash flagged, called from %pS\n", 431 __func__, __builtin_return_address(0)); 432 return 0; 433 } 434 435 t = rcu_dereference_protected(ipvs->svc_table, 1); 436 hash_key = READ_ONCE(svc->hash_key); 437 /* We need to lock the bucket in the right table */ 438 if (ip_vs_rht_same_table(t, hash_key)) { 439 head = t->buckets + (hash_key & t->mask); 440 hlist_bl_lock(head); 441 /* Ensure hash_key is read under lock */ 442 hash_key2 = READ_ONCE(svc->hash_key); 443 /* Moved to new table ? */ 444 if (hash_key != hash_key2) { 445 hlist_bl_unlock(head); 446 t = rcu_dereference_protected(t->new_tbl, 1); 447 head = t->buckets + (hash_key2 & t->mask); 448 hlist_bl_lock(head); 449 } 450 } else { 451 /* It is already moved to new table */ 452 t = rcu_dereference_protected(t->new_tbl, 1); 453 head = t->buckets + (hash_key & t->mask); 454 hlist_bl_lock(head); 455 } 456 /* Remove it from svc_table */ 457 hlist_bl_del_rcu(&svc->s_list); 458 459 svc->flags &= ~IP_VS_SVC_F_HASHED; 460 atomic_dec(&svc->refcnt); 461 hlist_bl_unlock(head); 462 return 1; 463 } 464 465 466 /* 467 * Get service by {netns, proto,addr,port} in the service table. 468 */ 469 static inline struct ip_vs_service * 470 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, 471 const union nf_inet_addr *vaddr, __be16 vport) 472 { 473 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 474 struct hlist_bl_head *head; 475 struct ip_vs_service *svc; 476 struct ip_vs_rht *t, *p; 477 struct hlist_bl_node *e; 478 u32 hash, hash_key; 479 480 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 481 /* Check for "full" addressed entries */ 482 hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport); 483 484 hash_key = ip_vs_rht_build_hash_key(t, hash); 485 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 486 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 487 if (READ_ONCE(svc->hash_key) == hash_key && 488 svc->af == af && 489 ip_vs_addr_equal(af, &svc->addr, vaddr) && 490 svc->port == vport && 491 svc->protocol == protocol && !svc->fwmark) { 492 /* HIT */ 493 return svc; 494 } 495 } 496 } 497 } 498 499 return NULL; 500 } 501 502 503 /* 504 * Get service by {fwmark} in the service table. 505 */ 506 static inline struct ip_vs_service * 507 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) 508 { 509 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 510 struct hlist_bl_head *head; 511 struct ip_vs_service *svc; 512 struct ip_vs_rht *t, *p; 513 struct hlist_bl_node *e; 514 u32 hash, hash_key; 515 516 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 517 /* Check for fwmark addressed entries */ 518 hash = ip_vs_svc_fwm_hashval(t, af, fwmark); 519 520 hash_key = ip_vs_rht_build_hash_key(t, hash); 521 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 522 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 523 if (READ_ONCE(svc->hash_key) == hash_key && 524 svc->fwmark == fwmark && svc->af == af) { 525 /* HIT */ 526 return svc; 527 } 528 } 529 } 530 } 531 532 return NULL; 533 } 534 535 /* Find service, called under RCU lock */ 536 struct ip_vs_service * 537 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, 538 const union nf_inet_addr *vaddr, __be16 vport) 539 { 540 struct ip_vs_service *svc = NULL; 541 int af_id = ip_vs_af_index(af); 542 543 /* 544 * Check the table hashed by fwmark first 545 */ 546 if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) { 547 svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); 548 if (svc) 549 goto out; 550 } 551 552 if (!atomic_read(&ipvs->nonfwm_services[af_id])) 553 goto out; 554 555 /* 556 * Check the table hashed by <protocol,addr,port> 557 * for "full" addressed entries 558 */ 559 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); 560 if (svc) 561 goto out; 562 563 if (protocol == IPPROTO_TCP && 564 atomic_read(&ipvs->ftpsvc_counter[af_id]) && 565 (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { 566 /* 567 * Check if ftp service entry exists, the packet 568 * might belong to FTP data connections. 569 */ 570 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); 571 if (svc) 572 goto out; 573 } 574 575 if (atomic_read(&ipvs->nullsvc_counter[af_id])) { 576 /* 577 * Check if the catch-all port (port zero) exists 578 */ 579 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); 580 } 581 582 out: 583 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", 584 fwmark, ip_vs_proto_name(protocol), 585 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), 586 svc ? "hit" : "not hit"); 587 588 return svc; 589 } 590 591 /* Return the number of registered services */ 592 static int ip_vs_get_num_services(struct netns_ipvs *ipvs) 593 { 594 int ns = 0, ni = IP_VS_AF_MAX; 595 596 while (--ni >= 0) 597 ns += atomic_read(&ipvs->num_services[ni]); 598 return ns; 599 } 600 601 /* Get default load factor to map num_services/u_thresh to t->size */ 602 static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs) 603 { 604 int factor; 605 606 if (net_eq(ipvs->net, &init_net)) 607 factor = -3; /* grow if load is above 12.5% */ 608 else 609 factor = -2; /* grow if load is above 25% */ 610 return factor; 611 } 612 613 /* Get the desired svc_table size */ 614 static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, 615 int lfactor) 616 { 617 return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs), 618 lfactor, IP_VS_SVC_TAB_MIN_BITS, 619 IP_VS_SVC_TAB_MAX_BITS); 620 } 621 622 /* Allocate svc_table */ 623 static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs, 624 int buckets, int lfactor) 625 { 626 struct ip_vs_rht *t; 627 int scounts, locks; 628 629 /* No frequent lookups to race with resizing, so use max of 64 630 * seqcounts. Only resizer moves entries, so use 0 locks. 631 */ 632 scounts = clamp(buckets >> 4, 1, 64); 633 locks = 0; 634 635 t = ip_vs_rht_alloc(buckets, scounts, locks); 636 if (!t) 637 return NULL; 638 t->lfactor = lfactor; 639 ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS, 640 IP_VS_SVC_TAB_MAX_BITS); 641 return t; 642 } 643 644 /* svc_table resizer work */ 645 static void svc_resize_work_handler(struct work_struct *work) 646 { 647 struct hlist_bl_head *head, *head2; 648 struct ip_vs_rht *t_free = NULL; 649 unsigned int resched_score = 0; 650 struct hlist_bl_node *cn, *nn; 651 struct ip_vs_rht *t, *t_new; 652 struct ip_vs_service *svc; 653 struct netns_ipvs *ipvs; 654 bool more_work = true; 655 seqcount_t *sc; 656 int limit = 0; 657 int new_size; 658 int lfactor; 659 u32 bucket; 660 661 ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work); 662 663 if (!down_write_trylock(&ipvs->svc_resize_sem)) 664 goto out; 665 if (!mutex_trylock(&ipvs->service_mutex)) 666 goto unlock_sem; 667 more_work = false; 668 clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); 669 if (!READ_ONCE(ipvs->enable) || 670 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 671 goto unlock_m; 672 t = rcu_dereference_protected(ipvs->svc_table, 1); 673 /* Do nothing if table is removed */ 674 if (!t) 675 goto unlock_m; 676 /* New table needs to be registered? BUG! */ 677 if (t != rcu_dereference_protected(t->new_tbl, 1)) 678 goto unlock_m; 679 680 lfactor = sysctl_svc_lfactor(ipvs); 681 /* Should we resize ? */ 682 new_size = ip_vs_svc_desired_size(ipvs, t, lfactor); 683 if (new_size == t->size && lfactor == t->lfactor) 684 goto unlock_m; 685 686 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 687 if (!t_new) { 688 more_work = true; 689 goto unlock_m; 690 } 691 /* Flip the table_id */ 692 t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; 693 694 rcu_assign_pointer(t->new_tbl, t_new); 695 /* Allow add/del to new_tbl while moving from old table */ 696 mutex_unlock(&ipvs->service_mutex); 697 698 ip_vs_rht_for_each_bucket(t, bucket, head) { 699 same_bucket: 700 if (++limit >= 16) { 701 if (!READ_ONCE(ipvs->enable) || 702 test_bit(IP_VS_WORK_SVC_NORESIZE, 703 &ipvs->work_flags)) 704 goto unlock_sem; 705 if (resched_score >= 100) { 706 resched_score = 0; 707 cond_resched(); 708 } 709 limit = 0; 710 } 711 if (hlist_bl_empty(head)) { 712 resched_score++; 713 continue; 714 } 715 /* Preemption calls ahead... */ 716 resched_score = 0; 717 718 sc = &t->seqc[bucket & t->seqc_mask]; 719 /* seqcount_t usage considering PREEMPT_RT rules: 720 * - we are the only writer => preemption can be allowed 721 * - readers (SoftIRQ) => disable BHs 722 * - readers (processes) => preemption should be disabled 723 */ 724 local_bh_disable(); 725 preempt_disable_nested(); 726 write_seqcount_begin(sc); 727 hlist_bl_lock(head); 728 729 hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) { 730 u32 hash; 731 732 /* New hash for the new table */ 733 if (svc->fwmark == 0) { 734 /* Hash it by <protocol,addr,port> */ 735 hash = ip_vs_svc_hashval(t_new, svc->af, 736 svc->protocol, 737 &svc->addr, svc->port); 738 } else { 739 /* Hash it by fwmark */ 740 hash = ip_vs_svc_fwm_hashval(t_new, svc->af, 741 svc->fwmark); 742 } 743 hlist_bl_del_rcu(&svc->s_list); 744 head2 = t_new->buckets + (hash & t_new->mask); 745 746 hlist_bl_lock(head2); 747 WRITE_ONCE(svc->hash_key, 748 ip_vs_rht_build_hash_key(t_new, hash)); 749 /* t_new->seqc are not used at this stage, we race 750 * only with add/del, so only lock the bucket. 751 */ 752 hlist_bl_add_head_rcu(&svc->s_list, head2); 753 hlist_bl_unlock(head2); 754 /* Too long chain? Do it in steps */ 755 if (++limit >= 64) 756 break; 757 } 758 759 hlist_bl_unlock(head); 760 write_seqcount_end(sc); 761 preempt_enable_nested(); 762 local_bh_enable(); 763 if (limit >= 64) 764 goto same_bucket; 765 } 766 767 /* Tables can be switched only under service_mutex */ 768 while (!mutex_trylock(&ipvs->service_mutex)) { 769 cond_resched(); 770 if (!READ_ONCE(ipvs->enable) || 771 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 772 goto unlock_sem; 773 } 774 if (!READ_ONCE(ipvs->enable) || 775 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 776 goto unlock_m; 777 778 rcu_assign_pointer(ipvs->svc_table, t_new); 779 /* Inform readers that new table is installed */ 780 smp_mb__before_atomic(); 781 atomic_inc(&ipvs->svc_table_changes); 782 t_free = t; 783 784 unlock_m: 785 mutex_unlock(&ipvs->service_mutex); 786 787 unlock_sem: 788 up_write(&ipvs->svc_resize_sem); 789 790 if (t_free) { 791 /* RCU readers should not see more than two tables in chain. 792 * To prevent new table to be attached wait here instead of 793 * freeing the old table in RCU callback. 794 */ 795 synchronize_rcu(); 796 ip_vs_rht_free(t_free); 797 } 798 799 out: 800 if (!READ_ONCE(ipvs->enable) || !more_work || 801 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 802 return; 803 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); 804 } 805 806 static inline void 807 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) 808 { 809 atomic_inc(&svc->refcnt); 810 rcu_assign_pointer(dest->svc, svc); 811 } 812 813 static void ip_vs_service_free(struct ip_vs_service *svc) 814 { 815 ip_vs_stats_release(&svc->stats); 816 kfree(svc); 817 } 818 819 static void ip_vs_service_rcu_free(struct rcu_head *head) 820 { 821 struct ip_vs_service *svc; 822 823 svc = container_of(head, struct ip_vs_service, rcu_head); 824 ip_vs_service_free(svc); 825 } 826 827 static void __ip_vs_svc_put(struct ip_vs_service *svc) 828 { 829 if (atomic_dec_and_test(&svc->refcnt)) { 830 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", 831 svc->fwmark, 832 IP_VS_DBG_ADDR(svc->af, &svc->addr), 833 ntohs(svc->port)); 834 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); 835 } 836 } 837 838 839 /* 840 * Returns hash value for real service 841 */ 842 static inline unsigned int ip_vs_rs_hashkey(int af, 843 const union nf_inet_addr *addr, 844 __be16 port) 845 { 846 unsigned int porth = ntohs(port); 847 __be32 addr_fold = addr->ip; 848 849 #ifdef CONFIG_IP_VS_IPV6 850 if (af == AF_INET6) 851 addr_fold = addr->ip6[0]^addr->ip6[1]^ 852 addr->ip6[2]^addr->ip6[3]; 853 #endif 854 855 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) 856 & IP_VS_RTAB_MASK; 857 } 858 859 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ 860 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) 861 { 862 unsigned int hash; 863 __be16 port; 864 865 if (dest->in_rs_table) 866 return; 867 868 switch (IP_VS_DFWD_METHOD(dest)) { 869 case IP_VS_CONN_F_MASQ: 870 port = dest->port; 871 break; 872 case IP_VS_CONN_F_TUNNEL: 873 switch (dest->tun_type) { 874 case IP_VS_CONN_F_TUNNEL_TYPE_GUE: 875 port = dest->tun_port; 876 break; 877 case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: 878 case IP_VS_CONN_F_TUNNEL_TYPE_GRE: 879 port = 0; 880 break; 881 default: 882 return; 883 } 884 break; 885 default: 886 return; 887 } 888 889 /* 890 * Hash by proto,addr,port, 891 * which are the parameters of the real service. 892 */ 893 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); 894 895 hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); 896 dest->in_rs_table = 1; 897 } 898 899 /* Unhash ip_vs_dest from rs_table. */ 900 static void ip_vs_rs_unhash(struct ip_vs_dest *dest) 901 { 902 /* 903 * Remove it from the rs_table table. 904 */ 905 if (dest->in_rs_table) { 906 hlist_del_rcu(&dest->d_list); 907 dest->in_rs_table = 0; 908 } 909 } 910 911 /* Check if real service by <proto,addr,port> is present */ 912 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, 913 const union nf_inet_addr *daddr, __be16 dport) 914 { 915 unsigned int hash; 916 struct ip_vs_dest *dest; 917 918 /* Check for "full" addressed entries */ 919 hash = ip_vs_rs_hashkey(af, daddr, dport); 920 921 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 922 if (dest->port == dport && 923 dest->af == af && 924 ip_vs_addr_equal(af, &dest->addr, daddr) && 925 (dest->protocol == protocol || dest->vfwmark) && 926 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 927 /* HIT */ 928 return true; 929 } 930 } 931 932 return false; 933 } 934 935 /* Find real service record by <proto,addr,port>. 936 * In case of multiple records with the same <proto,addr,port>, only 937 * the first found record is returned. 938 * 939 * To be called under RCU lock. 940 */ 941 struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, 942 __u16 protocol, 943 const union nf_inet_addr *daddr, 944 __be16 dport) 945 { 946 unsigned int hash; 947 struct ip_vs_dest *dest; 948 949 /* Check for "full" addressed entries */ 950 hash = ip_vs_rs_hashkey(af, daddr, dport); 951 952 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 953 if (dest->port == dport && 954 dest->af == af && 955 ip_vs_addr_equal(af, &dest->addr, daddr) && 956 (dest->protocol == protocol || dest->vfwmark) && 957 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 958 /* HIT */ 959 return dest; 960 } 961 } 962 963 return NULL; 964 } 965 966 /* Find real service record by <af,addr,tun_port>. 967 * In case of multiple records with the same <af,addr,tun_port>, only 968 * the first found record is returned. 969 * 970 * To be called under RCU lock. 971 */ 972 struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, 973 const union nf_inet_addr *daddr, 974 __be16 tun_port) 975 { 976 struct ip_vs_dest *dest; 977 unsigned int hash; 978 979 /* Check for "full" addressed entries */ 980 hash = ip_vs_rs_hashkey(af, daddr, tun_port); 981 982 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 983 if (dest->tun_port == tun_port && 984 dest->af == af && 985 ip_vs_addr_equal(af, &dest->addr, daddr) && 986 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { 987 /* HIT */ 988 return dest; 989 } 990 } 991 992 return NULL; 993 } 994 995 /* Lookup destination by {addr,port} in the given service 996 * Called under RCU lock. 997 */ 998 static struct ip_vs_dest * 999 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, 1000 const union nf_inet_addr *daddr, __be16 dport) 1001 { 1002 struct ip_vs_dest *dest; 1003 1004 /* 1005 * Find the destination for the given service 1006 */ 1007 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 1008 if ((dest->af == dest_af) && 1009 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 1010 (dest->port == dport)) { 1011 /* HIT */ 1012 return dest; 1013 } 1014 } 1015 1016 return NULL; 1017 } 1018 1019 /* 1020 * Find destination by {daddr,dport,vaddr,protocol} 1021 * Created to be used in ip_vs_process_message() in 1022 * the backup synchronization daemon. It finds the 1023 * destination to be bound to the received connection 1024 * on the backup. 1025 * Called under RCU lock, no refcnt is returned. 1026 */ 1027 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, 1028 const union nf_inet_addr *daddr, 1029 __be16 dport, 1030 const union nf_inet_addr *vaddr, 1031 __be16 vport, __u16 protocol, __u32 fwmark, 1032 __u32 flags) 1033 { 1034 struct ip_vs_dest *dest; 1035 struct ip_vs_service *svc; 1036 __be16 port = dport; 1037 1038 svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); 1039 if (!svc) 1040 return NULL; 1041 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) 1042 port = 0; 1043 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); 1044 if (!dest) 1045 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); 1046 return dest; 1047 } 1048 1049 void ip_vs_dest_dst_rcu_free(struct rcu_head *head) 1050 { 1051 struct ip_vs_dest_dst *dest_dst = container_of(head, 1052 struct ip_vs_dest_dst, 1053 rcu_head); 1054 1055 dst_release(dest_dst->dst_cache); 1056 kfree(dest_dst); 1057 } 1058 1059 /* Release dest_dst and dst_cache for dest in user context */ 1060 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) 1061 { 1062 struct ip_vs_dest_dst *old; 1063 1064 old = rcu_dereference_protected(dest->dest_dst, 1); 1065 if (old) { 1066 RCU_INIT_POINTER(dest->dest_dst, NULL); 1067 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 1068 } 1069 } 1070 1071 /* 1072 * Lookup dest by {svc,addr,port} in the destination trash. 1073 * The destination trash is used to hold the destinations that are removed 1074 * from the service table but are still referenced by some conn entries. 1075 * The reason to add the destination trash is when the dest is temporary 1076 * down (either by administrator or by monitor program), the dest can be 1077 * picked back from the trash, the remaining connections to the dest can 1078 * continue, and the counting information of the dest is also useful for 1079 * scheduling. 1080 */ 1081 static struct ip_vs_dest * 1082 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, 1083 const union nf_inet_addr *daddr, __be16 dport) 1084 { 1085 struct ip_vs_dest *dest; 1086 struct netns_ipvs *ipvs = svc->ipvs; 1087 1088 /* 1089 * Find the destination in trash 1090 */ 1091 spin_lock_bh(&ipvs->dest_trash_lock); 1092 list_for_each_entry(dest, &ipvs->dest_trash, t_list) { 1093 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " 1094 "dest->refcnt=%d\n", 1095 dest->vfwmark, 1096 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1097 ntohs(dest->port), 1098 refcount_read(&dest->refcnt)); 1099 if (dest->af == dest_af && 1100 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 1101 dest->port == dport && 1102 dest->vfwmark == svc->fwmark && 1103 dest->protocol == svc->protocol && 1104 (svc->fwmark || 1105 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && 1106 dest->vport == svc->port))) { 1107 /* HIT */ 1108 list_del(&dest->t_list); 1109 goto out; 1110 } 1111 } 1112 1113 dest = NULL; 1114 1115 out: 1116 spin_unlock_bh(&ipvs->dest_trash_lock); 1117 1118 return dest; 1119 } 1120 1121 /* Put destination in trash */ 1122 static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs, 1123 struct ip_vs_dest *dest, unsigned long istart, 1124 bool cleanup) 1125 { 1126 spin_lock_bh(&ipvs->dest_trash_lock); 1127 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", 1128 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), 1129 refcount_read(&dest->refcnt)); 1130 if (list_empty(&ipvs->dest_trash) && !cleanup) 1131 mod_timer(&ipvs->dest_trash_timer, 1132 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1133 /* dest lives in trash with reference */ 1134 list_add(&dest->t_list, &ipvs->dest_trash); 1135 dest->idle_start = istart; 1136 spin_unlock_bh(&ipvs->dest_trash_lock); 1137 } 1138 1139 static void ip_vs_dest_rcu_free(struct rcu_head *head) 1140 { 1141 struct ip_vs_dest *dest; 1142 1143 dest = container_of(head, struct ip_vs_dest, rcu_head); 1144 ip_vs_stats_release(&dest->stats); 1145 ip_vs_dest_put_and_free(dest); 1146 } 1147 1148 static void ip_vs_dest_free(struct ip_vs_dest *dest) 1149 { 1150 struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); 1151 1152 __ip_vs_svc_put(svc); 1153 call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); 1154 } 1155 1156 /* 1157 * Clean up all the destinations in the trash 1158 * Called by the ip_vs_control_cleanup() 1159 * 1160 * When the ip_vs_control_clearup is activated by ipvs module exit, 1161 * the service tables must have been flushed and all the connections 1162 * are expired, and the refcnt of each destination in the trash must 1163 * be 1, so we simply release them here. 1164 */ 1165 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) 1166 { 1167 struct ip_vs_dest *dest, *nxt; 1168 1169 timer_delete_sync(&ipvs->dest_trash_timer); 1170 /* No need to use dest_trash_lock */ 1171 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { 1172 list_del(&dest->t_list); 1173 ip_vs_dest_free(dest); 1174 } 1175 } 1176 1177 static void ip_vs_stats_rcu_free(struct rcu_head *head) 1178 { 1179 struct ip_vs_stats_rcu *rs = container_of(head, 1180 struct ip_vs_stats_rcu, 1181 rcu_head); 1182 1183 ip_vs_stats_release(&rs->s); 1184 kfree(rs); 1185 } 1186 1187 static void 1188 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) 1189 { 1190 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c 1191 1192 spin_lock(&src->lock); 1193 1194 IP_VS_SHOW_STATS_COUNTER(conns); 1195 IP_VS_SHOW_STATS_COUNTER(inpkts); 1196 IP_VS_SHOW_STATS_COUNTER(outpkts); 1197 IP_VS_SHOW_STATS_COUNTER(inbytes); 1198 IP_VS_SHOW_STATS_COUNTER(outbytes); 1199 1200 ip_vs_read_estimator(dst, src); 1201 1202 spin_unlock(&src->lock); 1203 } 1204 1205 static void 1206 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) 1207 { 1208 dst->conns = (u32)src->conns; 1209 dst->inpkts = (u32)src->inpkts; 1210 dst->outpkts = (u32)src->outpkts; 1211 dst->inbytes = src->inbytes; 1212 dst->outbytes = src->outbytes; 1213 dst->cps = (u32)src->cps; 1214 dst->inpps = (u32)src->inpps; 1215 dst->outpps = (u32)src->outpps; 1216 dst->inbps = (u32)src->inbps; 1217 dst->outbps = (u32)src->outbps; 1218 } 1219 1220 static void 1221 ip_vs_zero_stats(struct ip_vs_stats *stats) 1222 { 1223 spin_lock(&stats->lock); 1224 1225 /* get current counters as zero point, rates are zeroed */ 1226 1227 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c 1228 1229 IP_VS_ZERO_STATS_COUNTER(conns); 1230 IP_VS_ZERO_STATS_COUNTER(inpkts); 1231 IP_VS_ZERO_STATS_COUNTER(outpkts); 1232 IP_VS_ZERO_STATS_COUNTER(inbytes); 1233 IP_VS_ZERO_STATS_COUNTER(outbytes); 1234 1235 ip_vs_zero_estimator(stats); 1236 1237 spin_unlock(&stats->lock); 1238 } 1239 1240 /* Allocate fields after kzalloc */ 1241 int ip_vs_stats_init_alloc(struct ip_vs_stats *s) 1242 { 1243 int i; 1244 1245 spin_lock_init(&s->lock); 1246 s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); 1247 if (!s->cpustats) 1248 return -ENOMEM; 1249 1250 for_each_possible_cpu(i) { 1251 struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); 1252 1253 u64_stats_init(&cs->syncp); 1254 } 1255 return 0; 1256 } 1257 1258 struct ip_vs_stats *ip_vs_stats_alloc(void) 1259 { 1260 struct ip_vs_stats *s = kzalloc_obj(*s); 1261 1262 if (s && ip_vs_stats_init_alloc(s) >= 0) 1263 return s; 1264 kfree(s); 1265 return NULL; 1266 } 1267 1268 void ip_vs_stats_release(struct ip_vs_stats *stats) 1269 { 1270 free_percpu(stats->cpustats); 1271 } 1272 1273 void ip_vs_stats_free(struct ip_vs_stats *stats) 1274 { 1275 if (stats) { 1276 ip_vs_stats_release(stats); 1277 kfree(stats); 1278 } 1279 } 1280 1281 /* 1282 * Update a destination in the given service 1283 */ 1284 static void 1285 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, 1286 struct ip_vs_dest_user_kern *udest, int add) 1287 { 1288 struct netns_ipvs *ipvs = svc->ipvs; 1289 struct ip_vs_service *old_svc; 1290 struct ip_vs_scheduler *sched; 1291 int conn_flags; 1292 1293 /* We cannot modify an address and change the address family */ 1294 BUG_ON(!add && udest->af != dest->af); 1295 1296 if (add && udest->af != svc->af) 1297 ipvs->mixed_address_family_dests++; 1298 1299 /* keep the last_weight with latest non-0 weight */ 1300 if (add || udest->weight != 0) 1301 atomic_set(&dest->last_weight, udest->weight); 1302 1303 /* set the weight and the flags */ 1304 atomic_set(&dest->weight, udest->weight); 1305 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; 1306 conn_flags |= IP_VS_CONN_F_INACTIVE; 1307 1308 /* Need to rehash? */ 1309 if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != 1310 IP_VS_DFWD_METHOD(dest) || 1311 udest->tun_type != dest->tun_type || 1312 udest->tun_port != dest->tun_port) 1313 ip_vs_rs_unhash(dest); 1314 1315 /* set the tunnel info */ 1316 dest->tun_type = udest->tun_type; 1317 dest->tun_port = udest->tun_port; 1318 dest->tun_flags = udest->tun_flags; 1319 1320 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ 1321 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { 1322 conn_flags |= IP_VS_CONN_F_NOOUTPUT; 1323 } else { 1324 /* FTP-NAT requires conntrack for mangling */ 1325 if (svc->port == FTPPORT) 1326 ip_vs_register_conntrack(svc); 1327 } 1328 atomic_set(&dest->conn_flags, conn_flags); 1329 /* Put the real service in rs_table if not present. */ 1330 ip_vs_rs_hash(ipvs, dest); 1331 1332 /* bind the service */ 1333 old_svc = rcu_dereference_protected(dest->svc, 1); 1334 if (!old_svc) { 1335 __ip_vs_bind_svc(dest, svc); 1336 } else { 1337 if (old_svc != svc) { 1338 ip_vs_zero_stats(&dest->stats); 1339 __ip_vs_bind_svc(dest, svc); 1340 __ip_vs_svc_put(old_svc); 1341 } 1342 } 1343 1344 /* set the dest status flags */ 1345 dest->flags |= IP_VS_DEST_F_AVAILABLE; 1346 1347 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) 1348 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 1349 dest->u_threshold = udest->u_threshold; 1350 dest->l_threshold = udest->l_threshold; 1351 1352 dest->af = udest->af; 1353 1354 if (add) { 1355 list_add_rcu(&dest->n_list, &svc->destinations); 1356 svc->num_dests++; 1357 sched = rcu_dereference_protected(svc->scheduler, 1); 1358 if (sched && sched->add_dest) 1359 sched->add_dest(svc, dest); 1360 } else { 1361 spin_lock_bh(&dest->dst_lock); 1362 __ip_vs_dst_cache_reset(dest); 1363 spin_unlock_bh(&dest->dst_lock); 1364 1365 sched = rcu_dereference_protected(svc->scheduler, 1); 1366 if (sched && sched->upd_dest) 1367 sched->upd_dest(svc, dest); 1368 } 1369 } 1370 1371 1372 /* 1373 * Create a destination for the given service 1374 */ 1375 static int 1376 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1377 { 1378 struct ip_vs_dest *dest; 1379 unsigned int atype; 1380 int ret; 1381 1382 #ifdef CONFIG_IP_VS_IPV6 1383 if (udest->af == AF_INET6) { 1384 atype = ipv6_addr_type(&udest->addr.in6); 1385 if ((!(atype & IPV6_ADDR_UNICAST) || 1386 atype & IPV6_ADDR_LINKLOCAL) && 1387 !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) 1388 return -EINVAL; 1389 1390 ret = nf_defrag_ipv6_enable(svc->ipvs->net); 1391 if (ret) 1392 return ret; 1393 } else 1394 #endif 1395 { 1396 atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); 1397 if (atype != RTN_LOCAL && atype != RTN_UNICAST) 1398 return -EINVAL; 1399 } 1400 1401 dest = kzalloc_obj(struct ip_vs_dest); 1402 if (dest == NULL) 1403 return -ENOMEM; 1404 1405 ret = ip_vs_stats_init_alloc(&dest->stats); 1406 if (ret < 0) 1407 goto err_alloc; 1408 1409 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1410 if (ret < 0) 1411 goto err_stats; 1412 1413 dest->af = udest->af; 1414 dest->protocol = svc->protocol; 1415 dest->vaddr = svc->addr; 1416 dest->vport = svc->port; 1417 dest->vfwmark = svc->fwmark; 1418 ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); 1419 dest->port = udest->port; 1420 1421 atomic_set(&dest->activeconns, 0); 1422 atomic_set(&dest->inactconns, 0); 1423 atomic_set(&dest->persistconns, 0); 1424 refcount_set(&dest->refcnt, 1); 1425 1426 INIT_HLIST_NODE(&dest->d_list); 1427 spin_lock_init(&dest->dst_lock); 1428 __ip_vs_update_dest(svc, dest, udest, 1); 1429 1430 return 0; 1431 1432 err_stats: 1433 ip_vs_stats_release(&dest->stats); 1434 1435 err_alloc: 1436 kfree(dest); 1437 return ret; 1438 } 1439 1440 1441 /* 1442 * Add a destination into an existing service 1443 */ 1444 static int 1445 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1446 { 1447 struct ip_vs_dest *dest; 1448 union nf_inet_addr daddr; 1449 __be16 dport = udest->port; 1450 int ret; 1451 1452 if (udest->weight < 0) { 1453 pr_err("%s(): server weight less than zero\n", __func__); 1454 return -ERANGE; 1455 } 1456 1457 if (udest->l_threshold > udest->u_threshold) { 1458 pr_err("%s(): lower threshold is higher than upper threshold\n", 1459 __func__); 1460 return -ERANGE; 1461 } 1462 1463 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1464 if (udest->tun_port == 0) { 1465 pr_err("%s(): tunnel port is zero\n", __func__); 1466 return -EINVAL; 1467 } 1468 } 1469 1470 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1471 1472 /* We use function that requires RCU lock */ 1473 rcu_read_lock(); 1474 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1475 rcu_read_unlock(); 1476 1477 if (dest != NULL) { 1478 IP_VS_DBG(1, "%s(): dest already exists\n", __func__); 1479 return -EEXIST; 1480 } 1481 1482 /* 1483 * Check if the dest already exists in the trash and 1484 * is from the same service 1485 */ 1486 dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); 1487 1488 if (dest != NULL) { 1489 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " 1490 "dest->refcnt=%d, service %u/%s:%u\n", 1491 IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), 1492 refcount_read(&dest->refcnt), 1493 dest->vfwmark, 1494 IP_VS_DBG_ADDR(svc->af, &dest->vaddr), 1495 ntohs(dest->vport)); 1496 1497 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1498 /* On error put back dest into the trash */ 1499 if (ret < 0) 1500 ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start, 1501 false); 1502 else 1503 __ip_vs_update_dest(svc, dest, udest, 1); 1504 } else { 1505 /* 1506 * Allocate and initialize the dest structure 1507 */ 1508 ret = ip_vs_new_dest(svc, udest); 1509 } 1510 1511 return ret; 1512 } 1513 1514 1515 /* 1516 * Edit a destination in the given service 1517 */ 1518 static int 1519 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1520 { 1521 struct ip_vs_dest *dest; 1522 union nf_inet_addr daddr; 1523 __be16 dport = udest->port; 1524 1525 if (udest->weight < 0) { 1526 pr_err("%s(): server weight less than zero\n", __func__); 1527 return -ERANGE; 1528 } 1529 1530 if (udest->l_threshold > udest->u_threshold) { 1531 pr_err("%s(): lower threshold is higher than upper threshold\n", 1532 __func__); 1533 return -ERANGE; 1534 } 1535 1536 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1537 if (udest->tun_port == 0) { 1538 pr_err("%s(): tunnel port is zero\n", __func__); 1539 return -EINVAL; 1540 } 1541 } 1542 1543 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1544 1545 /* We use function that requires RCU lock */ 1546 rcu_read_lock(); 1547 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1548 rcu_read_unlock(); 1549 1550 if (dest == NULL) { 1551 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); 1552 return -ENOENT; 1553 } 1554 1555 __ip_vs_update_dest(svc, dest, udest, 0); 1556 1557 return 0; 1558 } 1559 1560 /* 1561 * Delete a destination (must be already unlinked from the service) 1562 */ 1563 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, 1564 bool cleanup) 1565 { 1566 ip_vs_stop_estimator(ipvs, &dest->stats); 1567 1568 /* 1569 * Remove it from the d-linked list with the real services. 1570 */ 1571 ip_vs_rs_unhash(dest); 1572 1573 ip_vs_trash_put_dest(ipvs, dest, 0, cleanup); 1574 1575 /* Queue up delayed work to expire all no destination connections. 1576 * No-op when CONFIG_SYSCTL is disabled. 1577 */ 1578 if (!cleanup) 1579 ip_vs_enqueue_expire_nodest_conns(ipvs); 1580 } 1581 1582 1583 /* 1584 * Unlink a destination from the given service 1585 */ 1586 static void __ip_vs_unlink_dest(struct ip_vs_service *svc, 1587 struct ip_vs_dest *dest, 1588 int svcupd) 1589 { 1590 dest->flags &= ~IP_VS_DEST_F_AVAILABLE; 1591 1592 spin_lock_bh(&dest->dst_lock); 1593 __ip_vs_dst_cache_reset(dest); 1594 spin_unlock_bh(&dest->dst_lock); 1595 1596 /* 1597 * Remove it from the d-linked destination list. 1598 */ 1599 list_del_rcu(&dest->n_list); 1600 svc->num_dests--; 1601 1602 if (dest->af != svc->af) 1603 svc->ipvs->mixed_address_family_dests--; 1604 1605 if (svcupd) { 1606 struct ip_vs_scheduler *sched; 1607 1608 sched = rcu_dereference_protected(svc->scheduler, 1); 1609 if (sched && sched->del_dest) 1610 sched->del_dest(svc, dest); 1611 } 1612 } 1613 1614 1615 /* 1616 * Delete a destination server in the given service 1617 */ 1618 static int 1619 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1620 { 1621 struct ip_vs_dest *dest; 1622 __be16 dport = udest->port; 1623 1624 /* We use function that requires RCU lock */ 1625 rcu_read_lock(); 1626 dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); 1627 rcu_read_unlock(); 1628 1629 if (dest == NULL) { 1630 IP_VS_DBG(1, "%s(): destination not found!\n", __func__); 1631 return -ENOENT; 1632 } 1633 1634 /* 1635 * Unlink dest from the service 1636 */ 1637 __ip_vs_unlink_dest(svc, dest, 1); 1638 1639 /* 1640 * Delete the destination 1641 */ 1642 __ip_vs_del_dest(svc->ipvs, dest, false); 1643 1644 return 0; 1645 } 1646 1647 static void ip_vs_dest_trash_expire(struct timer_list *t) 1648 { 1649 struct netns_ipvs *ipvs = timer_container_of(ipvs, t, 1650 dest_trash_timer); 1651 struct ip_vs_dest *dest, *next; 1652 unsigned long now = jiffies; 1653 1654 spin_lock(&ipvs->dest_trash_lock); 1655 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { 1656 if (refcount_read(&dest->refcnt) > 1) 1657 continue; 1658 if (dest->idle_start) { 1659 if (time_before(now, dest->idle_start + 1660 IP_VS_DEST_TRASH_PERIOD)) 1661 continue; 1662 } else { 1663 dest->idle_start = max(1UL, now); 1664 continue; 1665 } 1666 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", 1667 dest->vfwmark, 1668 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1669 ntohs(dest->port)); 1670 list_del(&dest->t_list); 1671 ip_vs_dest_free(dest); 1672 } 1673 if (!list_empty(&ipvs->dest_trash)) 1674 mod_timer(&ipvs->dest_trash_timer, 1675 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1676 spin_unlock(&ipvs->dest_trash_lock); 1677 } 1678 1679 /* 1680 * Add a service into the service hash table 1681 */ 1682 static int 1683 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, 1684 struct ip_vs_service **svc_p) 1685 { 1686 struct ip_vs_scheduler *sched = NULL; 1687 struct ip_vs_rht *tc_new = NULL; 1688 struct ip_vs_rht *t, *t_new = NULL; 1689 int af_id = ip_vs_af_index(u->af); 1690 struct ip_vs_service *svc = NULL; 1691 struct ip_vs_pe *pe = NULL; 1692 int ret_hooks = -1; 1693 int ret = 0; 1694 1695 /* increase the module use count */ 1696 if (!ip_vs_use_count_inc()) 1697 return -ENOPROTOOPT; 1698 1699 /* Lookup the scheduler by 'u->sched_name' */ 1700 if (strcmp(u->sched_name, "none")) { 1701 sched = ip_vs_scheduler_get(u->sched_name); 1702 if (!sched) { 1703 pr_info("Scheduler module ip_vs_%s not found\n", 1704 u->sched_name); 1705 ret = -ENOENT; 1706 goto out_err; 1707 } 1708 } 1709 1710 if (u->pe_name && *u->pe_name) { 1711 pe = ip_vs_pe_getbyname(u->pe_name); 1712 if (pe == NULL) { 1713 pr_info("persistence engine module ip_vs_pe_%s " 1714 "not found\n", u->pe_name); 1715 ret = -ENOENT; 1716 goto out_err; 1717 } 1718 } 1719 1720 #ifdef CONFIG_IP_VS_IPV6 1721 if (u->af == AF_INET6) { 1722 __u32 plen = (__force __u32) u->netmask; 1723 1724 if (plen < 1 || plen > 128) { 1725 ret = -EINVAL; 1726 goto out_err; 1727 } 1728 1729 ret = nf_defrag_ipv6_enable(ipvs->net); 1730 if (ret) 1731 goto out_err; 1732 } 1733 #endif 1734 1735 t = rcu_dereference_protected(ipvs->svc_table, 1); 1736 if (!t) { 1737 int lfactor = sysctl_svc_lfactor(ipvs); 1738 int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); 1739 1740 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 1741 if (!t_new) { 1742 ret = -ENOMEM; 1743 goto out_err; 1744 } 1745 } 1746 1747 if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { 1748 int lfactor = sysctl_conn_lfactor(ipvs); 1749 int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor); 1750 1751 tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); 1752 if (!tc_new) { 1753 ret = -ENOMEM; 1754 goto out_err; 1755 } 1756 } 1757 1758 if (!atomic_read(&ipvs->num_services[af_id])) { 1759 ret = ip_vs_register_hooks(ipvs, u->af); 1760 if (ret < 0) 1761 goto out_err; 1762 ret_hooks = ret; 1763 } 1764 1765 svc = kzalloc_obj(struct ip_vs_service); 1766 if (svc == NULL) { 1767 IP_VS_DBG(1, "%s(): no memory\n", __func__); 1768 ret = -ENOMEM; 1769 goto out_err; 1770 } 1771 ret = ip_vs_stats_init_alloc(&svc->stats); 1772 if (ret < 0) 1773 goto out_err; 1774 1775 /* I'm the first user of the service */ 1776 atomic_set(&svc->refcnt, 0); 1777 1778 svc->af = u->af; 1779 svc->protocol = u->protocol; 1780 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); 1781 svc->port = u->port; 1782 svc->fwmark = u->fwmark; 1783 svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; 1784 svc->timeout = u->timeout * HZ; 1785 svc->netmask = u->netmask; 1786 svc->ipvs = ipvs; 1787 1788 INIT_LIST_HEAD(&svc->destinations); 1789 spin_lock_init(&svc->sched_lock); 1790 1791 /* Bind the scheduler */ 1792 if (sched) { 1793 ret = ip_vs_bind_scheduler(svc, sched); 1794 if (ret) 1795 goto out_err; 1796 } 1797 1798 ret = ip_vs_start_estimator(ipvs, &svc->stats); 1799 if (ret < 0) 1800 goto out_err; 1801 1802 if (t_new) { 1803 clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 1804 rcu_assign_pointer(ipvs->svc_table, t_new); 1805 t_new = NULL; 1806 } 1807 if (tc_new) { 1808 rcu_assign_pointer(ipvs->conn_tab, tc_new); 1809 tc_new = NULL; 1810 } 1811 1812 /* Update the virtual service counters */ 1813 if (svc->port == FTPPORT) 1814 atomic_inc(&ipvs->ftpsvc_counter[af_id]); 1815 else if (!svc->port && !svc->fwmark) 1816 atomic_inc(&ipvs->nullsvc_counter[af_id]); 1817 if (pe && pe->conn_out) 1818 atomic_inc(&ipvs->conn_out_counter[af_id]); 1819 1820 /* Bind the ct retriever */ 1821 RCU_INIT_POINTER(svc->pe, pe); 1822 pe = NULL; 1823 1824 if (svc->fwmark) 1825 atomic_inc(&ipvs->fwm_services[af_id]); 1826 else 1827 atomic_inc(&ipvs->nonfwm_services[af_id]); 1828 atomic_inc(&ipvs->num_services[af_id]); 1829 1830 /* Hash the service into the service table */ 1831 ip_vs_svc_hash(svc); 1832 1833 /* Schedule resize work */ 1834 if (t && ip_vs_get_num_services(ipvs) > t->u_thresh && 1835 !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) 1836 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1837 1); 1838 1839 *svc_p = svc; 1840 1841 if (!READ_ONCE(ipvs->enable)) { 1842 mutex_lock(&ipvs->est_mutex); 1843 1844 /* Now there is a service - full throttle */ 1845 WRITE_ONCE(ipvs->enable, 1); 1846 1847 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 1848 1849 /* Start estimation for first time */ 1850 ip_vs_est_reload_start(ipvs, true); 1851 mutex_unlock(&ipvs->est_mutex); 1852 } 1853 1854 return 0; 1855 1856 1857 out_err: 1858 if (tc_new) 1859 ip_vs_rht_free(tc_new); 1860 if (t_new) 1861 ip_vs_rht_free(t_new); 1862 if (ret_hooks >= 0) 1863 ip_vs_unregister_hooks(ipvs, u->af); 1864 if (svc != NULL) { 1865 ip_vs_unbind_scheduler(svc, sched); 1866 ip_vs_service_free(svc); 1867 } 1868 ip_vs_scheduler_put(sched); 1869 ip_vs_pe_put(pe); 1870 1871 /* decrease the module use count */ 1872 ip_vs_use_count_dec(); 1873 1874 return ret; 1875 } 1876 1877 1878 /* 1879 * Edit a service and bind it with a new scheduler 1880 */ 1881 static int 1882 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) 1883 { 1884 struct ip_vs_scheduler *sched = NULL, *old_sched; 1885 struct ip_vs_pe *pe = NULL, *old_pe = NULL; 1886 int ret = 0; 1887 bool new_pe_conn_out, old_pe_conn_out; 1888 struct netns_ipvs *ipvs = svc->ipvs; 1889 int af_id = ip_vs_af_index(svc->af); 1890 1891 /* 1892 * Lookup the scheduler, by 'u->sched_name' 1893 */ 1894 if (strcmp(u->sched_name, "none")) { 1895 sched = ip_vs_scheduler_get(u->sched_name); 1896 if (!sched) { 1897 pr_info("Scheduler module ip_vs_%s not found\n", 1898 u->sched_name); 1899 return -ENOENT; 1900 } 1901 } 1902 old_sched = sched; 1903 1904 if (u->pe_name && *u->pe_name) { 1905 pe = ip_vs_pe_getbyname(u->pe_name); 1906 if (pe == NULL) { 1907 pr_info("persistence engine module ip_vs_pe_%s " 1908 "not found\n", u->pe_name); 1909 ret = -ENOENT; 1910 goto out; 1911 } 1912 old_pe = pe; 1913 } 1914 1915 #ifdef CONFIG_IP_VS_IPV6 1916 if (u->af == AF_INET6) { 1917 __u32 plen = (__force __u32) u->netmask; 1918 1919 if (plen < 1 || plen > 128) { 1920 ret = -EINVAL; 1921 goto out; 1922 } 1923 } 1924 #endif 1925 1926 old_sched = rcu_dereference_protected(svc->scheduler, 1); 1927 if (sched != old_sched) { 1928 if (old_sched) { 1929 ip_vs_unbind_scheduler(svc, old_sched); 1930 RCU_INIT_POINTER(svc->scheduler, NULL); 1931 /* Wait all svc->sched_data users */ 1932 synchronize_rcu(); 1933 } 1934 /* Bind the new scheduler */ 1935 if (sched) { 1936 ret = ip_vs_bind_scheduler(svc, sched); 1937 if (ret) { 1938 ip_vs_scheduler_put(sched); 1939 goto out; 1940 } 1941 } 1942 } 1943 1944 /* 1945 * Set the flags and timeout value 1946 */ 1947 svc->flags = u->flags | IP_VS_SVC_F_HASHED; 1948 svc->timeout = u->timeout * HZ; 1949 svc->netmask = u->netmask; 1950 1951 old_pe = rcu_dereference_protected(svc->pe, 1); 1952 if (pe != old_pe) { 1953 rcu_assign_pointer(svc->pe, pe); 1954 /* check for optional methods in new pe */ 1955 new_pe_conn_out = (pe && pe->conn_out) ? true : false; 1956 old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; 1957 if (new_pe_conn_out && !old_pe_conn_out) 1958 atomic_inc(&ipvs->conn_out_counter[af_id]); 1959 if (old_pe_conn_out && !new_pe_conn_out) 1960 atomic_dec(&ipvs->conn_out_counter[af_id]); 1961 } 1962 1963 out: 1964 ip_vs_scheduler_put(old_sched); 1965 ip_vs_pe_put(old_pe); 1966 return ret; 1967 } 1968 1969 /* 1970 * Delete a service from the service list 1971 * - The service must be unlinked, unlocked and not referenced! 1972 * - We are called under _bh lock 1973 */ 1974 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) 1975 { 1976 struct ip_vs_dest *dest, *nxt; 1977 struct ip_vs_scheduler *old_sched; 1978 struct ip_vs_pe *old_pe; 1979 struct netns_ipvs *ipvs = svc->ipvs; 1980 int af_id = ip_vs_af_index(svc->af); 1981 1982 atomic_dec(&ipvs->num_services[af_id]); 1983 if (!atomic_read(&ipvs->num_services[af_id])) 1984 ip_vs_unregister_hooks(ipvs, svc->af); 1985 if (svc->fwmark) 1986 atomic_dec(&ipvs->fwm_services[af_id]); 1987 else 1988 atomic_dec(&ipvs->nonfwm_services[af_id]); 1989 1990 ip_vs_stop_estimator(svc->ipvs, &svc->stats); 1991 1992 /* Unbind scheduler */ 1993 old_sched = rcu_dereference_protected(svc->scheduler, 1); 1994 ip_vs_unbind_scheduler(svc, old_sched); 1995 ip_vs_scheduler_put(old_sched); 1996 1997 /* Unbind persistence engine, keep svc->pe */ 1998 old_pe = rcu_dereference_protected(svc->pe, 1); 1999 if (old_pe && old_pe->conn_out) 2000 atomic_dec(&ipvs->conn_out_counter[af_id]); 2001 ip_vs_pe_put(old_pe); 2002 2003 /* 2004 * Unlink the whole destination list 2005 */ 2006 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { 2007 __ip_vs_unlink_dest(svc, dest, 0); 2008 __ip_vs_del_dest(svc->ipvs, dest, cleanup); 2009 } 2010 2011 /* 2012 * Update the virtual service counters 2013 */ 2014 if (svc->port == FTPPORT) 2015 atomic_dec(&ipvs->ftpsvc_counter[af_id]); 2016 else if (!svc->port && !svc->fwmark) 2017 atomic_dec(&ipvs->nullsvc_counter[af_id]); 2018 2019 /* 2020 * Free the service if nobody refers to it 2021 */ 2022 __ip_vs_svc_put(svc); 2023 2024 /* decrease the module use count */ 2025 ip_vs_use_count_dec(); 2026 } 2027 2028 /* 2029 * Unlink a service from list and try to delete it if its refcnt reached 0 2030 */ 2031 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) 2032 { 2033 ip_vs_unregister_conntrack(svc); 2034 /* Hold svc to avoid double release from dest_trash */ 2035 atomic_inc(&svc->refcnt); 2036 /* 2037 * Unhash it from the service table 2038 */ 2039 ip_vs_svc_unhash(svc); 2040 2041 __ip_vs_del_service(svc, cleanup); 2042 } 2043 2044 /* 2045 * Delete a service from the service list 2046 */ 2047 static int ip_vs_del_service(struct ip_vs_service *svc) 2048 { 2049 struct netns_ipvs *ipvs; 2050 struct ip_vs_rht *t, *p; 2051 int ns; 2052 2053 if (svc == NULL) 2054 return -EEXIST; 2055 ipvs = svc->ipvs; 2056 ip_vs_unlink_service(svc, false); 2057 t = rcu_dereference_protected(ipvs->svc_table, 1); 2058 2059 /* Drop the table if no more services */ 2060 ns = ip_vs_get_num_services(ipvs); 2061 if (!ns) { 2062 /* Stop the resizer and drop the tables */ 2063 set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 2064 cancel_delayed_work_sync(&ipvs->svc_resize_work); 2065 if (t) { 2066 rcu_assign_pointer(ipvs->svc_table, NULL); 2067 /* Inform readers that table is removed */ 2068 smp_mb__before_atomic(); 2069 atomic_inc(&ipvs->svc_table_changes); 2070 while (1) { 2071 p = rcu_dereference_protected(t->new_tbl, 1); 2072 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 2073 if (p == t) 2074 break; 2075 t = p; 2076 } 2077 } 2078 } else if (ns <= t->l_thresh && 2079 !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, 2080 &ipvs->work_flags)) { 2081 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 2082 1); 2083 } 2084 return 0; 2085 } 2086 2087 2088 /* 2089 * Flush all the virtual services 2090 */ 2091 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) 2092 { 2093 DECLARE_IP_VS_RHT_WALK_BUCKETS(); 2094 struct hlist_bl_head *head; 2095 struct ip_vs_service *svc; 2096 struct hlist_bl_node *ne; 2097 struct hlist_bl_node *e; 2098 struct ip_vs_rht *t, *p; 2099 2100 /* Stop the resizer and drop the tables */ 2101 if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 2102 cancel_delayed_work_sync(&ipvs->svc_resize_work); 2103 /* No resizer, so now we have exclusive write access */ 2104 2105 if (ip_vs_get_num_services(ipvs)) { 2106 ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 2107 hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list) 2108 ip_vs_unlink_service(svc, cleanup); 2109 } 2110 } 2111 2112 /* Unregister the hash table and release it after RCU grace period */ 2113 t = rcu_dereference_protected(ipvs->svc_table, 1); 2114 if (t) { 2115 rcu_assign_pointer(ipvs->svc_table, NULL); 2116 /* Inform readers that table is removed */ 2117 smp_mb__before_atomic(); 2118 atomic_inc(&ipvs->svc_table_changes); 2119 while (1) { 2120 p = rcu_dereference_protected(t->new_tbl, 1); 2121 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 2122 if (p == t) 2123 break; 2124 t = p; 2125 } 2126 } 2127 /* Stop the tot_stats estimator early under service_mutex 2128 * to avoid locking it again later. 2129 */ 2130 if (cleanup) 2131 ip_vs_stop_estimator_tot_stats(ipvs); 2132 return 0; 2133 } 2134 2135 /* 2136 * Delete service by {netns} in the service table. 2137 * Called by __ip_vs_batch_cleanup() 2138 */ 2139 void ip_vs_service_nets_cleanup(struct list_head *net_list) 2140 { 2141 struct netns_ipvs *ipvs; 2142 struct net *net; 2143 2144 /* Check for "full" addressed entries */ 2145 list_for_each_entry(net, net_list, exit_list) { 2146 ipvs = net_ipvs(net); 2147 mutex_lock(&ipvs->service_mutex); 2148 ip_vs_flush(ipvs, true); 2149 mutex_unlock(&ipvs->service_mutex); 2150 } 2151 } 2152 2153 /* Put all references for device (dst_cache) */ 2154 static inline void 2155 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) 2156 { 2157 struct ip_vs_dest_dst *dest_dst; 2158 2159 spin_lock_bh(&dest->dst_lock); 2160 dest_dst = rcu_dereference_protected(dest->dest_dst, 1); 2161 if (dest_dst && dest_dst->dst_cache->dev == dev) { 2162 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", 2163 dev->name, 2164 IP_VS_DBG_ADDR(dest->af, &dest->addr), 2165 ntohs(dest->port), 2166 refcount_read(&dest->refcnt)); 2167 __ip_vs_dst_cache_reset(dest); 2168 } 2169 spin_unlock_bh(&dest->dst_lock); 2170 2171 } 2172 /* Netdev event receiver 2173 * Currently only NETDEV_DOWN is handled to release refs to cached dsts 2174 */ 2175 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, 2176 void *ptr) 2177 { 2178 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 2179 struct net *net = dev_net(dev); 2180 struct netns_ipvs *ipvs = net_ipvs(net); 2181 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 2182 unsigned int resched_score = 0; 2183 struct hlist_bl_head *head; 2184 struct ip_vs_service *svc; 2185 struct hlist_bl_node *e; 2186 struct ip_vs_dest *dest; 2187 int old_gen, new_gen; 2188 2189 if (event != NETDEV_DOWN || !ipvs) 2190 return NOTIFY_DONE; 2191 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); 2192 2193 old_gen = atomic_read(&ipvs->svc_table_changes); 2194 2195 rcu_read_lock(); 2196 2197 repeat: 2198 smp_rmb(); /* ipvs->svc_table and svc_table_changes */ 2199 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 2200 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2201 list_for_each_entry_rcu(dest, &svc->destinations, 2202 n_list) { 2203 ip_vs_forget_dev(dest, dev); 2204 resched_score += 10; 2205 } 2206 resched_score++; 2207 } 2208 resched_score++; 2209 if (resched_score >= 100) { 2210 resched_score = 0; 2211 cond_resched_rcu(); 2212 new_gen = atomic_read(&ipvs->svc_table_changes); 2213 /* New table installed ? */ 2214 if (old_gen != new_gen) { 2215 old_gen = new_gen; 2216 goto repeat; 2217 } 2218 } 2219 } 2220 rcu_read_unlock(); 2221 2222 return NOTIFY_DONE; 2223 } 2224 2225 /* 2226 * Zero counters in a service or all services 2227 */ 2228 static int ip_vs_zero_service(struct ip_vs_service *svc) 2229 { 2230 struct ip_vs_dest *dest; 2231 2232 list_for_each_entry(dest, &svc->destinations, n_list) { 2233 ip_vs_zero_stats(&dest->stats); 2234 } 2235 ip_vs_zero_stats(&svc->stats); 2236 return 0; 2237 } 2238 2239 static int ip_vs_zero_all(struct netns_ipvs *ipvs) 2240 { 2241 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 2242 unsigned int resched_score = 0; 2243 struct hlist_bl_head *head; 2244 struct ip_vs_service *svc; 2245 struct hlist_bl_node *e; 2246 2247 rcu_read_lock(); 2248 2249 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 2250 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2251 ip_vs_zero_service(svc); 2252 resched_score += 10; 2253 } 2254 resched_score++; 2255 if (resched_score >= 100) { 2256 resched_score = 0; 2257 cond_resched_rcu(); 2258 } 2259 } 2260 2261 rcu_read_unlock(); 2262 2263 ip_vs_zero_stats(&ipvs->tot_stats->s); 2264 return 0; 2265 } 2266 2267 #ifdef CONFIG_SYSCTL 2268 2269 static int 2270 proc_do_defense_mode(const struct ctl_table *table, int write, 2271 void *buffer, size_t *lenp, loff_t *ppos) 2272 { 2273 struct netns_ipvs *ipvs = table->extra2; 2274 int *valp = table->data; 2275 int val = *valp; 2276 int rc; 2277 2278 struct ctl_table tmp = { 2279 .data = &val, 2280 .maxlen = sizeof(int), 2281 .mode = table->mode, 2282 }; 2283 2284 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2285 if (write && (*valp != val)) { 2286 if (val < 0 || val > 3) { 2287 rc = -EINVAL; 2288 } else { 2289 *valp = val; 2290 update_defense_level(ipvs); 2291 } 2292 } 2293 return rc; 2294 } 2295 2296 static int 2297 proc_do_sync_threshold(const struct ctl_table *table, int write, 2298 void *buffer, size_t *lenp, loff_t *ppos) 2299 { 2300 struct netns_ipvs *ipvs = table->extra2; 2301 int *valp = table->data; 2302 int val[2]; 2303 int rc; 2304 struct ctl_table tmp = { 2305 .data = &val, 2306 .maxlen = table->maxlen, 2307 .mode = table->mode, 2308 }; 2309 2310 mutex_lock(&ipvs->sync_mutex); 2311 memcpy(val, valp, sizeof(val)); 2312 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2313 if (write) { 2314 if (val[0] < 0 || val[1] < 0 || 2315 (val[0] >= val[1] && val[1])) 2316 rc = -EINVAL; 2317 else 2318 memcpy(valp, val, sizeof(val)); 2319 } 2320 mutex_unlock(&ipvs->sync_mutex); 2321 return rc; 2322 } 2323 2324 static int 2325 proc_do_sync_ports(const struct ctl_table *table, int write, 2326 void *buffer, size_t *lenp, loff_t *ppos) 2327 { 2328 int *valp = table->data; 2329 int val = *valp; 2330 int rc; 2331 2332 struct ctl_table tmp = { 2333 .data = &val, 2334 .maxlen = sizeof(int), 2335 .mode = table->mode, 2336 }; 2337 2338 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2339 if (write && (*valp != val)) { 2340 if (val < 1 || !is_power_of_2(val)) 2341 rc = -EINVAL; 2342 else 2343 *valp = val; 2344 } 2345 return rc; 2346 } 2347 2348 static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, 2349 void *buffer) 2350 { 2351 struct netns_ipvs *ipvs = table->extra2; 2352 cpumask_var_t *valp = table->data; 2353 cpumask_var_t newmask; 2354 int ret; 2355 2356 if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) 2357 return -ENOMEM; 2358 2359 ret = cpulist_parse(buffer, newmask); 2360 if (ret) 2361 goto out; 2362 2363 mutex_lock(&ipvs->est_mutex); 2364 2365 if (!ipvs->est_cpulist_valid) { 2366 if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { 2367 ret = -ENOMEM; 2368 goto unlock; 2369 } 2370 ipvs->est_cpulist_valid = 1; 2371 } 2372 cpumask_and(newmask, newmask, ¤t->cpus_mask); 2373 cpumask_copy(*valp, newmask); 2374 /* est_max_threads may depend on cpulist size */ 2375 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 2376 ipvs->est_calc_phase = 1; 2377 ip_vs_est_reload_start(ipvs, true); 2378 2379 unlock: 2380 mutex_unlock(&ipvs->est_mutex); 2381 2382 out: 2383 free_cpumask_var(newmask); 2384 return ret; 2385 } 2386 2387 static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, 2388 void *buffer, size_t size) 2389 { 2390 struct netns_ipvs *ipvs = table->extra2; 2391 cpumask_var_t *valp = table->data; 2392 struct cpumask *mask; 2393 int ret; 2394 2395 mutex_lock(&ipvs->est_mutex); 2396 2397 /* HK_TYPE_KTHREAD cpumask needs RCU protection */ 2398 scoped_guard(rcu) { 2399 if (ipvs->est_cpulist_valid) 2400 mask = *valp; 2401 else 2402 mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); 2403 ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); 2404 } 2405 2406 mutex_unlock(&ipvs->est_mutex); 2407 2408 return ret; 2409 } 2410 2411 static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, 2412 void *buffer, size_t *lenp, loff_t *ppos) 2413 { 2414 int ret; 2415 2416 /* Ignore both read and write(append) if *ppos not 0 */ 2417 if (*ppos || !*lenp) { 2418 *lenp = 0; 2419 return 0; 2420 } 2421 if (write) { 2422 /* proc_sys_call_handler() appends terminator */ 2423 ret = ipvs_proc_est_cpumask_set(table, buffer); 2424 if (ret >= 0) 2425 *ppos += *lenp; 2426 } else { 2427 /* proc_sys_call_handler() allocates 1 byte for terminator */ 2428 ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); 2429 if (ret >= 0) { 2430 *lenp = ret; 2431 *ppos += *lenp; 2432 ret = 0; 2433 } 2434 } 2435 return ret; 2436 } 2437 2438 static int ipvs_proc_est_nice(const struct ctl_table *table, int write, 2439 void *buffer, size_t *lenp, loff_t *ppos) 2440 { 2441 struct netns_ipvs *ipvs = table->extra2; 2442 int *valp = table->data; 2443 int val = *valp; 2444 int ret; 2445 2446 struct ctl_table tmp_table = { 2447 .data = &val, 2448 .maxlen = sizeof(int), 2449 .mode = table->mode, 2450 }; 2451 2452 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2453 if (write && ret >= 0) { 2454 if (val < MIN_NICE || val > MAX_NICE) { 2455 ret = -EINVAL; 2456 } else { 2457 mutex_lock(&ipvs->est_mutex); 2458 if (*valp != val) { 2459 *valp = val; 2460 ip_vs_est_reload_start(ipvs, true); 2461 } 2462 mutex_unlock(&ipvs->est_mutex); 2463 } 2464 } 2465 return ret; 2466 } 2467 2468 static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, 2469 void *buffer, size_t *lenp, loff_t *ppos) 2470 { 2471 struct netns_ipvs *ipvs = table->extra2; 2472 int *valp = table->data; 2473 int val = *valp; 2474 int ret; 2475 2476 struct ctl_table tmp_table = { 2477 .data = &val, 2478 .maxlen = sizeof(int), 2479 .mode = table->mode, 2480 }; 2481 2482 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2483 if (write && ret >= 0) { 2484 mutex_lock(&ipvs->est_mutex); 2485 if (*valp != val) { 2486 *valp = val; 2487 ip_vs_est_reload_start(ipvs, true); 2488 } 2489 mutex_unlock(&ipvs->est_mutex); 2490 } 2491 return ret; 2492 } 2493 2494 static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, 2495 void *buffer, size_t *lenp, loff_t *ppos) 2496 { 2497 struct netns_ipvs *ipvs = table->extra2; 2498 int *valp = table->data; 2499 int val = *valp; 2500 int ret; 2501 2502 struct ctl_table tmp_table = { 2503 .data = &val, 2504 .maxlen = sizeof(int), 2505 }; 2506 2507 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2508 if (write && ret >= 0) { 2509 if (val < -8 || val > 8) { 2510 ret = -EINVAL; 2511 } else { 2512 WRITE_ONCE(*valp, val); 2513 if (rcu_access_pointer(ipvs->conn_tab)) 2514 mod_delayed_work(system_unbound_wq, 2515 &ipvs->conn_resize_work, 0); 2516 } 2517 } 2518 return ret; 2519 } 2520 2521 static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, 2522 void *buffer, size_t *lenp, loff_t *ppos) 2523 { 2524 struct netns_ipvs *ipvs = table->extra2; 2525 int *valp = table->data; 2526 int val = *valp; 2527 int ret; 2528 2529 struct ctl_table tmp_table = { 2530 .data = &val, 2531 .maxlen = sizeof(int), 2532 }; 2533 2534 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2535 if (write && ret >= 0) { 2536 if (val < -8 || val > 8) { 2537 ret = -EINVAL; 2538 } else { 2539 mutex_lock(&ipvs->service_mutex); 2540 WRITE_ONCE(*valp, val); 2541 /* Make sure the services are present */ 2542 if (rcu_access_pointer(ipvs->svc_table) && 2543 READ_ONCE(ipvs->enable) && 2544 !test_bit(IP_VS_WORK_SVC_NORESIZE, 2545 &ipvs->work_flags)) 2546 mod_delayed_work(system_unbound_wq, 2547 &ipvs->svc_resize_work, 0); 2548 mutex_unlock(&ipvs->service_mutex); 2549 } 2550 } 2551 return ret; 2552 } 2553 2554 /* 2555 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) 2556 * Do not change order or insert new entries without 2557 * align with netns init in ip_vs_control_net_init() 2558 */ 2559 2560 static struct ctl_table vs_vars[] = { 2561 { 2562 .procname = "amemthresh", 2563 .maxlen = sizeof(int), 2564 .mode = 0644, 2565 .proc_handler = proc_dointvec, 2566 }, 2567 { 2568 .procname = "am_droprate", 2569 .maxlen = sizeof(int), 2570 .mode = 0644, 2571 .proc_handler = proc_dointvec, 2572 }, 2573 { 2574 .procname = "drop_entry", 2575 .maxlen = sizeof(int), 2576 .mode = 0644, 2577 .proc_handler = proc_do_defense_mode, 2578 }, 2579 { 2580 .procname = "drop_packet", 2581 .maxlen = sizeof(int), 2582 .mode = 0644, 2583 .proc_handler = proc_do_defense_mode, 2584 }, 2585 #ifdef CONFIG_IP_VS_NFCT 2586 { 2587 .procname = "conntrack", 2588 .maxlen = sizeof(int), 2589 .mode = 0644, 2590 .proc_handler = &proc_dointvec, 2591 }, 2592 #endif 2593 { 2594 .procname = "secure_tcp", 2595 .maxlen = sizeof(int), 2596 .mode = 0644, 2597 .proc_handler = proc_do_defense_mode, 2598 }, 2599 { 2600 .procname = "snat_reroute", 2601 .maxlen = sizeof(int), 2602 .mode = 0644, 2603 .proc_handler = &proc_dointvec, 2604 }, 2605 { 2606 .procname = "sync_version", 2607 .maxlen = sizeof(int), 2608 .mode = 0644, 2609 .proc_handler = proc_dointvec_minmax, 2610 .extra1 = SYSCTL_ZERO, 2611 .extra2 = SYSCTL_ONE, 2612 }, 2613 { 2614 .procname = "sync_ports", 2615 .maxlen = sizeof(int), 2616 .mode = 0644, 2617 .proc_handler = proc_do_sync_ports, 2618 }, 2619 { 2620 .procname = "sync_persist_mode", 2621 .maxlen = sizeof(int), 2622 .mode = 0644, 2623 .proc_handler = proc_dointvec, 2624 }, 2625 { 2626 .procname = "sync_qlen_max", 2627 .maxlen = sizeof(unsigned long), 2628 .mode = 0644, 2629 .proc_handler = proc_doulongvec_minmax, 2630 }, 2631 { 2632 .procname = "sync_sock_size", 2633 .maxlen = sizeof(int), 2634 .mode = 0644, 2635 .proc_handler = proc_dointvec, 2636 }, 2637 { 2638 .procname = "cache_bypass", 2639 .maxlen = sizeof(int), 2640 .mode = 0644, 2641 .proc_handler = proc_dointvec, 2642 }, 2643 { 2644 .procname = "expire_nodest_conn", 2645 .maxlen = sizeof(int), 2646 .mode = 0644, 2647 .proc_handler = proc_dointvec, 2648 }, 2649 { 2650 .procname = "sloppy_tcp", 2651 .maxlen = sizeof(int), 2652 .mode = 0644, 2653 .proc_handler = proc_dointvec, 2654 }, 2655 { 2656 .procname = "sloppy_sctp", 2657 .maxlen = sizeof(int), 2658 .mode = 0644, 2659 .proc_handler = proc_dointvec, 2660 }, 2661 { 2662 .procname = "expire_quiescent_template", 2663 .maxlen = sizeof(int), 2664 .mode = 0644, 2665 .proc_handler = proc_dointvec, 2666 }, 2667 { 2668 .procname = "sync_threshold", 2669 .maxlen = 2670 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), 2671 .mode = 0644, 2672 .proc_handler = proc_do_sync_threshold, 2673 }, 2674 { 2675 .procname = "sync_refresh_period", 2676 .maxlen = sizeof(int), 2677 .mode = 0644, 2678 .proc_handler = proc_dointvec_jiffies, 2679 }, 2680 { 2681 .procname = "sync_retries", 2682 .maxlen = sizeof(int), 2683 .mode = 0644, 2684 .proc_handler = proc_dointvec_minmax, 2685 .extra1 = SYSCTL_ZERO, 2686 .extra2 = SYSCTL_THREE, 2687 }, 2688 { 2689 .procname = "nat_icmp_send", 2690 .maxlen = sizeof(int), 2691 .mode = 0644, 2692 .proc_handler = proc_dointvec, 2693 }, 2694 { 2695 .procname = "pmtu_disc", 2696 .maxlen = sizeof(int), 2697 .mode = 0644, 2698 .proc_handler = proc_dointvec, 2699 }, 2700 { 2701 .procname = "backup_only", 2702 .maxlen = sizeof(int), 2703 .mode = 0644, 2704 .proc_handler = proc_dointvec, 2705 }, 2706 { 2707 .procname = "conn_reuse_mode", 2708 .maxlen = sizeof(int), 2709 .mode = 0644, 2710 .proc_handler = proc_dointvec, 2711 }, 2712 { 2713 .procname = "schedule_icmp", 2714 .maxlen = sizeof(int), 2715 .mode = 0644, 2716 .proc_handler = proc_dointvec, 2717 }, 2718 { 2719 .procname = "ignore_tunneled", 2720 .maxlen = sizeof(int), 2721 .mode = 0644, 2722 .proc_handler = proc_dointvec, 2723 }, 2724 { 2725 .procname = "run_estimation", 2726 .maxlen = sizeof(int), 2727 .mode = 0644, 2728 .proc_handler = ipvs_proc_run_estimation, 2729 }, 2730 { 2731 .procname = "est_cpulist", 2732 .maxlen = NR_CPUS, /* unused */ 2733 .mode = 0644, 2734 .proc_handler = ipvs_proc_est_cpulist, 2735 }, 2736 { 2737 .procname = "est_nice", 2738 .maxlen = sizeof(int), 2739 .mode = 0644, 2740 .proc_handler = ipvs_proc_est_nice, 2741 }, 2742 { 2743 .procname = "conn_lfactor", 2744 .maxlen = sizeof(int), 2745 .mode = 0644, 2746 .proc_handler = ipvs_proc_conn_lfactor, 2747 }, 2748 { 2749 .procname = "svc_lfactor", 2750 .maxlen = sizeof(int), 2751 .mode = 0644, 2752 .proc_handler = ipvs_proc_svc_lfactor, 2753 }, 2754 #ifdef CONFIG_IP_VS_DEBUG 2755 { 2756 .procname = "debug_level", 2757 .data = &sysctl_ip_vs_debug_level, 2758 .maxlen = sizeof(int), 2759 .mode = 0644, 2760 .proc_handler = proc_dointvec, 2761 }, 2762 #endif 2763 }; 2764 2765 #endif 2766 2767 #ifdef CONFIG_PROC_FS 2768 2769 struct ip_vs_iter { 2770 struct seq_net_private p; /* Do not move this, netns depends upon it*/ 2771 struct ip_vs_rht *t; 2772 u32 bucket; 2773 }; 2774 2775 /* 2776 * Write the contents of the VS rule table to a PROCfs file. 2777 * (It is kept just for backward compatibility) 2778 */ 2779 static inline const char *ip_vs_fwd_name(unsigned int flags) 2780 { 2781 switch (flags & IP_VS_CONN_F_FWD_MASK) { 2782 case IP_VS_CONN_F_LOCALNODE: 2783 return "Local"; 2784 case IP_VS_CONN_F_TUNNEL: 2785 return "Tunnel"; 2786 case IP_VS_CONN_F_DROUTE: 2787 return "Route"; 2788 default: 2789 return "Masq"; 2790 } 2791 } 2792 2793 /* Do not expect consistent view during add, del and move(table resize). 2794 * We may miss entries and even show duplicates. 2795 */ 2796 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) 2797 { 2798 struct ip_vs_iter *iter = seq->private; 2799 struct ip_vs_rht *t = iter->t; 2800 struct ip_vs_service *svc; 2801 struct hlist_bl_node *e; 2802 int idx; 2803 2804 if (!t) 2805 return NULL; 2806 for (idx = 0; idx < t->size; idx++) { 2807 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) { 2808 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2809 break; 2810 if (pos-- == 0) { 2811 iter->bucket = idx; 2812 return svc; 2813 } 2814 } 2815 } 2816 return NULL; 2817 } 2818 2819 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) 2820 __acquires(RCU) 2821 { 2822 struct ip_vs_iter *iter = seq->private; 2823 struct net *net = seq_file_net(seq); 2824 struct netns_ipvs *ipvs = net_ipvs(net); 2825 2826 rcu_read_lock(); 2827 iter->t = rcu_dereference(ipvs->svc_table); 2828 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; 2829 } 2830 2831 2832 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2833 { 2834 struct ip_vs_service *svc; 2835 struct ip_vs_iter *iter; 2836 struct hlist_bl_node *e; 2837 struct ip_vs_rht *t; 2838 2839 ++*pos; 2840 if (v == SEQ_START_TOKEN) 2841 return ip_vs_info_array(seq,0); 2842 2843 svc = v; 2844 iter = seq->private; 2845 t = iter->t; 2846 if (!t) 2847 return NULL; 2848 2849 hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) { 2850 /* Our cursor was moved to new table ? */ 2851 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2852 break; 2853 return svc; 2854 } 2855 2856 while (++iter->bucket < t->size) { 2857 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket], 2858 s_list) { 2859 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2860 break; 2861 return svc; 2862 } 2863 } 2864 return NULL; 2865 } 2866 2867 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) 2868 __releases(RCU) 2869 { 2870 rcu_read_unlock(); 2871 } 2872 2873 2874 static int ip_vs_info_seq_show(struct seq_file *seq, void *v) 2875 { 2876 struct net *net = seq_file_net(seq); 2877 struct netns_ipvs *ipvs = net_ipvs(net); 2878 2879 if (v == SEQ_START_TOKEN) { 2880 seq_printf(seq, 2881 "IP Virtual Server version %d.%d.%d (size=%d)\n", 2882 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); 2883 seq_puts(seq, 2884 "Prot LocalAddress:Port Scheduler Flags\n"); 2885 seq_puts(seq, 2886 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); 2887 } else { 2888 const struct ip_vs_service *svc = v; 2889 const struct ip_vs_dest *dest; 2890 struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); 2891 char *sched_name = sched ? sched->name : "none"; 2892 2893 if (!svc->fwmark) { 2894 #ifdef CONFIG_IP_VS_IPV6 2895 if (svc->af == AF_INET6) 2896 seq_printf(seq, "%s [%pI6]:%04X %s ", 2897 ip_vs_proto_name(svc->protocol), 2898 &svc->addr.in6, 2899 ntohs(svc->port), 2900 sched_name); 2901 else 2902 #endif 2903 seq_printf(seq, "%s %08X:%04X %s %s ", 2904 ip_vs_proto_name(svc->protocol), 2905 ntohl(svc->addr.ip), 2906 ntohs(svc->port), 2907 sched_name, 2908 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 2909 } else { 2910 seq_printf(seq, "FWM %08X %s %s", 2911 svc->fwmark, sched_name, 2912 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 2913 } 2914 2915 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 2916 seq_printf(seq, "persistent %d %08X\n", 2917 svc->timeout, 2918 ntohl(svc->netmask)); 2919 else 2920 seq_putc(seq, '\n'); 2921 2922 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 2923 #ifdef CONFIG_IP_VS_IPV6 2924 if (dest->af == AF_INET6) 2925 seq_printf(seq, 2926 " -> [%pI6]:%04X" 2927 " %-7s %-6d %-10d %-10d\n", 2928 &dest->addr.in6, 2929 ntohs(dest->port), 2930 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 2931 atomic_read(&dest->weight), 2932 atomic_read(&dest->activeconns), 2933 atomic_read(&dest->inactconns)); 2934 else 2935 #endif 2936 seq_printf(seq, 2937 " -> %08X:%04X " 2938 "%-7s %-6d %-10d %-10d\n", 2939 ntohl(dest->addr.ip), 2940 ntohs(dest->port), 2941 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 2942 atomic_read(&dest->weight), 2943 atomic_read(&dest->activeconns), 2944 atomic_read(&dest->inactconns)); 2945 2946 } 2947 } 2948 return 0; 2949 } 2950 2951 static const struct seq_operations ip_vs_info_seq_ops = { 2952 .start = ip_vs_info_seq_start, 2953 .next = ip_vs_info_seq_next, 2954 .stop = ip_vs_info_seq_stop, 2955 .show = ip_vs_info_seq_show, 2956 }; 2957 2958 static int ip_vs_stats_show(struct seq_file *seq, void *v) 2959 { 2960 struct net *net = seq_file_single_net(seq); 2961 struct ip_vs_kstats show; 2962 2963 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 2964 seq_puts(seq, 2965 " Total Incoming Outgoing Incoming Outgoing\n"); 2966 seq_puts(seq, 2967 " Conns Packets Packets Bytes Bytes\n"); 2968 2969 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); 2970 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", 2971 (unsigned long long)show.conns, 2972 (unsigned long long)show.inpkts, 2973 (unsigned long long)show.outpkts, 2974 (unsigned long long)show.inbytes, 2975 (unsigned long long)show.outbytes); 2976 2977 /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ 2978 seq_puts(seq, 2979 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 2980 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", 2981 (unsigned long long)show.cps, 2982 (unsigned long long)show.inpps, 2983 (unsigned long long)show.outpps, 2984 (unsigned long long)show.inbps, 2985 (unsigned long long)show.outbps); 2986 2987 return 0; 2988 } 2989 2990 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) 2991 { 2992 struct net *net = seq_file_single_net(seq); 2993 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; 2994 struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; 2995 struct ip_vs_kstats kstats; 2996 int i; 2997 2998 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 2999 seq_puts(seq, 3000 " Total Incoming Outgoing Incoming Outgoing\n"); 3001 seq_puts(seq, 3002 "CPU Conns Packets Packets Bytes Bytes\n"); 3003 3004 for_each_possible_cpu(i) { 3005 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); 3006 unsigned int start; 3007 u64 conns, inpkts, outpkts, inbytes, outbytes; 3008 3009 do { 3010 start = u64_stats_fetch_begin(&u->syncp); 3011 conns = u64_stats_read(&u->cnt.conns); 3012 inpkts = u64_stats_read(&u->cnt.inpkts); 3013 outpkts = u64_stats_read(&u->cnt.outpkts); 3014 inbytes = u64_stats_read(&u->cnt.inbytes); 3015 outbytes = u64_stats_read(&u->cnt.outbytes); 3016 } while (u64_stats_fetch_retry(&u->syncp, start)); 3017 3018 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", 3019 i, (u64)conns, (u64)inpkts, 3020 (u64)outpkts, (u64)inbytes, 3021 (u64)outbytes); 3022 } 3023 3024 ip_vs_copy_stats(&kstats, tot_stats); 3025 3026 seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", 3027 (unsigned long long)kstats.conns, 3028 (unsigned long long)kstats.inpkts, 3029 (unsigned long long)kstats.outpkts, 3030 (unsigned long long)kstats.inbytes, 3031 (unsigned long long)kstats.outbytes); 3032 3033 /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 3034 seq_puts(seq, 3035 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 3036 seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", 3037 kstats.cps, 3038 kstats.inpps, 3039 kstats.outpps, 3040 kstats.inbps, 3041 kstats.outbps); 3042 3043 return 0; 3044 } 3045 3046 static int ip_vs_status_show(struct seq_file *seq, void *v) 3047 { 3048 struct net *net = seq_file_single_net(seq); 3049 struct netns_ipvs *ipvs = net_ipvs(net); 3050 unsigned int resched_score = 0; 3051 struct ip_vs_conn_hnode *hn; 3052 struct hlist_bl_head *head; 3053 struct ip_vs_service *svc; 3054 struct ip_vs_rht *t, *pt; 3055 struct hlist_bl_node *e; 3056 int old_gen, new_gen; 3057 u32 counts[8]; 3058 u32 bucket; 3059 u32 count; 3060 int loops; 3061 u32 sum1; 3062 u32 sum; 3063 int i; 3064 3065 rcu_read_lock(); 3066 3067 t = rcu_dereference(ipvs->conn_tab); 3068 3069 seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count)); 3070 seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n", 3071 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); 3072 3073 if (!atomic_read(&ipvs->conn_count)) 3074 goto after_conns; 3075 old_gen = atomic_read(&ipvs->conn_tab_changes); 3076 loops = 0; 3077 3078 repeat_conn: 3079 smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ 3080 memset(counts, 0, sizeof(counts)); 3081 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { 3082 for (bucket = 0; bucket < t->size; bucket++) { 3083 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 3084 3085 count = 0; 3086 resched_score++; 3087 ip_vs_rht_walk_bucket_rcu(t, bucket, head) { 3088 count = 0; 3089 hlist_bl_for_each_entry_rcu(hn, e, head, node) { 3090 count++; 3091 if (count >= ARRAY_SIZE(counts) - 1) 3092 break; 3093 } 3094 } 3095 resched_score += count; 3096 if (resched_score >= 100) { 3097 resched_score = 0; 3098 cond_resched_rcu(); 3099 new_gen = atomic_read(&ipvs->conn_tab_changes); 3100 /* New table installed ? */ 3101 if (old_gen != new_gen) { 3102 /* Too many changes? */ 3103 if (++loops >= 5) 3104 goto after_conns; 3105 old_gen = new_gen; 3106 goto repeat_conn; 3107 } 3108 } 3109 counts[count]++; 3110 } 3111 } 3112 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) 3113 sum += counts[i]; 3114 sum1 = sum - counts[0]; 3115 seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n", 3116 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); 3117 for (i = 1; i < ARRAY_SIZE(counts); i++) { 3118 if (!counts[i]) 3119 continue; 3120 seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n", 3121 i, counts[i], 3122 div_u64((u64)counts[i] * 100U, max(sum1, 1U))); 3123 } 3124 3125 after_conns: 3126 t = rcu_dereference(ipvs->svc_table); 3127 3128 count = ip_vs_get_num_services(ipvs); 3129 seq_printf(seq, "Services:\t%u\n", count); 3130 seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", 3131 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); 3132 3133 if (!count) 3134 goto after_svc; 3135 old_gen = atomic_read(&ipvs->svc_table_changes); 3136 loops = 0; 3137 3138 repeat_svc: 3139 smp_rmb(); /* ipvs->svc_table and svc_table_changes */ 3140 memset(counts, 0, sizeof(counts)); 3141 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { 3142 for (bucket = 0; bucket < t->size; bucket++) { 3143 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 3144 3145 count = 0; 3146 resched_score++; 3147 ip_vs_rht_walk_bucket_rcu(t, bucket, head) { 3148 count = 0; 3149 hlist_bl_for_each_entry_rcu(svc, e, head, 3150 s_list) { 3151 count++; 3152 if (count >= ARRAY_SIZE(counts) - 1) 3153 break; 3154 } 3155 } 3156 resched_score += count; 3157 if (resched_score >= 100) { 3158 resched_score = 0; 3159 cond_resched_rcu(); 3160 new_gen = atomic_read(&ipvs->svc_table_changes); 3161 /* New table installed ? */ 3162 if (old_gen != new_gen) { 3163 /* Too many changes? */ 3164 if (++loops >= 5) 3165 goto after_svc; 3166 old_gen = new_gen; 3167 goto repeat_svc; 3168 } 3169 } 3170 counts[count]++; 3171 } 3172 } 3173 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) 3174 sum += counts[i]; 3175 sum1 = sum - counts[0]; 3176 seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n", 3177 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); 3178 for (i = 1; i < ARRAY_SIZE(counts); i++) { 3179 if (!counts[i]) 3180 continue; 3181 seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n", 3182 i, counts[i], 3183 div_u64((u64)counts[i] * 100U, max(sum1, 1U))); 3184 } 3185 3186 after_svc: 3187 seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", 3188 ipvs->est_kt_count, ipvs->est_max_threads); 3189 seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); 3190 seq_printf(seq, "Stats thread ests:\t%d\n", 3191 ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * 3192 IPVS_EST_NTICKS); 3193 3194 rcu_read_unlock(); 3195 return 0; 3196 } 3197 3198 #endif 3199 3200 /* 3201 * Set timeout values for tcp tcpfin udp in the timeout_table. 3202 */ 3203 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 3204 { 3205 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 3206 struct ip_vs_proto_data *pd; 3207 #endif 3208 3209 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", 3210 u->tcp_timeout, 3211 u->tcp_fin_timeout, 3212 u->udp_timeout); 3213 3214 #ifdef CONFIG_IP_VS_PROTO_TCP 3215 if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || 3216 u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { 3217 return -EINVAL; 3218 } 3219 #endif 3220 3221 #ifdef CONFIG_IP_VS_PROTO_UDP 3222 if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) 3223 return -EINVAL; 3224 #endif 3225 3226 #ifdef CONFIG_IP_VS_PROTO_TCP 3227 if (u->tcp_timeout) { 3228 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3229 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] 3230 = u->tcp_timeout * HZ; 3231 } 3232 3233 if (u->tcp_fin_timeout) { 3234 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3235 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] 3236 = u->tcp_fin_timeout * HZ; 3237 } 3238 #endif 3239 3240 #ifdef CONFIG_IP_VS_PROTO_UDP 3241 if (u->udp_timeout) { 3242 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 3243 pd->timeout_table[IP_VS_UDP_S_NORMAL] 3244 = u->udp_timeout * HZ; 3245 } 3246 #endif 3247 return 0; 3248 } 3249 3250 #define CMDID(cmd) (cmd - IP_VS_BASE_CTL) 3251 3252 struct ip_vs_svcdest_user { 3253 struct ip_vs_service_user s; 3254 struct ip_vs_dest_user d; 3255 }; 3256 3257 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { 3258 [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), 3259 [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), 3260 [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), 3261 [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), 3262 [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), 3263 [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), 3264 [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 3265 [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), 3266 [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), 3267 [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), 3268 }; 3269 3270 union ip_vs_set_arglen { 3271 struct ip_vs_service_user field_IP_VS_SO_SET_ADD; 3272 struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; 3273 struct ip_vs_service_user field_IP_VS_SO_SET_DEL; 3274 struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; 3275 struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; 3276 struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; 3277 struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; 3278 struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; 3279 struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; 3280 struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; 3281 }; 3282 3283 #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) 3284 3285 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, 3286 struct ip_vs_service_user *usvc_compat) 3287 { 3288 memset(usvc, 0, sizeof(*usvc)); 3289 3290 usvc->af = AF_INET; 3291 usvc->protocol = usvc_compat->protocol; 3292 usvc->addr.ip = usvc_compat->addr; 3293 usvc->port = usvc_compat->port; 3294 usvc->fwmark = usvc_compat->fwmark; 3295 3296 /* Deep copy of sched_name is not needed here */ 3297 usvc->sched_name = usvc_compat->sched_name; 3298 3299 usvc->flags = usvc_compat->flags; 3300 usvc->timeout = usvc_compat->timeout; 3301 usvc->netmask = usvc_compat->netmask; 3302 } 3303 3304 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, 3305 struct ip_vs_dest_user *udest_compat) 3306 { 3307 memset(udest, 0, sizeof(*udest)); 3308 3309 udest->addr.ip = udest_compat->addr; 3310 udest->port = udest_compat->port; 3311 udest->conn_flags = udest_compat->conn_flags; 3312 udest->weight = udest_compat->weight; 3313 udest->u_threshold = udest_compat->u_threshold; 3314 udest->l_threshold = udest_compat->l_threshold; 3315 udest->af = AF_INET; 3316 udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; 3317 } 3318 3319 static int 3320 do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) 3321 { 3322 struct net *net = sock_net(sk); 3323 int ret; 3324 unsigned char arg[MAX_SET_ARGLEN]; 3325 struct ip_vs_service_user *usvc_compat; 3326 struct ip_vs_service_user_kern usvc; 3327 struct ip_vs_service *svc; 3328 struct ip_vs_dest_user *udest_compat; 3329 struct ip_vs_dest_user_kern udest; 3330 struct netns_ipvs *ipvs = net_ipvs(net); 3331 3332 BUILD_BUG_ON(sizeof(arg) > 255); 3333 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3334 return -EPERM; 3335 3336 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) 3337 return -EINVAL; 3338 if (len != set_arglen[CMDID(cmd)]) { 3339 IP_VS_DBG(1, "set_ctl: len %u != %u\n", 3340 len, set_arglen[CMDID(cmd)]); 3341 return -EINVAL; 3342 } 3343 3344 if (copy_from_sockptr(arg, ptr, len) != 0) 3345 return -EFAULT; 3346 3347 /* Handle daemons since they have another lock */ 3348 if (cmd == IP_VS_SO_SET_STARTDAEMON || 3349 cmd == IP_VS_SO_SET_STOPDAEMON) { 3350 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; 3351 3352 if (cmd == IP_VS_SO_SET_STARTDAEMON) { 3353 struct ipvs_sync_daemon_cfg cfg; 3354 3355 memset(&cfg, 0, sizeof(cfg)); 3356 ret = -EINVAL; 3357 if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, 3358 sizeof(cfg.mcast_ifn)) <= 0) 3359 return ret; 3360 cfg.syncid = dm->syncid; 3361 ret = start_sync_thread(ipvs, &cfg, dm->state); 3362 } else { 3363 ret = stop_sync_thread(ipvs, dm->state); 3364 } 3365 return ret; 3366 } 3367 3368 mutex_lock(&ipvs->service_mutex); 3369 if (cmd == IP_VS_SO_SET_FLUSH) { 3370 /* Flush the virtual service */ 3371 ret = ip_vs_flush(ipvs, false); 3372 goto out_unlock; 3373 } else if (cmd == IP_VS_SO_SET_TIMEOUT) { 3374 /* Set timeout values for (tcp tcpfin udp) */ 3375 ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); 3376 goto out_unlock; 3377 } else if (!len) { 3378 /* No more commands with len == 0 below */ 3379 ret = -EINVAL; 3380 goto out_unlock; 3381 } 3382 3383 usvc_compat = (struct ip_vs_service_user *)arg; 3384 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); 3385 3386 /* We only use the new structs internally, so copy userspace compat 3387 * structs to extended internal versions */ 3388 ip_vs_copy_usvc_compat(&usvc, usvc_compat); 3389 ip_vs_copy_udest_compat(&udest, udest_compat); 3390 3391 if (cmd == IP_VS_SO_SET_ZERO) { 3392 /* if no service address is set, zero counters in all */ 3393 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { 3394 ret = ip_vs_zero_all(ipvs); 3395 goto out_unlock; 3396 } 3397 } 3398 3399 if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && 3400 strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == 3401 IP_VS_SCHEDNAME_MAXLEN) { 3402 ret = -EINVAL; 3403 goto out_unlock; 3404 } 3405 3406 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ 3407 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && 3408 usvc.protocol != IPPROTO_SCTP) { 3409 pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", 3410 usvc.protocol, &usvc.addr.ip, 3411 ntohs(usvc.port)); 3412 ret = -EFAULT; 3413 goto out_unlock; 3414 } 3415 3416 /* Lookup the exact service by <protocol, addr, port> or fwmark */ 3417 rcu_read_lock(); 3418 if (usvc.fwmark == 0) 3419 svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, 3420 &usvc.addr, usvc.port); 3421 else 3422 svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); 3423 rcu_read_unlock(); 3424 3425 if (cmd != IP_VS_SO_SET_ADD 3426 && (svc == NULL || svc->protocol != usvc.protocol)) { 3427 ret = -ESRCH; 3428 goto out_unlock; 3429 } 3430 3431 switch (cmd) { 3432 case IP_VS_SO_SET_ADD: 3433 if (svc != NULL) 3434 ret = -EEXIST; 3435 else 3436 ret = ip_vs_add_service(ipvs, &usvc, &svc); 3437 break; 3438 case IP_VS_SO_SET_EDIT: 3439 ret = ip_vs_edit_service(svc, &usvc); 3440 break; 3441 case IP_VS_SO_SET_DEL: 3442 ret = ip_vs_del_service(svc); 3443 if (!ret) 3444 goto out_unlock; 3445 break; 3446 case IP_VS_SO_SET_ZERO: 3447 ret = ip_vs_zero_service(svc); 3448 break; 3449 case IP_VS_SO_SET_ADDDEST: 3450 ret = ip_vs_add_dest(svc, &udest); 3451 break; 3452 case IP_VS_SO_SET_EDITDEST: 3453 ret = ip_vs_edit_dest(svc, &udest); 3454 break; 3455 case IP_VS_SO_SET_DELDEST: 3456 ret = ip_vs_del_dest(svc, &udest); 3457 break; 3458 default: 3459 WARN_ON_ONCE(1); 3460 ret = -EINVAL; 3461 break; 3462 } 3463 3464 out_unlock: 3465 mutex_unlock(&ipvs->service_mutex); 3466 return ret; 3467 } 3468 3469 3470 static void 3471 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) 3472 { 3473 struct ip_vs_scheduler *sched; 3474 struct ip_vs_kstats kstats; 3475 char *sched_name; 3476 3477 sched = rcu_dereference_protected(src->scheduler, 1); 3478 sched_name = sched ? sched->name : "none"; 3479 dst->protocol = src->protocol; 3480 dst->addr = src->addr.ip; 3481 dst->port = src->port; 3482 dst->fwmark = src->fwmark; 3483 strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); 3484 dst->flags = src->flags; 3485 dst->timeout = src->timeout / HZ; 3486 dst->netmask = src->netmask; 3487 dst->num_dests = src->num_dests; 3488 ip_vs_copy_stats(&kstats, &src->stats); 3489 ip_vs_export_stats_user(&dst->stats, &kstats); 3490 } 3491 3492 static inline int 3493 __ip_vs_get_service_entries(struct netns_ipvs *ipvs, 3494 const struct ip_vs_get_services *get, 3495 struct ip_vs_get_services __user *uptr) 3496 { 3497 struct ip_vs_service_entry entry; 3498 DECLARE_IP_VS_RHT_WALK_BUCKETS(); 3499 struct hlist_bl_head *head; 3500 struct ip_vs_service *svc; 3501 struct hlist_bl_node *e; 3502 int count = 0; 3503 int ret = 0; 3504 3505 lockdep_assert_held(&ipvs->svc_resize_sem); 3506 /* All service modifications are disabled, go ahead */ 3507 ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 3508 hlist_bl_for_each_entry(svc, e, head, s_list) { 3509 /* Only expose IPv4 entries to old interface */ 3510 if (svc->af != AF_INET) 3511 continue; 3512 3513 if (count >= get->num_services) 3514 goto out; 3515 memset(&entry, 0, sizeof(entry)); 3516 ip_vs_copy_service(&entry, svc); 3517 if (copy_to_user(&uptr->entrytable[count], 3518 &entry, sizeof(entry))) { 3519 ret = -EFAULT; 3520 goto out; 3521 } 3522 count++; 3523 } 3524 } 3525 3526 out: 3527 return ret; 3528 } 3529 3530 static inline int 3531 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, 3532 struct ip_vs_get_dests __user *uptr) 3533 { 3534 struct ip_vs_service *svc; 3535 union nf_inet_addr addr = { .ip = get->addr }; 3536 int ret = 0; 3537 3538 rcu_read_lock(); 3539 if (get->fwmark) 3540 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); 3541 else 3542 svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, 3543 get->port); 3544 rcu_read_unlock(); 3545 3546 if (svc) { 3547 int count = 0; 3548 struct ip_vs_dest *dest; 3549 struct ip_vs_dest_entry entry; 3550 struct ip_vs_kstats kstats; 3551 3552 memset(&entry, 0, sizeof(entry)); 3553 list_for_each_entry(dest, &svc->destinations, n_list) { 3554 if (count >= get->num_dests) 3555 break; 3556 3557 /* Cannot expose heterogeneous members via sockopt 3558 * interface 3559 */ 3560 if (dest->af != svc->af) 3561 continue; 3562 3563 entry.addr = dest->addr.ip; 3564 entry.port = dest->port; 3565 entry.conn_flags = atomic_read(&dest->conn_flags); 3566 entry.weight = atomic_read(&dest->weight); 3567 entry.u_threshold = dest->u_threshold; 3568 entry.l_threshold = dest->l_threshold; 3569 entry.activeconns = atomic_read(&dest->activeconns); 3570 entry.inactconns = atomic_read(&dest->inactconns); 3571 entry.persistconns = atomic_read(&dest->persistconns); 3572 ip_vs_copy_stats(&kstats, &dest->stats); 3573 ip_vs_export_stats_user(&entry.stats, &kstats); 3574 if (copy_to_user(&uptr->entrytable[count], 3575 &entry, sizeof(entry))) { 3576 ret = -EFAULT; 3577 break; 3578 } 3579 count++; 3580 } 3581 } else 3582 ret = -ESRCH; 3583 return ret; 3584 } 3585 3586 static inline void 3587 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 3588 { 3589 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 3590 struct ip_vs_proto_data *pd; 3591 #endif 3592 3593 memset(u, 0, sizeof (*u)); 3594 3595 #ifdef CONFIG_IP_VS_PROTO_TCP 3596 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3597 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; 3598 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; 3599 #endif 3600 #ifdef CONFIG_IP_VS_PROTO_UDP 3601 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 3602 u->udp_timeout = 3603 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; 3604 #endif 3605 } 3606 3607 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { 3608 [CMDID(IP_VS_SO_GET_VERSION)] = 64, 3609 [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), 3610 [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), 3611 [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), 3612 [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), 3613 [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 3614 [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), 3615 }; 3616 3617 union ip_vs_get_arglen { 3618 char field_IP_VS_SO_GET_VERSION[64]; 3619 struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; 3620 struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; 3621 struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; 3622 struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; 3623 struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; 3624 struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; 3625 }; 3626 3627 #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) 3628 3629 static int 3630 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) 3631 { 3632 unsigned char arg[MAX_GET_ARGLEN]; 3633 int ret = 0; 3634 unsigned int copylen; 3635 struct net *net = sock_net(sk); 3636 struct netns_ipvs *ipvs = net_ipvs(net); 3637 3638 BUG_ON(!net); 3639 BUILD_BUG_ON(sizeof(arg) > 255); 3640 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3641 return -EPERM; 3642 3643 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) 3644 return -EINVAL; 3645 3646 copylen = get_arglen[CMDID(cmd)]; 3647 if (*len < (int) copylen) { 3648 IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); 3649 return -EINVAL; 3650 } 3651 3652 if (copy_from_user(arg, user, copylen) != 0) 3653 return -EFAULT; 3654 /* 3655 * Handle daemons first since it has its own locking 3656 */ 3657 if (cmd == IP_VS_SO_GET_DAEMON) { 3658 struct ip_vs_daemon_user d[2]; 3659 3660 memset(&d, 0, sizeof(d)); 3661 mutex_lock(&ipvs->sync_mutex); 3662 if (ipvs->sync_state & IP_VS_STATE_MASTER) { 3663 d[0].state = IP_VS_STATE_MASTER; 3664 strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, 3665 sizeof(d[0].mcast_ifn)); 3666 d[0].syncid = ipvs->mcfg.syncid; 3667 } 3668 if (ipvs->sync_state & IP_VS_STATE_BACKUP) { 3669 d[1].state = IP_VS_STATE_BACKUP; 3670 strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, 3671 sizeof(d[1].mcast_ifn)); 3672 d[1].syncid = ipvs->bcfg.syncid; 3673 } 3674 if (copy_to_user(user, &d, sizeof(d)) != 0) 3675 ret = -EFAULT; 3676 mutex_unlock(&ipvs->sync_mutex); 3677 return ret; 3678 } 3679 3680 if (cmd == IP_VS_SO_GET_SERVICES) { 3681 struct ip_vs_get_services *get; 3682 size_t size; 3683 3684 get = (struct ip_vs_get_services *)arg; 3685 size = struct_size(get, entrytable, get->num_services); 3686 if (*len != size) { 3687 pr_err("length: %u != %zu\n", *len, size); 3688 return -EINVAL; 3689 } 3690 /* Protect against table resizer moving the entries. 3691 * Try reverse locking, so that we do not hold the mutex 3692 * while waiting for semaphore. 3693 */ 3694 while (1) { 3695 ret = down_read_killable(&ipvs->svc_resize_sem); 3696 if (ret < 0) 3697 return ret; 3698 if (mutex_trylock(&ipvs->service_mutex)) 3699 break; 3700 up_read(&ipvs->svc_resize_sem); 3701 cond_resched(); 3702 } 3703 ret = __ip_vs_get_service_entries(ipvs, get, user); 3704 up_read(&ipvs->svc_resize_sem); 3705 mutex_unlock(&ipvs->service_mutex); 3706 return ret; 3707 } 3708 3709 mutex_lock(&ipvs->service_mutex); 3710 switch (cmd) { 3711 case IP_VS_SO_GET_VERSION: 3712 { 3713 char buf[64]; 3714 3715 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", 3716 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); 3717 if (copy_to_user(user, buf, strlen(buf)+1) != 0) { 3718 ret = -EFAULT; 3719 goto out; 3720 } 3721 *len = strlen(buf)+1; 3722 } 3723 break; 3724 3725 case IP_VS_SO_GET_INFO: 3726 { 3727 struct ip_vs_getinfo info; 3728 3729 info.version = IP_VS_VERSION_CODE; 3730 info.size = get_conn_tab_size(ipvs); 3731 info.num_services = 3732 atomic_read(&ipvs->num_services[IP_VS_AF_INET]); 3733 if (copy_to_user(user, &info, sizeof(info)) != 0) 3734 ret = -EFAULT; 3735 } 3736 break; 3737 3738 case IP_VS_SO_GET_SERVICE: 3739 { 3740 struct ip_vs_service_entry *entry; 3741 struct ip_vs_service *svc; 3742 union nf_inet_addr addr; 3743 3744 entry = (struct ip_vs_service_entry *)arg; 3745 addr.ip = entry->addr; 3746 rcu_read_lock(); 3747 if (entry->fwmark) 3748 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); 3749 else 3750 svc = __ip_vs_service_find(ipvs, AF_INET, 3751 entry->protocol, &addr, 3752 entry->port); 3753 rcu_read_unlock(); 3754 if (svc) { 3755 ip_vs_copy_service(entry, svc); 3756 if (copy_to_user(user, entry, sizeof(*entry)) != 0) 3757 ret = -EFAULT; 3758 } else 3759 ret = -ESRCH; 3760 } 3761 break; 3762 3763 case IP_VS_SO_GET_DESTS: 3764 { 3765 struct ip_vs_get_dests *get; 3766 size_t size; 3767 3768 get = (struct ip_vs_get_dests *)arg; 3769 size = struct_size(get, entrytable, get->num_dests); 3770 if (*len != size) { 3771 pr_err("length: %u != %zu\n", *len, size); 3772 ret = -EINVAL; 3773 goto out; 3774 } 3775 ret = __ip_vs_get_dest_entries(ipvs, get, user); 3776 } 3777 break; 3778 3779 case IP_VS_SO_GET_TIMEOUT: 3780 { 3781 struct ip_vs_timeout_user t; 3782 3783 __ip_vs_get_timeouts(ipvs, &t); 3784 if (copy_to_user(user, &t, sizeof(t)) != 0) 3785 ret = -EFAULT; 3786 } 3787 break; 3788 3789 default: 3790 ret = -EINVAL; 3791 } 3792 3793 out: 3794 mutex_unlock(&ipvs->service_mutex); 3795 return ret; 3796 } 3797 3798 3799 static struct nf_sockopt_ops ip_vs_sockopts = { 3800 .pf = PF_INET, 3801 .set_optmin = IP_VS_BASE_CTL, 3802 .set_optmax = IP_VS_SO_SET_MAX+1, 3803 .set = do_ip_vs_set_ctl, 3804 .get_optmin = IP_VS_BASE_CTL, 3805 .get_optmax = IP_VS_SO_GET_MAX+1, 3806 .get = do_ip_vs_get_ctl, 3807 .owner = THIS_MODULE, 3808 }; 3809 3810 /* 3811 * Generic Netlink interface 3812 */ 3813 3814 /* IPVS genetlink family */ 3815 static struct genl_family ip_vs_genl_family; 3816 3817 /* Policy used for first-level command attributes */ 3818 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { 3819 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, 3820 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, 3821 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, 3822 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, 3823 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, 3824 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, 3825 }; 3826 3827 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ 3828 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { 3829 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, 3830 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, 3831 .len = IP_VS_IFNAME_MAXLEN - 1 }, 3832 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, 3833 [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, 3834 [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, 3835 [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, 3836 [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, 3837 [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, 3838 }; 3839 3840 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ 3841 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { 3842 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, 3843 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, 3844 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, 3845 .len = sizeof(union nf_inet_addr) }, 3846 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, 3847 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, 3848 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, 3849 .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, 3850 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, 3851 .len = IP_VS_PENAME_MAXLEN }, 3852 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, 3853 .len = sizeof(struct ip_vs_flags) }, 3854 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, 3855 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, 3856 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, 3857 }; 3858 3859 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ 3860 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { 3861 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, 3862 .len = sizeof(union nf_inet_addr) }, 3863 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, 3864 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, 3865 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, 3866 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, 3867 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, 3868 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, 3869 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, 3870 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, 3871 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, 3872 [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, 3873 [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, 3874 [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, 3875 [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, 3876 }; 3877 3878 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, 3879 struct ip_vs_kstats *kstats) 3880 { 3881 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 3882 3883 if (!nl_stats) 3884 return -EMSGSIZE; 3885 3886 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || 3887 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || 3888 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || 3889 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 3890 IPVS_STATS_ATTR_PAD) || 3891 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 3892 IPVS_STATS_ATTR_PAD) || 3893 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || 3894 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || 3895 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || 3896 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || 3897 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) 3898 goto nla_put_failure; 3899 nla_nest_end(skb, nl_stats); 3900 3901 return 0; 3902 3903 nla_put_failure: 3904 nla_nest_cancel(skb, nl_stats); 3905 return -EMSGSIZE; 3906 } 3907 3908 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, 3909 struct ip_vs_kstats *kstats) 3910 { 3911 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 3912 3913 if (!nl_stats) 3914 return -EMSGSIZE; 3915 3916 if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, 3917 IPVS_STATS_ATTR_PAD) || 3918 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, 3919 IPVS_STATS_ATTR_PAD) || 3920 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, 3921 IPVS_STATS_ATTR_PAD) || 3922 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 3923 IPVS_STATS_ATTR_PAD) || 3924 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 3925 IPVS_STATS_ATTR_PAD) || 3926 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, 3927 IPVS_STATS_ATTR_PAD) || 3928 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, 3929 IPVS_STATS_ATTR_PAD) || 3930 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, 3931 IPVS_STATS_ATTR_PAD) || 3932 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, 3933 IPVS_STATS_ATTR_PAD) || 3934 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, 3935 IPVS_STATS_ATTR_PAD)) 3936 goto nla_put_failure; 3937 nla_nest_end(skb, nl_stats); 3938 3939 return 0; 3940 3941 nla_put_failure: 3942 nla_nest_cancel(skb, nl_stats); 3943 return -EMSGSIZE; 3944 } 3945 3946 static int ip_vs_genl_fill_service(struct sk_buff *skb, 3947 struct ip_vs_service *svc) 3948 { 3949 struct ip_vs_scheduler *sched; 3950 struct ip_vs_pe *pe; 3951 struct nlattr *nl_service; 3952 struct ip_vs_flags flags = { .flags = svc->flags, 3953 .mask = ~0 }; 3954 struct ip_vs_kstats kstats; 3955 char *sched_name; 3956 3957 nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); 3958 if (!nl_service) 3959 return -EMSGSIZE; 3960 3961 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) 3962 goto nla_put_failure; 3963 if (svc->fwmark) { 3964 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) 3965 goto nla_put_failure; 3966 } else { 3967 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || 3968 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || 3969 nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) 3970 goto nla_put_failure; 3971 } 3972 3973 sched = rcu_dereference(svc->scheduler); 3974 sched_name = sched ? sched->name : "none"; 3975 pe = rcu_dereference(svc->pe); 3976 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || 3977 (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || 3978 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || 3979 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || 3980 nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) 3981 goto nla_put_failure; 3982 ip_vs_copy_stats(&kstats, &svc->stats); 3983 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) 3984 goto nla_put_failure; 3985 if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) 3986 goto nla_put_failure; 3987 3988 nla_nest_end(skb, nl_service); 3989 3990 return 0; 3991 3992 nla_put_failure: 3993 nla_nest_cancel(skb, nl_service); 3994 return -EMSGSIZE; 3995 } 3996 3997 static int ip_vs_genl_dump_service(struct sk_buff *skb, 3998 struct ip_vs_service *svc, 3999 struct netlink_callback *cb) 4000 { 4001 void *hdr; 4002 4003 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4004 &ip_vs_genl_family, NLM_F_MULTI, 4005 IPVS_CMD_NEW_SERVICE); 4006 if (!hdr) 4007 return -EMSGSIZE; 4008 4009 if (ip_vs_genl_fill_service(skb, svc) < 0) 4010 goto nla_put_failure; 4011 4012 genlmsg_end(skb, hdr); 4013 return 0; 4014 4015 nla_put_failure: 4016 genlmsg_cancel(skb, hdr); 4017 return -EMSGSIZE; 4018 } 4019 4020 static int ip_vs_genl_dump_services(struct sk_buff *skb, 4021 struct netlink_callback *cb) 4022 { 4023 DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); 4024 struct net *net = sock_net(skb->sk); 4025 struct netns_ipvs *ipvs = net_ipvs(net); 4026 struct hlist_bl_head *head; 4027 struct ip_vs_service *svc; 4028 struct hlist_bl_node *e; 4029 int start = cb->args[0]; 4030 int idx = 0; 4031 4032 down_read(&ipvs->svc_resize_sem); 4033 rcu_read_lock(); 4034 ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { 4035 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 4036 if (++idx <= start) 4037 continue; 4038 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { 4039 idx--; 4040 goto nla_put_failure; 4041 } 4042 } 4043 } 4044 4045 nla_put_failure: 4046 rcu_read_unlock(); 4047 up_read(&ipvs->svc_resize_sem); 4048 cb->args[0] = idx; 4049 4050 return skb->len; 4051 } 4052 4053 static bool ip_vs_is_af_valid(int af) 4054 { 4055 if (af == AF_INET) 4056 return true; 4057 #ifdef CONFIG_IP_VS_IPV6 4058 if (af == AF_INET6 && ipv6_mod_enabled()) 4059 return true; 4060 #endif 4061 return false; 4062 } 4063 4064 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, 4065 struct ip_vs_service_user_kern *usvc, 4066 struct nlattr *nla, bool full_entry, 4067 struct ip_vs_service **ret_svc) 4068 { 4069 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; 4070 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; 4071 struct ip_vs_service *svc; 4072 4073 /* Parse mandatory identifying service fields first */ 4074 if (nla == NULL || 4075 nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) 4076 return -EINVAL; 4077 4078 nla_af = attrs[IPVS_SVC_ATTR_AF]; 4079 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; 4080 nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; 4081 nla_port = attrs[IPVS_SVC_ATTR_PORT]; 4082 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; 4083 4084 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) 4085 return -EINVAL; 4086 4087 memset(usvc, 0, sizeof(*usvc)); 4088 4089 usvc->af = nla_get_u16(nla_af); 4090 if (!ip_vs_is_af_valid(usvc->af)) 4091 return -EAFNOSUPPORT; 4092 4093 if (nla_fwmark) { 4094 usvc->protocol = IPPROTO_TCP; 4095 usvc->fwmark = nla_get_u32(nla_fwmark); 4096 } else { 4097 usvc->protocol = nla_get_u16(nla_protocol); 4098 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); 4099 usvc->port = nla_get_be16(nla_port); 4100 usvc->fwmark = 0; 4101 } 4102 4103 if (usvc->fwmark) 4104 svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); 4105 else 4106 svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, 4107 &usvc->addr, usvc->port); 4108 *ret_svc = svc; 4109 4110 /* If a full entry was requested, check for the additional fields */ 4111 if (full_entry) { 4112 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, 4113 *nla_netmask; 4114 struct ip_vs_flags flags; 4115 4116 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; 4117 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; 4118 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; 4119 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; 4120 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; 4121 4122 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) 4123 return -EINVAL; 4124 4125 nla_memcpy(&flags, nla_flags, sizeof(flags)); 4126 4127 /* prefill flags from service if it already exists */ 4128 if (svc) 4129 usvc->flags = svc->flags; 4130 4131 /* set new flags from userland */ 4132 usvc->flags = (usvc->flags & ~flags.mask) | 4133 (flags.flags & flags.mask); 4134 usvc->sched_name = nla_data(nla_sched); 4135 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; 4136 usvc->timeout = nla_get_u32(nla_timeout); 4137 usvc->netmask = nla_get_be32(nla_netmask); 4138 } 4139 4140 return 0; 4141 } 4142 4143 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, 4144 struct nlattr *nla) 4145 { 4146 struct ip_vs_service_user_kern usvc; 4147 struct ip_vs_service *svc; 4148 int ret; 4149 4150 ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); 4151 return ret ? ERR_PTR(ret) : svc; 4152 } 4153 4154 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) 4155 { 4156 struct nlattr *nl_dest; 4157 struct ip_vs_kstats kstats; 4158 4159 nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); 4160 if (!nl_dest) 4161 return -EMSGSIZE; 4162 4163 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || 4164 nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || 4165 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, 4166 (atomic_read(&dest->conn_flags) & 4167 IP_VS_CONN_F_FWD_MASK)) || 4168 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, 4169 atomic_read(&dest->weight)) || 4170 nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, 4171 dest->tun_type) || 4172 nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, 4173 dest->tun_port) || 4174 nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, 4175 dest->tun_flags) || 4176 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || 4177 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || 4178 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, 4179 atomic_read(&dest->activeconns)) || 4180 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, 4181 atomic_read(&dest->inactconns)) || 4182 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, 4183 atomic_read(&dest->persistconns)) || 4184 nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) 4185 goto nla_put_failure; 4186 ip_vs_copy_stats(&kstats, &dest->stats); 4187 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) 4188 goto nla_put_failure; 4189 if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) 4190 goto nla_put_failure; 4191 4192 nla_nest_end(skb, nl_dest); 4193 4194 return 0; 4195 4196 nla_put_failure: 4197 nla_nest_cancel(skb, nl_dest); 4198 return -EMSGSIZE; 4199 } 4200 4201 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, 4202 struct netlink_callback *cb) 4203 { 4204 void *hdr; 4205 4206 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4207 &ip_vs_genl_family, NLM_F_MULTI, 4208 IPVS_CMD_NEW_DEST); 4209 if (!hdr) 4210 return -EMSGSIZE; 4211 4212 if (ip_vs_genl_fill_dest(skb, dest) < 0) 4213 goto nla_put_failure; 4214 4215 genlmsg_end(skb, hdr); 4216 return 0; 4217 4218 nla_put_failure: 4219 genlmsg_cancel(skb, hdr); 4220 return -EMSGSIZE; 4221 } 4222 4223 static int ip_vs_genl_dump_dests(struct sk_buff *skb, 4224 struct netlink_callback *cb) 4225 { 4226 int idx = 0; 4227 int start = cb->args[0]; 4228 struct ip_vs_service *svc; 4229 struct ip_vs_dest *dest; 4230 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; 4231 struct net *net = sock_net(skb->sk); 4232 struct netns_ipvs *ipvs = net_ipvs(net); 4233 4234 rcu_read_lock(); 4235 4236 /* Try to find the service for which to dump destinations */ 4237 if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) 4238 goto out_err; 4239 4240 4241 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); 4242 if (IS_ERR_OR_NULL(svc)) 4243 goto out_err; 4244 4245 /* Dump the destinations */ 4246 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 4247 if (++idx <= start) 4248 continue; 4249 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { 4250 idx--; 4251 goto nla_put_failure; 4252 } 4253 } 4254 4255 nla_put_failure: 4256 cb->args[0] = idx; 4257 4258 out_err: 4259 rcu_read_unlock(); 4260 4261 return skb->len; 4262 } 4263 4264 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, 4265 struct nlattr *nla, bool full_entry) 4266 { 4267 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; 4268 struct nlattr *nla_addr, *nla_port; 4269 struct nlattr *nla_addr_family; 4270 4271 /* Parse mandatory identifying destination fields first */ 4272 if (nla == NULL || 4273 nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) 4274 return -EINVAL; 4275 4276 nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; 4277 nla_port = attrs[IPVS_DEST_ATTR_PORT]; 4278 nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; 4279 4280 if (!(nla_addr && nla_port)) 4281 return -EINVAL; 4282 4283 memset(udest, 0, sizeof(*udest)); 4284 4285 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); 4286 udest->port = nla_get_be16(nla_port); 4287 4288 udest->af = nla_get_u16_default(nla_addr_family, 0); 4289 4290 /* If a full entry was requested, check for the additional fields */ 4291 if (full_entry) { 4292 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, 4293 *nla_l_thresh, *nla_tun_type, *nla_tun_port, 4294 *nla_tun_flags; 4295 4296 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; 4297 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; 4298 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; 4299 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; 4300 nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; 4301 nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; 4302 nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; 4303 4304 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) 4305 return -EINVAL; 4306 4307 udest->conn_flags = nla_get_u32(nla_fwd) 4308 & IP_VS_CONN_F_FWD_MASK; 4309 udest->weight = nla_get_u32(nla_weight); 4310 udest->u_threshold = nla_get_u32(nla_u_thresh); 4311 udest->l_threshold = nla_get_u32(nla_l_thresh); 4312 4313 if (nla_tun_type) 4314 udest->tun_type = nla_get_u8(nla_tun_type); 4315 4316 if (nla_tun_port) 4317 udest->tun_port = nla_get_be16(nla_tun_port); 4318 4319 if (nla_tun_flags) 4320 udest->tun_flags = nla_get_u16(nla_tun_flags); 4321 } 4322 4323 return 0; 4324 } 4325 4326 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, 4327 struct ipvs_sync_daemon_cfg *c) 4328 { 4329 struct nlattr *nl_daemon; 4330 4331 nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); 4332 if (!nl_daemon) 4333 return -EMSGSIZE; 4334 4335 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || 4336 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || 4337 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || 4338 nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || 4339 nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || 4340 nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) 4341 goto nla_put_failure; 4342 #ifdef CONFIG_IP_VS_IPV6 4343 if (c->mcast_af == AF_INET6) { 4344 if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, 4345 &c->mcast_group.in6)) 4346 goto nla_put_failure; 4347 } else 4348 #endif 4349 if (c->mcast_af == AF_INET && 4350 nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, 4351 c->mcast_group.ip)) 4352 goto nla_put_failure; 4353 nla_nest_end(skb, nl_daemon); 4354 4355 return 0; 4356 4357 nla_put_failure: 4358 nla_nest_cancel(skb, nl_daemon); 4359 return -EMSGSIZE; 4360 } 4361 4362 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, 4363 struct ipvs_sync_daemon_cfg *c, 4364 struct netlink_callback *cb) 4365 { 4366 void *hdr; 4367 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4368 &ip_vs_genl_family, NLM_F_MULTI, 4369 IPVS_CMD_NEW_DAEMON); 4370 if (!hdr) 4371 return -EMSGSIZE; 4372 4373 if (ip_vs_genl_fill_daemon(skb, state, c)) 4374 goto nla_put_failure; 4375 4376 genlmsg_end(skb, hdr); 4377 return 0; 4378 4379 nla_put_failure: 4380 genlmsg_cancel(skb, hdr); 4381 return -EMSGSIZE; 4382 } 4383 4384 static int ip_vs_genl_dump_daemons(struct sk_buff *skb, 4385 struct netlink_callback *cb) 4386 { 4387 struct net *net = sock_net(skb->sk); 4388 struct netns_ipvs *ipvs = net_ipvs(net); 4389 4390 mutex_lock(&ipvs->sync_mutex); 4391 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { 4392 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, 4393 &ipvs->mcfg, cb) < 0) 4394 goto nla_put_failure; 4395 4396 cb->args[0] = 1; 4397 } 4398 4399 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { 4400 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, 4401 &ipvs->bcfg, cb) < 0) 4402 goto nla_put_failure; 4403 4404 cb->args[1] = 1; 4405 } 4406 4407 nla_put_failure: 4408 mutex_unlock(&ipvs->sync_mutex); 4409 4410 return skb->len; 4411 } 4412 4413 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 4414 { 4415 struct ipvs_sync_daemon_cfg c; 4416 struct nlattr *a; 4417 int ret; 4418 4419 memset(&c, 0, sizeof(c)); 4420 if (!(attrs[IPVS_DAEMON_ATTR_STATE] && 4421 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && 4422 attrs[IPVS_DAEMON_ATTR_SYNC_ID])) 4423 return -EINVAL; 4424 strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), 4425 sizeof(c.mcast_ifn)); 4426 c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); 4427 4428 a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; 4429 if (a) 4430 c.sync_maxlen = nla_get_u16(a); 4431 4432 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; 4433 if (a) { 4434 c.mcast_af = AF_INET; 4435 c.mcast_group.ip = nla_get_in_addr(a); 4436 if (!ipv4_is_multicast(c.mcast_group.ip)) 4437 return -EINVAL; 4438 } else { 4439 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; 4440 if (a) { 4441 #ifdef CONFIG_IP_VS_IPV6 4442 int addr_type; 4443 4444 c.mcast_af = AF_INET6; 4445 c.mcast_group.in6 = nla_get_in6_addr(a); 4446 addr_type = ipv6_addr_type(&c.mcast_group.in6); 4447 if (!(addr_type & IPV6_ADDR_MULTICAST)) 4448 return -EINVAL; 4449 #else 4450 return -EAFNOSUPPORT; 4451 #endif 4452 } 4453 } 4454 4455 a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; 4456 if (a) 4457 c.mcast_port = nla_get_u16(a); 4458 4459 a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; 4460 if (a) 4461 c.mcast_ttl = nla_get_u8(a); 4462 4463 /* The synchronization protocol is incompatible with mixed family 4464 * services 4465 */ 4466 if (ipvs->mixed_address_family_dests > 0) 4467 return -EINVAL; 4468 4469 ret = start_sync_thread(ipvs, &c, 4470 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 4471 return ret; 4472 } 4473 4474 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 4475 { 4476 int ret; 4477 4478 if (!attrs[IPVS_DAEMON_ATTR_STATE]) 4479 return -EINVAL; 4480 4481 ret = stop_sync_thread(ipvs, 4482 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 4483 return ret; 4484 } 4485 4486 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) 4487 { 4488 struct ip_vs_timeout_user t; 4489 4490 __ip_vs_get_timeouts(ipvs, &t); 4491 4492 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) 4493 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); 4494 4495 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) 4496 t.tcp_fin_timeout = 4497 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); 4498 4499 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) 4500 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); 4501 4502 return ip_vs_set_timeout(ipvs, &t); 4503 } 4504 4505 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) 4506 { 4507 int ret = -EINVAL, cmd; 4508 struct net *net = sock_net(skb->sk); 4509 struct netns_ipvs *ipvs = net_ipvs(net); 4510 4511 cmd = info->genlhdr->cmd; 4512 4513 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { 4514 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; 4515 4516 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || 4517 nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) 4518 goto out; 4519 4520 if (cmd == IPVS_CMD_NEW_DAEMON) 4521 ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); 4522 else 4523 ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); 4524 } 4525 4526 out: 4527 return ret; 4528 } 4529 4530 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) 4531 { 4532 bool need_full_svc = false, need_full_dest = false; 4533 struct ip_vs_service *svc = NULL; 4534 struct ip_vs_service_user_kern usvc; 4535 struct ip_vs_dest_user_kern udest; 4536 int ret = 0, cmd; 4537 struct net *net = sock_net(skb->sk); 4538 struct netns_ipvs *ipvs = net_ipvs(net); 4539 4540 cmd = info->genlhdr->cmd; 4541 4542 mutex_lock(&ipvs->service_mutex); 4543 4544 if (cmd == IPVS_CMD_FLUSH) { 4545 ret = ip_vs_flush(ipvs, false); 4546 goto out; 4547 } else if (cmd == IPVS_CMD_SET_CONFIG) { 4548 ret = ip_vs_genl_set_config(ipvs, info->attrs); 4549 goto out; 4550 } else if (cmd == IPVS_CMD_ZERO && 4551 !info->attrs[IPVS_CMD_ATTR_SERVICE]) { 4552 ret = ip_vs_zero_all(ipvs); 4553 goto out; 4554 } 4555 4556 /* All following commands require a service argument, so check if we 4557 * received a valid one. We need a full service specification when 4558 * adding / editing a service. Only identifying members otherwise. */ 4559 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) 4560 need_full_svc = true; 4561 4562 /* We use function that requires RCU lock (hlist_bl) */ 4563 rcu_read_lock(); 4564 ret = ip_vs_genl_parse_service(ipvs, &usvc, 4565 info->attrs[IPVS_CMD_ATTR_SERVICE], 4566 need_full_svc, &svc); 4567 rcu_read_unlock(); 4568 if (ret) 4569 goto out; 4570 4571 /* Unless we're adding a new service, the service must already exist */ 4572 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { 4573 ret = -ESRCH; 4574 goto out; 4575 } 4576 4577 /* Destination commands require a valid destination argument. For 4578 * adding / editing a destination, we need a full destination 4579 * specification. */ 4580 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || 4581 cmd == IPVS_CMD_DEL_DEST) { 4582 if (cmd != IPVS_CMD_DEL_DEST) 4583 need_full_dest = true; 4584 4585 ret = ip_vs_genl_parse_dest(&udest, 4586 info->attrs[IPVS_CMD_ATTR_DEST], 4587 need_full_dest); 4588 if (ret) 4589 goto out; 4590 4591 /* Old protocols did not allow the user to specify address 4592 * family, so we set it to zero instead. We also didn't 4593 * allow heterogeneous pools in the old code, so it's safe 4594 * to assume that this will have the same address family as 4595 * the service. 4596 */ 4597 if (udest.af == 0) 4598 udest.af = svc->af; 4599 4600 if (!ip_vs_is_af_valid(udest.af)) { 4601 ret = -EAFNOSUPPORT; 4602 goto out; 4603 } 4604 4605 if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { 4606 /* The synchronization protocol is incompatible 4607 * with mixed family services 4608 */ 4609 if (ipvs->sync_state) { 4610 ret = -EINVAL; 4611 goto out; 4612 } 4613 4614 /* Which connection types do we support? */ 4615 switch (udest.conn_flags) { 4616 case IP_VS_CONN_F_TUNNEL: 4617 /* We are able to forward this */ 4618 break; 4619 default: 4620 ret = -EINVAL; 4621 goto out; 4622 } 4623 } 4624 } 4625 4626 switch (cmd) { 4627 case IPVS_CMD_NEW_SERVICE: 4628 if (svc == NULL) 4629 ret = ip_vs_add_service(ipvs, &usvc, &svc); 4630 else 4631 ret = -EEXIST; 4632 break; 4633 case IPVS_CMD_SET_SERVICE: 4634 ret = ip_vs_edit_service(svc, &usvc); 4635 break; 4636 case IPVS_CMD_DEL_SERVICE: 4637 ret = ip_vs_del_service(svc); 4638 /* do not use svc, it can be freed */ 4639 break; 4640 case IPVS_CMD_NEW_DEST: 4641 ret = ip_vs_add_dest(svc, &udest); 4642 break; 4643 case IPVS_CMD_SET_DEST: 4644 ret = ip_vs_edit_dest(svc, &udest); 4645 break; 4646 case IPVS_CMD_DEL_DEST: 4647 ret = ip_vs_del_dest(svc, &udest); 4648 break; 4649 case IPVS_CMD_ZERO: 4650 ret = ip_vs_zero_service(svc); 4651 break; 4652 default: 4653 ret = -EINVAL; 4654 } 4655 4656 out: 4657 mutex_unlock(&ipvs->service_mutex); 4658 4659 return ret; 4660 } 4661 4662 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) 4663 { 4664 struct sk_buff *msg; 4665 void *reply; 4666 int ret, cmd, reply_cmd; 4667 struct net *net = sock_net(skb->sk); 4668 struct netns_ipvs *ipvs = net_ipvs(net); 4669 4670 cmd = info->genlhdr->cmd; 4671 4672 if (cmd == IPVS_CMD_GET_SERVICE) 4673 reply_cmd = IPVS_CMD_NEW_SERVICE; 4674 else if (cmd == IPVS_CMD_GET_INFO) 4675 reply_cmd = IPVS_CMD_SET_INFO; 4676 else if (cmd == IPVS_CMD_GET_CONFIG) 4677 reply_cmd = IPVS_CMD_SET_CONFIG; 4678 else { 4679 pr_err("unknown Generic Netlink command\n"); 4680 return -EINVAL; 4681 } 4682 4683 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 4684 if (!msg) 4685 return -ENOMEM; 4686 4687 rcu_read_lock(); 4688 4689 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); 4690 if (reply == NULL) 4691 goto nla_put_failure; 4692 4693 switch (cmd) { 4694 case IPVS_CMD_GET_SERVICE: 4695 { 4696 struct ip_vs_service *svc; 4697 4698 svc = ip_vs_genl_find_service(ipvs, 4699 info->attrs[IPVS_CMD_ATTR_SERVICE]); 4700 if (IS_ERR(svc)) { 4701 ret = PTR_ERR(svc); 4702 goto out_err; 4703 } else if (svc) { 4704 ret = ip_vs_genl_fill_service(msg, svc); 4705 if (ret) 4706 goto nla_put_failure; 4707 } else { 4708 ret = -ESRCH; 4709 goto out_err; 4710 } 4711 4712 break; 4713 } 4714 4715 case IPVS_CMD_GET_CONFIG: 4716 { 4717 struct ip_vs_timeout_user t; 4718 4719 __ip_vs_get_timeouts(ipvs, &t); 4720 #ifdef CONFIG_IP_VS_PROTO_TCP 4721 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, 4722 t.tcp_timeout) || 4723 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, 4724 t.tcp_fin_timeout)) 4725 goto nla_put_failure; 4726 #endif 4727 #ifdef CONFIG_IP_VS_PROTO_UDP 4728 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) 4729 goto nla_put_failure; 4730 #endif 4731 4732 break; 4733 } 4734 4735 case IPVS_CMD_GET_INFO: 4736 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, 4737 IP_VS_VERSION_CODE) || 4738 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, 4739 get_conn_tab_size(ipvs))) 4740 goto nla_put_failure; 4741 break; 4742 } 4743 4744 genlmsg_end(msg, reply); 4745 ret = genlmsg_reply(msg, info); 4746 goto out; 4747 4748 nla_put_failure: 4749 pr_err("not enough space in Netlink message\n"); 4750 ret = -EMSGSIZE; 4751 4752 out_err: 4753 nlmsg_free(msg); 4754 out: 4755 rcu_read_unlock(); 4756 4757 return ret; 4758 } 4759 4760 4761 static const struct genl_small_ops ip_vs_genl_ops[] = { 4762 { 4763 .cmd = IPVS_CMD_NEW_SERVICE, 4764 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4765 .flags = GENL_ADMIN_PERM, 4766 .doit = ip_vs_genl_set_cmd, 4767 }, 4768 { 4769 .cmd = IPVS_CMD_SET_SERVICE, 4770 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4771 .flags = GENL_ADMIN_PERM, 4772 .doit = ip_vs_genl_set_cmd, 4773 }, 4774 { 4775 .cmd = IPVS_CMD_DEL_SERVICE, 4776 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4777 .flags = GENL_ADMIN_PERM, 4778 .doit = ip_vs_genl_set_cmd, 4779 }, 4780 { 4781 .cmd = IPVS_CMD_GET_SERVICE, 4782 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4783 .flags = GENL_ADMIN_PERM, 4784 .doit = ip_vs_genl_get_cmd, 4785 .dumpit = ip_vs_genl_dump_services, 4786 }, 4787 { 4788 .cmd = IPVS_CMD_NEW_DEST, 4789 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4790 .flags = GENL_ADMIN_PERM, 4791 .doit = ip_vs_genl_set_cmd, 4792 }, 4793 { 4794 .cmd = IPVS_CMD_SET_DEST, 4795 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4796 .flags = GENL_ADMIN_PERM, 4797 .doit = ip_vs_genl_set_cmd, 4798 }, 4799 { 4800 .cmd = IPVS_CMD_DEL_DEST, 4801 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4802 .flags = GENL_ADMIN_PERM, 4803 .doit = ip_vs_genl_set_cmd, 4804 }, 4805 { 4806 .cmd = IPVS_CMD_GET_DEST, 4807 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4808 .flags = GENL_ADMIN_PERM, 4809 .dumpit = ip_vs_genl_dump_dests, 4810 }, 4811 { 4812 .cmd = IPVS_CMD_NEW_DAEMON, 4813 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4814 .flags = GENL_ADMIN_PERM, 4815 .doit = ip_vs_genl_set_daemon, 4816 }, 4817 { 4818 .cmd = IPVS_CMD_DEL_DAEMON, 4819 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4820 .flags = GENL_ADMIN_PERM, 4821 .doit = ip_vs_genl_set_daemon, 4822 }, 4823 { 4824 .cmd = IPVS_CMD_GET_DAEMON, 4825 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4826 .flags = GENL_ADMIN_PERM, 4827 .dumpit = ip_vs_genl_dump_daemons, 4828 }, 4829 { 4830 .cmd = IPVS_CMD_SET_CONFIG, 4831 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4832 .flags = GENL_ADMIN_PERM, 4833 .doit = ip_vs_genl_set_cmd, 4834 }, 4835 { 4836 .cmd = IPVS_CMD_GET_CONFIG, 4837 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4838 .flags = GENL_ADMIN_PERM, 4839 .doit = ip_vs_genl_get_cmd, 4840 }, 4841 { 4842 .cmd = IPVS_CMD_GET_INFO, 4843 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4844 .flags = GENL_ADMIN_PERM, 4845 .doit = ip_vs_genl_get_cmd, 4846 }, 4847 { 4848 .cmd = IPVS_CMD_ZERO, 4849 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4850 .flags = GENL_ADMIN_PERM, 4851 .doit = ip_vs_genl_set_cmd, 4852 }, 4853 { 4854 .cmd = IPVS_CMD_FLUSH, 4855 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4856 .flags = GENL_ADMIN_PERM, 4857 .doit = ip_vs_genl_set_cmd, 4858 }, 4859 }; 4860 4861 static struct genl_family ip_vs_genl_family __ro_after_init = { 4862 .hdrsize = 0, 4863 .name = IPVS_GENL_NAME, 4864 .version = IPVS_GENL_VERSION, 4865 .maxattr = IPVS_CMD_ATTR_MAX, 4866 .policy = ip_vs_cmd_policy, 4867 .netnsok = true, /* Make ipvsadm to work on netns */ 4868 .module = THIS_MODULE, 4869 .small_ops = ip_vs_genl_ops, 4870 .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), 4871 .resv_start_op = IPVS_CMD_FLUSH + 1, 4872 .parallel_ops = 1, 4873 }; 4874 4875 static int __init ip_vs_genl_register(void) 4876 { 4877 return genl_register_family(&ip_vs_genl_family); 4878 } 4879 4880 static void ip_vs_genl_unregister(void) 4881 { 4882 genl_unregister_family(&ip_vs_genl_family); 4883 } 4884 4885 /* End of Generic Netlink interface definitions */ 4886 4887 /* 4888 * per netns intit/exit func. 4889 */ 4890 #ifdef CONFIG_SYSCTL 4891 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) 4892 { 4893 struct net *net = ipvs->net; 4894 struct ctl_table *tbl; 4895 int idx, ret; 4896 size_t ctl_table_size = ARRAY_SIZE(vs_vars); 4897 bool unpriv = net->user_ns != &init_user_ns; 4898 4899 atomic_set(&ipvs->dropentry, 0); 4900 spin_lock_init(&ipvs->dropentry_lock); 4901 spin_lock_init(&ipvs->droppacket_lock); 4902 spin_lock_init(&ipvs->securetcp_lock); 4903 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); 4904 INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, 4905 expire_nodest_conn_handler); 4906 ipvs->est_stopped = 0; 4907 4908 if (!net_eq(net, &init_net)) { 4909 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); 4910 if (tbl == NULL) 4911 return -ENOMEM; 4912 } else 4913 tbl = vs_vars; 4914 /* Initialize sysctl defaults */ 4915 for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { 4916 if (tbl[idx].proc_handler == proc_do_defense_mode) 4917 tbl[idx].extra2 = ipvs; 4918 } 4919 idx = 0; 4920 ipvs->sysctl_amemthresh = 1024; 4921 tbl[idx++].data = &ipvs->sysctl_amemthresh; 4922 ipvs->sysctl_am_droprate = 10; 4923 tbl[idx++].data = &ipvs->sysctl_am_droprate; 4924 tbl[idx++].data = &ipvs->sysctl_drop_entry; 4925 tbl[idx++].data = &ipvs->sysctl_drop_packet; 4926 #ifdef CONFIG_IP_VS_NFCT 4927 tbl[idx++].data = &ipvs->sysctl_conntrack; 4928 #endif 4929 tbl[idx++].data = &ipvs->sysctl_secure_tcp; 4930 ipvs->sysctl_snat_reroute = 1; 4931 tbl[idx++].data = &ipvs->sysctl_snat_reroute; 4932 ipvs->sysctl_sync_ver = 1; 4933 tbl[idx++].data = &ipvs->sysctl_sync_ver; 4934 ipvs->sysctl_sync_ports = 1; 4935 tbl[idx++].data = &ipvs->sysctl_sync_ports; 4936 tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; 4937 4938 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; 4939 if (unpriv) 4940 tbl[idx].mode = 0444; 4941 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; 4942 4943 ipvs->sysctl_sync_sock_size = 0; 4944 if (unpriv) 4945 tbl[idx].mode = 0444; 4946 tbl[idx++].data = &ipvs->sysctl_sync_sock_size; 4947 4948 tbl[idx++].data = &ipvs->sysctl_cache_bypass; 4949 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; 4950 tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; 4951 tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; 4952 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; 4953 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; 4954 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; 4955 tbl[idx].data = &ipvs->sysctl_sync_threshold; 4956 tbl[idx].extra2 = ipvs; 4957 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); 4958 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; 4959 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; 4960 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); 4961 tbl[idx++].data = &ipvs->sysctl_sync_retries; 4962 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; 4963 ipvs->sysctl_pmtu_disc = 1; 4964 tbl[idx++].data = &ipvs->sysctl_pmtu_disc; 4965 tbl[idx++].data = &ipvs->sysctl_backup_only; 4966 ipvs->sysctl_conn_reuse_mode = 1; 4967 tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; 4968 tbl[idx++].data = &ipvs->sysctl_schedule_icmp; 4969 tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; 4970 4971 ipvs->sysctl_run_estimation = 1; 4972 if (unpriv) 4973 tbl[idx].mode = 0444; 4974 tbl[idx].extra2 = ipvs; 4975 tbl[idx++].data = &ipvs->sysctl_run_estimation; 4976 4977 ipvs->est_cpulist_valid = 0; 4978 if (unpriv) 4979 tbl[idx].mode = 0444; 4980 tbl[idx].extra2 = ipvs; 4981 tbl[idx++].data = &ipvs->sysctl_est_cpulist; 4982 4983 ipvs->sysctl_est_nice = IPVS_EST_NICE; 4984 if (unpriv) 4985 tbl[idx].mode = 0444; 4986 tbl[idx].extra2 = ipvs; 4987 tbl[idx++].data = &ipvs->sysctl_est_nice; 4988 4989 if (unpriv) 4990 tbl[idx].mode = 0444; 4991 tbl[idx].extra2 = ipvs; 4992 tbl[idx++].data = &ipvs->sysctl_conn_lfactor; 4993 4994 if (unpriv) 4995 tbl[idx].mode = 0444; 4996 tbl[idx].extra2 = ipvs; 4997 tbl[idx++].data = &ipvs->sysctl_svc_lfactor; 4998 4999 #ifdef CONFIG_IP_VS_DEBUG 5000 /* Global sysctls must be ro in non-init netns */ 5001 if (!net_eq(net, &init_net)) 5002 tbl[idx++].mode = 0444; 5003 #endif 5004 5005 ret = -ENOMEM; 5006 ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, 5007 ctl_table_size); 5008 if (!ipvs->sysctl_hdr) 5009 goto err; 5010 ipvs->sysctl_tbl = tbl; 5011 5012 ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); 5013 if (ret < 0) 5014 goto err; 5015 5016 /* Schedule defense work */ 5017 queue_delayed_work(system_long_wq, &ipvs->defense_work, 5018 DEFENSE_TIMER_PERIOD); 5019 5020 return 0; 5021 5022 err: 5023 unregister_net_sysctl_table(ipvs->sysctl_hdr); 5024 if (!net_eq(net, &init_net)) 5025 kfree(tbl); 5026 return ret; 5027 } 5028 5029 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) 5030 { 5031 struct net *net = ipvs->net; 5032 5033 cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); 5034 cancel_delayed_work_sync(&ipvs->defense_work); 5035 cancel_work_sync(&ipvs->defense_work.work); 5036 unregister_net_sysctl_table(ipvs->sysctl_hdr); 5037 if (ipvs->tot_stats->s.est.ktid != -2) { 5038 /* Not stopped yet? This happens only on netns init error and 5039 * we even do not need to lock the service_mutex for this case. 5040 */ 5041 mutex_lock(&ipvs->service_mutex); 5042 ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); 5043 mutex_unlock(&ipvs->service_mutex); 5044 } 5045 5046 if (ipvs->est_cpulist_valid) 5047 free_cpumask_var(ipvs->sysctl_est_cpulist); 5048 5049 if (!net_eq(net, &init_net)) 5050 kfree(ipvs->sysctl_tbl); 5051 } 5052 5053 #else 5054 5055 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } 5056 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } 5057 5058 #endif 5059 5060 static struct notifier_block ip_vs_dst_notifier = { 5061 .notifier_call = ip_vs_dst_event, 5062 #ifdef CONFIG_IP_VS_IPV6 5063 .priority = ADDRCONF_NOTIFY_PRIORITY + 5, 5064 #endif 5065 }; 5066 5067 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) 5068 { 5069 int ret = -ENOMEM; 5070 int idx; 5071 5072 /* Initialize service_mutex, svc_table per netns */ 5073 __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); 5074 init_rwsem(&ipvs->svc_resize_sem); 5075 INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); 5076 atomic_set(&ipvs->svc_table_changes, 0); 5077 RCU_INIT_POINTER(ipvs->svc_table, NULL); 5078 5079 /* Initialize rs_table */ 5080 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) 5081 INIT_HLIST_HEAD(&ipvs->rs_table[idx]); 5082 5083 INIT_LIST_HEAD(&ipvs->dest_trash); 5084 spin_lock_init(&ipvs->dest_trash_lock); 5085 timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); 5086 for (idx = 0; idx < IP_VS_AF_MAX; idx++) { 5087 atomic_set(&ipvs->num_services[idx], 0); 5088 atomic_set(&ipvs->fwm_services[idx], 0); 5089 atomic_set(&ipvs->nonfwm_services[idx], 0); 5090 atomic_set(&ipvs->ftpsvc_counter[idx], 0); 5091 atomic_set(&ipvs->nullsvc_counter[idx], 0); 5092 atomic_set(&ipvs->conn_out_counter[idx], 0); 5093 } 5094 5095 INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); 5096 ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs); 5097 5098 /* procfs stats */ 5099 ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats); 5100 if (!ipvs->tot_stats) 5101 goto out; 5102 if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) 5103 goto err_tot_stats; 5104 5105 #ifdef CONFIG_PROC_FS 5106 if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, 5107 &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) 5108 goto err_vs; 5109 if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, 5110 ip_vs_stats_show, NULL)) 5111 goto err_stats; 5112 if (!proc_create_net_single("ip_vs_stats_percpu", 0, 5113 ipvs->net->proc_net, 5114 ip_vs_stats_percpu_show, NULL)) 5115 goto err_percpu; 5116 if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net, 5117 ip_vs_status_show, NULL)) 5118 goto err_status; 5119 #endif 5120 5121 ret = ip_vs_control_net_init_sysctl(ipvs); 5122 if (ret < 0) 5123 goto err; 5124 5125 return 0; 5126 5127 err: 5128 #ifdef CONFIG_PROC_FS 5129 remove_proc_entry("ip_vs_status", ipvs->net->proc_net); 5130 5131 err_status: 5132 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 5133 5134 err_percpu: 5135 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 5136 5137 err_stats: 5138 remove_proc_entry("ip_vs", ipvs->net->proc_net); 5139 5140 err_vs: 5141 #endif 5142 ip_vs_stats_release(&ipvs->tot_stats->s); 5143 5144 err_tot_stats: 5145 kfree(ipvs->tot_stats); 5146 5147 out: 5148 return ret; 5149 } 5150 5151 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) 5152 { 5153 ip_vs_trash_cleanup(ipvs); 5154 ip_vs_control_net_cleanup_sysctl(ipvs); 5155 cancel_delayed_work_sync(&ipvs->est_reload_work); 5156 #ifdef CONFIG_PROC_FS 5157 remove_proc_entry("ip_vs_status", ipvs->net->proc_net); 5158 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 5159 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 5160 remove_proc_entry("ip_vs", ipvs->net->proc_net); 5161 #endif 5162 call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); 5163 } 5164 5165 int __init ip_vs_register_nl_ioctl(void) 5166 { 5167 int ret; 5168 5169 ret = nf_register_sockopt(&ip_vs_sockopts); 5170 if (ret) { 5171 pr_err("cannot register sockopt.\n"); 5172 goto err_sock; 5173 } 5174 5175 ret = ip_vs_genl_register(); 5176 if (ret) { 5177 pr_err("cannot register Generic Netlink interface.\n"); 5178 goto err_genl; 5179 } 5180 return 0; 5181 5182 err_genl: 5183 nf_unregister_sockopt(&ip_vs_sockopts); 5184 err_sock: 5185 return ret; 5186 } 5187 5188 void ip_vs_unregister_nl_ioctl(void) 5189 { 5190 ip_vs_genl_unregister(); 5191 nf_unregister_sockopt(&ip_vs_sockopts); 5192 } 5193 5194 int __init ip_vs_control_init(void) 5195 { 5196 int ret; 5197 5198 ret = register_netdevice_notifier(&ip_vs_dst_notifier); 5199 if (ret < 0) 5200 return ret; 5201 5202 return 0; 5203 } 5204 5205 5206 void ip_vs_control_cleanup(void) 5207 { 5208 unregister_netdevice_notifier(&ip_vs_dst_notifier); 5209 /* relying on common rcu_barrier() in ip_vs_cleanup() */ 5210 } 5211