1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPVS An implementation of the IP virtual server support for the
4 * LINUX operating system. IPVS is now implemented as a module
5 * over the NetFilter framework. IPVS can be used to build a
6 * high-performance and highly available server based on a
7 * cluster of servers.
8 *
9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
10 * Peter Kese <peter.kese@ijs.si>
11 * Julian Anastasov <ja@ssi.bg>
12 *
13 * Changes:
14 */
15
16 #define pr_fmt(fmt) "IPVS: " fmt
17
18 #include <linux/module.h>
19 #include <linux/init.h>
20 #include <linux/types.h>
21 #include <linux/capability.h>
22 #include <linux/fs.h>
23 #include <linux/sysctl.h>
24 #include <linux/proc_fs.h>
25 #include <linux/workqueue.h>
26 #include <linux/seq_file.h>
27 #include <linux/slab.h>
28
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/mutex.h>
32 #include <linux/rcupdate_wait.h>
33
34 #include <net/net_namespace.h>
35 #include <linux/nsproxy.h>
36 #include <net/ip.h>
37 #ifdef CONFIG_IP_VS_IPV6
38 #include <net/ipv6.h>
39 #include <net/ip6_route.h>
40 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
41 #endif
42 #include <net/route.h>
43 #include <net/sock.h>
44 #include <net/genetlink.h>
45
46 #include <linux/uaccess.h>
47
48 #include <net/ip_vs.h>
49
50 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
51
52 static struct lock_class_key __ipvs_service_key;
53
54 /* sysctl variables */
55
56 #ifdef CONFIG_IP_VS_DEBUG
57 static int sysctl_ip_vs_debug_level = 0;
58
ip_vs_get_debug_level(void)59 int ip_vs_get_debug_level(void)
60 {
61 return sysctl_ip_vs_debug_level;
62 }
63 #endif
64
65
66 /* Protos */
67 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
68
69
70 #ifdef CONFIG_IP_VS_IPV6
71 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
__ip_vs_addr_is_local_v6(struct net * net,const struct in6_addr * addr)72 static bool __ip_vs_addr_is_local_v6(struct net *net,
73 const struct in6_addr *addr)
74 {
75 struct flowi6 fl6 = {
76 .daddr = *addr,
77 };
78 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
79 bool is_local;
80
81 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
82
83 dst_release(dst);
84 return is_local;
85 }
86 #endif
87
88 #ifdef CONFIG_SYSCTL
89 /*
90 * update_defense_level is called from keventd and from sysctl,
91 * so it needs to protect itself from softirqs
92 */
update_defense_level(struct netns_ipvs * ipvs)93 static void update_defense_level(struct netns_ipvs *ipvs)
94 {
95 struct sysinfo i;
96 int availmem;
97 int amemthresh;
98 int nomem;
99 int to_change = -1;
100
101 /* we only count free and buffered memory (in pages) */
102 si_meminfo(&i);
103 availmem = i.freeram + i.bufferram;
104 /* however in linux 2.5 the i.bufferram is total page cache size,
105 we need adjust it */
106 /* si_swapinfo(&i); */
107 /* availmem = availmem - (i.totalswap - i.freeswap); */
108
109 amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0);
110 nomem = (availmem < amemthresh);
111
112 local_bh_disable();
113
114 /* drop_entry */
115 spin_lock(&ipvs->dropentry_lock);
116 switch (ipvs->sysctl_drop_entry) {
117 case 0:
118 atomic_set(&ipvs->dropentry, 0);
119 break;
120 case 1:
121 if (nomem) {
122 atomic_set(&ipvs->dropentry, 1);
123 ipvs->sysctl_drop_entry = 2;
124 } else {
125 atomic_set(&ipvs->dropentry, 0);
126 }
127 break;
128 case 2:
129 if (nomem) {
130 atomic_set(&ipvs->dropentry, 1);
131 } else {
132 atomic_set(&ipvs->dropentry, 0);
133 ipvs->sysctl_drop_entry = 1;
134 }
135 break;
136 case 3:
137 atomic_set(&ipvs->dropentry, 1);
138 break;
139 }
140 spin_unlock(&ipvs->dropentry_lock);
141
142 /* drop_packet */
143 spin_lock(&ipvs->droppacket_lock);
144 switch (ipvs->sysctl_drop_packet) {
145 case 0:
146 ipvs->drop_rate = 0;
147 break;
148 case 1:
149 if (nomem) {
150 ipvs->drop_counter = amemthresh / (amemthresh - availmem);
151 ipvs->drop_rate = ipvs->drop_counter;
152 ipvs->sysctl_drop_packet = 2;
153 } else {
154 ipvs->drop_rate = 0;
155 }
156 break;
157 case 2:
158 if (nomem) {
159 ipvs->drop_counter = amemthresh / (amemthresh - availmem);
160 ipvs->drop_rate = ipvs->drop_counter;
161 } else {
162 ipvs->drop_rate = 0;
163 ipvs->sysctl_drop_packet = 1;
164 }
165 break;
166 case 3:
167 ipvs->drop_rate = ipvs->sysctl_am_droprate;
168 break;
169 }
170 spin_unlock(&ipvs->droppacket_lock);
171
172 /* secure_tcp */
173 spin_lock(&ipvs->securetcp_lock);
174 switch (ipvs->sysctl_secure_tcp) {
175 case 0:
176 if (ipvs->old_secure_tcp >= 2)
177 to_change = 0;
178 break;
179 case 1:
180 if (nomem) {
181 if (ipvs->old_secure_tcp < 2)
182 to_change = 1;
183 ipvs->sysctl_secure_tcp = 2;
184 } else {
185 if (ipvs->old_secure_tcp >= 2)
186 to_change = 0;
187 }
188 break;
189 case 2:
190 if (nomem) {
191 if (ipvs->old_secure_tcp < 2)
192 to_change = 1;
193 } else {
194 if (ipvs->old_secure_tcp >= 2)
195 to_change = 0;
196 ipvs->sysctl_secure_tcp = 1;
197 }
198 break;
199 case 3:
200 if (ipvs->old_secure_tcp < 2)
201 to_change = 1;
202 break;
203 }
204 ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp;
205 if (to_change >= 0)
206 ip_vs_protocol_timeout_change(ipvs,
207 ipvs->sysctl_secure_tcp > 1);
208 spin_unlock(&ipvs->securetcp_lock);
209
210 local_bh_enable();
211 }
212
213 /* Handler for delayed work for expiring no
214 * destination connections
215 */
expire_nodest_conn_handler(struct work_struct * work)216 static void expire_nodest_conn_handler(struct work_struct *work)
217 {
218 struct netns_ipvs *ipvs;
219
220 ipvs = container_of(work, struct netns_ipvs,
221 expire_nodest_conn_work.work);
222 ip_vs_expire_nodest_conn_flush(ipvs);
223 }
224
225 /*
226 * Timer for checking the defense
227 */
228 #define DEFENSE_TIMER_PERIOD 1*HZ
229
defense_work_handler(struct work_struct * work)230 static void defense_work_handler(struct work_struct *work)
231 {
232 struct netns_ipvs *ipvs =
233 container_of(work, struct netns_ipvs, defense_work.work);
234
235 update_defense_level(ipvs);
236 if (atomic_read(&ipvs->dropentry))
237 ip_vs_random_dropentry(ipvs);
238 queue_delayed_work(system_long_wq, &ipvs->defense_work,
239 DEFENSE_TIMER_PERIOD);
240 }
241 #endif
242
est_reload_work_handler(struct work_struct * work)243 static void est_reload_work_handler(struct work_struct *work)
244 {
245 struct netns_ipvs *ipvs =
246 container_of(work, struct netns_ipvs, est_reload_work.work);
247 int genid_done = atomic_read(&ipvs->est_genid_done);
248 unsigned long delay = HZ / 10; /* repeat startups after failure */
249 bool repeat = false;
250 int genid;
251 int id;
252
253 mutex_lock(&ipvs->est_mutex);
254 genid = atomic_read(&ipvs->est_genid);
255 for (id = 0; id < ipvs->est_kt_count; id++) {
256 struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
257
258 /* netns clean up started, abort delayed work */
259 if (!READ_ONCE(ipvs->enable))
260 goto unlock;
261 if (!kd)
262 continue;
263 /* New config ? Stop kthread tasks */
264 if (genid != genid_done) {
265 if (!id) {
266 /* Only we can stop kt 0 but not under mutex */
267 mutex_unlock(&ipvs->est_mutex);
268 ip_vs_est_kthread_stop(kd);
269 mutex_lock(&ipvs->est_mutex);
270 if (!READ_ONCE(ipvs->enable))
271 goto unlock;
272 /* kd for kt 0 is never destroyed */
273 } else {
274 ip_vs_est_kthread_stop(kd);
275 }
276 }
277 if (!kd->task && !ip_vs_est_stopped(ipvs)) {
278 bool start;
279
280 /* Do not start kthreads above 0 in calc phase */
281 if (id)
282 start = !ipvs->est_calc_phase;
283 else
284 start = kd->needed;
285 if (start && ip_vs_est_kthread_start(ipvs, kd) < 0)
286 repeat = true;
287 }
288 }
289
290 atomic_set(&ipvs->est_genid_done, genid);
291
292 if (repeat)
293 queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
294 delay);
295
296 unlock:
297 mutex_unlock(&ipvs->est_mutex);
298 }
299
get_conn_tab_size(struct netns_ipvs * ipvs)300 static int get_conn_tab_size(struct netns_ipvs *ipvs)
301 {
302 const struct ip_vs_rht *t;
303 int size = 0;
304
305 rcu_read_lock();
306 t = rcu_dereference(ipvs->conn_tab);
307 if (t)
308 size = t->size;
309 rcu_read_unlock();
310
311 return size;
312 }
313
314 int
ip_vs_use_count_inc(void)315 ip_vs_use_count_inc(void)
316 {
317 return try_module_get(THIS_MODULE);
318 }
319
320 void
ip_vs_use_count_dec(void)321 ip_vs_use_count_dec(void)
322 {
323 module_put(THIS_MODULE);
324 }
325
326
327 /* Service hashing:
328 * Operation Locking order
329 * ---------------------------------------------------------------------------
330 * add first table service_mutex
331 * attach new table service_mutex
332 * add/del service service_mutex, RCU, bit lock
333 * move between tables (rehash) svc_resize_sem(W), seqcount_t(W), bit lock
334 * replace old with attached svc_resize_sem(W), svc_replace_sem(W)
335 * find service RCU, seqcount_t(R)
336 * walk services(blocking) service_mutex, svc_resize_sem(R)
337 * walk services(non-blocking) RCU, seqcount_t(R)
338 * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R)
339 * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R)
340 * del table service_mutex after stopped work
341 *
342 * - new table is attached on resizing under service_mutex and all operations
343 * can run in parallel in 2 tables until the new table is registered as current
344 * one
345 * - two contexts can modify buckets: config and table resize (work), both in
346 * process context
347 * - only table resizer can move entries, so we do not protect t->seqc[]
348 * items with t->lock[]
349 * - lookups occur under RCU lock and seqcount reader lock to detect if
350 * services are moved to new table
351 * - move operations may disturb readers: find operation will not miss entries
352 * but walkers may see same entry twice if they are forced to retry chains
353 * or to walk the newly attached second table
354 * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to check
355 * svc_table_changes and repeat the RCU read section if new table is installed
356 * - walkers may serialize with the whole resizing process (svc_resize_sem)
357 * to prevent seeing same service twice or just with the svc_table
358 * replace (svc_replace_sem) when we can see entries twice but we
359 * prefer to run concurrently with the rehashing.
360 */
361
362 /*
363 * Returns hash value for virtual service
364 */
365 static inline u32
ip_vs_svc_hashval(struct ip_vs_rht * t,int af,unsigned int proto,const union nf_inet_addr * addr,__be16 port)366 ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto,
367 const union nf_inet_addr *addr, __be16 port)
368 {
369 return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto);
370 }
371
372 /*
373 * Returns hash value of fwmark for virtual service lookup
374 */
ip_vs_svc_fwm_hashval(struct ip_vs_rht * t,int af,__u32 fwmark)375 static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af,
376 __u32 fwmark)
377 {
378 return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]);
379 }
380
381 /* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */
ip_vs_svc_hash(struct ip_vs_service * svc)382 static int ip_vs_svc_hash(struct ip_vs_service *svc)
383 {
384 struct netns_ipvs *ipvs = svc->ipvs;
385 struct hlist_bl_head *head;
386 struct ip_vs_rht *t;
387 u32 hash;
388
389 if (svc->flags & IP_VS_SVC_F_HASHED) {
390 pr_err("%s(): request for already hashed, called from %pS\n",
391 __func__, __builtin_return_address(0));
392 return 0;
393 }
394
395 /* increase its refcnt because it is referenced by the svc table */
396 atomic_inc(&svc->refcnt);
397
398 /* We know if new table is attached under service_mutex but rely on
399 * RCU to hold the old table to be freed in resizer
400 */
401 rcu_read_lock();
402
403 /* This can be the old or the new table */
404 t = rcu_dereference(ipvs->svc_table);
405
406 /* New entries go into recent table */
407 t = rcu_dereference(t->new_tbl);
408
409 if (svc->fwmark == 0) {
410 /*
411 * Hash it by <protocol,addr,port>
412 */
413 hash = ip_vs_svc_hashval(t, svc->af, svc->protocol,
414 &svc->addr, svc->port);
415 } else {
416 /*
417 * Hash it by fwmark
418 */
419 hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark);
420 }
421 head = t->buckets + (hash & t->mask);
422 hlist_bl_lock(head);
423 WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash));
424 svc->flags |= IP_VS_SVC_F_HASHED;
425 hlist_bl_add_head_rcu(&svc->s_list, head);
426 hlist_bl_unlock(head);
427
428 rcu_read_unlock();
429
430 return 1;
431 }
432
433
434 /*
435 * Unhashes a service from svc_table.
436 * Should be called with locked tables.
437 */
ip_vs_svc_unhash(struct ip_vs_service * svc)438 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
439 {
440 struct netns_ipvs *ipvs = svc->ipvs;
441 struct hlist_bl_head *head;
442 struct ip_vs_rht *t;
443 u32 hash_key2;
444 u32 hash_key;
445
446 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
447 pr_err("%s(): request for unhash flagged, called from %pS\n",
448 __func__, __builtin_return_address(0));
449 return 0;
450 }
451
452 /* We know if new table is attached under service_mutex but rely on
453 * RCU to hold the old table to be freed in resizer
454 */
455 rcu_read_lock();
456
457 /* This can be the old or the new table */
458 t = rcu_dereference(ipvs->svc_table);
459 hash_key = READ_ONCE(svc->hash_key);
460 /* We need to lock the bucket in the right table */
461 if (ip_vs_rht_same_table(t, hash_key)) {
462 head = t->buckets + (hash_key & t->mask);
463 hlist_bl_lock(head);
464 /* Ensure hash_key is read under lock */
465 hash_key2 = READ_ONCE(svc->hash_key);
466 /* Moved to new table ? */
467 if (hash_key != hash_key2) {
468 hlist_bl_unlock(head);
469 t = rcu_dereference(t->new_tbl);
470 head = t->buckets + (hash_key2 & t->mask);
471 hlist_bl_lock(head);
472 }
473 } else {
474 /* It is already moved to new table */
475 t = rcu_dereference(t->new_tbl);
476 head = t->buckets + (hash_key & t->mask);
477 hlist_bl_lock(head);
478 }
479 /* Remove it from svc_table */
480 hlist_bl_del_rcu(&svc->s_list);
481
482 svc->flags &= ~IP_VS_SVC_F_HASHED;
483 atomic_dec(&svc->refcnt);
484 hlist_bl_unlock(head);
485
486 rcu_read_unlock();
487 return 1;
488 }
489
490
491 /*
492 * Get service by {netns, proto,addr,port} in the service table.
493 */
494 static inline struct ip_vs_service *
__ip_vs_service_find(struct netns_ipvs * ipvs,int af,__u16 protocol,const union nf_inet_addr * vaddr,__be16 vport)495 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
496 const union nf_inet_addr *vaddr, __be16 vport)
497 {
498 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
499 struct hlist_bl_head *head;
500 struct ip_vs_service *svc;
501 struct ip_vs_rht *t, *p;
502 struct hlist_bl_node *e;
503 u32 hash, hash_key;
504
505 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
506 /* Check for "full" addressed entries */
507 hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport);
508
509 hash_key = ip_vs_rht_build_hash_key(t, hash);
510 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
511 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
512 if (READ_ONCE(svc->hash_key) == hash_key &&
513 svc->af == af &&
514 ip_vs_addr_equal(af, &svc->addr, vaddr) &&
515 svc->port == vport &&
516 svc->protocol == protocol && !svc->fwmark) {
517 /* HIT */
518 return svc;
519 }
520 }
521 }
522 }
523
524 return NULL;
525 }
526
527
528 /*
529 * Get service by {fwmark} in the service table.
530 */
531 static inline struct ip_vs_service *
__ip_vs_svc_fwm_find(struct netns_ipvs * ipvs,int af,__u32 fwmark)532 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
533 {
534 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
535 struct hlist_bl_head *head;
536 struct ip_vs_service *svc;
537 struct ip_vs_rht *t, *p;
538 struct hlist_bl_node *e;
539 u32 hash, hash_key;
540
541 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
542 /* Check for fwmark addressed entries */
543 hash = ip_vs_svc_fwm_hashval(t, af, fwmark);
544
545 hash_key = ip_vs_rht_build_hash_key(t, hash);
546 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
547 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
548 if (READ_ONCE(svc->hash_key) == hash_key &&
549 svc->fwmark == fwmark && svc->af == af) {
550 /* HIT */
551 return svc;
552 }
553 }
554 }
555 }
556
557 return NULL;
558 }
559
560 /* Find service, called under RCU lock */
561 struct ip_vs_service *
ip_vs_service_find(struct netns_ipvs * ipvs,int af,__u32 fwmark,__u16 protocol,const union nf_inet_addr * vaddr,__be16 vport)562 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
563 const union nf_inet_addr *vaddr, __be16 vport)
564 {
565 struct ip_vs_service *svc = NULL;
566 int af_id = ip_vs_af_index(af);
567
568 /*
569 * Check the table hashed by fwmark first
570 */
571 if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) {
572 svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
573 if (svc)
574 goto out;
575 }
576
577 if (!atomic_read(&ipvs->nonfwm_services[af_id]))
578 goto out;
579
580 /*
581 * Check the table hashed by <protocol,addr,port>
582 * for "full" addressed entries
583 */
584 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
585 if (svc)
586 goto out;
587
588 if (protocol == IPPROTO_TCP &&
589 atomic_read(&ipvs->ftpsvc_counter[af_id]) &&
590 (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) {
591 /*
592 * Check if ftp service entry exists, the packet
593 * might belong to FTP data connections.
594 */
595 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
596 if (svc)
597 goto out;
598 }
599
600 if (atomic_read(&ipvs->nullsvc_counter[af_id])) {
601 /*
602 * Check if the catch-all port (port zero) exists
603 */
604 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0);
605 }
606
607 out:
608 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
609 fwmark, ip_vs_proto_name(protocol),
610 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
611 svc ? "hit" : "not hit");
612
613 return svc;
614 }
615
616 /* Return the number of registered services */
ip_vs_get_num_services(struct netns_ipvs * ipvs)617 static int ip_vs_get_num_services(struct netns_ipvs *ipvs)
618 {
619 int ns = 0, ni = IP_VS_AF_MAX;
620
621 while (--ni >= 0)
622 ns += atomic_read(&ipvs->num_services[ni]);
623 return ns;
624 }
625
626 /* Get default load factor to map num_services/u_thresh to t->size */
ip_vs_svc_default_load_factor(struct netns_ipvs * ipvs)627 static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs)
628 {
629 int factor;
630
631 if (net_eq(ipvs->net, &init_net))
632 factor = -3; /* grow if load is above 12.5% */
633 else
634 factor = -2; /* grow if load is above 25% */
635 return factor;
636 }
637
638 /* Get the desired svc_table size */
ip_vs_svc_desired_size(struct netns_ipvs * ipvs,struct ip_vs_rht * t,int lfactor)639 static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
640 int lfactor)
641 {
642 return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs),
643 lfactor, IP_VS_SVC_TAB_MIN_BITS,
644 IP_VS_SVC_TAB_MAX_BITS);
645 }
646
647 /* Allocate svc_table */
ip_vs_svc_table_alloc(struct netns_ipvs * ipvs,int buckets,int lfactor)648 static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs,
649 int buckets, int lfactor)
650 {
651 struct ip_vs_rht *t;
652 int scounts, locks;
653
654 /* No frequent lookups to race with resizing, so use max of 64
655 * seqcounts. Only resizer moves entries, so use 0 locks.
656 */
657 scounts = clamp(buckets >> 4, 1, 64);
658 locks = 0;
659
660 t = ip_vs_rht_alloc(buckets, scounts, locks);
661 if (!t)
662 return NULL;
663 t->lfactor = lfactor;
664 ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS,
665 IP_VS_SVC_TAB_MAX_BITS);
666 return t;
667 }
668
669 /* svc_table resizer work */
svc_resize_work_handler(struct work_struct * work)670 static void svc_resize_work_handler(struct work_struct *work)
671 {
672 struct hlist_bl_head *head, *head2;
673 struct ip_vs_rht *t_free = NULL;
674 unsigned int resched_score = 0;
675 struct hlist_bl_node *cn, *nn;
676 struct ip_vs_rht *t, *t_new;
677 struct ip_vs_service *svc;
678 struct netns_ipvs *ipvs;
679 bool more_work = true;
680 seqcount_t *sc;
681 int limit = 0;
682 int new_size;
683 int lfactor;
684 u32 bucket;
685
686 ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work);
687
688 if (!down_write_trylock(&ipvs->svc_resize_sem))
689 goto out;
690 if (!mutex_trylock(&ipvs->service_mutex))
691 goto unlock_sem;
692 more_work = false;
693 clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags);
694 if (!READ_ONCE(ipvs->enable))
695 goto unlock_m;
696 t = rcu_dereference_protected(ipvs->svc_table, 1);
697 /* Do nothing if table is removed */
698 if (!t)
699 goto unlock_m;
700 /* New table already attached? BUG! */
701 if (t != rcu_access_pointer(t->new_tbl))
702 goto unlock_m;
703
704 lfactor = sysctl_svc_lfactor(ipvs);
705 /* Should we resize ? */
706 new_size = ip_vs_svc_desired_size(ipvs, t, lfactor);
707 if (new_size == t->size && lfactor == t->lfactor)
708 goto unlock_m;
709
710 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
711 if (!t_new) {
712 more_work = true;
713 goto unlock_m;
714 }
715 /* Flip the table_id */
716 t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
717
718 /* Attach new table */
719 rcu_assign_pointer(t->new_tbl, t_new);
720 /* Allow add/del to new_tbl while moving from old table */
721 mutex_unlock(&ipvs->service_mutex);
722
723 ip_vs_rht_for_each_bucket(t, bucket, head) {
724 same_bucket:
725 if (++limit >= 16) {
726 /* Check if work is stopped */
727 if (test_bit(IP_VS_WORK_SVC_NORESIZE,
728 &ipvs->work_flags))
729 goto unlock_sem;
730 if (resched_score >= 100) {
731 resched_score = 0;
732 cond_resched();
733 }
734 limit = 0;
735 }
736 if (hlist_bl_empty(head)) {
737 resched_score++;
738 continue;
739 }
740 /* Preemption calls ahead... */
741 resched_score = 0;
742
743 sc = &t->seqc[bucket & t->seqc_mask];
744 /* seqcount_t usage considering PREEMPT_RT rules:
745 * - we are the only writer => preemption can be allowed
746 * - readers (SoftIRQ) => disable BHs
747 * - readers (processes) => preemption should be disabled
748 */
749 local_bh_disable();
750 preempt_disable_nested();
751 write_seqcount_begin(sc);
752 hlist_bl_lock(head);
753
754 hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) {
755 u32 hash;
756
757 /* New hash for the new table */
758 if (svc->fwmark == 0) {
759 /* Hash it by <protocol,addr,port> */
760 hash = ip_vs_svc_hashval(t_new, svc->af,
761 svc->protocol,
762 &svc->addr, svc->port);
763 } else {
764 /* Hash it by fwmark */
765 hash = ip_vs_svc_fwm_hashval(t_new, svc->af,
766 svc->fwmark);
767 }
768 hlist_bl_del_rcu(&svc->s_list);
769 head2 = t_new->buckets + (hash & t_new->mask);
770
771 hlist_bl_lock(head2);
772 WRITE_ONCE(svc->hash_key,
773 ip_vs_rht_build_hash_key(t_new, hash));
774 /* t_new->seqc are not used at this stage, we race
775 * only with add/del, so only lock the bucket.
776 */
777 hlist_bl_add_head_rcu(&svc->s_list, head2);
778 hlist_bl_unlock(head2);
779 /* Too long chain? Do it in steps */
780 if (++limit >= 64)
781 break;
782 }
783
784 hlist_bl_unlock(head);
785 write_seqcount_end(sc);
786 preempt_enable_nested();
787 local_bh_enable();
788 if (limit >= 64)
789 goto same_bucket;
790 }
791
792 /* Serialize with readers that don't like svc_table changes */
793 down_write(&ipvs->svc_replace_sem);
794
795 /* Check if work is stopped to avoid synchronize_rcu() */
796 if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
797 goto unlock_repl;
798
799 rcu_assign_pointer(ipvs->svc_table, t_new);
800 /* Inform readers that new table is installed */
801 smp_mb__before_atomic();
802 atomic_inc(&ipvs->svc_table_changes);
803 t_free = t;
804
805 unlock_repl:
806 up_write(&ipvs->svc_replace_sem);
807
808 unlock_sem:
809 up_write(&ipvs->svc_resize_sem);
810
811 if (t_free) {
812 /* RCU readers should not see more than two tables in chain.
813 * To prevent new table to be attached wait here instead of
814 * freeing the old table in RCU callback.
815 */
816 synchronize_rcu();
817 ip_vs_rht_free(t_free);
818 }
819
820 out:
821 if (!READ_ONCE(ipvs->enable) || !more_work ||
822 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
823 return;
824 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1);
825 return;
826
827 unlock_m:
828 mutex_unlock(&ipvs->service_mutex);
829 goto unlock_sem;
830 }
831
832 static inline void
__ip_vs_bind_svc(struct ip_vs_dest * dest,struct ip_vs_service * svc)833 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
834 {
835 atomic_inc(&svc->refcnt);
836 rcu_assign_pointer(dest->svc, svc);
837 }
838
ip_vs_service_free(struct ip_vs_service * svc)839 static void ip_vs_service_free(struct ip_vs_service *svc)
840 {
841 ip_vs_stats_release(&svc->stats);
842 kfree(svc);
843 }
844
ip_vs_service_rcu_free(struct rcu_head * head)845 static void ip_vs_service_rcu_free(struct rcu_head *head)
846 {
847 struct ip_vs_service *svc;
848
849 svc = container_of(head, struct ip_vs_service, rcu_head);
850 ip_vs_service_free(svc);
851 }
852
__ip_vs_svc_put(struct ip_vs_service * svc)853 static void __ip_vs_svc_put(struct ip_vs_service *svc)
854 {
855 if (atomic_dec_and_test(&svc->refcnt)) {
856 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
857 svc->fwmark,
858 IP_VS_DBG_ADDR(svc->af, &svc->addr),
859 ntohs(svc->port));
860 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
861 }
862 }
863
864
865 /*
866 * Returns hash value for real service
867 */
ip_vs_rs_hashkey(int af,const union nf_inet_addr * addr,__be16 port)868 static inline unsigned int ip_vs_rs_hashkey(int af,
869 const union nf_inet_addr *addr,
870 __be16 port)
871 {
872 unsigned int porth = ntohs(port);
873 __be32 addr_fold = addr->ip;
874
875 #ifdef CONFIG_IP_VS_IPV6
876 if (af == AF_INET6)
877 addr_fold = addr->ip6[0]^addr->ip6[1]^
878 addr->ip6[2]^addr->ip6[3];
879 #endif
880
881 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
882 & IP_VS_RTAB_MASK;
883 }
884
885 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
ip_vs_rs_hash(struct netns_ipvs * ipvs,struct ip_vs_dest * dest)886 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
887 {
888 unsigned int hash;
889 __be16 port;
890
891 if (dest->in_rs_table)
892 return;
893
894 switch (IP_VS_DFWD_METHOD(dest)) {
895 case IP_VS_CONN_F_MASQ:
896 port = dest->port;
897 break;
898 case IP_VS_CONN_F_TUNNEL:
899 switch (dest->tun_type) {
900 case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
901 port = dest->tun_port;
902 break;
903 case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
904 case IP_VS_CONN_F_TUNNEL_TYPE_GRE:
905 port = 0;
906 break;
907 default:
908 return;
909 }
910 break;
911 default:
912 return;
913 }
914
915 /*
916 * Hash by proto,addr,port,
917 * which are the parameters of the real service.
918 */
919 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
920
921 hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
922 dest->in_rs_table = 1;
923 }
924
925 /* Unhash ip_vs_dest from rs_table. */
ip_vs_rs_unhash(struct ip_vs_dest * dest)926 static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
927 {
928 /*
929 * Remove it from the rs_table table.
930 */
931 if (dest->in_rs_table) {
932 hlist_del_rcu(&dest->d_list);
933 dest->in_rs_table = 0;
934 }
935 }
936
937 /* Check if real service by <proto,addr,port> is present */
ip_vs_has_real_service(struct netns_ipvs * ipvs,int af,__u16 protocol,const union nf_inet_addr * daddr,__be16 dport)938 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
939 const union nf_inet_addr *daddr, __be16 dport)
940 {
941 unsigned int hash;
942 struct ip_vs_dest *dest;
943
944 /* Check for "full" addressed entries */
945 hash = ip_vs_rs_hashkey(af, daddr, dport);
946
947 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
948 if (dest->port == dport &&
949 dest->af == af &&
950 ip_vs_addr_equal(af, &dest->addr, daddr) &&
951 (dest->protocol == protocol || dest->vfwmark) &&
952 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
953 /* HIT */
954 return true;
955 }
956 }
957
958 return false;
959 }
960
961 /* Find real service record by <proto,addr,port>.
962 * In case of multiple records with the same <proto,addr,port>, only
963 * the first found record is returned.
964 *
965 * To be called under RCU lock.
966 */
ip_vs_find_real_service(struct netns_ipvs * ipvs,int af,__u16 protocol,const union nf_inet_addr * daddr,__be16 dport)967 struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
968 __u16 protocol,
969 const union nf_inet_addr *daddr,
970 __be16 dport)
971 {
972 unsigned int hash;
973 struct ip_vs_dest *dest;
974
975 /* Check for "full" addressed entries */
976 hash = ip_vs_rs_hashkey(af, daddr, dport);
977
978 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
979 if (dest->port == dport &&
980 dest->af == af &&
981 ip_vs_addr_equal(af, &dest->addr, daddr) &&
982 (dest->protocol == protocol || dest->vfwmark) &&
983 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
984 /* HIT */
985 return dest;
986 }
987 }
988
989 return NULL;
990 }
991
992 /* Find real service record by <af,addr,tun_port>.
993 * In case of multiple records with the same <af,addr,tun_port>, only
994 * the first found record is returned.
995 *
996 * To be called under RCU lock.
997 */
ip_vs_find_tunnel(struct netns_ipvs * ipvs,int af,const union nf_inet_addr * daddr,__be16 tun_port)998 struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
999 const union nf_inet_addr *daddr,
1000 __be16 tun_port)
1001 {
1002 struct ip_vs_dest *dest;
1003 unsigned int hash;
1004
1005 /* Check for "full" addressed entries */
1006 hash = ip_vs_rs_hashkey(af, daddr, tun_port);
1007
1008 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
1009 if (dest->tun_port == tun_port &&
1010 dest->af == af &&
1011 ip_vs_addr_equal(af, &dest->addr, daddr) &&
1012 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
1013 /* HIT */
1014 return dest;
1015 }
1016 }
1017
1018 return NULL;
1019 }
1020
1021 /* Lookup destination by {addr,port} in the given service
1022 * Called under RCU lock.
1023 */
1024 static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service * svc,int dest_af,const union nf_inet_addr * daddr,__be16 dport)1025 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
1026 const union nf_inet_addr *daddr, __be16 dport)
1027 {
1028 struct ip_vs_dest *dest;
1029
1030 /*
1031 * Find the destination for the given service
1032 */
1033 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
1034 if ((dest->af == dest_af) &&
1035 ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
1036 (dest->port == dport)) {
1037 /* HIT */
1038 return dest;
1039 }
1040 }
1041
1042 return NULL;
1043 }
1044
1045 /*
1046 * Find destination by {daddr,dport,vaddr,protocol}
1047 * Created to be used in ip_vs_process_message() in
1048 * the backup synchronization daemon. It finds the
1049 * destination to be bound to the received connection
1050 * on the backup.
1051 * Called under RCU lock, no refcnt is returned.
1052 */
ip_vs_find_dest(struct netns_ipvs * ipvs,int svc_af,int dest_af,const union nf_inet_addr * daddr,__be16 dport,const union nf_inet_addr * vaddr,__be16 vport,__u16 protocol,__u32 fwmark,__u32 flags)1053 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af,
1054 const union nf_inet_addr *daddr,
1055 __be16 dport,
1056 const union nf_inet_addr *vaddr,
1057 __be16 vport, __u16 protocol, __u32 fwmark,
1058 __u32 flags)
1059 {
1060 struct ip_vs_dest *dest;
1061 struct ip_vs_service *svc;
1062 __be16 port = dport;
1063
1064 svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport);
1065 if (!svc)
1066 return NULL;
1067 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
1068 port = 0;
1069 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
1070 if (!dest)
1071 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
1072 return dest;
1073 }
1074
ip_vs_dest_dst_rcu_free(struct rcu_head * head)1075 void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
1076 {
1077 struct ip_vs_dest_dst *dest_dst = container_of(head,
1078 struct ip_vs_dest_dst,
1079 rcu_head);
1080
1081 dst_release(dest_dst->dst_cache);
1082 kfree(dest_dst);
1083 }
1084
1085 /* Release dest_dst and dst_cache for dest in user context */
__ip_vs_dst_cache_reset(struct ip_vs_dest * dest)1086 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
1087 {
1088 struct ip_vs_dest_dst *old;
1089
1090 old = rcu_dereference_protected(dest->dest_dst, 1);
1091 if (old) {
1092 RCU_INIT_POINTER(dest->dest_dst, NULL);
1093 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
1094 }
1095 }
1096
1097 /*
1098 * Lookup dest by {svc,addr,port} in the destination trash.
1099 * The destination trash is used to hold the destinations that are removed
1100 * from the service table but are still referenced by some conn entries.
1101 * The reason to add the destination trash is when the dest is temporary
1102 * down (either by administrator or by monitor program), the dest can be
1103 * picked back from the trash, the remaining connections to the dest can
1104 * continue, and the counting information of the dest is also useful for
1105 * scheduling.
1106 */
1107 static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service * svc,int dest_af,const union nf_inet_addr * daddr,__be16 dport)1108 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
1109 const union nf_inet_addr *daddr, __be16 dport)
1110 {
1111 struct ip_vs_dest *dest;
1112 struct netns_ipvs *ipvs = svc->ipvs;
1113
1114 /*
1115 * Find the destination in trash
1116 */
1117 spin_lock_bh(&ipvs->dest_trash_lock);
1118 list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1119 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
1120 "dest->refcnt=%d\n",
1121 dest->vfwmark,
1122 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1123 ntohs(dest->port),
1124 refcount_read(&dest->refcnt));
1125 if (dest->af == dest_af &&
1126 ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
1127 dest->port == dport &&
1128 dest->vfwmark == svc->fwmark &&
1129 dest->protocol == svc->protocol &&
1130 (svc->fwmark ||
1131 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
1132 dest->vport == svc->port))) {
1133 /* HIT */
1134 list_del(&dest->t_list);
1135 goto out;
1136 }
1137 }
1138
1139 dest = NULL;
1140
1141 out:
1142 spin_unlock_bh(&ipvs->dest_trash_lock);
1143
1144 return dest;
1145 }
1146
1147 /* Put destination in trash */
ip_vs_trash_put_dest(struct netns_ipvs * ipvs,struct ip_vs_dest * dest,unsigned long istart,bool cleanup)1148 static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs,
1149 struct ip_vs_dest *dest, unsigned long istart,
1150 bool cleanup)
1151 {
1152 spin_lock_bh(&ipvs->dest_trash_lock);
1153 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1154 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1155 refcount_read(&dest->refcnt));
1156 if (list_empty(&ipvs->dest_trash) && !cleanup)
1157 mod_timer(&ipvs->dest_trash_timer,
1158 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1159 /* dest lives in trash with reference */
1160 list_add(&dest->t_list, &ipvs->dest_trash);
1161 dest->idle_start = istart;
1162 spin_unlock_bh(&ipvs->dest_trash_lock);
1163 }
1164
ip_vs_dest_rcu_free(struct rcu_head * head)1165 static void ip_vs_dest_rcu_free(struct rcu_head *head)
1166 {
1167 struct ip_vs_dest *dest;
1168
1169 dest = container_of(head, struct ip_vs_dest, rcu_head);
1170 ip_vs_stats_release(&dest->stats);
1171 ip_vs_dest_put_and_free(dest);
1172 }
1173
ip_vs_dest_free(struct ip_vs_dest * dest)1174 static void ip_vs_dest_free(struct ip_vs_dest *dest)
1175 {
1176 struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
1177
1178 __ip_vs_svc_put(svc);
1179 call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
1180 }
1181
1182 /*
1183 * Clean up all the destinations in the trash
1184 * Called by the ip_vs_control_cleanup()
1185 *
1186 * When the ip_vs_control_clearup is activated by ipvs module exit,
1187 * the service tables must have been flushed and all the connections
1188 * are expired, and the refcnt of each destination in the trash must
1189 * be 1, so we simply release them here.
1190 */
ip_vs_trash_cleanup(struct netns_ipvs * ipvs)1191 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
1192 {
1193 struct ip_vs_dest *dest, *nxt;
1194
1195 timer_delete_sync(&ipvs->dest_trash_timer);
1196 /* No need to use dest_trash_lock */
1197 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
1198 list_del(&dest->t_list);
1199 ip_vs_dest_free(dest);
1200 }
1201 }
1202
ip_vs_stats_rcu_free(struct rcu_head * head)1203 static void ip_vs_stats_rcu_free(struct rcu_head *head)
1204 {
1205 struct ip_vs_stats_rcu *rs = container_of(head,
1206 struct ip_vs_stats_rcu,
1207 rcu_head);
1208
1209 ip_vs_stats_release(&rs->s);
1210 kfree(rs);
1211 }
1212
1213 static void
ip_vs_copy_stats(struct ip_vs_kstats * dst,struct ip_vs_stats * src)1214 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
1215 {
1216 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
1217
1218 spin_lock(&src->lock);
1219
1220 IP_VS_SHOW_STATS_COUNTER(conns);
1221 IP_VS_SHOW_STATS_COUNTER(inpkts);
1222 IP_VS_SHOW_STATS_COUNTER(outpkts);
1223 IP_VS_SHOW_STATS_COUNTER(inbytes);
1224 IP_VS_SHOW_STATS_COUNTER(outbytes);
1225
1226 ip_vs_read_estimator(dst, src);
1227
1228 spin_unlock(&src->lock);
1229 }
1230
1231 static void
ip_vs_export_stats_user(struct ip_vs_stats_user * dst,struct ip_vs_kstats * src)1232 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
1233 {
1234 dst->conns = (u32)src->conns;
1235 dst->inpkts = (u32)src->inpkts;
1236 dst->outpkts = (u32)src->outpkts;
1237 dst->inbytes = src->inbytes;
1238 dst->outbytes = src->outbytes;
1239 dst->cps = (u32)src->cps;
1240 dst->inpps = (u32)src->inpps;
1241 dst->outpps = (u32)src->outpps;
1242 dst->inbps = (u32)src->inbps;
1243 dst->outbps = (u32)src->outbps;
1244 }
1245
1246 static void
ip_vs_zero_stats(struct ip_vs_stats * stats)1247 ip_vs_zero_stats(struct ip_vs_stats *stats)
1248 {
1249 spin_lock(&stats->lock);
1250
1251 /* get current counters as zero point, rates are zeroed */
1252
1253 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
1254
1255 IP_VS_ZERO_STATS_COUNTER(conns);
1256 IP_VS_ZERO_STATS_COUNTER(inpkts);
1257 IP_VS_ZERO_STATS_COUNTER(outpkts);
1258 IP_VS_ZERO_STATS_COUNTER(inbytes);
1259 IP_VS_ZERO_STATS_COUNTER(outbytes);
1260
1261 ip_vs_zero_estimator(stats);
1262
1263 spin_unlock(&stats->lock);
1264 }
1265
1266 /* Allocate fields after kzalloc */
ip_vs_stats_init_alloc(struct ip_vs_stats * s)1267 int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
1268 {
1269 int i;
1270
1271 spin_lock_init(&s->lock);
1272 s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1273 if (!s->cpustats)
1274 return -ENOMEM;
1275
1276 for_each_possible_cpu(i) {
1277 struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
1278
1279 u64_stats_init(&cs->syncp);
1280 }
1281 return 0;
1282 }
1283
ip_vs_stats_alloc(void)1284 struct ip_vs_stats *ip_vs_stats_alloc(void)
1285 {
1286 struct ip_vs_stats *s = kzalloc_obj(*s);
1287
1288 if (s && ip_vs_stats_init_alloc(s) >= 0)
1289 return s;
1290 kfree(s);
1291 return NULL;
1292 }
1293
ip_vs_stats_release(struct ip_vs_stats * stats)1294 void ip_vs_stats_release(struct ip_vs_stats *stats)
1295 {
1296 free_percpu(stats->cpustats);
1297 }
1298
ip_vs_stats_free(struct ip_vs_stats * stats)1299 void ip_vs_stats_free(struct ip_vs_stats *stats)
1300 {
1301 if (stats) {
1302 ip_vs_stats_release(stats);
1303 kfree(stats);
1304 }
1305 }
1306
1307 /*
1308 * Update a destination in the given service
1309 */
1310 static void
__ip_vs_update_dest(struct ip_vs_service * svc,struct ip_vs_dest * dest,struct ip_vs_dest_user_kern * udest,int add)1311 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
1312 struct ip_vs_dest_user_kern *udest, int add)
1313 {
1314 struct netns_ipvs *ipvs = svc->ipvs;
1315 struct ip_vs_service *old_svc;
1316 struct ip_vs_scheduler *sched;
1317 int conn_flags;
1318
1319 /* We cannot modify an address and change the address family */
1320 BUG_ON(!add && udest->af != dest->af);
1321
1322 if (add && udest->af != svc->af)
1323 ipvs->mixed_address_family_dests++;
1324
1325 /* keep the last_weight with latest non-0 weight */
1326 if (add || udest->weight != 0)
1327 atomic_set(&dest->last_weight, udest->weight);
1328
1329 /* set the weight and the flags */
1330 atomic_set(&dest->weight, udest->weight);
1331 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
1332 conn_flags |= IP_VS_CONN_F_INACTIVE;
1333
1334 /* Need to rehash? */
1335 if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
1336 IP_VS_DFWD_METHOD(dest) ||
1337 udest->tun_type != dest->tun_type ||
1338 udest->tun_port != dest->tun_port)
1339 ip_vs_rs_unhash(dest);
1340
1341 /* set the tunnel info */
1342 dest->tun_type = udest->tun_type;
1343 dest->tun_port = udest->tun_port;
1344 dest->tun_flags = udest->tun_flags;
1345
1346 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
1347 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
1348 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
1349 } else {
1350 /* FTP-NAT requires conntrack for mangling */
1351 if (svc->port == FTPPORT)
1352 ip_vs_register_conntrack(svc);
1353 }
1354 atomic_set(&dest->conn_flags, conn_flags);
1355 /* Put the real service in rs_table if not present. */
1356 ip_vs_rs_hash(ipvs, dest);
1357
1358 /* bind the service */
1359 old_svc = rcu_dereference_protected(dest->svc, 1);
1360 if (!old_svc) {
1361 __ip_vs_bind_svc(dest, svc);
1362 } else {
1363 if (old_svc != svc) {
1364 ip_vs_zero_stats(&dest->stats);
1365 __ip_vs_bind_svc(dest, svc);
1366 __ip_vs_svc_put(old_svc);
1367 }
1368 }
1369
1370 /* set the dest status flags */
1371 dest->flags |= IP_VS_DEST_F_AVAILABLE;
1372
1373 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
1374 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1375 dest->u_threshold = udest->u_threshold;
1376 dest->l_threshold = udest->l_threshold;
1377
1378 dest->af = udest->af;
1379
1380 if (add) {
1381 list_add_rcu(&dest->n_list, &svc->destinations);
1382 svc->num_dests++;
1383 sched = rcu_dereference_protected(svc->scheduler, 1);
1384 if (sched && sched->add_dest)
1385 sched->add_dest(svc, dest);
1386 } else {
1387 spin_lock_bh(&dest->dst_lock);
1388 __ip_vs_dst_cache_reset(dest);
1389 spin_unlock_bh(&dest->dst_lock);
1390
1391 sched = rcu_dereference_protected(svc->scheduler, 1);
1392 if (sched && sched->upd_dest)
1393 sched->upd_dest(svc, dest);
1394 }
1395 }
1396
1397
1398 /*
1399 * Create a destination for the given service
1400 */
1401 static int
ip_vs_new_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1402 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1403 {
1404 struct ip_vs_dest *dest;
1405 unsigned int atype;
1406 int ret;
1407
1408 #ifdef CONFIG_IP_VS_IPV6
1409 if (udest->af == AF_INET6) {
1410 atype = ipv6_addr_type(&udest->addr.in6);
1411 if ((!(atype & IPV6_ADDR_UNICAST) ||
1412 atype & IPV6_ADDR_LINKLOCAL) &&
1413 !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
1414 return -EINVAL;
1415
1416 ret = nf_defrag_ipv6_enable(svc->ipvs->net);
1417 if (ret)
1418 return ret;
1419 } else
1420 #endif
1421 {
1422 atype = inet_addr_type(svc->ipvs->net, udest->addr.ip);
1423 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
1424 return -EINVAL;
1425 }
1426
1427 dest = kzalloc_obj(struct ip_vs_dest);
1428 if (dest == NULL)
1429 return -ENOMEM;
1430
1431 ret = ip_vs_stats_init_alloc(&dest->stats);
1432 if (ret < 0)
1433 goto err_alloc;
1434
1435 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1436 if (ret < 0)
1437 goto err_stats;
1438
1439 dest->af = udest->af;
1440 dest->protocol = svc->protocol;
1441 dest->vaddr = svc->addr;
1442 dest->vport = svc->port;
1443 dest->vfwmark = svc->fwmark;
1444 ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
1445 dest->port = udest->port;
1446
1447 atomic_set(&dest->activeconns, 0);
1448 atomic_set(&dest->inactconns, 0);
1449 atomic_set(&dest->persistconns, 0);
1450 refcount_set(&dest->refcnt, 1);
1451
1452 INIT_HLIST_NODE(&dest->d_list);
1453 spin_lock_init(&dest->dst_lock);
1454 __ip_vs_update_dest(svc, dest, udest, 1);
1455
1456 return 0;
1457
1458 err_stats:
1459 ip_vs_stats_release(&dest->stats);
1460
1461 err_alloc:
1462 kfree(dest);
1463 return ret;
1464 }
1465
1466
1467 /*
1468 * Add a destination into an existing service
1469 */
1470 static int
ip_vs_add_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1471 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1472 {
1473 struct ip_vs_dest *dest;
1474 union nf_inet_addr daddr;
1475 __be16 dport = udest->port;
1476 int ret;
1477
1478 if (udest->weight < 0) {
1479 pr_err("%s(): server weight less than zero\n", __func__);
1480 return -ERANGE;
1481 }
1482
1483 if (udest->l_threshold > udest->u_threshold) {
1484 pr_err("%s(): lower threshold is higher than upper threshold\n",
1485 __func__);
1486 return -ERANGE;
1487 }
1488
1489 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1490 if (udest->tun_port == 0) {
1491 pr_err("%s(): tunnel port is zero\n", __func__);
1492 return -EINVAL;
1493 }
1494 }
1495
1496 ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1497
1498 /* We use function that requires RCU lock */
1499 rcu_read_lock();
1500 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1501 rcu_read_unlock();
1502
1503 if (dest != NULL) {
1504 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
1505 return -EEXIST;
1506 }
1507
1508 /*
1509 * Check if the dest already exists in the trash and
1510 * is from the same service
1511 */
1512 dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
1513
1514 if (dest != NULL) {
1515 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
1516 "dest->refcnt=%d, service %u/%s:%u\n",
1517 IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
1518 refcount_read(&dest->refcnt),
1519 dest->vfwmark,
1520 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
1521 ntohs(dest->vport));
1522
1523 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1524 /* On error put back dest into the trash */
1525 if (ret < 0)
1526 ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start,
1527 false);
1528 else
1529 __ip_vs_update_dest(svc, dest, udest, 1);
1530 } else {
1531 /*
1532 * Allocate and initialize the dest structure
1533 */
1534 ret = ip_vs_new_dest(svc, udest);
1535 }
1536
1537 return ret;
1538 }
1539
1540
1541 /*
1542 * Edit a destination in the given service
1543 */
1544 static int
ip_vs_edit_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1545 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1546 {
1547 struct ip_vs_dest *dest;
1548 union nf_inet_addr daddr;
1549 __be16 dport = udest->port;
1550
1551 if (udest->weight < 0) {
1552 pr_err("%s(): server weight less than zero\n", __func__);
1553 return -ERANGE;
1554 }
1555
1556 if (udest->l_threshold > udest->u_threshold) {
1557 pr_err("%s(): lower threshold is higher than upper threshold\n",
1558 __func__);
1559 return -ERANGE;
1560 }
1561
1562 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1563 if (udest->tun_port == 0) {
1564 pr_err("%s(): tunnel port is zero\n", __func__);
1565 return -EINVAL;
1566 }
1567 }
1568
1569 ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1570
1571 /* We use function that requires RCU lock */
1572 rcu_read_lock();
1573 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1574 rcu_read_unlock();
1575
1576 if (dest == NULL) {
1577 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1578 return -ENOENT;
1579 }
1580
1581 __ip_vs_update_dest(svc, dest, udest, 0);
1582
1583 return 0;
1584 }
1585
1586 /*
1587 * Delete a destination (must be already unlinked from the service)
1588 */
__ip_vs_del_dest(struct netns_ipvs * ipvs,struct ip_vs_dest * dest,bool cleanup)1589 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
1590 bool cleanup)
1591 {
1592 ip_vs_stop_estimator(ipvs, &dest->stats);
1593
1594 /*
1595 * Remove it from the d-linked list with the real services.
1596 */
1597 ip_vs_rs_unhash(dest);
1598
1599 ip_vs_trash_put_dest(ipvs, dest, 0, cleanup);
1600
1601 /* Queue up delayed work to expire all no destination connections.
1602 * No-op when CONFIG_SYSCTL is disabled.
1603 */
1604 if (!cleanup)
1605 ip_vs_enqueue_expire_nodest_conns(ipvs);
1606 }
1607
1608
1609 /*
1610 * Unlink a destination from the given service
1611 */
__ip_vs_unlink_dest(struct ip_vs_service * svc,struct ip_vs_dest * dest,int svcupd)1612 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1613 struct ip_vs_dest *dest,
1614 int svcupd)
1615 {
1616 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1617
1618 spin_lock_bh(&dest->dst_lock);
1619 __ip_vs_dst_cache_reset(dest);
1620 spin_unlock_bh(&dest->dst_lock);
1621
1622 /*
1623 * Remove it from the d-linked destination list.
1624 */
1625 list_del_rcu(&dest->n_list);
1626 svc->num_dests--;
1627
1628 if (dest->af != svc->af)
1629 svc->ipvs->mixed_address_family_dests--;
1630
1631 if (svcupd) {
1632 struct ip_vs_scheduler *sched;
1633
1634 sched = rcu_dereference_protected(svc->scheduler, 1);
1635 if (sched && sched->del_dest)
1636 sched->del_dest(svc, dest);
1637 }
1638 }
1639
1640
1641 /*
1642 * Delete a destination server in the given service
1643 */
1644 static int
ip_vs_del_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1645 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1646 {
1647 struct ip_vs_dest *dest;
1648 __be16 dport = udest->port;
1649
1650 /* We use function that requires RCU lock */
1651 rcu_read_lock();
1652 dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
1653 rcu_read_unlock();
1654
1655 if (dest == NULL) {
1656 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1657 return -ENOENT;
1658 }
1659
1660 /*
1661 * Unlink dest from the service
1662 */
1663 __ip_vs_unlink_dest(svc, dest, 1);
1664
1665 /*
1666 * Delete the destination
1667 */
1668 __ip_vs_del_dest(svc->ipvs, dest, false);
1669
1670 return 0;
1671 }
1672
ip_vs_dest_trash_expire(struct timer_list * t)1673 static void ip_vs_dest_trash_expire(struct timer_list *t)
1674 {
1675 struct netns_ipvs *ipvs = timer_container_of(ipvs, t,
1676 dest_trash_timer);
1677 struct ip_vs_dest *dest, *next;
1678 unsigned long now = jiffies;
1679
1680 spin_lock(&ipvs->dest_trash_lock);
1681 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1682 if (refcount_read(&dest->refcnt) > 1)
1683 continue;
1684 if (dest->idle_start) {
1685 if (time_before(now, dest->idle_start +
1686 IP_VS_DEST_TRASH_PERIOD))
1687 continue;
1688 } else {
1689 dest->idle_start = max(1UL, now);
1690 continue;
1691 }
1692 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1693 dest->vfwmark,
1694 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1695 ntohs(dest->port));
1696 list_del(&dest->t_list);
1697 ip_vs_dest_free(dest);
1698 }
1699 if (!list_empty(&ipvs->dest_trash))
1700 mod_timer(&ipvs->dest_trash_timer,
1701 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1702 spin_unlock(&ipvs->dest_trash_lock);
1703 }
1704
1705 /*
1706 * Add a service into the service hash table
1707 */
1708 static int
ip_vs_add_service(struct netns_ipvs * ipvs,struct ip_vs_service_user_kern * u,struct ip_vs_service ** svc_p)1709 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
1710 struct ip_vs_service **svc_p)
1711 {
1712 struct ip_vs_scheduler *sched = NULL;
1713 struct ip_vs_rht *tc_new = NULL;
1714 struct ip_vs_rht *t, *t_new = NULL;
1715 int af_id = ip_vs_af_index(u->af);
1716 struct ip_vs_service *svc = NULL;
1717 struct ip_vs_pe *pe = NULL;
1718 int ret_hooks = -1;
1719 int ret = 0;
1720 bool grow;
1721
1722 /* increase the module use count */
1723 if (!ip_vs_use_count_inc())
1724 return -ENOPROTOOPT;
1725
1726 /* Lookup the scheduler by 'u->sched_name' */
1727 if (strcmp(u->sched_name, "none")) {
1728 sched = ip_vs_scheduler_get(u->sched_name);
1729 if (!sched) {
1730 pr_info("Scheduler module ip_vs_%s not found\n",
1731 u->sched_name);
1732 ret = -ENOENT;
1733 goto out_err;
1734 }
1735 }
1736
1737 if (u->pe_name && *u->pe_name) {
1738 pe = ip_vs_pe_getbyname(u->pe_name);
1739 if (pe == NULL) {
1740 pr_info("persistence engine module ip_vs_pe_%s "
1741 "not found\n", u->pe_name);
1742 ret = -ENOENT;
1743 goto out_err;
1744 }
1745 }
1746
1747 #ifdef CONFIG_IP_VS_IPV6
1748 if (u->af == AF_INET6) {
1749 __u32 plen = (__force __u32) u->netmask;
1750
1751 if (plen < 1 || plen > 128) {
1752 ret = -EINVAL;
1753 goto out_err;
1754 }
1755
1756 ret = nf_defrag_ipv6_enable(ipvs->net);
1757 if (ret)
1758 goto out_err;
1759 }
1760 #endif
1761
1762 /* The old table can be freed, protect it with RCU */
1763 rcu_read_lock();
1764 t = rcu_dereference(ipvs->svc_table);
1765 if (!t) {
1766 int lfactor = sysctl_svc_lfactor(ipvs);
1767 int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor);
1768
1769 rcu_read_unlock();
1770 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
1771 if (!t_new) {
1772 ret = -ENOMEM;
1773 goto out_err;
1774 }
1775 grow = false;
1776 } else {
1777 /* Even the currently attached new table may need to grow */
1778 t = rcu_dereference(t->new_tbl);
1779 grow = ip_vs_get_num_services(ipvs) + 1 > t->u_thresh;
1780 rcu_read_unlock();
1781 }
1782
1783 if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
1784 int lfactor = sysctl_conn_lfactor(ipvs);
1785 int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor);
1786
1787 tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
1788 if (!tc_new) {
1789 ret = -ENOMEM;
1790 goto out_err;
1791 }
1792 }
1793
1794 if (!atomic_read(&ipvs->num_services[af_id])) {
1795 ret = ip_vs_register_hooks(ipvs, u->af);
1796 if (ret < 0)
1797 goto out_err;
1798 ret_hooks = ret;
1799 }
1800
1801 svc = kzalloc_obj(struct ip_vs_service);
1802 if (svc == NULL) {
1803 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1804 ret = -ENOMEM;
1805 goto out_err;
1806 }
1807 ret = ip_vs_stats_init_alloc(&svc->stats);
1808 if (ret < 0)
1809 goto out_err;
1810
1811 /* I'm the first user of the service */
1812 atomic_set(&svc->refcnt, 0);
1813
1814 svc->af = u->af;
1815 svc->protocol = u->protocol;
1816 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1817 svc->port = u->port;
1818 svc->fwmark = u->fwmark;
1819 svc->flags = u->flags & ~IP_VS_SVC_F_HASHED;
1820 svc->timeout = u->timeout * HZ;
1821 svc->netmask = u->netmask;
1822 svc->ipvs = ipvs;
1823
1824 INIT_LIST_HEAD(&svc->destinations);
1825 spin_lock_init(&svc->sched_lock);
1826
1827 /* Bind the scheduler */
1828 if (sched) {
1829 ret = ip_vs_bind_scheduler(svc, sched);
1830 if (ret)
1831 goto out_err;
1832 }
1833
1834 ret = ip_vs_start_estimator(ipvs, &svc->stats);
1835 if (ret < 0)
1836 goto out_err;
1837
1838 if (t_new) {
1839 /* Add table for first time */
1840 clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
1841 rcu_assign_pointer(ipvs->svc_table, t_new);
1842 t_new = NULL;
1843 }
1844 if (tc_new) {
1845 rcu_assign_pointer(ipvs->conn_tab, tc_new);
1846 tc_new = NULL;
1847 }
1848
1849 /* Update the virtual service counters */
1850 if (svc->port == FTPPORT)
1851 atomic_inc(&ipvs->ftpsvc_counter[af_id]);
1852 else if (!svc->port && !svc->fwmark)
1853 atomic_inc(&ipvs->nullsvc_counter[af_id]);
1854 if (pe && pe->conn_out)
1855 atomic_inc(&ipvs->conn_out_counter[af_id]);
1856
1857 /* Bind the ct retriever */
1858 RCU_INIT_POINTER(svc->pe, pe);
1859 pe = NULL;
1860
1861 if (svc->fwmark)
1862 atomic_inc(&ipvs->fwm_services[af_id]);
1863 else
1864 atomic_inc(&ipvs->nonfwm_services[af_id]);
1865 atomic_inc(&ipvs->num_services[af_id]);
1866
1867 /* Hash the service into the service table */
1868 ip_vs_svc_hash(svc);
1869
1870 /* Schedule resize work */
1871 if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
1872 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
1873 1);
1874
1875 *svc_p = svc;
1876
1877 if (!READ_ONCE(ipvs->enable)) {
1878 mutex_lock(&ipvs->est_mutex);
1879
1880 /* Now there is a service - full throttle */
1881 WRITE_ONCE(ipvs->enable, 1);
1882
1883 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
1884
1885 /* Start estimation for first time */
1886 ip_vs_est_reload_start(ipvs, true);
1887 mutex_unlock(&ipvs->est_mutex);
1888 }
1889
1890 return 0;
1891
1892
1893 out_err:
1894 if (tc_new)
1895 ip_vs_rht_free(tc_new);
1896 if (t_new)
1897 ip_vs_rht_free(t_new);
1898 if (ret_hooks >= 0)
1899 ip_vs_unregister_hooks(ipvs, u->af);
1900 if (svc != NULL) {
1901 ip_vs_unbind_scheduler(svc, sched);
1902 ip_vs_service_free(svc);
1903 }
1904 ip_vs_scheduler_put(sched);
1905 ip_vs_pe_put(pe);
1906
1907 /* decrease the module use count */
1908 ip_vs_use_count_dec();
1909
1910 return ret;
1911 }
1912
1913
1914 /*
1915 * Edit a service and bind it with a new scheduler
1916 */
1917 static int
ip_vs_edit_service(struct ip_vs_service * svc,struct ip_vs_service_user_kern * u)1918 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1919 {
1920 struct ip_vs_scheduler *sched = NULL, *old_sched;
1921 struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1922 int ret = 0;
1923 bool new_pe_conn_out, old_pe_conn_out;
1924 struct netns_ipvs *ipvs = svc->ipvs;
1925 int af_id = ip_vs_af_index(svc->af);
1926
1927 /*
1928 * Lookup the scheduler, by 'u->sched_name'
1929 */
1930 if (strcmp(u->sched_name, "none")) {
1931 sched = ip_vs_scheduler_get(u->sched_name);
1932 if (!sched) {
1933 pr_info("Scheduler module ip_vs_%s not found\n",
1934 u->sched_name);
1935 return -ENOENT;
1936 }
1937 }
1938 old_sched = sched;
1939
1940 if (u->pe_name && *u->pe_name) {
1941 pe = ip_vs_pe_getbyname(u->pe_name);
1942 if (pe == NULL) {
1943 pr_info("persistence engine module ip_vs_pe_%s "
1944 "not found\n", u->pe_name);
1945 ret = -ENOENT;
1946 goto out;
1947 }
1948 old_pe = pe;
1949 }
1950
1951 #ifdef CONFIG_IP_VS_IPV6
1952 if (u->af == AF_INET6) {
1953 __u32 plen = (__force __u32) u->netmask;
1954
1955 if (plen < 1 || plen > 128) {
1956 ret = -EINVAL;
1957 goto out;
1958 }
1959 }
1960 #endif
1961
1962 old_sched = rcu_dereference_protected(svc->scheduler, 1);
1963 if (sched != old_sched) {
1964 if (old_sched) {
1965 ip_vs_unbind_scheduler(svc, old_sched);
1966 RCU_INIT_POINTER(svc->scheduler, NULL);
1967 /* Wait all svc->sched_data users */
1968 synchronize_rcu();
1969 }
1970 /* Bind the new scheduler */
1971 if (sched) {
1972 ret = ip_vs_bind_scheduler(svc, sched);
1973 if (ret) {
1974 ip_vs_scheduler_put(sched);
1975 goto out;
1976 }
1977 }
1978 }
1979
1980 /*
1981 * Set the flags and timeout value
1982 */
1983 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1984 svc->timeout = u->timeout * HZ;
1985 svc->netmask = u->netmask;
1986
1987 old_pe = rcu_dereference_protected(svc->pe, 1);
1988 if (pe != old_pe) {
1989 rcu_assign_pointer(svc->pe, pe);
1990 /* check for optional methods in new pe */
1991 new_pe_conn_out = (pe && pe->conn_out) ? true : false;
1992 old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
1993 if (new_pe_conn_out && !old_pe_conn_out)
1994 atomic_inc(&ipvs->conn_out_counter[af_id]);
1995 if (old_pe_conn_out && !new_pe_conn_out)
1996 atomic_dec(&ipvs->conn_out_counter[af_id]);
1997 }
1998
1999 out:
2000 ip_vs_scheduler_put(old_sched);
2001 ip_vs_pe_put(old_pe);
2002 return ret;
2003 }
2004
2005 /*
2006 * Delete a service from the service list
2007 * - The service must be unlinked, unlocked and not referenced!
2008 * - We are called under _bh lock
2009 */
__ip_vs_del_service(struct ip_vs_service * svc,bool cleanup)2010 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
2011 {
2012 struct ip_vs_dest *dest, *nxt;
2013 struct ip_vs_scheduler *old_sched;
2014 struct ip_vs_pe *old_pe;
2015 struct netns_ipvs *ipvs = svc->ipvs;
2016 int af_id = ip_vs_af_index(svc->af);
2017
2018 atomic_dec(&ipvs->num_services[af_id]);
2019 if (!atomic_read(&ipvs->num_services[af_id]))
2020 ip_vs_unregister_hooks(ipvs, svc->af);
2021 if (svc->fwmark)
2022 atomic_dec(&ipvs->fwm_services[af_id]);
2023 else
2024 atomic_dec(&ipvs->nonfwm_services[af_id]);
2025
2026 ip_vs_stop_estimator(svc->ipvs, &svc->stats);
2027
2028 /* Unbind scheduler */
2029 old_sched = rcu_dereference_protected(svc->scheduler, 1);
2030 ip_vs_unbind_scheduler(svc, old_sched);
2031 ip_vs_scheduler_put(old_sched);
2032
2033 /* Unbind persistence engine, keep svc->pe */
2034 old_pe = rcu_dereference_protected(svc->pe, 1);
2035 if (old_pe && old_pe->conn_out)
2036 atomic_dec(&ipvs->conn_out_counter[af_id]);
2037 ip_vs_pe_put(old_pe);
2038
2039 /*
2040 * Unlink the whole destination list
2041 */
2042 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
2043 __ip_vs_unlink_dest(svc, dest, 0);
2044 __ip_vs_del_dest(svc->ipvs, dest, cleanup);
2045 }
2046
2047 /*
2048 * Update the virtual service counters
2049 */
2050 if (svc->port == FTPPORT)
2051 atomic_dec(&ipvs->ftpsvc_counter[af_id]);
2052 else if (!svc->port && !svc->fwmark)
2053 atomic_dec(&ipvs->nullsvc_counter[af_id]);
2054
2055 /*
2056 * Free the service if nobody refers to it
2057 */
2058 __ip_vs_svc_put(svc);
2059
2060 /* decrease the module use count */
2061 ip_vs_use_count_dec();
2062 }
2063
2064 /*
2065 * Unlink a service from list and try to delete it if its refcnt reached 0
2066 */
ip_vs_unlink_service(struct ip_vs_service * svc,bool cleanup)2067 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
2068 {
2069 ip_vs_unregister_conntrack(svc);
2070 /* Hold svc to avoid double release from dest_trash */
2071 atomic_inc(&svc->refcnt);
2072 /*
2073 * Unhash it from the service table
2074 */
2075 ip_vs_svc_unhash(svc);
2076
2077 __ip_vs_del_service(svc, cleanup);
2078 }
2079
2080 /*
2081 * Delete a service from the service list
2082 */
ip_vs_del_service(struct ip_vs_service * svc)2083 static int ip_vs_del_service(struct ip_vs_service *svc)
2084 {
2085 struct netns_ipvs *ipvs;
2086 struct ip_vs_rht *t, *p;
2087 int ns;
2088
2089 if (svc == NULL)
2090 return -EEXIST;
2091 ipvs = svc->ipvs;
2092 ip_vs_unlink_service(svc, false);
2093
2094 /* Drop the table if no more services */
2095 ns = ip_vs_get_num_services(ipvs);
2096 if (!ns) {
2097 /* Stop the resizer and drop the tables */
2098 set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
2099 cancel_delayed_work_sync(&ipvs->svc_resize_work);
2100 t = rcu_dereference_protected(ipvs->svc_table, 1);
2101 if (t) {
2102 rcu_assign_pointer(ipvs->svc_table, NULL);
2103 /* Inform readers that table is removed */
2104 smp_mb__before_atomic();
2105 atomic_inc(&ipvs->svc_table_changes);
2106 while (1) {
2107 p = rcu_dereference_protected(t->new_tbl, 1);
2108 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
2109 if (p == t)
2110 break;
2111 t = p;
2112 }
2113 }
2114 } else {
2115 bool shrink;
2116
2117 rcu_read_lock();
2118 t = rcu_dereference(ipvs->svc_table);
2119 /* Even the currently attached new table may need to shrink */
2120 t = rcu_dereference(t->new_tbl);
2121 shrink = ns <= t->l_thresh;
2122 rcu_read_unlock();
2123 if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
2124 &ipvs->work_flags))
2125 queue_delayed_work(system_unbound_wq,
2126 &ipvs->svc_resize_work, 1);
2127 }
2128 return 0;
2129 }
2130
2131
2132 /*
2133 * Flush all the virtual services
2134 */
ip_vs_flush(struct netns_ipvs * ipvs,bool cleanup)2135 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
2136 {
2137 DECLARE_IP_VS_RHT_WALK_BUCKETS();
2138 struct hlist_bl_head *head;
2139 struct ip_vs_service *svc;
2140 struct hlist_bl_node *ne;
2141 struct hlist_bl_node *e;
2142 struct ip_vs_rht *t, *p;
2143
2144 /* Stop the resizer and drop the tables */
2145 if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
2146 cancel_delayed_work_sync(&ipvs->svc_resize_work);
2147 /* No resizer, so now we have exclusive write access */
2148
2149 if (ip_vs_get_num_services(ipvs)) {
2150 ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
2151 hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list)
2152 ip_vs_unlink_service(svc, cleanup);
2153 }
2154 }
2155
2156 /* Unregister the hash table and release it after RCU grace period */
2157 t = rcu_dereference_protected(ipvs->svc_table, 1);
2158 if (t) {
2159 rcu_assign_pointer(ipvs->svc_table, NULL);
2160 /* Inform readers that table is removed */
2161 smp_mb__before_atomic();
2162 atomic_inc(&ipvs->svc_table_changes);
2163 while (1) {
2164 p = rcu_dereference_protected(t->new_tbl, 1);
2165 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
2166 if (p == t)
2167 break;
2168 t = p;
2169 }
2170 }
2171 /* Stop the tot_stats estimator early under service_mutex
2172 * to avoid locking it again later.
2173 */
2174 if (cleanup)
2175 ip_vs_stop_estimator_tot_stats(ipvs);
2176 return 0;
2177 }
2178
2179 /*
2180 * Delete service by {netns} in the service table.
2181 * Called by __ip_vs_batch_cleanup()
2182 */
ip_vs_service_nets_cleanup(struct list_head * net_list)2183 void ip_vs_service_nets_cleanup(struct list_head *net_list)
2184 {
2185 struct netns_ipvs *ipvs;
2186 struct net *net;
2187
2188 /* Check for "full" addressed entries */
2189 list_for_each_entry(net, net_list, exit_list) {
2190 ipvs = net_ipvs(net);
2191 mutex_lock(&ipvs->service_mutex);
2192 ip_vs_flush(ipvs, true);
2193 mutex_unlock(&ipvs->service_mutex);
2194 }
2195 }
2196
2197 /* Put all references for device (dst_cache) */
2198 static inline void
ip_vs_forget_dev(struct ip_vs_dest * dest,struct net_device * dev)2199 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
2200 {
2201 struct ip_vs_dest_dst *dest_dst;
2202
2203 spin_lock_bh(&dest->dst_lock);
2204 dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
2205 if (dest_dst && dest_dst->dst_cache->dev == dev) {
2206 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
2207 dev->name,
2208 IP_VS_DBG_ADDR(dest->af, &dest->addr),
2209 ntohs(dest->port),
2210 refcount_read(&dest->refcnt));
2211 __ip_vs_dst_cache_reset(dest);
2212 }
2213 spin_unlock_bh(&dest->dst_lock);
2214
2215 }
2216 /* Netdev event receiver
2217 * Currently only NETDEV_DOWN is handled to release refs to cached dsts
2218 */
ip_vs_dst_event(struct notifier_block * this,unsigned long event,void * ptr)2219 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
2220 void *ptr)
2221 {
2222 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2223 struct net *net = dev_net(dev);
2224 struct netns_ipvs *ipvs = net_ipvs(net);
2225 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
2226 unsigned int resched_score = 0;
2227 struct hlist_bl_head *head;
2228 struct ip_vs_service *svc;
2229 struct hlist_bl_node *e;
2230 struct ip_vs_dest *dest;
2231 int old_gen;
2232
2233 if (event != NETDEV_DOWN || !ipvs)
2234 return NOTIFY_DONE;
2235 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
2236
2237 /* Allow concurrent rehashing on resize but to avoid loop
2238 * serialize with installing the new table.
2239 */
2240 down_read(&ipvs->svc_replace_sem);
2241
2242 old_gen = atomic_read(&ipvs->svc_table_changes);
2243
2244 rcu_read_lock();
2245
2246 smp_rmb(); /* ipvs->svc_table and svc_table_changes */
2247 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
2248 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
2249 list_for_each_entry_rcu(dest, &svc->destinations,
2250 n_list) {
2251 ip_vs_forget_dev(dest, dev);
2252 resched_score += 10;
2253 }
2254 resched_score++;
2255 }
2256 resched_score++;
2257 if (resched_score >= 100) {
2258 cond_resched_rcu();
2259 /* Flushed? So no more dev refs */
2260 if (atomic_read(&ipvs->svc_table_changes) != old_gen)
2261 goto done;
2262 resched_score = 0;
2263 }
2264 }
2265
2266 done:
2267 rcu_read_unlock();
2268 up_read(&ipvs->svc_replace_sem);
2269
2270 return NOTIFY_DONE;
2271 }
2272
2273 /*
2274 * Zero counters in a service or all services
2275 */
ip_vs_zero_service(struct ip_vs_service * svc)2276 static int ip_vs_zero_service(struct ip_vs_service *svc)
2277 {
2278 struct ip_vs_dest *dest;
2279
2280 list_for_each_entry(dest, &svc->destinations, n_list) {
2281 ip_vs_zero_stats(&dest->stats);
2282 }
2283 ip_vs_zero_stats(&svc->stats);
2284 return 0;
2285 }
2286
ip_vs_zero_all(struct netns_ipvs * ipvs)2287 static int ip_vs_zero_all(struct netns_ipvs *ipvs)
2288 {
2289 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
2290 unsigned int resched_score = 0;
2291 struct hlist_bl_head *head;
2292 struct ip_vs_service *svc;
2293 struct hlist_bl_node *e;
2294
2295 /* svc_table can not be replaced (svc_replace_sem) or
2296 * removed (service_mutex)
2297 */
2298 down_read(&ipvs->svc_replace_sem);
2299 rcu_read_lock();
2300
2301 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
2302 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
2303 ip_vs_zero_service(svc);
2304 resched_score += 10;
2305 }
2306 resched_score++;
2307 if (resched_score >= 100) {
2308 resched_score = 0;
2309 cond_resched_rcu();
2310 }
2311 }
2312
2313 rcu_read_unlock();
2314 up_read(&ipvs->svc_replace_sem);
2315
2316 ip_vs_zero_stats(&ipvs->tot_stats->s);
2317 return 0;
2318 }
2319
2320 #ifdef CONFIG_SYSCTL
2321
2322 static int
proc_do_defense_mode(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2323 proc_do_defense_mode(const struct ctl_table *table, int write,
2324 void *buffer, size_t *lenp, loff_t *ppos)
2325 {
2326 struct netns_ipvs *ipvs = table->extra2;
2327 int *valp = table->data;
2328 int val = *valp;
2329 int rc;
2330
2331 struct ctl_table tmp = {
2332 .data = &val,
2333 .maxlen = sizeof(int),
2334 .mode = table->mode,
2335 };
2336
2337 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2338 if (write && (*valp != val)) {
2339 if (val < 0 || val > 3) {
2340 rc = -EINVAL;
2341 } else {
2342 *valp = val;
2343 update_defense_level(ipvs);
2344 }
2345 }
2346 return rc;
2347 }
2348
2349 static int
proc_do_sync_threshold(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2350 proc_do_sync_threshold(const struct ctl_table *table, int write,
2351 void *buffer, size_t *lenp, loff_t *ppos)
2352 {
2353 struct netns_ipvs *ipvs = table->extra2;
2354 int *valp = table->data;
2355 int val[2];
2356 int rc;
2357 struct ctl_table tmp = {
2358 .data = &val,
2359 .maxlen = table->maxlen,
2360 .mode = table->mode,
2361 };
2362
2363 mutex_lock(&ipvs->sync_mutex);
2364 memcpy(val, valp, sizeof(val));
2365 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2366 if (write) {
2367 if (val[0] < 0 || val[1] < 0 ||
2368 (val[0] >= val[1] && val[1]))
2369 rc = -EINVAL;
2370 else
2371 memcpy(valp, val, sizeof(val));
2372 }
2373 mutex_unlock(&ipvs->sync_mutex);
2374 return rc;
2375 }
2376
2377 static int
proc_do_sync_ports(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2378 proc_do_sync_ports(const struct ctl_table *table, int write,
2379 void *buffer, size_t *lenp, loff_t *ppos)
2380 {
2381 int *valp = table->data;
2382 int val = *valp;
2383 int rc;
2384
2385 struct ctl_table tmp = {
2386 .data = &val,
2387 .maxlen = sizeof(int),
2388 .mode = table->mode,
2389 };
2390
2391 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2392 if (write && (*valp != val)) {
2393 if (val < 1 || !is_power_of_2(val))
2394 rc = -EINVAL;
2395 else
2396 *valp = val;
2397 }
2398 return rc;
2399 }
2400
ipvs_proc_est_cpumask_set(const struct ctl_table * table,void * buffer)2401 static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
2402 void *buffer)
2403 {
2404 struct netns_ipvs *ipvs = table->extra2;
2405 cpumask_var_t *valp = table->data;
2406 cpumask_var_t newmask;
2407 int ret;
2408
2409 if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
2410 return -ENOMEM;
2411
2412 ret = cpulist_parse(buffer, newmask);
2413 if (ret)
2414 goto out;
2415
2416 mutex_lock(&ipvs->est_mutex);
2417
2418 if (!ipvs->est_cpulist_valid) {
2419 if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
2420 ret = -ENOMEM;
2421 goto unlock;
2422 }
2423 ipvs->est_cpulist_valid = 1;
2424 }
2425 cpumask_and(newmask, newmask, ¤t->cpus_mask);
2426 cpumask_copy(*valp, newmask);
2427 /* est_max_threads may depend on cpulist size */
2428 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
2429 ipvs->est_calc_phase = 1;
2430 ip_vs_est_reload_start(ipvs, true);
2431
2432 unlock:
2433 mutex_unlock(&ipvs->est_mutex);
2434
2435 out:
2436 free_cpumask_var(newmask);
2437 return ret;
2438 }
2439
ipvs_proc_est_cpumask_get(const struct ctl_table * table,void * buffer,size_t size)2440 static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
2441 void *buffer, size_t size)
2442 {
2443 struct netns_ipvs *ipvs = table->extra2;
2444 cpumask_var_t *valp = table->data;
2445 struct cpumask *mask;
2446 int ret;
2447
2448 mutex_lock(&ipvs->est_mutex);
2449
2450 /* HK_TYPE_KTHREAD cpumask needs RCU protection */
2451 scoped_guard(rcu) {
2452 if (ipvs->est_cpulist_valid)
2453 mask = *valp;
2454 else
2455 mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
2456 ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
2457 }
2458
2459 mutex_unlock(&ipvs->est_mutex);
2460
2461 return ret;
2462 }
2463
ipvs_proc_est_cpulist(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2464 static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write,
2465 void *buffer, size_t *lenp, loff_t *ppos)
2466 {
2467 int ret;
2468
2469 /* Ignore both read and write(append) if *ppos not 0 */
2470 if (*ppos || !*lenp) {
2471 *lenp = 0;
2472 return 0;
2473 }
2474 if (write) {
2475 /* proc_sys_call_handler() appends terminator */
2476 ret = ipvs_proc_est_cpumask_set(table, buffer);
2477 if (ret >= 0)
2478 *ppos += *lenp;
2479 } else {
2480 /* proc_sys_call_handler() allocates 1 byte for terminator */
2481 ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
2482 if (ret >= 0) {
2483 *lenp = ret;
2484 *ppos += *lenp;
2485 ret = 0;
2486 }
2487 }
2488 return ret;
2489 }
2490
ipvs_proc_est_nice(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2491 static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
2492 void *buffer, size_t *lenp, loff_t *ppos)
2493 {
2494 struct netns_ipvs *ipvs = table->extra2;
2495 int *valp = table->data;
2496 int val = *valp;
2497 int ret;
2498
2499 struct ctl_table tmp_table = {
2500 .data = &val,
2501 .maxlen = sizeof(int),
2502 .mode = table->mode,
2503 };
2504
2505 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2506 if (write && ret >= 0) {
2507 if (val < MIN_NICE || val > MAX_NICE) {
2508 ret = -EINVAL;
2509 } else {
2510 mutex_lock(&ipvs->est_mutex);
2511 if (*valp != val) {
2512 *valp = val;
2513 ip_vs_est_reload_start(ipvs, true);
2514 }
2515 mutex_unlock(&ipvs->est_mutex);
2516 }
2517 }
2518 return ret;
2519 }
2520
ipvs_proc_run_estimation(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2521 static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
2522 void *buffer, size_t *lenp, loff_t *ppos)
2523 {
2524 struct netns_ipvs *ipvs = table->extra2;
2525 int *valp = table->data;
2526 int val = *valp;
2527 int ret;
2528
2529 struct ctl_table tmp_table = {
2530 .data = &val,
2531 .maxlen = sizeof(int),
2532 .mode = table->mode,
2533 };
2534
2535 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2536 if (write && ret >= 0) {
2537 mutex_lock(&ipvs->est_mutex);
2538 if (*valp != val) {
2539 *valp = val;
2540 ip_vs_est_reload_start(ipvs, true);
2541 }
2542 mutex_unlock(&ipvs->est_mutex);
2543 }
2544 return ret;
2545 }
2546
ipvs_proc_conn_lfactor(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2547 static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write,
2548 void *buffer, size_t *lenp, loff_t *ppos)
2549 {
2550 struct netns_ipvs *ipvs = table->extra2;
2551 int *valp = table->data;
2552 int val = *valp;
2553 int ret;
2554
2555 struct ctl_table tmp_table = {
2556 .data = &val,
2557 .maxlen = sizeof(int),
2558 };
2559
2560 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2561 if (write && ret >= 0) {
2562 if (val < -8 || val > 8) {
2563 ret = -EINVAL;
2564 } else {
2565 WRITE_ONCE(*valp, val);
2566 if (rcu_access_pointer(ipvs->conn_tab))
2567 mod_delayed_work(system_unbound_wq,
2568 &ipvs->conn_resize_work, 0);
2569 }
2570 }
2571 return ret;
2572 }
2573
ipvs_proc_svc_lfactor(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2574 static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write,
2575 void *buffer, size_t *lenp, loff_t *ppos)
2576 {
2577 struct netns_ipvs *ipvs = table->extra2;
2578 int *valp = table->data;
2579 int val = *valp;
2580 int ret;
2581
2582 struct ctl_table tmp_table = {
2583 .data = &val,
2584 .maxlen = sizeof(int),
2585 };
2586
2587 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2588 if (write && ret >= 0) {
2589 if (val < -8 || val > 8) {
2590 ret = -EINVAL;
2591 } else {
2592 mutex_lock(&ipvs->service_mutex);
2593 WRITE_ONCE(*valp, val);
2594 /* Make sure the services are present */
2595 if (rcu_access_pointer(ipvs->svc_table) &&
2596 READ_ONCE(ipvs->enable) &&
2597 !test_bit(IP_VS_WORK_SVC_NORESIZE,
2598 &ipvs->work_flags))
2599 mod_delayed_work(system_unbound_wq,
2600 &ipvs->svc_resize_work, 0);
2601 mutex_unlock(&ipvs->service_mutex);
2602 }
2603 }
2604 return ret;
2605 }
2606
2607 /*
2608 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
2609 * Do not change order or insert new entries without
2610 * align with netns init in ip_vs_control_net_init()
2611 */
2612
2613 static struct ctl_table vs_vars[] = {
2614 {
2615 .procname = "amemthresh",
2616 .maxlen = sizeof(int),
2617 .mode = 0644,
2618 .proc_handler = proc_dointvec,
2619 },
2620 {
2621 .procname = "am_droprate",
2622 .maxlen = sizeof(int),
2623 .mode = 0644,
2624 .proc_handler = proc_dointvec,
2625 },
2626 {
2627 .procname = "drop_entry",
2628 .maxlen = sizeof(int),
2629 .mode = 0644,
2630 .proc_handler = proc_do_defense_mode,
2631 },
2632 {
2633 .procname = "drop_packet",
2634 .maxlen = sizeof(int),
2635 .mode = 0644,
2636 .proc_handler = proc_do_defense_mode,
2637 },
2638 #ifdef CONFIG_IP_VS_NFCT
2639 {
2640 .procname = "conntrack",
2641 .maxlen = sizeof(int),
2642 .mode = 0644,
2643 .proc_handler = &proc_dointvec,
2644 },
2645 #endif
2646 {
2647 .procname = "secure_tcp",
2648 .maxlen = sizeof(int),
2649 .mode = 0644,
2650 .proc_handler = proc_do_defense_mode,
2651 },
2652 {
2653 .procname = "snat_reroute",
2654 .maxlen = sizeof(int),
2655 .mode = 0644,
2656 .proc_handler = &proc_dointvec,
2657 },
2658 {
2659 .procname = "sync_version",
2660 .maxlen = sizeof(int),
2661 .mode = 0644,
2662 .proc_handler = proc_dointvec_minmax,
2663 .extra1 = SYSCTL_ZERO,
2664 .extra2 = SYSCTL_ONE,
2665 },
2666 {
2667 .procname = "sync_ports",
2668 .maxlen = sizeof(int),
2669 .mode = 0644,
2670 .proc_handler = proc_do_sync_ports,
2671 },
2672 {
2673 .procname = "sync_persist_mode",
2674 .maxlen = sizeof(int),
2675 .mode = 0644,
2676 .proc_handler = proc_dointvec,
2677 },
2678 {
2679 .procname = "sync_qlen_max",
2680 .maxlen = sizeof(unsigned long),
2681 .mode = 0644,
2682 .proc_handler = proc_doulongvec_minmax,
2683 },
2684 {
2685 .procname = "sync_sock_size",
2686 .maxlen = sizeof(int),
2687 .mode = 0644,
2688 .proc_handler = proc_dointvec,
2689 },
2690 {
2691 .procname = "cache_bypass",
2692 .maxlen = sizeof(int),
2693 .mode = 0644,
2694 .proc_handler = proc_dointvec,
2695 },
2696 {
2697 .procname = "expire_nodest_conn",
2698 .maxlen = sizeof(int),
2699 .mode = 0644,
2700 .proc_handler = proc_dointvec,
2701 },
2702 {
2703 .procname = "sloppy_tcp",
2704 .maxlen = sizeof(int),
2705 .mode = 0644,
2706 .proc_handler = proc_dointvec,
2707 },
2708 {
2709 .procname = "sloppy_sctp",
2710 .maxlen = sizeof(int),
2711 .mode = 0644,
2712 .proc_handler = proc_dointvec,
2713 },
2714 {
2715 .procname = "expire_quiescent_template",
2716 .maxlen = sizeof(int),
2717 .mode = 0644,
2718 .proc_handler = proc_dointvec,
2719 },
2720 {
2721 .procname = "sync_threshold",
2722 .maxlen =
2723 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
2724 .mode = 0644,
2725 .proc_handler = proc_do_sync_threshold,
2726 },
2727 {
2728 .procname = "sync_refresh_period",
2729 .maxlen = sizeof(int),
2730 .mode = 0644,
2731 .proc_handler = proc_dointvec_jiffies,
2732 },
2733 {
2734 .procname = "sync_retries",
2735 .maxlen = sizeof(int),
2736 .mode = 0644,
2737 .proc_handler = proc_dointvec_minmax,
2738 .extra1 = SYSCTL_ZERO,
2739 .extra2 = SYSCTL_THREE,
2740 },
2741 {
2742 .procname = "nat_icmp_send",
2743 .maxlen = sizeof(int),
2744 .mode = 0644,
2745 .proc_handler = proc_dointvec,
2746 },
2747 {
2748 .procname = "pmtu_disc",
2749 .maxlen = sizeof(int),
2750 .mode = 0644,
2751 .proc_handler = proc_dointvec,
2752 },
2753 {
2754 .procname = "backup_only",
2755 .maxlen = sizeof(int),
2756 .mode = 0644,
2757 .proc_handler = proc_dointvec,
2758 },
2759 {
2760 .procname = "conn_reuse_mode",
2761 .maxlen = sizeof(int),
2762 .mode = 0644,
2763 .proc_handler = proc_dointvec,
2764 },
2765 {
2766 .procname = "schedule_icmp",
2767 .maxlen = sizeof(int),
2768 .mode = 0644,
2769 .proc_handler = proc_dointvec,
2770 },
2771 {
2772 .procname = "ignore_tunneled",
2773 .maxlen = sizeof(int),
2774 .mode = 0644,
2775 .proc_handler = proc_dointvec,
2776 },
2777 {
2778 .procname = "run_estimation",
2779 .maxlen = sizeof(int),
2780 .mode = 0644,
2781 .proc_handler = ipvs_proc_run_estimation,
2782 },
2783 {
2784 .procname = "est_cpulist",
2785 .maxlen = NR_CPUS, /* unused */
2786 .mode = 0644,
2787 .proc_handler = ipvs_proc_est_cpulist,
2788 },
2789 {
2790 .procname = "est_nice",
2791 .maxlen = sizeof(int),
2792 .mode = 0644,
2793 .proc_handler = ipvs_proc_est_nice,
2794 },
2795 {
2796 .procname = "conn_lfactor",
2797 .maxlen = sizeof(int),
2798 .mode = 0644,
2799 .proc_handler = ipvs_proc_conn_lfactor,
2800 },
2801 {
2802 .procname = "svc_lfactor",
2803 .maxlen = sizeof(int),
2804 .mode = 0644,
2805 .proc_handler = ipvs_proc_svc_lfactor,
2806 },
2807 #ifdef CONFIG_IP_VS_DEBUG
2808 {
2809 .procname = "debug_level",
2810 .data = &sysctl_ip_vs_debug_level,
2811 .maxlen = sizeof(int),
2812 .mode = 0644,
2813 .proc_handler = proc_dointvec,
2814 },
2815 #endif
2816 };
2817
2818 #endif
2819
2820 #ifdef CONFIG_PROC_FS
2821
2822 struct ip_vs_iter {
2823 struct seq_net_private p; /* Do not move this, netns depends upon it*/
2824 struct ip_vs_rht *t;
2825 u32 bucket;
2826 };
2827
2828 /*
2829 * Write the contents of the VS rule table to a PROCfs file.
2830 * (It is kept just for backward compatibility)
2831 */
ip_vs_fwd_name(unsigned int flags)2832 static inline const char *ip_vs_fwd_name(unsigned int flags)
2833 {
2834 switch (flags & IP_VS_CONN_F_FWD_MASK) {
2835 case IP_VS_CONN_F_LOCALNODE:
2836 return "Local";
2837 case IP_VS_CONN_F_TUNNEL:
2838 return "Tunnel";
2839 case IP_VS_CONN_F_DROUTE:
2840 return "Route";
2841 default:
2842 return "Masq";
2843 }
2844 }
2845
2846 /* Do not expect consistent view during add, del and move(table resize).
2847 * We may miss entries and even show duplicates.
2848 */
ip_vs_info_array(struct seq_file * seq,loff_t pos)2849 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
2850 {
2851 struct ip_vs_iter *iter = seq->private;
2852 struct ip_vs_rht *t = iter->t;
2853 struct ip_vs_service *svc;
2854 struct hlist_bl_node *e;
2855 int idx;
2856
2857 if (!t)
2858 return NULL;
2859 for (idx = 0; idx < t->size; idx++) {
2860 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) {
2861 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2862 break;
2863 if (pos-- == 0) {
2864 iter->bucket = idx;
2865 return svc;
2866 }
2867 }
2868 }
2869 return NULL;
2870 }
2871
ip_vs_info_seq_start(struct seq_file * seq,loff_t * pos)2872 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
2873 __acquires(RCU)
2874 {
2875 struct ip_vs_iter *iter = seq->private;
2876 struct net *net = seq_file_net(seq);
2877 struct netns_ipvs *ipvs = net_ipvs(net);
2878
2879 rcu_read_lock();
2880 iter->t = rcu_dereference(ipvs->svc_table);
2881 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
2882 }
2883
2884
ip_vs_info_seq_next(struct seq_file * seq,void * v,loff_t * pos)2885 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2886 {
2887 struct ip_vs_service *svc;
2888 struct ip_vs_iter *iter;
2889 struct hlist_bl_node *e;
2890 struct ip_vs_rht *t;
2891
2892 ++*pos;
2893 if (v == SEQ_START_TOKEN)
2894 return ip_vs_info_array(seq,0);
2895
2896 svc = v;
2897 iter = seq->private;
2898 t = iter->t;
2899 if (!t)
2900 return NULL;
2901
2902 hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) {
2903 /* Our cursor was moved to new table ? */
2904 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2905 break;
2906 return svc;
2907 }
2908
2909 while (++iter->bucket < t->size) {
2910 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket],
2911 s_list) {
2912 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2913 break;
2914 return svc;
2915 }
2916 }
2917 return NULL;
2918 }
2919
ip_vs_info_seq_stop(struct seq_file * seq,void * v)2920 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2921 __releases(RCU)
2922 {
2923 rcu_read_unlock();
2924 }
2925
2926
ip_vs_info_seq_show(struct seq_file * seq,void * v)2927 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2928 {
2929 struct net *net = seq_file_net(seq);
2930 struct netns_ipvs *ipvs = net_ipvs(net);
2931
2932 if (v == SEQ_START_TOKEN) {
2933 seq_printf(seq,
2934 "IP Virtual Server version %d.%d.%d (size=%d)\n",
2935 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
2936 seq_puts(seq,
2937 "Prot LocalAddress:Port Scheduler Flags\n");
2938 seq_puts(seq,
2939 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2940 } else {
2941 const struct ip_vs_service *svc = v;
2942 const struct ip_vs_dest *dest;
2943 struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
2944 char *sched_name = sched ? sched->name : "none";
2945
2946 if (!svc->fwmark) {
2947 #ifdef CONFIG_IP_VS_IPV6
2948 if (svc->af == AF_INET6)
2949 seq_printf(seq, "%s [%pI6]:%04X %s ",
2950 ip_vs_proto_name(svc->protocol),
2951 &svc->addr.in6,
2952 ntohs(svc->port),
2953 sched_name);
2954 else
2955 #endif
2956 seq_printf(seq, "%s %08X:%04X %s %s ",
2957 ip_vs_proto_name(svc->protocol),
2958 ntohl(svc->addr.ip),
2959 ntohs(svc->port),
2960 sched_name,
2961 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2962 } else {
2963 seq_printf(seq, "FWM %08X %s %s",
2964 svc->fwmark, sched_name,
2965 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2966 }
2967
2968 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2969 seq_printf(seq, "persistent %d %08X\n",
2970 svc->timeout,
2971 ntohl(svc->netmask));
2972 else
2973 seq_putc(seq, '\n');
2974
2975 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
2976 #ifdef CONFIG_IP_VS_IPV6
2977 if (dest->af == AF_INET6)
2978 seq_printf(seq,
2979 " -> [%pI6]:%04X"
2980 " %-7s %-6d %-10d %-10d\n",
2981 &dest->addr.in6,
2982 ntohs(dest->port),
2983 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2984 atomic_read(&dest->weight),
2985 atomic_read(&dest->activeconns),
2986 atomic_read(&dest->inactconns));
2987 else
2988 #endif
2989 seq_printf(seq,
2990 " -> %08X:%04X "
2991 "%-7s %-6d %-10d %-10d\n",
2992 ntohl(dest->addr.ip),
2993 ntohs(dest->port),
2994 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2995 atomic_read(&dest->weight),
2996 atomic_read(&dest->activeconns),
2997 atomic_read(&dest->inactconns));
2998
2999 }
3000 }
3001 return 0;
3002 }
3003
3004 static const struct seq_operations ip_vs_info_seq_ops = {
3005 .start = ip_vs_info_seq_start,
3006 .next = ip_vs_info_seq_next,
3007 .stop = ip_vs_info_seq_stop,
3008 .show = ip_vs_info_seq_show,
3009 };
3010
ip_vs_stats_show(struct seq_file * seq,void * v)3011 static int ip_vs_stats_show(struct seq_file *seq, void *v)
3012 {
3013 struct net *net = seq_file_single_net(seq);
3014 struct ip_vs_kstats show;
3015
3016 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
3017 seq_puts(seq,
3018 " Total Incoming Outgoing Incoming Outgoing\n");
3019 seq_puts(seq,
3020 " Conns Packets Packets Bytes Bytes\n");
3021
3022 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
3023 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
3024 (unsigned long long)show.conns,
3025 (unsigned long long)show.inpkts,
3026 (unsigned long long)show.outpkts,
3027 (unsigned long long)show.inbytes,
3028 (unsigned long long)show.outbytes);
3029
3030 /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/
3031 seq_puts(seq,
3032 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
3033 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
3034 (unsigned long long)show.cps,
3035 (unsigned long long)show.inpps,
3036 (unsigned long long)show.outpps,
3037 (unsigned long long)show.inbps,
3038 (unsigned long long)show.outbps);
3039
3040 return 0;
3041 }
3042
ip_vs_stats_percpu_show(struct seq_file * seq,void * v)3043 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
3044 {
3045 struct net *net = seq_file_single_net(seq);
3046 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
3047 struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
3048 struct ip_vs_kstats kstats;
3049 int i;
3050
3051 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
3052 seq_puts(seq,
3053 " Total Incoming Outgoing Incoming Outgoing\n");
3054 seq_puts(seq,
3055 "CPU Conns Packets Packets Bytes Bytes\n");
3056
3057 for_each_possible_cpu(i) {
3058 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
3059 unsigned int start;
3060 u64 conns, inpkts, outpkts, inbytes, outbytes;
3061
3062 do {
3063 start = u64_stats_fetch_begin(&u->syncp);
3064 conns = u64_stats_read(&u->cnt.conns);
3065 inpkts = u64_stats_read(&u->cnt.inpkts);
3066 outpkts = u64_stats_read(&u->cnt.outpkts);
3067 inbytes = u64_stats_read(&u->cnt.inbytes);
3068 outbytes = u64_stats_read(&u->cnt.outbytes);
3069 } while (u64_stats_fetch_retry(&u->syncp, start));
3070
3071 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
3072 i, (u64)conns, (u64)inpkts,
3073 (u64)outpkts, (u64)inbytes,
3074 (u64)outbytes);
3075 }
3076
3077 ip_vs_copy_stats(&kstats, tot_stats);
3078
3079 seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n",
3080 (unsigned long long)kstats.conns,
3081 (unsigned long long)kstats.inpkts,
3082 (unsigned long long)kstats.outpkts,
3083 (unsigned long long)kstats.inbytes,
3084 (unsigned long long)kstats.outbytes);
3085
3086 /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
3087 seq_puts(seq,
3088 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
3089 seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n",
3090 kstats.cps,
3091 kstats.inpps,
3092 kstats.outpps,
3093 kstats.inbps,
3094 kstats.outbps);
3095
3096 return 0;
3097 }
3098
ip_vs_status_show(struct seq_file * seq,void * v)3099 static int ip_vs_status_show(struct seq_file *seq, void *v)
3100 {
3101 struct net *net = seq_file_single_net(seq);
3102 struct netns_ipvs *ipvs = net_ipvs(net);
3103 unsigned int resched_score = 0;
3104 struct ip_vs_conn_hnode *hn;
3105 struct hlist_bl_head *head;
3106 struct ip_vs_service *svc;
3107 struct ip_vs_rht *t, *pt;
3108 struct hlist_bl_node *e;
3109 int old_gen, new_gen;
3110 u32 counts[8];
3111 u32 bucket;
3112 u32 count;
3113 int loops;
3114 u32 sum1;
3115 u32 sum;
3116 int i;
3117
3118 /* Info for conns */
3119 rcu_read_lock();
3120
3121 t = rcu_dereference(ipvs->conn_tab);
3122
3123 seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count));
3124 seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n",
3125 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
3126
3127 if (!atomic_read(&ipvs->conn_count))
3128 goto after_conns;
3129 old_gen = atomic_read(&ipvs->conn_tab_changes);
3130 loops = 0;
3131
3132 repeat_conn:
3133 smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
3134 memset(counts, 0, sizeof(counts));
3135 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
3136 for (bucket = 0; bucket < t->size; bucket++) {
3137 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
3138
3139 count = 0;
3140 resched_score++;
3141 ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
3142 count = 0;
3143 hlist_bl_for_each_entry_rcu(hn, e, head, node) {
3144 count++;
3145 if (count >= ARRAY_SIZE(counts) - 1)
3146 break;
3147 }
3148 }
3149 resched_score += count;
3150 if (resched_score >= 100) {
3151 resched_score = 0;
3152 cond_resched_rcu();
3153 new_gen = atomic_read(&ipvs->conn_tab_changes);
3154 /* New table installed ? */
3155 if (old_gen != new_gen) {
3156 /* Too many changes? */
3157 if (++loops >= 5)
3158 goto after_conns;
3159 old_gen = new_gen;
3160 goto repeat_conn;
3161 }
3162 }
3163 counts[count]++;
3164 }
3165 }
3166 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
3167 sum += counts[i];
3168 sum1 = sum - counts[0];
3169 seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n",
3170 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
3171 for (i = 1; i < ARRAY_SIZE(counts); i++) {
3172 if (!counts[i])
3173 continue;
3174 seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n",
3175 i, counts[i],
3176 div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
3177 }
3178
3179 after_conns:
3180 rcu_read_unlock();
3181
3182 /* Info for services */
3183 down_read(&ipvs->svc_replace_sem);
3184 rcu_read_lock();
3185
3186 t = rcu_dereference(ipvs->svc_table);
3187
3188 count = ip_vs_get_num_services(ipvs);
3189 seq_printf(seq, "Services:\t%u\n", count);
3190 seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n",
3191 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
3192
3193 if (!count)
3194 goto after_svc;
3195 old_gen = atomic_read(&ipvs->svc_table_changes);
3196
3197 smp_rmb(); /* ipvs->svc_table and svc_table_changes */
3198 memset(counts, 0, sizeof(counts));
3199 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) {
3200 for (bucket = 0; bucket < t->size; bucket++) {
3201 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
3202
3203 count = 0;
3204 resched_score++;
3205 ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
3206 count = 0;
3207 hlist_bl_for_each_entry_rcu(svc, e, head,
3208 s_list) {
3209 count++;
3210 if (count >= ARRAY_SIZE(counts) - 1)
3211 break;
3212 }
3213 }
3214 resched_score += count;
3215 if (resched_score >= 100) {
3216 resched_score = 0;
3217 cond_resched_rcu();
3218 /* Flushed? */
3219 if (atomic_read(&ipvs->svc_table_changes) !=
3220 old_gen)
3221 goto after_svc;
3222 }
3223 counts[count]++;
3224 }
3225 }
3226 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
3227 sum += counts[i];
3228 sum1 = sum - counts[0];
3229 seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n",
3230 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
3231 for (i = 1; i < ARRAY_SIZE(counts); i++) {
3232 if (!counts[i])
3233 continue;
3234 seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n",
3235 i, counts[i],
3236 div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
3237 }
3238
3239 after_svc:
3240 rcu_read_unlock();
3241 up_read(&ipvs->svc_replace_sem);
3242
3243 seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n",
3244 ipvs->est_kt_count, ipvs->est_max_threads);
3245 seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max);
3246 seq_printf(seq, "Stats thread ests:\t%d\n",
3247 ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR *
3248 IPVS_EST_NTICKS);
3249
3250 return 0;
3251 }
3252
3253 #endif
3254
3255 /*
3256 * Set timeout values for tcp tcpfin udp in the timeout_table.
3257 */
ip_vs_set_timeout(struct netns_ipvs * ipvs,struct ip_vs_timeout_user * u)3258 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
3259 {
3260 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
3261 struct ip_vs_proto_data *pd;
3262 #endif
3263
3264 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
3265 u->tcp_timeout,
3266 u->tcp_fin_timeout,
3267 u->udp_timeout);
3268
3269 #ifdef CONFIG_IP_VS_PROTO_TCP
3270 if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
3271 u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
3272 return -EINVAL;
3273 }
3274 #endif
3275
3276 #ifdef CONFIG_IP_VS_PROTO_UDP
3277 if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
3278 return -EINVAL;
3279 #endif
3280
3281 #ifdef CONFIG_IP_VS_PROTO_TCP
3282 if (u->tcp_timeout) {
3283 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3284 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
3285 = u->tcp_timeout * HZ;
3286 }
3287
3288 if (u->tcp_fin_timeout) {
3289 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3290 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
3291 = u->tcp_fin_timeout * HZ;
3292 }
3293 #endif
3294
3295 #ifdef CONFIG_IP_VS_PROTO_UDP
3296 if (u->udp_timeout) {
3297 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
3298 pd->timeout_table[IP_VS_UDP_S_NORMAL]
3299 = u->udp_timeout * HZ;
3300 }
3301 #endif
3302 return 0;
3303 }
3304
3305 #define CMDID(cmd) (cmd - IP_VS_BASE_CTL)
3306
3307 struct ip_vs_svcdest_user {
3308 struct ip_vs_service_user s;
3309 struct ip_vs_dest_user d;
3310 };
3311
3312 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
3313 [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user),
3314 [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user),
3315 [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user),
3316 [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user),
3317 [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user),
3318 [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user),
3319 [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
3320 [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
3321 [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user),
3322 [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user),
3323 };
3324
3325 union ip_vs_set_arglen {
3326 struct ip_vs_service_user field_IP_VS_SO_SET_ADD;
3327 struct ip_vs_service_user field_IP_VS_SO_SET_EDIT;
3328 struct ip_vs_service_user field_IP_VS_SO_SET_DEL;
3329 struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST;
3330 struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST;
3331 struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST;
3332 struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT;
3333 struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON;
3334 struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON;
3335 struct ip_vs_service_user field_IP_VS_SO_SET_ZERO;
3336 };
3337
3338 #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen)
3339
ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern * usvc,struct ip_vs_service_user * usvc_compat)3340 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
3341 struct ip_vs_service_user *usvc_compat)
3342 {
3343 memset(usvc, 0, sizeof(*usvc));
3344
3345 usvc->af = AF_INET;
3346 usvc->protocol = usvc_compat->protocol;
3347 usvc->addr.ip = usvc_compat->addr;
3348 usvc->port = usvc_compat->port;
3349 usvc->fwmark = usvc_compat->fwmark;
3350
3351 /* Deep copy of sched_name is not needed here */
3352 usvc->sched_name = usvc_compat->sched_name;
3353
3354 usvc->flags = usvc_compat->flags;
3355 usvc->timeout = usvc_compat->timeout;
3356 usvc->netmask = usvc_compat->netmask;
3357 }
3358
ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern * udest,struct ip_vs_dest_user * udest_compat)3359 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
3360 struct ip_vs_dest_user *udest_compat)
3361 {
3362 memset(udest, 0, sizeof(*udest));
3363
3364 udest->addr.ip = udest_compat->addr;
3365 udest->port = udest_compat->port;
3366 udest->conn_flags = udest_compat->conn_flags;
3367 udest->weight = udest_compat->weight;
3368 udest->u_threshold = udest_compat->u_threshold;
3369 udest->l_threshold = udest_compat->l_threshold;
3370 udest->af = AF_INET;
3371 udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
3372 }
3373
3374 static int
do_ip_vs_set_ctl(struct sock * sk,int cmd,sockptr_t ptr,unsigned int len)3375 do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
3376 {
3377 struct net *net = sock_net(sk);
3378 int ret;
3379 unsigned char arg[MAX_SET_ARGLEN];
3380 struct ip_vs_service_user *usvc_compat;
3381 struct ip_vs_service_user_kern usvc;
3382 struct ip_vs_service *svc;
3383 struct ip_vs_dest_user *udest_compat;
3384 struct ip_vs_dest_user_kern udest;
3385 struct netns_ipvs *ipvs = net_ipvs(net);
3386
3387 BUILD_BUG_ON(sizeof(arg) > 255);
3388 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3389 return -EPERM;
3390
3391 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
3392 return -EINVAL;
3393 if (len != set_arglen[CMDID(cmd)]) {
3394 IP_VS_DBG(1, "set_ctl: len %u != %u\n",
3395 len, set_arglen[CMDID(cmd)]);
3396 return -EINVAL;
3397 }
3398
3399 if (copy_from_sockptr(arg, ptr, len) != 0)
3400 return -EFAULT;
3401
3402 /* Handle daemons since they have another lock */
3403 if (cmd == IP_VS_SO_SET_STARTDAEMON ||
3404 cmd == IP_VS_SO_SET_STOPDAEMON) {
3405 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
3406
3407 if (cmd == IP_VS_SO_SET_STARTDAEMON) {
3408 struct ipvs_sync_daemon_cfg cfg;
3409
3410 memset(&cfg, 0, sizeof(cfg));
3411 ret = -EINVAL;
3412 if (strscpy(cfg.mcast_ifn, dm->mcast_ifn,
3413 sizeof(cfg.mcast_ifn)) <= 0)
3414 return ret;
3415 cfg.syncid = dm->syncid;
3416 ret = start_sync_thread(ipvs, &cfg, dm->state);
3417 } else {
3418 ret = stop_sync_thread(ipvs, dm->state);
3419 }
3420 return ret;
3421 }
3422
3423 mutex_lock(&ipvs->service_mutex);
3424 if (cmd == IP_VS_SO_SET_FLUSH) {
3425 /* Flush the virtual service */
3426 ret = ip_vs_flush(ipvs, false);
3427 goto out_unlock;
3428 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
3429 /* Set timeout values for (tcp tcpfin udp) */
3430 ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
3431 goto out_unlock;
3432 } else if (!len) {
3433 /* No more commands with len == 0 below */
3434 ret = -EINVAL;
3435 goto out_unlock;
3436 }
3437
3438 usvc_compat = (struct ip_vs_service_user *)arg;
3439 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
3440
3441 /* We only use the new structs internally, so copy userspace compat
3442 * structs to extended internal versions */
3443 ip_vs_copy_usvc_compat(&usvc, usvc_compat);
3444 ip_vs_copy_udest_compat(&udest, udest_compat);
3445
3446 if (cmd == IP_VS_SO_SET_ZERO) {
3447 /* if no service address is set, zero counters in all */
3448 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
3449 ret = ip_vs_zero_all(ipvs);
3450 goto out_unlock;
3451 }
3452 }
3453
3454 if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) &&
3455 strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) ==
3456 IP_VS_SCHEDNAME_MAXLEN) {
3457 ret = -EINVAL;
3458 goto out_unlock;
3459 }
3460
3461 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
3462 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
3463 usvc.protocol != IPPROTO_SCTP) {
3464 pr_err("set_ctl: invalid protocol: %d %pI4:%d\n",
3465 usvc.protocol, &usvc.addr.ip,
3466 ntohs(usvc.port));
3467 ret = -EFAULT;
3468 goto out_unlock;
3469 }
3470
3471 /* Lookup the exact service by <protocol, addr, port> or fwmark */
3472 rcu_read_lock();
3473 if (usvc.fwmark == 0)
3474 svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol,
3475 &usvc.addr, usvc.port);
3476 else
3477 svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark);
3478 rcu_read_unlock();
3479
3480 if (cmd != IP_VS_SO_SET_ADD
3481 && (svc == NULL || svc->protocol != usvc.protocol)) {
3482 ret = -ESRCH;
3483 goto out_unlock;
3484 }
3485
3486 switch (cmd) {
3487 case IP_VS_SO_SET_ADD:
3488 if (svc != NULL)
3489 ret = -EEXIST;
3490 else
3491 ret = ip_vs_add_service(ipvs, &usvc, &svc);
3492 break;
3493 case IP_VS_SO_SET_EDIT:
3494 ret = ip_vs_edit_service(svc, &usvc);
3495 break;
3496 case IP_VS_SO_SET_DEL:
3497 ret = ip_vs_del_service(svc);
3498 if (!ret)
3499 goto out_unlock;
3500 break;
3501 case IP_VS_SO_SET_ZERO:
3502 ret = ip_vs_zero_service(svc);
3503 break;
3504 case IP_VS_SO_SET_ADDDEST:
3505 ret = ip_vs_add_dest(svc, &udest);
3506 break;
3507 case IP_VS_SO_SET_EDITDEST:
3508 ret = ip_vs_edit_dest(svc, &udest);
3509 break;
3510 case IP_VS_SO_SET_DELDEST:
3511 ret = ip_vs_del_dest(svc, &udest);
3512 break;
3513 default:
3514 WARN_ON_ONCE(1);
3515 ret = -EINVAL;
3516 break;
3517 }
3518
3519 out_unlock:
3520 mutex_unlock(&ipvs->service_mutex);
3521 return ret;
3522 }
3523
3524
3525 static void
ip_vs_copy_service(struct ip_vs_service_entry * dst,struct ip_vs_service * src)3526 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
3527 {
3528 struct ip_vs_scheduler *sched;
3529 struct ip_vs_kstats kstats;
3530 char *sched_name;
3531
3532 sched = rcu_dereference_protected(src->scheduler, 1);
3533 sched_name = sched ? sched->name : "none";
3534 dst->protocol = src->protocol;
3535 dst->addr = src->addr.ip;
3536 dst->port = src->port;
3537 dst->fwmark = src->fwmark;
3538 strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
3539 dst->flags = src->flags;
3540 dst->timeout = src->timeout / HZ;
3541 dst->netmask = src->netmask;
3542 dst->num_dests = src->num_dests;
3543 ip_vs_copy_stats(&kstats, &src->stats);
3544 ip_vs_export_stats_user(&dst->stats, &kstats);
3545 }
3546
3547 static inline int
__ip_vs_get_service_entries(struct netns_ipvs * ipvs,const struct ip_vs_get_services * get,struct ip_vs_get_services __user * uptr)3548 __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
3549 const struct ip_vs_get_services *get,
3550 struct ip_vs_get_services __user *uptr)
3551 {
3552 struct ip_vs_service_entry entry;
3553 DECLARE_IP_VS_RHT_WALK_BUCKETS();
3554 struct hlist_bl_head *head;
3555 struct ip_vs_service *svc;
3556 struct hlist_bl_node *e;
3557 int count = 0;
3558 int ret = 0;
3559
3560 lockdep_assert_held(&ipvs->svc_resize_sem);
3561 /* All svc_table modifications are disabled, go ahead */
3562 ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
3563 hlist_bl_for_each_entry(svc, e, head, s_list) {
3564 /* Only expose IPv4 entries to old interface */
3565 if (svc->af != AF_INET)
3566 continue;
3567
3568 if (count >= get->num_services)
3569 goto out;
3570 memset(&entry, 0, sizeof(entry));
3571 ip_vs_copy_service(&entry, svc);
3572 if (copy_to_user(&uptr->entrytable[count],
3573 &entry, sizeof(entry))) {
3574 ret = -EFAULT;
3575 goto out;
3576 }
3577 count++;
3578 }
3579 }
3580
3581 out:
3582 return ret;
3583 }
3584
3585 static inline int
__ip_vs_get_dest_entries(struct netns_ipvs * ipvs,const struct ip_vs_get_dests * get,struct ip_vs_get_dests __user * uptr)3586 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get,
3587 struct ip_vs_get_dests __user *uptr)
3588 {
3589 struct ip_vs_service *svc;
3590 union nf_inet_addr addr = { .ip = get->addr };
3591 int ret = 0;
3592
3593 rcu_read_lock();
3594 if (get->fwmark)
3595 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark);
3596 else
3597 svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr,
3598 get->port);
3599 rcu_read_unlock();
3600
3601 if (svc) {
3602 int count = 0;
3603 struct ip_vs_dest *dest;
3604 struct ip_vs_dest_entry entry;
3605 struct ip_vs_kstats kstats;
3606
3607 memset(&entry, 0, sizeof(entry));
3608 list_for_each_entry(dest, &svc->destinations, n_list) {
3609 if (count >= get->num_dests)
3610 break;
3611
3612 /* Cannot expose heterogeneous members via sockopt
3613 * interface
3614 */
3615 if (dest->af != svc->af)
3616 continue;
3617
3618 entry.addr = dest->addr.ip;
3619 entry.port = dest->port;
3620 entry.conn_flags = atomic_read(&dest->conn_flags);
3621 entry.weight = atomic_read(&dest->weight);
3622 entry.u_threshold = dest->u_threshold;
3623 entry.l_threshold = dest->l_threshold;
3624 entry.activeconns = atomic_read(&dest->activeconns);
3625 entry.inactconns = atomic_read(&dest->inactconns);
3626 entry.persistconns = atomic_read(&dest->persistconns);
3627 ip_vs_copy_stats(&kstats, &dest->stats);
3628 ip_vs_export_stats_user(&entry.stats, &kstats);
3629 if (copy_to_user(&uptr->entrytable[count],
3630 &entry, sizeof(entry))) {
3631 ret = -EFAULT;
3632 break;
3633 }
3634 count++;
3635 }
3636 } else
3637 ret = -ESRCH;
3638 return ret;
3639 }
3640
3641 static inline void
__ip_vs_get_timeouts(struct netns_ipvs * ipvs,struct ip_vs_timeout_user * u)3642 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
3643 {
3644 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
3645 struct ip_vs_proto_data *pd;
3646 #endif
3647
3648 memset(u, 0, sizeof (*u));
3649
3650 #ifdef CONFIG_IP_VS_PROTO_TCP
3651 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3652 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
3653 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
3654 #endif
3655 #ifdef CONFIG_IP_VS_PROTO_UDP
3656 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
3657 u->udp_timeout =
3658 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
3659 #endif
3660 }
3661
3662 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
3663 [CMDID(IP_VS_SO_GET_VERSION)] = 64,
3664 [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo),
3665 [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
3666 [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry),
3667 [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests),
3668 [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
3669 [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user),
3670 };
3671
3672 union ip_vs_get_arglen {
3673 char field_IP_VS_SO_GET_VERSION[64];
3674 struct ip_vs_getinfo field_IP_VS_SO_GET_INFO;
3675 struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES;
3676 struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE;
3677 struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS;
3678 struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT;
3679 struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2];
3680 };
3681
3682 #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen)
3683
3684 static int
do_ip_vs_get_ctl(struct sock * sk,int cmd,void __user * user,int * len)3685 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
3686 {
3687 unsigned char arg[MAX_GET_ARGLEN];
3688 int ret = 0;
3689 unsigned int copylen;
3690 struct net *net = sock_net(sk);
3691 struct netns_ipvs *ipvs = net_ipvs(net);
3692
3693 BUG_ON(!net);
3694 BUILD_BUG_ON(sizeof(arg) > 255);
3695 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3696 return -EPERM;
3697
3698 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
3699 return -EINVAL;
3700
3701 copylen = get_arglen[CMDID(cmd)];
3702 if (*len < (int) copylen) {
3703 IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
3704 return -EINVAL;
3705 }
3706
3707 if (copy_from_user(arg, user, copylen) != 0)
3708 return -EFAULT;
3709 /*
3710 * Handle daemons first since it has its own locking
3711 */
3712 if (cmd == IP_VS_SO_GET_DAEMON) {
3713 struct ip_vs_daemon_user d[2];
3714
3715 memset(&d, 0, sizeof(d));
3716 mutex_lock(&ipvs->sync_mutex);
3717 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
3718 d[0].state = IP_VS_STATE_MASTER;
3719 strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
3720 sizeof(d[0].mcast_ifn));
3721 d[0].syncid = ipvs->mcfg.syncid;
3722 }
3723 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
3724 d[1].state = IP_VS_STATE_BACKUP;
3725 strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
3726 sizeof(d[1].mcast_ifn));
3727 d[1].syncid = ipvs->bcfg.syncid;
3728 }
3729 if (copy_to_user(user, &d, sizeof(d)) != 0)
3730 ret = -EFAULT;
3731 mutex_unlock(&ipvs->sync_mutex);
3732 return ret;
3733 }
3734
3735 if (cmd == IP_VS_SO_GET_SERVICES) {
3736 struct ip_vs_get_services *get;
3737 size_t size;
3738
3739 get = (struct ip_vs_get_services *)arg;
3740 size = struct_size(get, entrytable, get->num_services);
3741 if (*len != size) {
3742 pr_err("length: %u != %zu\n", *len, size);
3743 return -EINVAL;
3744 }
3745 /* Prevent modifications to the list with services.
3746 * Try reverse locking, so that we do not hold the mutex
3747 * while waiting for semaphore.
3748 */
3749 while (1) {
3750 ret = down_read_killable(&ipvs->svc_resize_sem);
3751 if (ret < 0)
3752 return ret;
3753 if (mutex_trylock(&ipvs->service_mutex))
3754 break;
3755 up_read(&ipvs->svc_resize_sem);
3756 cond_resched();
3757 }
3758 ret = __ip_vs_get_service_entries(ipvs, get, user);
3759 up_read(&ipvs->svc_resize_sem);
3760 mutex_unlock(&ipvs->service_mutex);
3761 return ret;
3762 }
3763
3764 mutex_lock(&ipvs->service_mutex);
3765 switch (cmd) {
3766 case IP_VS_SO_GET_VERSION:
3767 {
3768 char buf[64];
3769
3770 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
3771 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
3772 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
3773 ret = -EFAULT;
3774 goto out;
3775 }
3776 *len = strlen(buf)+1;
3777 }
3778 break;
3779
3780 case IP_VS_SO_GET_INFO:
3781 {
3782 struct ip_vs_getinfo info;
3783
3784 info.version = IP_VS_VERSION_CODE;
3785 info.size = get_conn_tab_size(ipvs);
3786 info.num_services =
3787 atomic_read(&ipvs->num_services[IP_VS_AF_INET]);
3788 if (copy_to_user(user, &info, sizeof(info)) != 0)
3789 ret = -EFAULT;
3790 }
3791 break;
3792
3793 case IP_VS_SO_GET_SERVICE:
3794 {
3795 struct ip_vs_service_entry *entry;
3796 struct ip_vs_service *svc;
3797 union nf_inet_addr addr;
3798
3799 entry = (struct ip_vs_service_entry *)arg;
3800 addr.ip = entry->addr;
3801 rcu_read_lock();
3802 if (entry->fwmark)
3803 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark);
3804 else
3805 svc = __ip_vs_service_find(ipvs, AF_INET,
3806 entry->protocol, &addr,
3807 entry->port);
3808 rcu_read_unlock();
3809 if (svc) {
3810 ip_vs_copy_service(entry, svc);
3811 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
3812 ret = -EFAULT;
3813 } else
3814 ret = -ESRCH;
3815 }
3816 break;
3817
3818 case IP_VS_SO_GET_DESTS:
3819 {
3820 struct ip_vs_get_dests *get;
3821 size_t size;
3822
3823 get = (struct ip_vs_get_dests *)arg;
3824 size = struct_size(get, entrytable, get->num_dests);
3825 if (*len != size) {
3826 pr_err("length: %u != %zu\n", *len, size);
3827 ret = -EINVAL;
3828 goto out;
3829 }
3830 ret = __ip_vs_get_dest_entries(ipvs, get, user);
3831 }
3832 break;
3833
3834 case IP_VS_SO_GET_TIMEOUT:
3835 {
3836 struct ip_vs_timeout_user t;
3837
3838 __ip_vs_get_timeouts(ipvs, &t);
3839 if (copy_to_user(user, &t, sizeof(t)) != 0)
3840 ret = -EFAULT;
3841 }
3842 break;
3843
3844 default:
3845 ret = -EINVAL;
3846 }
3847
3848 out:
3849 mutex_unlock(&ipvs->service_mutex);
3850 return ret;
3851 }
3852
3853
3854 static struct nf_sockopt_ops ip_vs_sockopts = {
3855 .pf = PF_INET,
3856 .set_optmin = IP_VS_BASE_CTL,
3857 .set_optmax = IP_VS_SO_SET_MAX+1,
3858 .set = do_ip_vs_set_ctl,
3859 .get_optmin = IP_VS_BASE_CTL,
3860 .get_optmax = IP_VS_SO_GET_MAX+1,
3861 .get = do_ip_vs_get_ctl,
3862 .owner = THIS_MODULE,
3863 };
3864
3865 /*
3866 * Generic Netlink interface
3867 */
3868
3869 /* IPVS genetlink family */
3870 static struct genl_family ip_vs_genl_family;
3871
3872 /* Policy used for first-level command attributes */
3873 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
3874 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
3875 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
3876 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
3877 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
3878 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
3879 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
3880 };
3881
3882 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
3883 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
3884 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
3885 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
3886 .len = IP_VS_IFNAME_MAXLEN - 1 },
3887 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
3888 [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 },
3889 [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 },
3890 [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) },
3891 [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 },
3892 [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 },
3893 };
3894
3895 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
3896 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
3897 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
3898 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
3899 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
3900 .len = sizeof(union nf_inet_addr) },
3901 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
3902 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
3903 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
3904 .len = IP_VS_SCHEDNAME_MAXLEN - 1 },
3905 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
3906 .len = IP_VS_PENAME_MAXLEN },
3907 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
3908 .len = sizeof(struct ip_vs_flags) },
3909 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
3910 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
3911 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
3912 };
3913
3914 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
3915 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
3916 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
3917 .len = sizeof(union nf_inet_addr) },
3918 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
3919 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
3920 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
3921 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
3922 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
3923 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
3924 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
3925 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
3926 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
3927 [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
3928 [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
3929 [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
3930 [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 },
3931 };
3932
ip_vs_genl_fill_stats(struct sk_buff * skb,int container_type,struct ip_vs_kstats * kstats)3933 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
3934 struct ip_vs_kstats *kstats)
3935 {
3936 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3937
3938 if (!nl_stats)
3939 return -EMSGSIZE;
3940
3941 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
3942 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
3943 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
3944 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3945 IPVS_STATS_ATTR_PAD) ||
3946 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3947 IPVS_STATS_ATTR_PAD) ||
3948 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
3949 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
3950 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
3951 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
3952 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
3953 goto nla_put_failure;
3954 nla_nest_end(skb, nl_stats);
3955
3956 return 0;
3957
3958 nla_put_failure:
3959 nla_nest_cancel(skb, nl_stats);
3960 return -EMSGSIZE;
3961 }
3962
ip_vs_genl_fill_stats64(struct sk_buff * skb,int container_type,struct ip_vs_kstats * kstats)3963 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
3964 struct ip_vs_kstats *kstats)
3965 {
3966 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3967
3968 if (!nl_stats)
3969 return -EMSGSIZE;
3970
3971 if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns,
3972 IPVS_STATS_ATTR_PAD) ||
3973 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts,
3974 IPVS_STATS_ATTR_PAD) ||
3975 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts,
3976 IPVS_STATS_ATTR_PAD) ||
3977 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3978 IPVS_STATS_ATTR_PAD) ||
3979 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3980 IPVS_STATS_ATTR_PAD) ||
3981 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps,
3982 IPVS_STATS_ATTR_PAD) ||
3983 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps,
3984 IPVS_STATS_ATTR_PAD) ||
3985 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps,
3986 IPVS_STATS_ATTR_PAD) ||
3987 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps,
3988 IPVS_STATS_ATTR_PAD) ||
3989 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps,
3990 IPVS_STATS_ATTR_PAD))
3991 goto nla_put_failure;
3992 nla_nest_end(skb, nl_stats);
3993
3994 return 0;
3995
3996 nla_put_failure:
3997 nla_nest_cancel(skb, nl_stats);
3998 return -EMSGSIZE;
3999 }
4000
ip_vs_genl_fill_service(struct sk_buff * skb,struct ip_vs_service * svc)4001 static int ip_vs_genl_fill_service(struct sk_buff *skb,
4002 struct ip_vs_service *svc)
4003 {
4004 struct ip_vs_scheduler *sched;
4005 struct ip_vs_pe *pe;
4006 struct nlattr *nl_service;
4007 struct ip_vs_flags flags = { .flags = svc->flags,
4008 .mask = ~0 };
4009 struct ip_vs_kstats kstats;
4010 char *sched_name;
4011
4012 nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
4013 if (!nl_service)
4014 return -EMSGSIZE;
4015
4016 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
4017 goto nla_put_failure;
4018 if (svc->fwmark) {
4019 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
4020 goto nla_put_failure;
4021 } else {
4022 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
4023 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
4024 nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
4025 goto nla_put_failure;
4026 }
4027
4028 sched = rcu_dereference(svc->scheduler);
4029 sched_name = sched ? sched->name : "none";
4030 pe = rcu_dereference(svc->pe);
4031 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
4032 (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
4033 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
4034 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
4035 nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
4036 goto nla_put_failure;
4037 ip_vs_copy_stats(&kstats, &svc->stats);
4038 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
4039 goto nla_put_failure;
4040 if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
4041 goto nla_put_failure;
4042
4043 nla_nest_end(skb, nl_service);
4044
4045 return 0;
4046
4047 nla_put_failure:
4048 nla_nest_cancel(skb, nl_service);
4049 return -EMSGSIZE;
4050 }
4051
ip_vs_genl_dump_service(struct sk_buff * skb,struct ip_vs_service * svc,struct netlink_callback * cb)4052 static int ip_vs_genl_dump_service(struct sk_buff *skb,
4053 struct ip_vs_service *svc,
4054 struct netlink_callback *cb)
4055 {
4056 void *hdr;
4057
4058 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4059 &ip_vs_genl_family, NLM_F_MULTI,
4060 IPVS_CMD_NEW_SERVICE);
4061 if (!hdr)
4062 return -EMSGSIZE;
4063
4064 if (ip_vs_genl_fill_service(skb, svc) < 0)
4065 goto nla_put_failure;
4066
4067 genlmsg_end(skb, hdr);
4068 return 0;
4069
4070 nla_put_failure:
4071 genlmsg_cancel(skb, hdr);
4072 return -EMSGSIZE;
4073 }
4074
ip_vs_genl_dump_services(struct sk_buff * skb,struct netlink_callback * cb)4075 static int ip_vs_genl_dump_services(struct sk_buff *skb,
4076 struct netlink_callback *cb)
4077 {
4078 DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
4079 struct net *net = sock_net(skb->sk);
4080 struct netns_ipvs *ipvs = net_ipvs(net);
4081 struct hlist_bl_head *head;
4082 struct ip_vs_service *svc;
4083 struct hlist_bl_node *e;
4084 int start = cb->args[0];
4085 int idx = 0;
4086
4087 /* Make sure we do not see same service twice during resize */
4088 down_read(&ipvs->svc_resize_sem);
4089 rcu_read_lock();
4090 ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
4091 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
4092 if (++idx <= start)
4093 continue;
4094 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
4095 idx--;
4096 goto nla_put_failure;
4097 }
4098 }
4099 }
4100
4101 nla_put_failure:
4102 rcu_read_unlock();
4103 up_read(&ipvs->svc_resize_sem);
4104 cb->args[0] = idx;
4105
4106 return skb->len;
4107 }
4108
ip_vs_is_af_valid(int af)4109 static bool ip_vs_is_af_valid(int af)
4110 {
4111 if (af == AF_INET)
4112 return true;
4113 #ifdef CONFIG_IP_VS_IPV6
4114 if (af == AF_INET6 && ipv6_mod_enabled())
4115 return true;
4116 #endif
4117 return false;
4118 }
4119
ip_vs_genl_parse_service(struct netns_ipvs * ipvs,struct ip_vs_service_user_kern * usvc,struct nlattr * nla,bool full_entry,struct ip_vs_service ** ret_svc)4120 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
4121 struct ip_vs_service_user_kern *usvc,
4122 struct nlattr *nla, bool full_entry,
4123 struct ip_vs_service **ret_svc)
4124 {
4125 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
4126 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
4127 struct ip_vs_service *svc;
4128
4129 /* Parse mandatory identifying service fields first */
4130 if (nla == NULL ||
4131 nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL))
4132 return -EINVAL;
4133
4134 nla_af = attrs[IPVS_SVC_ATTR_AF];
4135 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
4136 nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
4137 nla_port = attrs[IPVS_SVC_ATTR_PORT];
4138 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
4139
4140 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
4141 return -EINVAL;
4142
4143 memset(usvc, 0, sizeof(*usvc));
4144
4145 usvc->af = nla_get_u16(nla_af);
4146 if (!ip_vs_is_af_valid(usvc->af))
4147 return -EAFNOSUPPORT;
4148
4149 if (nla_fwmark) {
4150 usvc->protocol = IPPROTO_TCP;
4151 usvc->fwmark = nla_get_u32(nla_fwmark);
4152 } else {
4153 usvc->protocol = nla_get_u16(nla_protocol);
4154 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
4155 usvc->port = nla_get_be16(nla_port);
4156 usvc->fwmark = 0;
4157 }
4158
4159 if (usvc->fwmark)
4160 svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
4161 else
4162 svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
4163 &usvc->addr, usvc->port);
4164 *ret_svc = svc;
4165
4166 /* If a full entry was requested, check for the additional fields */
4167 if (full_entry) {
4168 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
4169 *nla_netmask;
4170 struct ip_vs_flags flags;
4171
4172 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
4173 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
4174 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
4175 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
4176 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
4177
4178 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
4179 return -EINVAL;
4180
4181 nla_memcpy(&flags, nla_flags, sizeof(flags));
4182
4183 /* prefill flags from service if it already exists */
4184 if (svc)
4185 usvc->flags = svc->flags;
4186
4187 /* set new flags from userland */
4188 usvc->flags = (usvc->flags & ~flags.mask) |
4189 (flags.flags & flags.mask);
4190 usvc->sched_name = nla_data(nla_sched);
4191 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
4192 usvc->timeout = nla_get_u32(nla_timeout);
4193 usvc->netmask = nla_get_be32(nla_netmask);
4194 }
4195
4196 return 0;
4197 }
4198
ip_vs_genl_find_service(struct netns_ipvs * ipvs,struct nlattr * nla)4199 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
4200 struct nlattr *nla)
4201 {
4202 struct ip_vs_service_user_kern usvc;
4203 struct ip_vs_service *svc;
4204 int ret;
4205
4206 ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc);
4207 return ret ? ERR_PTR(ret) : svc;
4208 }
4209
ip_vs_genl_fill_dest(struct sk_buff * skb,struct ip_vs_dest * dest)4210 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
4211 {
4212 struct nlattr *nl_dest;
4213 struct ip_vs_kstats kstats;
4214
4215 nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
4216 if (!nl_dest)
4217 return -EMSGSIZE;
4218
4219 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
4220 nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
4221 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
4222 (atomic_read(&dest->conn_flags) &
4223 IP_VS_CONN_F_FWD_MASK)) ||
4224 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
4225 atomic_read(&dest->weight)) ||
4226 nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
4227 dest->tun_type) ||
4228 nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
4229 dest->tun_port) ||
4230 nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
4231 dest->tun_flags) ||
4232 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
4233 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
4234 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
4235 atomic_read(&dest->activeconns)) ||
4236 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
4237 atomic_read(&dest->inactconns)) ||
4238 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
4239 atomic_read(&dest->persistconns)) ||
4240 nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
4241 goto nla_put_failure;
4242 ip_vs_copy_stats(&kstats, &dest->stats);
4243 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
4244 goto nla_put_failure;
4245 if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
4246 goto nla_put_failure;
4247
4248 nla_nest_end(skb, nl_dest);
4249
4250 return 0;
4251
4252 nla_put_failure:
4253 nla_nest_cancel(skb, nl_dest);
4254 return -EMSGSIZE;
4255 }
4256
ip_vs_genl_dump_dest(struct sk_buff * skb,struct ip_vs_dest * dest,struct netlink_callback * cb)4257 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
4258 struct netlink_callback *cb)
4259 {
4260 void *hdr;
4261
4262 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4263 &ip_vs_genl_family, NLM_F_MULTI,
4264 IPVS_CMD_NEW_DEST);
4265 if (!hdr)
4266 return -EMSGSIZE;
4267
4268 if (ip_vs_genl_fill_dest(skb, dest) < 0)
4269 goto nla_put_failure;
4270
4271 genlmsg_end(skb, hdr);
4272 return 0;
4273
4274 nla_put_failure:
4275 genlmsg_cancel(skb, hdr);
4276 return -EMSGSIZE;
4277 }
4278
ip_vs_genl_dump_dests(struct sk_buff * skb,struct netlink_callback * cb)4279 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
4280 struct netlink_callback *cb)
4281 {
4282 int idx = 0;
4283 int start = cb->args[0];
4284 struct ip_vs_service *svc;
4285 struct ip_vs_dest *dest;
4286 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
4287 struct net *net = sock_net(skb->sk);
4288 struct netns_ipvs *ipvs = net_ipvs(net);
4289
4290 rcu_read_lock();
4291
4292 /* Try to find the service for which to dump destinations */
4293 if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack))
4294 goto out_err;
4295
4296
4297 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
4298 if (IS_ERR_OR_NULL(svc))
4299 goto out_err;
4300
4301 /* Dump the destinations */
4302 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
4303 if (++idx <= start)
4304 continue;
4305 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
4306 idx--;
4307 goto nla_put_failure;
4308 }
4309 }
4310
4311 nla_put_failure:
4312 cb->args[0] = idx;
4313
4314 out_err:
4315 rcu_read_unlock();
4316
4317 return skb->len;
4318 }
4319
ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern * udest,struct nlattr * nla,bool full_entry)4320 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
4321 struct nlattr *nla, bool full_entry)
4322 {
4323 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
4324 struct nlattr *nla_addr, *nla_port;
4325 struct nlattr *nla_addr_family;
4326
4327 /* Parse mandatory identifying destination fields first */
4328 if (nla == NULL ||
4329 nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL))
4330 return -EINVAL;
4331
4332 nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
4333 nla_port = attrs[IPVS_DEST_ATTR_PORT];
4334 nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
4335
4336 if (!(nla_addr && nla_port))
4337 return -EINVAL;
4338
4339 memset(udest, 0, sizeof(*udest));
4340
4341 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
4342 udest->port = nla_get_be16(nla_port);
4343
4344 udest->af = nla_get_u16_default(nla_addr_family, 0);
4345
4346 /* If a full entry was requested, check for the additional fields */
4347 if (full_entry) {
4348 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
4349 *nla_l_thresh, *nla_tun_type, *nla_tun_port,
4350 *nla_tun_flags;
4351
4352 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
4353 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
4354 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
4355 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
4356 nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
4357 nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
4358 nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS];
4359
4360 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
4361 return -EINVAL;
4362
4363 udest->conn_flags = nla_get_u32(nla_fwd)
4364 & IP_VS_CONN_F_FWD_MASK;
4365 udest->weight = nla_get_u32(nla_weight);
4366 udest->u_threshold = nla_get_u32(nla_u_thresh);
4367 udest->l_threshold = nla_get_u32(nla_l_thresh);
4368
4369 if (nla_tun_type)
4370 udest->tun_type = nla_get_u8(nla_tun_type);
4371
4372 if (nla_tun_port)
4373 udest->tun_port = nla_get_be16(nla_tun_port);
4374
4375 if (nla_tun_flags)
4376 udest->tun_flags = nla_get_u16(nla_tun_flags);
4377 }
4378
4379 return 0;
4380 }
4381
ip_vs_genl_fill_daemon(struct sk_buff * skb,__u32 state,struct ipvs_sync_daemon_cfg * c)4382 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
4383 struct ipvs_sync_daemon_cfg *c)
4384 {
4385 struct nlattr *nl_daemon;
4386
4387 nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON);
4388 if (!nl_daemon)
4389 return -EMSGSIZE;
4390
4391 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
4392 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
4393 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
4394 nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
4395 nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
4396 nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
4397 goto nla_put_failure;
4398 #ifdef CONFIG_IP_VS_IPV6
4399 if (c->mcast_af == AF_INET6) {
4400 if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
4401 &c->mcast_group.in6))
4402 goto nla_put_failure;
4403 } else
4404 #endif
4405 if (c->mcast_af == AF_INET &&
4406 nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
4407 c->mcast_group.ip))
4408 goto nla_put_failure;
4409 nla_nest_end(skb, nl_daemon);
4410
4411 return 0;
4412
4413 nla_put_failure:
4414 nla_nest_cancel(skb, nl_daemon);
4415 return -EMSGSIZE;
4416 }
4417
ip_vs_genl_dump_daemon(struct sk_buff * skb,__u32 state,struct ipvs_sync_daemon_cfg * c,struct netlink_callback * cb)4418 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
4419 struct ipvs_sync_daemon_cfg *c,
4420 struct netlink_callback *cb)
4421 {
4422 void *hdr;
4423 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4424 &ip_vs_genl_family, NLM_F_MULTI,
4425 IPVS_CMD_NEW_DAEMON);
4426 if (!hdr)
4427 return -EMSGSIZE;
4428
4429 if (ip_vs_genl_fill_daemon(skb, state, c))
4430 goto nla_put_failure;
4431
4432 genlmsg_end(skb, hdr);
4433 return 0;
4434
4435 nla_put_failure:
4436 genlmsg_cancel(skb, hdr);
4437 return -EMSGSIZE;
4438 }
4439
ip_vs_genl_dump_daemons(struct sk_buff * skb,struct netlink_callback * cb)4440 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
4441 struct netlink_callback *cb)
4442 {
4443 struct net *net = sock_net(skb->sk);
4444 struct netns_ipvs *ipvs = net_ipvs(net);
4445
4446 mutex_lock(&ipvs->sync_mutex);
4447 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
4448 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
4449 &ipvs->mcfg, cb) < 0)
4450 goto nla_put_failure;
4451
4452 cb->args[0] = 1;
4453 }
4454
4455 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
4456 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
4457 &ipvs->bcfg, cb) < 0)
4458 goto nla_put_failure;
4459
4460 cb->args[1] = 1;
4461 }
4462
4463 nla_put_failure:
4464 mutex_unlock(&ipvs->sync_mutex);
4465
4466 return skb->len;
4467 }
4468
ip_vs_genl_new_daemon(struct netns_ipvs * ipvs,struct nlattr ** attrs)4469 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
4470 {
4471 struct ipvs_sync_daemon_cfg c;
4472 struct nlattr *a;
4473 int ret;
4474
4475 memset(&c, 0, sizeof(c));
4476 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
4477 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
4478 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
4479 return -EINVAL;
4480 strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
4481 sizeof(c.mcast_ifn));
4482 c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
4483
4484 a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
4485 if (a)
4486 c.sync_maxlen = nla_get_u16(a);
4487
4488 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
4489 if (a) {
4490 c.mcast_af = AF_INET;
4491 c.mcast_group.ip = nla_get_in_addr(a);
4492 if (!ipv4_is_multicast(c.mcast_group.ip))
4493 return -EINVAL;
4494 } else {
4495 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
4496 if (a) {
4497 #ifdef CONFIG_IP_VS_IPV6
4498 int addr_type;
4499
4500 c.mcast_af = AF_INET6;
4501 c.mcast_group.in6 = nla_get_in6_addr(a);
4502 addr_type = ipv6_addr_type(&c.mcast_group.in6);
4503 if (!(addr_type & IPV6_ADDR_MULTICAST))
4504 return -EINVAL;
4505 #else
4506 return -EAFNOSUPPORT;
4507 #endif
4508 }
4509 }
4510
4511 a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
4512 if (a)
4513 c.mcast_port = nla_get_u16(a);
4514
4515 a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
4516 if (a)
4517 c.mcast_ttl = nla_get_u8(a);
4518
4519 /* The synchronization protocol is incompatible with mixed family
4520 * services
4521 */
4522 if (ipvs->mixed_address_family_dests > 0)
4523 return -EINVAL;
4524
4525 ret = start_sync_thread(ipvs, &c,
4526 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
4527 return ret;
4528 }
4529
ip_vs_genl_del_daemon(struct netns_ipvs * ipvs,struct nlattr ** attrs)4530 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
4531 {
4532 int ret;
4533
4534 if (!attrs[IPVS_DAEMON_ATTR_STATE])
4535 return -EINVAL;
4536
4537 ret = stop_sync_thread(ipvs,
4538 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
4539 return ret;
4540 }
4541
ip_vs_genl_set_config(struct netns_ipvs * ipvs,struct nlattr ** attrs)4542 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs)
4543 {
4544 struct ip_vs_timeout_user t;
4545
4546 __ip_vs_get_timeouts(ipvs, &t);
4547
4548 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
4549 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
4550
4551 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
4552 t.tcp_fin_timeout =
4553 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
4554
4555 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
4556 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
4557
4558 return ip_vs_set_timeout(ipvs, &t);
4559 }
4560
ip_vs_genl_set_daemon(struct sk_buff * skb,struct genl_info * info)4561 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
4562 {
4563 int ret = -EINVAL, cmd;
4564 struct net *net = sock_net(skb->sk);
4565 struct netns_ipvs *ipvs = net_ipvs(net);
4566
4567 cmd = info->genlhdr->cmd;
4568
4569 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
4570 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
4571
4572 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
4573 nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack))
4574 goto out;
4575
4576 if (cmd == IPVS_CMD_NEW_DAEMON)
4577 ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs);
4578 else
4579 ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs);
4580 }
4581
4582 out:
4583 return ret;
4584 }
4585
ip_vs_genl_set_cmd(struct sk_buff * skb,struct genl_info * info)4586 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
4587 {
4588 bool need_full_svc = false, need_full_dest = false;
4589 struct ip_vs_service *svc = NULL;
4590 struct ip_vs_service_user_kern usvc;
4591 struct ip_vs_dest_user_kern udest;
4592 int ret = 0, cmd;
4593 struct net *net = sock_net(skb->sk);
4594 struct netns_ipvs *ipvs = net_ipvs(net);
4595
4596 cmd = info->genlhdr->cmd;
4597
4598 mutex_lock(&ipvs->service_mutex);
4599
4600 if (cmd == IPVS_CMD_FLUSH) {
4601 ret = ip_vs_flush(ipvs, false);
4602 goto out;
4603 } else if (cmd == IPVS_CMD_SET_CONFIG) {
4604 ret = ip_vs_genl_set_config(ipvs, info->attrs);
4605 goto out;
4606 } else if (cmd == IPVS_CMD_ZERO &&
4607 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
4608 ret = ip_vs_zero_all(ipvs);
4609 goto out;
4610 }
4611
4612 /* All following commands require a service argument, so check if we
4613 * received a valid one. We need a full service specification when
4614 * adding / editing a service. Only identifying members otherwise. */
4615 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
4616 need_full_svc = true;
4617
4618 /* We use function that requires RCU lock (hlist_bl) */
4619 rcu_read_lock();
4620 ret = ip_vs_genl_parse_service(ipvs, &usvc,
4621 info->attrs[IPVS_CMD_ATTR_SERVICE],
4622 need_full_svc, &svc);
4623 rcu_read_unlock();
4624 if (ret)
4625 goto out;
4626
4627 /* Unless we're adding a new service, the service must already exist */
4628 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
4629 ret = -ESRCH;
4630 goto out;
4631 }
4632
4633 /* Destination commands require a valid destination argument. For
4634 * adding / editing a destination, we need a full destination
4635 * specification. */
4636 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
4637 cmd == IPVS_CMD_DEL_DEST) {
4638 if (cmd != IPVS_CMD_DEL_DEST)
4639 need_full_dest = true;
4640
4641 ret = ip_vs_genl_parse_dest(&udest,
4642 info->attrs[IPVS_CMD_ATTR_DEST],
4643 need_full_dest);
4644 if (ret)
4645 goto out;
4646
4647 /* Old protocols did not allow the user to specify address
4648 * family, so we set it to zero instead. We also didn't
4649 * allow heterogeneous pools in the old code, so it's safe
4650 * to assume that this will have the same address family as
4651 * the service.
4652 */
4653 if (udest.af == 0)
4654 udest.af = svc->af;
4655
4656 if (!ip_vs_is_af_valid(udest.af)) {
4657 ret = -EAFNOSUPPORT;
4658 goto out;
4659 }
4660
4661 if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
4662 /* The synchronization protocol is incompatible
4663 * with mixed family services
4664 */
4665 if (ipvs->sync_state) {
4666 ret = -EINVAL;
4667 goto out;
4668 }
4669
4670 /* Which connection types do we support? */
4671 switch (udest.conn_flags) {
4672 case IP_VS_CONN_F_TUNNEL:
4673 /* We are able to forward this */
4674 break;
4675 default:
4676 ret = -EINVAL;
4677 goto out;
4678 }
4679 }
4680 }
4681
4682 switch (cmd) {
4683 case IPVS_CMD_NEW_SERVICE:
4684 if (svc == NULL)
4685 ret = ip_vs_add_service(ipvs, &usvc, &svc);
4686 else
4687 ret = -EEXIST;
4688 break;
4689 case IPVS_CMD_SET_SERVICE:
4690 ret = ip_vs_edit_service(svc, &usvc);
4691 break;
4692 case IPVS_CMD_DEL_SERVICE:
4693 ret = ip_vs_del_service(svc);
4694 /* do not use svc, it can be freed */
4695 break;
4696 case IPVS_CMD_NEW_DEST:
4697 ret = ip_vs_add_dest(svc, &udest);
4698 break;
4699 case IPVS_CMD_SET_DEST:
4700 ret = ip_vs_edit_dest(svc, &udest);
4701 break;
4702 case IPVS_CMD_DEL_DEST:
4703 ret = ip_vs_del_dest(svc, &udest);
4704 break;
4705 case IPVS_CMD_ZERO:
4706 ret = ip_vs_zero_service(svc);
4707 break;
4708 default:
4709 ret = -EINVAL;
4710 }
4711
4712 out:
4713 mutex_unlock(&ipvs->service_mutex);
4714
4715 return ret;
4716 }
4717
ip_vs_genl_get_cmd(struct sk_buff * skb,struct genl_info * info)4718 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
4719 {
4720 struct sk_buff *msg;
4721 void *reply;
4722 int ret, cmd, reply_cmd;
4723 struct net *net = sock_net(skb->sk);
4724 struct netns_ipvs *ipvs = net_ipvs(net);
4725
4726 cmd = info->genlhdr->cmd;
4727
4728 if (cmd == IPVS_CMD_GET_SERVICE)
4729 reply_cmd = IPVS_CMD_NEW_SERVICE;
4730 else if (cmd == IPVS_CMD_GET_INFO)
4731 reply_cmd = IPVS_CMD_SET_INFO;
4732 else if (cmd == IPVS_CMD_GET_CONFIG)
4733 reply_cmd = IPVS_CMD_SET_CONFIG;
4734 else {
4735 pr_err("unknown Generic Netlink command\n");
4736 return -EINVAL;
4737 }
4738
4739 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4740 if (!msg)
4741 return -ENOMEM;
4742
4743 rcu_read_lock();
4744
4745 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
4746 if (reply == NULL)
4747 goto nla_put_failure;
4748
4749 switch (cmd) {
4750 case IPVS_CMD_GET_SERVICE:
4751 {
4752 struct ip_vs_service *svc;
4753
4754 svc = ip_vs_genl_find_service(ipvs,
4755 info->attrs[IPVS_CMD_ATTR_SERVICE]);
4756 if (IS_ERR(svc)) {
4757 ret = PTR_ERR(svc);
4758 goto out_err;
4759 } else if (svc) {
4760 ret = ip_vs_genl_fill_service(msg, svc);
4761 if (ret)
4762 goto nla_put_failure;
4763 } else {
4764 ret = -ESRCH;
4765 goto out_err;
4766 }
4767
4768 break;
4769 }
4770
4771 case IPVS_CMD_GET_CONFIG:
4772 {
4773 struct ip_vs_timeout_user t;
4774
4775 __ip_vs_get_timeouts(ipvs, &t);
4776 #ifdef CONFIG_IP_VS_PROTO_TCP
4777 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
4778 t.tcp_timeout) ||
4779 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
4780 t.tcp_fin_timeout))
4781 goto nla_put_failure;
4782 #endif
4783 #ifdef CONFIG_IP_VS_PROTO_UDP
4784 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
4785 goto nla_put_failure;
4786 #endif
4787
4788 break;
4789 }
4790
4791 case IPVS_CMD_GET_INFO:
4792 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
4793 IP_VS_VERSION_CODE) ||
4794 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
4795 get_conn_tab_size(ipvs)))
4796 goto nla_put_failure;
4797 break;
4798 }
4799
4800 genlmsg_end(msg, reply);
4801 ret = genlmsg_reply(msg, info);
4802 goto out;
4803
4804 nla_put_failure:
4805 pr_err("not enough space in Netlink message\n");
4806 ret = -EMSGSIZE;
4807
4808 out_err:
4809 nlmsg_free(msg);
4810 out:
4811 rcu_read_unlock();
4812
4813 return ret;
4814 }
4815
4816
4817 static const struct genl_small_ops ip_vs_genl_ops[] = {
4818 {
4819 .cmd = IPVS_CMD_NEW_SERVICE,
4820 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4821 .flags = GENL_ADMIN_PERM,
4822 .doit = ip_vs_genl_set_cmd,
4823 },
4824 {
4825 .cmd = IPVS_CMD_SET_SERVICE,
4826 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4827 .flags = GENL_ADMIN_PERM,
4828 .doit = ip_vs_genl_set_cmd,
4829 },
4830 {
4831 .cmd = IPVS_CMD_DEL_SERVICE,
4832 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4833 .flags = GENL_ADMIN_PERM,
4834 .doit = ip_vs_genl_set_cmd,
4835 },
4836 {
4837 .cmd = IPVS_CMD_GET_SERVICE,
4838 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4839 .flags = GENL_ADMIN_PERM,
4840 .doit = ip_vs_genl_get_cmd,
4841 .dumpit = ip_vs_genl_dump_services,
4842 },
4843 {
4844 .cmd = IPVS_CMD_NEW_DEST,
4845 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4846 .flags = GENL_ADMIN_PERM,
4847 .doit = ip_vs_genl_set_cmd,
4848 },
4849 {
4850 .cmd = IPVS_CMD_SET_DEST,
4851 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4852 .flags = GENL_ADMIN_PERM,
4853 .doit = ip_vs_genl_set_cmd,
4854 },
4855 {
4856 .cmd = IPVS_CMD_DEL_DEST,
4857 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4858 .flags = GENL_ADMIN_PERM,
4859 .doit = ip_vs_genl_set_cmd,
4860 },
4861 {
4862 .cmd = IPVS_CMD_GET_DEST,
4863 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4864 .flags = GENL_ADMIN_PERM,
4865 .dumpit = ip_vs_genl_dump_dests,
4866 },
4867 {
4868 .cmd = IPVS_CMD_NEW_DAEMON,
4869 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4870 .flags = GENL_ADMIN_PERM,
4871 .doit = ip_vs_genl_set_daemon,
4872 },
4873 {
4874 .cmd = IPVS_CMD_DEL_DAEMON,
4875 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4876 .flags = GENL_ADMIN_PERM,
4877 .doit = ip_vs_genl_set_daemon,
4878 },
4879 {
4880 .cmd = IPVS_CMD_GET_DAEMON,
4881 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4882 .flags = GENL_ADMIN_PERM,
4883 .dumpit = ip_vs_genl_dump_daemons,
4884 },
4885 {
4886 .cmd = IPVS_CMD_SET_CONFIG,
4887 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4888 .flags = GENL_ADMIN_PERM,
4889 .doit = ip_vs_genl_set_cmd,
4890 },
4891 {
4892 .cmd = IPVS_CMD_GET_CONFIG,
4893 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4894 .flags = GENL_ADMIN_PERM,
4895 .doit = ip_vs_genl_get_cmd,
4896 },
4897 {
4898 .cmd = IPVS_CMD_GET_INFO,
4899 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4900 .flags = GENL_ADMIN_PERM,
4901 .doit = ip_vs_genl_get_cmd,
4902 },
4903 {
4904 .cmd = IPVS_CMD_ZERO,
4905 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4906 .flags = GENL_ADMIN_PERM,
4907 .doit = ip_vs_genl_set_cmd,
4908 },
4909 {
4910 .cmd = IPVS_CMD_FLUSH,
4911 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4912 .flags = GENL_ADMIN_PERM,
4913 .doit = ip_vs_genl_set_cmd,
4914 },
4915 };
4916
4917 static struct genl_family ip_vs_genl_family __ro_after_init = {
4918 .hdrsize = 0,
4919 .name = IPVS_GENL_NAME,
4920 .version = IPVS_GENL_VERSION,
4921 .maxattr = IPVS_CMD_ATTR_MAX,
4922 .policy = ip_vs_cmd_policy,
4923 .netnsok = true, /* Make ipvsadm to work on netns */
4924 .module = THIS_MODULE,
4925 .small_ops = ip_vs_genl_ops,
4926 .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops),
4927 .resv_start_op = IPVS_CMD_FLUSH + 1,
4928 .parallel_ops = 1,
4929 };
4930
ip_vs_genl_register(void)4931 static int __init ip_vs_genl_register(void)
4932 {
4933 return genl_register_family(&ip_vs_genl_family);
4934 }
4935
ip_vs_genl_unregister(void)4936 static void ip_vs_genl_unregister(void)
4937 {
4938 genl_unregister_family(&ip_vs_genl_family);
4939 }
4940
4941 /* End of Generic Netlink interface definitions */
4942
4943 /*
4944 * per netns intit/exit func.
4945 */
4946 #ifdef CONFIG_SYSCTL
ip_vs_control_net_init_sysctl(struct netns_ipvs * ipvs)4947 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
4948 {
4949 struct net *net = ipvs->net;
4950 struct ctl_table *tbl;
4951 int idx, ret;
4952 size_t ctl_table_size = ARRAY_SIZE(vs_vars);
4953 bool unpriv = net->user_ns != &init_user_ns;
4954
4955 atomic_set(&ipvs->dropentry, 0);
4956 spin_lock_init(&ipvs->dropentry_lock);
4957 spin_lock_init(&ipvs->droppacket_lock);
4958 spin_lock_init(&ipvs->securetcp_lock);
4959 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
4960 INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
4961 expire_nodest_conn_handler);
4962 ipvs->est_stopped = 0;
4963
4964 if (!net_eq(net, &init_net)) {
4965 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
4966 if (tbl == NULL)
4967 return -ENOMEM;
4968 } else
4969 tbl = vs_vars;
4970 /* Initialize sysctl defaults */
4971 for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) {
4972 if (tbl[idx].proc_handler == proc_do_defense_mode)
4973 tbl[idx].extra2 = ipvs;
4974 }
4975 idx = 0;
4976 ipvs->sysctl_amemthresh = 1024;
4977 tbl[idx++].data = &ipvs->sysctl_amemthresh;
4978 ipvs->sysctl_am_droprate = 10;
4979 tbl[idx++].data = &ipvs->sysctl_am_droprate;
4980 tbl[idx++].data = &ipvs->sysctl_drop_entry;
4981 tbl[idx++].data = &ipvs->sysctl_drop_packet;
4982 #ifdef CONFIG_IP_VS_NFCT
4983 tbl[idx++].data = &ipvs->sysctl_conntrack;
4984 #endif
4985 tbl[idx++].data = &ipvs->sysctl_secure_tcp;
4986 ipvs->sysctl_snat_reroute = 1;
4987 tbl[idx++].data = &ipvs->sysctl_snat_reroute;
4988 ipvs->sysctl_sync_ver = 1;
4989 tbl[idx++].data = &ipvs->sysctl_sync_ver;
4990 ipvs->sysctl_sync_ports = 1;
4991 tbl[idx++].data = &ipvs->sysctl_sync_ports;
4992 tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
4993
4994 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
4995 if (unpriv)
4996 tbl[idx].mode = 0444;
4997 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
4998
4999 ipvs->sysctl_sync_sock_size = 0;
5000 if (unpriv)
5001 tbl[idx].mode = 0444;
5002 tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
5003
5004 tbl[idx++].data = &ipvs->sysctl_cache_bypass;
5005 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
5006 tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
5007 tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
5008 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
5009 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
5010 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
5011 tbl[idx].data = &ipvs->sysctl_sync_threshold;
5012 tbl[idx].extra2 = ipvs;
5013 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
5014 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
5015 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
5016 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
5017 tbl[idx++].data = &ipvs->sysctl_sync_retries;
5018 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
5019 ipvs->sysctl_pmtu_disc = 1;
5020 tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
5021 tbl[idx++].data = &ipvs->sysctl_backup_only;
5022 ipvs->sysctl_conn_reuse_mode = 1;
5023 tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
5024 tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
5025 tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
5026
5027 ipvs->sysctl_run_estimation = 1;
5028 if (unpriv)
5029 tbl[idx].mode = 0444;
5030 tbl[idx].extra2 = ipvs;
5031 tbl[idx++].data = &ipvs->sysctl_run_estimation;
5032
5033 ipvs->est_cpulist_valid = 0;
5034 if (unpriv)
5035 tbl[idx].mode = 0444;
5036 tbl[idx].extra2 = ipvs;
5037 tbl[idx++].data = &ipvs->sysctl_est_cpulist;
5038
5039 ipvs->sysctl_est_nice = IPVS_EST_NICE;
5040 if (unpriv)
5041 tbl[idx].mode = 0444;
5042 tbl[idx].extra2 = ipvs;
5043 tbl[idx++].data = &ipvs->sysctl_est_nice;
5044
5045 if (unpriv)
5046 tbl[idx].mode = 0444;
5047 tbl[idx].extra2 = ipvs;
5048 tbl[idx++].data = &ipvs->sysctl_conn_lfactor;
5049
5050 if (unpriv)
5051 tbl[idx].mode = 0444;
5052 tbl[idx].extra2 = ipvs;
5053 tbl[idx++].data = &ipvs->sysctl_svc_lfactor;
5054
5055 #ifdef CONFIG_IP_VS_DEBUG
5056 /* Global sysctls must be ro in non-init netns */
5057 if (!net_eq(net, &init_net))
5058 tbl[idx++].mode = 0444;
5059 #endif
5060
5061 ret = -ENOMEM;
5062 ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl,
5063 ctl_table_size);
5064 if (!ipvs->sysctl_hdr)
5065 goto err;
5066 ipvs->sysctl_tbl = tbl;
5067
5068 ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
5069 if (ret < 0)
5070 goto err;
5071
5072 /* Schedule defense work */
5073 queue_delayed_work(system_long_wq, &ipvs->defense_work,
5074 DEFENSE_TIMER_PERIOD);
5075
5076 return 0;
5077
5078 err:
5079 unregister_net_sysctl_table(ipvs->sysctl_hdr);
5080 if (!net_eq(net, &init_net))
5081 kfree(tbl);
5082 return ret;
5083 }
5084
ip_vs_control_net_cleanup_sysctl(struct netns_ipvs * ipvs)5085 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
5086 {
5087 struct net *net = ipvs->net;
5088
5089 cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work);
5090 cancel_delayed_work_sync(&ipvs->defense_work);
5091 cancel_work_sync(&ipvs->defense_work.work);
5092 unregister_net_sysctl_table(ipvs->sysctl_hdr);
5093 if (ipvs->tot_stats->s.est.ktid != -2) {
5094 /* Not stopped yet? This happens only on netns init error and
5095 * we even do not need to lock the service_mutex for this case.
5096 */
5097 mutex_lock(&ipvs->service_mutex);
5098 ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
5099 mutex_unlock(&ipvs->service_mutex);
5100 }
5101
5102 if (ipvs->est_cpulist_valid)
5103 free_cpumask_var(ipvs->sysctl_est_cpulist);
5104
5105 if (!net_eq(net, &init_net))
5106 kfree(ipvs->sysctl_tbl);
5107 }
5108
5109 #else
5110
ip_vs_control_net_init_sysctl(struct netns_ipvs * ipvs)5111 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; }
ip_vs_control_net_cleanup_sysctl(struct netns_ipvs * ipvs)5112 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { }
5113
5114 #endif
5115
5116 static struct notifier_block ip_vs_dst_notifier = {
5117 .notifier_call = ip_vs_dst_event,
5118 #ifdef CONFIG_IP_VS_IPV6
5119 .priority = ADDRCONF_NOTIFY_PRIORITY + 5,
5120 #endif
5121 };
5122
ip_vs_control_net_init(struct netns_ipvs * ipvs)5123 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
5124 {
5125 int ret = -ENOMEM;
5126 int idx;
5127
5128 /* Initialize service_mutex, svc_table per netns */
5129 __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
5130 init_rwsem(&ipvs->svc_resize_sem);
5131 init_rwsem(&ipvs->svc_replace_sem);
5132 INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
5133 atomic_set(&ipvs->svc_table_changes, 0);
5134 RCU_INIT_POINTER(ipvs->svc_table, NULL);
5135
5136 /* Initialize rs_table */
5137 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
5138 INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
5139
5140 INIT_LIST_HEAD(&ipvs->dest_trash);
5141 spin_lock_init(&ipvs->dest_trash_lock);
5142 timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0);
5143 for (idx = 0; idx < IP_VS_AF_MAX; idx++) {
5144 atomic_set(&ipvs->num_services[idx], 0);
5145 atomic_set(&ipvs->fwm_services[idx], 0);
5146 atomic_set(&ipvs->nonfwm_services[idx], 0);
5147 atomic_set(&ipvs->ftpsvc_counter[idx], 0);
5148 atomic_set(&ipvs->nullsvc_counter[idx], 0);
5149 atomic_set(&ipvs->conn_out_counter[idx], 0);
5150 }
5151
5152 INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
5153 ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs);
5154
5155 /* procfs stats */
5156 ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats);
5157 if (!ipvs->tot_stats)
5158 goto out;
5159 if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
5160 goto err_tot_stats;
5161
5162 #ifdef CONFIG_PROC_FS
5163 if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
5164 &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
5165 goto err_vs;
5166 if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
5167 ip_vs_stats_show, NULL))
5168 goto err_stats;
5169 if (!proc_create_net_single("ip_vs_stats_percpu", 0,
5170 ipvs->net->proc_net,
5171 ip_vs_stats_percpu_show, NULL))
5172 goto err_percpu;
5173 if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net,
5174 ip_vs_status_show, NULL))
5175 goto err_status;
5176 #endif
5177
5178 ret = ip_vs_control_net_init_sysctl(ipvs);
5179 if (ret < 0)
5180 goto err;
5181
5182 return 0;
5183
5184 err:
5185 #ifdef CONFIG_PROC_FS
5186 remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
5187
5188 err_status:
5189 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
5190
5191 err_percpu:
5192 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
5193
5194 err_stats:
5195 remove_proc_entry("ip_vs", ipvs->net->proc_net);
5196
5197 err_vs:
5198 #endif
5199 ip_vs_stats_release(&ipvs->tot_stats->s);
5200
5201 err_tot_stats:
5202 kfree(ipvs->tot_stats);
5203
5204 out:
5205 return ret;
5206 }
5207
ip_vs_control_net_cleanup(struct netns_ipvs * ipvs)5208 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
5209 {
5210 ip_vs_trash_cleanup(ipvs);
5211 ip_vs_control_net_cleanup_sysctl(ipvs);
5212 cancel_delayed_work_sync(&ipvs->est_reload_work);
5213 #ifdef CONFIG_PROC_FS
5214 remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
5215 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
5216 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
5217 remove_proc_entry("ip_vs", ipvs->net->proc_net);
5218 #endif
5219 call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
5220 }
5221
ip_vs_register_nl_ioctl(void)5222 int __init ip_vs_register_nl_ioctl(void)
5223 {
5224 int ret;
5225
5226 ret = nf_register_sockopt(&ip_vs_sockopts);
5227 if (ret) {
5228 pr_err("cannot register sockopt.\n");
5229 goto err_sock;
5230 }
5231
5232 ret = ip_vs_genl_register();
5233 if (ret) {
5234 pr_err("cannot register Generic Netlink interface.\n");
5235 goto err_genl;
5236 }
5237 return 0;
5238
5239 err_genl:
5240 nf_unregister_sockopt(&ip_vs_sockopts);
5241 err_sock:
5242 return ret;
5243 }
5244
ip_vs_unregister_nl_ioctl(void)5245 void ip_vs_unregister_nl_ioctl(void)
5246 {
5247 ip_vs_genl_unregister();
5248 nf_unregister_sockopt(&ip_vs_sockopts);
5249 }
5250
ip_vs_control_init(void)5251 int __init ip_vs_control_init(void)
5252 {
5253 int ret;
5254
5255 ret = register_netdevice_notifier(&ip_vs_dst_notifier);
5256 if (ret < 0)
5257 return ret;
5258
5259 return 0;
5260 }
5261
5262
ip_vs_control_cleanup(void)5263 void ip_vs_control_cleanup(void)
5264 {
5265 unregister_netdevice_notifier(&ip_vs_dst_notifier);
5266 /* relying on common rcu_barrier() in ip_vs_cleanup() */
5267 }
5268