xref: /linux/net/core/sysctl_net_core.c (revision 45d8b572fac3aa8b49d53c946b3685eaf78a2824)
1 // SPDX-License-Identifier: GPL-2.0
2 /* -*- linux-c -*-
3  * sysctl_net_core.c: sysctl interface to net core subsystem.
4  *
5  * Begun April 1, 1996, Mike Shaver.
6  * Added /proc/sys/net/core directory entry (empty =) ). [MS]
7  */
8 
9 #include <linux/filter.h>
10 #include <linux/mm.h>
11 #include <linux/sysctl.h>
12 #include <linux/module.h>
13 #include <linux/socket.h>
14 #include <linux/netdevice.h>
15 #include <linux/ratelimit.h>
16 #include <linux/vmalloc.h>
17 #include <linux/init.h>
18 #include <linux/slab.h>
19 #include <linux/sched/isolation.h>
20 
21 #include <net/ip.h>
22 #include <net/sock.h>
23 #include <net/net_ratelimit.h>
24 #include <net/busy_poll.h>
25 #include <net/pkt_sched.h>
26 #include <net/hotdata.h>
27 #include <net/rps.h>
28 
29 #include "dev.h"
30 
31 static int int_3600 = 3600;
32 static int min_sndbuf = SOCK_MIN_SNDBUF;
33 static int min_rcvbuf = SOCK_MIN_RCVBUF;
34 static int max_skb_frags = MAX_SKB_FRAGS;
35 static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
36 
37 static int net_msg_warn;	/* Unused, but still a sysctl */
38 
39 int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
40 EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
41 
42 /* 0 - Keep current behavior:
43  *     IPv4: inherit all current settings from init_net
44  *     IPv6: reset all settings to default
45  * 1 - Both inherit all current settings from init_net
46  * 2 - Both reset all settings to default
47  * 3 - Both inherit all settings from current netns
48  */
49 int sysctl_devconf_inherit_init_net __read_mostly;
50 EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
51 
52 #if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
53 static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
54 			 struct cpumask *mask)
55 {
56 	char kbuf[128];
57 	int len;
58 
59 	if (*ppos || !*lenp) {
60 		*lenp = 0;
61 		return;
62 	}
63 
64 	len = min(sizeof(kbuf) - 1, *lenp);
65 	len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
66 	if (!len) {
67 		*lenp = 0;
68 		return;
69 	}
70 
71 	if (len < *lenp)
72 		kbuf[len++] = '\n';
73 	memcpy(buffer, kbuf, len);
74 	*lenp = len;
75 	*ppos += len;
76 }
77 #endif
78 
79 #ifdef CONFIG_RPS
80 
81 static struct cpumask *rps_default_mask_cow_alloc(struct net *net)
82 {
83 	struct cpumask *rps_default_mask;
84 
85 	if (net->core.rps_default_mask)
86 		return net->core.rps_default_mask;
87 
88 	rps_default_mask = kzalloc(cpumask_size(), GFP_KERNEL);
89 	if (!rps_default_mask)
90 		return NULL;
91 
92 	/* pairs with READ_ONCE in rx_queue_default_mask() */
93 	WRITE_ONCE(net->core.rps_default_mask, rps_default_mask);
94 	return rps_default_mask;
95 }
96 
97 static int rps_default_mask_sysctl(struct ctl_table *table, int write,
98 				   void *buffer, size_t *lenp, loff_t *ppos)
99 {
100 	struct net *net = (struct net *)table->data;
101 	int err = 0;
102 
103 	rtnl_lock();
104 	if (write) {
105 		struct cpumask *rps_default_mask = rps_default_mask_cow_alloc(net);
106 
107 		err = -ENOMEM;
108 		if (!rps_default_mask)
109 			goto done;
110 
111 		err = cpumask_parse(buffer, rps_default_mask);
112 		if (err)
113 			goto done;
114 
115 		err = rps_cpumask_housekeeping(rps_default_mask);
116 		if (err)
117 			goto done;
118 	} else {
119 		dump_cpumask(buffer, lenp, ppos,
120 			     net->core.rps_default_mask ? : cpu_none_mask);
121 	}
122 
123 done:
124 	rtnl_unlock();
125 	return err;
126 }
127 
128 static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
129 				void *buffer, size_t *lenp, loff_t *ppos)
130 {
131 	unsigned int orig_size, size;
132 	int ret, i;
133 	struct ctl_table tmp = {
134 		.data = &size,
135 		.maxlen = sizeof(size),
136 		.mode = table->mode
137 	};
138 	struct rps_sock_flow_table *orig_sock_table, *sock_table;
139 	static DEFINE_MUTEX(sock_flow_mutex);
140 
141 	mutex_lock(&sock_flow_mutex);
142 
143 	orig_sock_table = rcu_dereference_protected(
144 					net_hotdata.rps_sock_flow_table,
145 					lockdep_is_held(&sock_flow_mutex));
146 	size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
147 
148 	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
149 
150 	if (write) {
151 		if (size) {
152 			if (size > 1<<29) {
153 				/* Enforce limit to prevent overflow */
154 				mutex_unlock(&sock_flow_mutex);
155 				return -EINVAL;
156 			}
157 			size = roundup_pow_of_two(size);
158 			if (size != orig_size) {
159 				sock_table =
160 				    vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
161 				if (!sock_table) {
162 					mutex_unlock(&sock_flow_mutex);
163 					return -ENOMEM;
164 				}
165 				net_hotdata.rps_cpu_mask =
166 					roundup_pow_of_two(nr_cpu_ids) - 1;
167 				sock_table->mask = size - 1;
168 			} else
169 				sock_table = orig_sock_table;
170 
171 			for (i = 0; i < size; i++)
172 				sock_table->ents[i] = RPS_NO_CPU;
173 		} else
174 			sock_table = NULL;
175 
176 		if (sock_table != orig_sock_table) {
177 			rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
178 					   sock_table);
179 			if (sock_table) {
180 				static_branch_inc(&rps_needed);
181 				static_branch_inc(&rfs_needed);
182 			}
183 			if (orig_sock_table) {
184 				static_branch_dec(&rps_needed);
185 				static_branch_dec(&rfs_needed);
186 				kvfree_rcu_mightsleep(orig_sock_table);
187 			}
188 		}
189 	}
190 
191 	mutex_unlock(&sock_flow_mutex);
192 
193 	return ret;
194 }
195 #endif /* CONFIG_RPS */
196 
197 #ifdef CONFIG_NET_FLOW_LIMIT
198 static DEFINE_MUTEX(flow_limit_update_mutex);
199 
200 static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
201 				 void *buffer, size_t *lenp, loff_t *ppos)
202 {
203 	struct sd_flow_limit *cur;
204 	struct softnet_data *sd;
205 	cpumask_var_t mask;
206 	int i, len, ret = 0;
207 
208 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
209 		return -ENOMEM;
210 
211 	if (write) {
212 		ret = cpumask_parse(buffer, mask);
213 		if (ret)
214 			goto done;
215 
216 		mutex_lock(&flow_limit_update_mutex);
217 		len = sizeof(*cur) + netdev_flow_limit_table_len;
218 		for_each_possible_cpu(i) {
219 			sd = &per_cpu(softnet_data, i);
220 			cur = rcu_dereference_protected(sd->flow_limit,
221 				     lockdep_is_held(&flow_limit_update_mutex));
222 			if (cur && !cpumask_test_cpu(i, mask)) {
223 				RCU_INIT_POINTER(sd->flow_limit, NULL);
224 				kfree_rcu_mightsleep(cur);
225 			} else if (!cur && cpumask_test_cpu(i, mask)) {
226 				cur = kzalloc_node(len, GFP_KERNEL,
227 						   cpu_to_node(i));
228 				if (!cur) {
229 					/* not unwinding previous changes */
230 					ret = -ENOMEM;
231 					goto write_unlock;
232 				}
233 				cur->num_buckets = netdev_flow_limit_table_len;
234 				rcu_assign_pointer(sd->flow_limit, cur);
235 			}
236 		}
237 write_unlock:
238 		mutex_unlock(&flow_limit_update_mutex);
239 	} else {
240 		cpumask_clear(mask);
241 		rcu_read_lock();
242 		for_each_possible_cpu(i) {
243 			sd = &per_cpu(softnet_data, i);
244 			if (rcu_dereference(sd->flow_limit))
245 				cpumask_set_cpu(i, mask);
246 		}
247 		rcu_read_unlock();
248 
249 		dump_cpumask(buffer, lenp, ppos, mask);
250 	}
251 
252 done:
253 	free_cpumask_var(mask);
254 	return ret;
255 }
256 
257 static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
258 				       void *buffer, size_t *lenp, loff_t *ppos)
259 {
260 	unsigned int old, *ptr;
261 	int ret;
262 
263 	mutex_lock(&flow_limit_update_mutex);
264 
265 	ptr = table->data;
266 	old = *ptr;
267 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
268 	if (!ret && write && !is_power_of_2(*ptr)) {
269 		*ptr = old;
270 		ret = -EINVAL;
271 	}
272 
273 	mutex_unlock(&flow_limit_update_mutex);
274 	return ret;
275 }
276 #endif /* CONFIG_NET_FLOW_LIMIT */
277 
278 #ifdef CONFIG_NET_SCHED
279 static int set_default_qdisc(struct ctl_table *table, int write,
280 			     void *buffer, size_t *lenp, loff_t *ppos)
281 {
282 	char id[IFNAMSIZ];
283 	struct ctl_table tbl = {
284 		.data = id,
285 		.maxlen = IFNAMSIZ,
286 	};
287 	int ret;
288 
289 	qdisc_get_default(id, IFNAMSIZ);
290 
291 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
292 	if (write && ret == 0)
293 		ret = qdisc_set_default(id);
294 	return ret;
295 }
296 #endif
297 
298 static int proc_do_dev_weight(struct ctl_table *table, int write,
299 			   void *buffer, size_t *lenp, loff_t *ppos)
300 {
301 	static DEFINE_MUTEX(dev_weight_mutex);
302 	int ret, weight;
303 
304 	mutex_lock(&dev_weight_mutex);
305 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
306 	if (!ret && write) {
307 		weight = READ_ONCE(weight_p);
308 		WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias);
309 		WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias);
310 	}
311 	mutex_unlock(&dev_weight_mutex);
312 
313 	return ret;
314 }
315 
316 static int proc_do_rss_key(struct ctl_table *table, int write,
317 			   void *buffer, size_t *lenp, loff_t *ppos)
318 {
319 	struct ctl_table fake_table;
320 	char buf[NETDEV_RSS_KEY_LEN * 3];
321 
322 	snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
323 	fake_table.data = buf;
324 	fake_table.maxlen = sizeof(buf);
325 	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
326 }
327 
328 #ifdef CONFIG_BPF_JIT
329 static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
330 					   void *buffer, size_t *lenp,
331 					   loff_t *ppos)
332 {
333 	int ret, jit_enable = *(int *)table->data;
334 	int min = *(int *)table->extra1;
335 	int max = *(int *)table->extra2;
336 	struct ctl_table tmp = *table;
337 
338 	if (write && !capable(CAP_SYS_ADMIN))
339 		return -EPERM;
340 
341 	tmp.data = &jit_enable;
342 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
343 	if (write && !ret) {
344 		if (jit_enable < 2 ||
345 		    (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) {
346 			*(int *)table->data = jit_enable;
347 			if (jit_enable == 2)
348 				pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
349 		} else {
350 			ret = -EPERM;
351 		}
352 	}
353 
354 	if (write && ret && min == max)
355 		pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n");
356 
357 	return ret;
358 }
359 
360 # ifdef CONFIG_HAVE_EBPF_JIT
361 static int
362 proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
363 				    void *buffer, size_t *lenp, loff_t *ppos)
364 {
365 	if (!capable(CAP_SYS_ADMIN))
366 		return -EPERM;
367 
368 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
369 }
370 # endif /* CONFIG_HAVE_EBPF_JIT */
371 
372 static int
373 proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
374 				     void *buffer, size_t *lenp, loff_t *ppos)
375 {
376 	if (!capable(CAP_SYS_ADMIN))
377 		return -EPERM;
378 
379 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
380 }
381 #endif
382 
383 static struct ctl_table net_core_table[] = {
384 	{
385 		.procname	= "wmem_max",
386 		.data		= &sysctl_wmem_max,
387 		.maxlen		= sizeof(int),
388 		.mode		= 0644,
389 		.proc_handler	= proc_dointvec_minmax,
390 		.extra1		= &min_sndbuf,
391 	},
392 	{
393 		.procname	= "rmem_max",
394 		.data		= &sysctl_rmem_max,
395 		.maxlen		= sizeof(int),
396 		.mode		= 0644,
397 		.proc_handler	= proc_dointvec_minmax,
398 		.extra1		= &min_rcvbuf,
399 	},
400 	{
401 		.procname	= "wmem_default",
402 		.data		= &sysctl_wmem_default,
403 		.maxlen		= sizeof(int),
404 		.mode		= 0644,
405 		.proc_handler	= proc_dointvec_minmax,
406 		.extra1		= &min_sndbuf,
407 	},
408 	{
409 		.procname	= "rmem_default",
410 		.data		= &sysctl_rmem_default,
411 		.maxlen		= sizeof(int),
412 		.mode		= 0644,
413 		.proc_handler	= proc_dointvec_minmax,
414 		.extra1		= &min_rcvbuf,
415 	},
416 	{
417 		.procname	= "mem_pcpu_rsv",
418 		.data		= &sysctl_mem_pcpu_rsv,
419 		.maxlen		= sizeof(int),
420 		.mode		= 0644,
421 		.proc_handler	= proc_dointvec_minmax,
422 		.extra1		= &min_mem_pcpu_rsv,
423 	},
424 	{
425 		.procname	= "dev_weight",
426 		.data		= &weight_p,
427 		.maxlen		= sizeof(int),
428 		.mode		= 0644,
429 		.proc_handler	= proc_do_dev_weight,
430 	},
431 	{
432 		.procname	= "dev_weight_rx_bias",
433 		.data		= &dev_weight_rx_bias,
434 		.maxlen		= sizeof(int),
435 		.mode		= 0644,
436 		.proc_handler	= proc_do_dev_weight,
437 	},
438 	{
439 		.procname	= "dev_weight_tx_bias",
440 		.data		= &dev_weight_tx_bias,
441 		.maxlen		= sizeof(int),
442 		.mode		= 0644,
443 		.proc_handler	= proc_do_dev_weight,
444 	},
445 	{
446 		.procname	= "netdev_max_backlog",
447 		.data		= &net_hotdata.max_backlog,
448 		.maxlen		= sizeof(int),
449 		.mode		= 0644,
450 		.proc_handler	= proc_dointvec
451 	},
452 	{
453 		.procname	= "netdev_rss_key",
454 		.data		= &netdev_rss_key,
455 		.maxlen		= sizeof(int),
456 		.mode		= 0444,
457 		.proc_handler	= proc_do_rss_key,
458 	},
459 #ifdef CONFIG_BPF_JIT
460 	{
461 		.procname	= "bpf_jit_enable",
462 		.data		= &bpf_jit_enable,
463 		.maxlen		= sizeof(int),
464 		.mode		= 0644,
465 		.proc_handler	= proc_dointvec_minmax_bpf_enable,
466 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
467 		.extra1		= SYSCTL_ONE,
468 		.extra2		= SYSCTL_ONE,
469 # else
470 		.extra1		= SYSCTL_ZERO,
471 		.extra2		= SYSCTL_TWO,
472 # endif
473 	},
474 # ifdef CONFIG_HAVE_EBPF_JIT
475 	{
476 		.procname	= "bpf_jit_harden",
477 		.data		= &bpf_jit_harden,
478 		.maxlen		= sizeof(int),
479 		.mode		= 0600,
480 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
481 		.extra1		= SYSCTL_ZERO,
482 		.extra2		= SYSCTL_TWO,
483 	},
484 	{
485 		.procname	= "bpf_jit_kallsyms",
486 		.data		= &bpf_jit_kallsyms,
487 		.maxlen		= sizeof(int),
488 		.mode		= 0600,
489 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
490 		.extra1		= SYSCTL_ZERO,
491 		.extra2		= SYSCTL_ONE,
492 	},
493 # endif
494 	{
495 		.procname	= "bpf_jit_limit",
496 		.data		= &bpf_jit_limit,
497 		.maxlen		= sizeof(long),
498 		.mode		= 0600,
499 		.proc_handler	= proc_dolongvec_minmax_bpf_restricted,
500 		.extra1		= SYSCTL_LONG_ONE,
501 		.extra2		= &bpf_jit_limit_max,
502 	},
503 #endif
504 	{
505 		.procname	= "netdev_tstamp_prequeue",
506 		.data		= &net_hotdata.tstamp_prequeue,
507 		.maxlen		= sizeof(int),
508 		.mode		= 0644,
509 		.proc_handler	= proc_dointvec
510 	},
511 	{
512 		.procname	= "message_cost",
513 		.data		= &net_ratelimit_state.interval,
514 		.maxlen		= sizeof(int),
515 		.mode		= 0644,
516 		.proc_handler	= proc_dointvec_jiffies,
517 	},
518 	{
519 		.procname	= "message_burst",
520 		.data		= &net_ratelimit_state.burst,
521 		.maxlen		= sizeof(int),
522 		.mode		= 0644,
523 		.proc_handler	= proc_dointvec,
524 	},
525 	{
526 		.procname	= "tstamp_allow_data",
527 		.data		= &sysctl_tstamp_allow_data,
528 		.maxlen		= sizeof(int),
529 		.mode		= 0644,
530 		.proc_handler	= proc_dointvec_minmax,
531 		.extra1		= SYSCTL_ZERO,
532 		.extra2		= SYSCTL_ONE
533 	},
534 #ifdef CONFIG_RPS
535 	{
536 		.procname	= "rps_sock_flow_entries",
537 		.maxlen		= sizeof(int),
538 		.mode		= 0644,
539 		.proc_handler	= rps_sock_flow_sysctl
540 	},
541 #endif
542 #ifdef CONFIG_NET_FLOW_LIMIT
543 	{
544 		.procname	= "flow_limit_cpu_bitmap",
545 		.mode		= 0644,
546 		.proc_handler	= flow_limit_cpu_sysctl
547 	},
548 	{
549 		.procname	= "flow_limit_table_len",
550 		.data		= &netdev_flow_limit_table_len,
551 		.maxlen		= sizeof(int),
552 		.mode		= 0644,
553 		.proc_handler	= flow_limit_table_len_sysctl
554 	},
555 #endif /* CONFIG_NET_FLOW_LIMIT */
556 #ifdef CONFIG_NET_RX_BUSY_POLL
557 	{
558 		.procname	= "busy_poll",
559 		.data		= &sysctl_net_busy_poll,
560 		.maxlen		= sizeof(unsigned int),
561 		.mode		= 0644,
562 		.proc_handler	= proc_dointvec_minmax,
563 		.extra1		= SYSCTL_ZERO,
564 	},
565 	{
566 		.procname	= "busy_read",
567 		.data		= &sysctl_net_busy_read,
568 		.maxlen		= sizeof(unsigned int),
569 		.mode		= 0644,
570 		.proc_handler	= proc_dointvec_minmax,
571 		.extra1		= SYSCTL_ZERO,
572 	},
573 #endif
574 #ifdef CONFIG_NET_SCHED
575 	{
576 		.procname	= "default_qdisc",
577 		.mode		= 0644,
578 		.maxlen		= IFNAMSIZ,
579 		.proc_handler	= set_default_qdisc
580 	},
581 #endif
582 	{
583 		.procname	= "netdev_budget",
584 		.data		= &net_hotdata.netdev_budget,
585 		.maxlen		= sizeof(int),
586 		.mode		= 0644,
587 		.proc_handler	= proc_dointvec
588 	},
589 	{
590 		.procname	= "warnings",
591 		.data		= &net_msg_warn,
592 		.maxlen		= sizeof(int),
593 		.mode		= 0644,
594 		.proc_handler	= proc_dointvec
595 	},
596 	{
597 		.procname	= "max_skb_frags",
598 		.data		= &sysctl_max_skb_frags,
599 		.maxlen		= sizeof(int),
600 		.mode		= 0644,
601 		.proc_handler	= proc_dointvec_minmax,
602 		.extra1		= SYSCTL_ONE,
603 		.extra2		= &max_skb_frags,
604 	},
605 	{
606 		.procname	= "netdev_budget_usecs",
607 		.data		= &net_hotdata.netdev_budget_usecs,
608 		.maxlen		= sizeof(unsigned int),
609 		.mode		= 0644,
610 		.proc_handler	= proc_dointvec_minmax,
611 		.extra1		= SYSCTL_ZERO,
612 	},
613 	{
614 		.procname	= "fb_tunnels_only_for_init_net",
615 		.data		= &sysctl_fb_tunnels_only_for_init_net,
616 		.maxlen		= sizeof(int),
617 		.mode		= 0644,
618 		.proc_handler	= proc_dointvec_minmax,
619 		.extra1		= SYSCTL_ZERO,
620 		.extra2		= SYSCTL_TWO,
621 	},
622 	{
623 		.procname	= "devconf_inherit_init_net",
624 		.data		= &sysctl_devconf_inherit_init_net,
625 		.maxlen		= sizeof(int),
626 		.mode		= 0644,
627 		.proc_handler	= proc_dointvec_minmax,
628 		.extra1		= SYSCTL_ZERO,
629 		.extra2		= SYSCTL_THREE,
630 	},
631 	{
632 		.procname	= "high_order_alloc_disable",
633 		.data		= &net_high_order_alloc_disable_key.key,
634 		.maxlen         = sizeof(net_high_order_alloc_disable_key),
635 		.mode		= 0644,
636 		.proc_handler	= proc_do_static_key,
637 	},
638 	{
639 		.procname	= "gro_normal_batch",
640 		.data		= &net_hotdata.gro_normal_batch,
641 		.maxlen		= sizeof(unsigned int),
642 		.mode		= 0644,
643 		.proc_handler	= proc_dointvec_minmax,
644 		.extra1		= SYSCTL_ONE,
645 	},
646 	{
647 		.procname	= "netdev_unregister_timeout_secs",
648 		.data		= &netdev_unregister_timeout_secs,
649 		.maxlen		= sizeof(unsigned int),
650 		.mode		= 0644,
651 		.proc_handler	= proc_dointvec_minmax,
652 		.extra1		= SYSCTL_ONE,
653 		.extra2		= &int_3600,
654 	},
655 	{
656 		.procname	= "skb_defer_max",
657 		.data		= &sysctl_skb_defer_max,
658 		.maxlen		= sizeof(unsigned int),
659 		.mode		= 0644,
660 		.proc_handler	= proc_dointvec_minmax,
661 		.extra1		= SYSCTL_ZERO,
662 	},
663 	{ }
664 };
665 
666 static struct ctl_table netns_core_table[] = {
667 #if IS_ENABLED(CONFIG_RPS)
668 	{
669 		.procname	= "rps_default_mask",
670 		.data		= &init_net,
671 		.mode		= 0644,
672 		.proc_handler	= rps_default_mask_sysctl
673 	},
674 #endif
675 	{
676 		.procname	= "somaxconn",
677 		.data		= &init_net.core.sysctl_somaxconn,
678 		.maxlen		= sizeof(int),
679 		.mode		= 0644,
680 		.extra1		= SYSCTL_ZERO,
681 		.proc_handler	= proc_dointvec_minmax
682 	},
683 	{
684 		.procname	= "optmem_max",
685 		.data		= &init_net.core.sysctl_optmem_max,
686 		.maxlen		= sizeof(int),
687 		.mode		= 0644,
688 		.extra1		= SYSCTL_ZERO,
689 		.proc_handler	= proc_dointvec_minmax
690 	},
691 	{
692 		.procname	= "txrehash",
693 		.data		= &init_net.core.sysctl_txrehash,
694 		.maxlen		= sizeof(u8),
695 		.mode		= 0644,
696 		.extra1		= SYSCTL_ZERO,
697 		.extra2		= SYSCTL_ONE,
698 		.proc_handler	= proc_dou8vec_minmax,
699 	},
700 	{ }
701 };
702 
703 static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str)
704 {
705 	/* fallback tunnels for initns only */
706 	if (!strncmp(str, "initns", 6))
707 		sysctl_fb_tunnels_only_for_init_net = 1;
708 	/* no fallback tunnels anywhere */
709 	else if (!strncmp(str, "none", 4))
710 		sysctl_fb_tunnels_only_for_init_net = 2;
711 
712 	return 1;
713 }
714 __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
715 
716 static __net_init int sysctl_core_net_init(struct net *net)
717 {
718 	struct ctl_table *tbl, *tmp;
719 
720 	tbl = netns_core_table;
721 	if (!net_eq(net, &init_net)) {
722 		tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
723 		if (tbl == NULL)
724 			goto err_dup;
725 
726 		for (tmp = tbl; tmp->procname; tmp++)
727 			tmp->data += (char *)net - (char *)&init_net;
728 	}
729 
730 	net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl,
731 						      ARRAY_SIZE(netns_core_table));
732 	if (net->core.sysctl_hdr == NULL)
733 		goto err_reg;
734 
735 	return 0;
736 
737 err_reg:
738 	if (tbl != netns_core_table)
739 		kfree(tbl);
740 err_dup:
741 	return -ENOMEM;
742 }
743 
744 static __net_exit void sysctl_core_net_exit(struct net *net)
745 {
746 	struct ctl_table *tbl;
747 
748 	tbl = net->core.sysctl_hdr->ctl_table_arg;
749 	unregister_net_sysctl_table(net->core.sysctl_hdr);
750 	BUG_ON(tbl == netns_core_table);
751 #if IS_ENABLED(CONFIG_RPS)
752 	kfree(net->core.rps_default_mask);
753 #endif
754 	kfree(tbl);
755 }
756 
757 static __net_initdata struct pernet_operations sysctl_core_ops = {
758 	.init = sysctl_core_net_init,
759 	.exit = sysctl_core_net_exit,
760 };
761 
762 static __init int sysctl_core_init(void)
763 {
764 	register_net_sysctl(&init_net, "net/core", net_core_table);
765 	return register_pernet_subsys(&sysctl_core_ops);
766 }
767 
768 fs_initcall(sysctl_core_init);
769