xref: /linux/net/core/sock.c (revision 8eecf1c9929aef24e9e75280a39ed1ba3c64fb71)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 #include "dev.h"
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 static void sock_def_write_space_wfree(struct sock *sk);
150 static void sock_def_write_space(struct sock *sk);
151 
152 /**
153  * sk_ns_capable - General socket capability test
154  * @sk: Socket to use a capability on or through
155  * @user_ns: The user namespace of the capability to use
156  * @cap: The capability to use
157  *
158  * Test to see if the opener of the socket had when the socket was
159  * created and the current process has the capability @cap in the user
160  * namespace @user_ns.
161  */
162 bool sk_ns_capable(const struct sock *sk,
163 		   struct user_namespace *user_ns, int cap)
164 {
165 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
166 		ns_capable(user_ns, cap);
167 }
168 EXPORT_SYMBOL(sk_ns_capable);
169 
170 /**
171  * sk_capable - Socket global capability test
172  * @sk: Socket to use a capability on or through
173  * @cap: The global capability to use
174  *
175  * Test to see if the opener of the socket had when the socket was
176  * created and the current process has the capability @cap in all user
177  * namespaces.
178  */
179 bool sk_capable(const struct sock *sk, int cap)
180 {
181 	return sk_ns_capable(sk, &init_user_ns, cap);
182 }
183 EXPORT_SYMBOL(sk_capable);
184 
185 /**
186  * sk_net_capable - Network namespace socket capability test
187  * @sk: Socket to use a capability on or through
188  * @cap: The capability to use
189  *
190  * Test to see if the opener of the socket had when the socket was created
191  * and the current process has the capability @cap over the network namespace
192  * the socket is a member of.
193  */
194 bool sk_net_capable(const struct sock *sk, int cap)
195 {
196 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
197 }
198 EXPORT_SYMBOL(sk_net_capable);
199 
200 /*
201  * Each address family might have different locking rules, so we have
202  * one slock key per address family and separate keys for internal and
203  * userspace sockets.
204  */
205 static struct lock_class_key af_family_keys[AF_MAX];
206 static struct lock_class_key af_family_kern_keys[AF_MAX];
207 static struct lock_class_key af_family_slock_keys[AF_MAX];
208 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
209 
210 /*
211  * Make lock validator output more readable. (we pre-construct these
212  * strings build-time, so that runtime initialization of socket
213  * locks is fast):
214  */
215 
216 #define _sock_locks(x)						  \
217   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
218   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
219   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
220   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
221   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
222   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
223   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
224   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
225   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
226   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
227   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
228   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
229   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
230   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
231   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
232   x "AF_MCTP"  , \
233   x "AF_MAX"
234 
235 static const char *const af_family_key_strings[AF_MAX+1] = {
236 	_sock_locks("sk_lock-")
237 };
238 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
239 	_sock_locks("slock-")
240 };
241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
242 	_sock_locks("clock-")
243 };
244 
245 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-sk_lock-")
247 };
248 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
249 	_sock_locks("k-slock-")
250 };
251 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
252 	_sock_locks("k-clock-")
253 };
254 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
255 	_sock_locks("rlock-")
256 };
257 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
258 	_sock_locks("wlock-")
259 };
260 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
261 	_sock_locks("elock-")
262 };
263 
264 /*
265  * sk_callback_lock and sk queues locking rules are per-address-family,
266  * so split the lock classes by using a per-AF key:
267  */
268 static struct lock_class_key af_callback_keys[AF_MAX];
269 static struct lock_class_key af_rlock_keys[AF_MAX];
270 static struct lock_class_key af_wlock_keys[AF_MAX];
271 static struct lock_class_key af_elock_keys[AF_MAX];
272 static struct lock_class_key af_kern_callback_keys[AF_MAX];
273 
274 /* Run time adjustable parameters. */
275 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
276 EXPORT_SYMBOL(sysctl_wmem_max);
277 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
278 EXPORT_SYMBOL(sysctl_rmem_max);
279 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
280 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
281 
282 /* Maximal space eaten by iovec or ancillary data plus some space */
283 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
284 EXPORT_SYMBOL(sysctl_optmem_max);
285 
286 int sysctl_tstamp_allow_data __read_mostly = 1;
287 
288 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
289 EXPORT_SYMBOL_GPL(memalloc_socks_key);
290 
291 /**
292  * sk_set_memalloc - sets %SOCK_MEMALLOC
293  * @sk: socket to set it on
294  *
295  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
296  * It's the responsibility of the admin to adjust min_free_kbytes
297  * to meet the requirements
298  */
299 void sk_set_memalloc(struct sock *sk)
300 {
301 	sock_set_flag(sk, SOCK_MEMALLOC);
302 	sk->sk_allocation |= __GFP_MEMALLOC;
303 	static_branch_inc(&memalloc_socks_key);
304 }
305 EXPORT_SYMBOL_GPL(sk_set_memalloc);
306 
307 void sk_clear_memalloc(struct sock *sk)
308 {
309 	sock_reset_flag(sk, SOCK_MEMALLOC);
310 	sk->sk_allocation &= ~__GFP_MEMALLOC;
311 	static_branch_dec(&memalloc_socks_key);
312 
313 	/*
314 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
315 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
316 	 * it has rmem allocations due to the last swapfile being deactivated
317 	 * but there is a risk that the socket is unusable due to exceeding
318 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
319 	 */
320 	sk_mem_reclaim(sk);
321 }
322 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
323 
324 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
325 {
326 	int ret;
327 	unsigned int noreclaim_flag;
328 
329 	/* these should have been dropped before queueing */
330 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
331 
332 	noreclaim_flag = memalloc_noreclaim_save();
333 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
334 				 tcp_v6_do_rcv,
335 				 tcp_v4_do_rcv,
336 				 sk, skb);
337 	memalloc_noreclaim_restore(noreclaim_flag);
338 
339 	return ret;
340 }
341 EXPORT_SYMBOL(__sk_backlog_rcv);
342 
343 void sk_error_report(struct sock *sk)
344 {
345 	sk->sk_error_report(sk);
346 
347 	switch (sk->sk_family) {
348 	case AF_INET:
349 		fallthrough;
350 	case AF_INET6:
351 		trace_inet_sk_error_report(sk);
352 		break;
353 	default:
354 		break;
355 	}
356 }
357 EXPORT_SYMBOL(sk_error_report);
358 
359 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
360 {
361 	struct __kernel_sock_timeval tv;
362 
363 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
364 		tv.tv_sec = 0;
365 		tv.tv_usec = 0;
366 	} else {
367 		tv.tv_sec = timeo / HZ;
368 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
369 	}
370 
371 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
372 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
373 		*(struct old_timeval32 *)optval = tv32;
374 		return sizeof(tv32);
375 	}
376 
377 	if (old_timeval) {
378 		struct __kernel_old_timeval old_tv;
379 		old_tv.tv_sec = tv.tv_sec;
380 		old_tv.tv_usec = tv.tv_usec;
381 		*(struct __kernel_old_timeval *)optval = old_tv;
382 		return sizeof(old_tv);
383 	}
384 
385 	*(struct __kernel_sock_timeval *)optval = tv;
386 	return sizeof(tv);
387 }
388 EXPORT_SYMBOL(sock_get_timeout);
389 
390 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
391 			   sockptr_t optval, int optlen, bool old_timeval)
392 {
393 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
394 		struct old_timeval32 tv32;
395 
396 		if (optlen < sizeof(tv32))
397 			return -EINVAL;
398 
399 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
400 			return -EFAULT;
401 		tv->tv_sec = tv32.tv_sec;
402 		tv->tv_usec = tv32.tv_usec;
403 	} else if (old_timeval) {
404 		struct __kernel_old_timeval old_tv;
405 
406 		if (optlen < sizeof(old_tv))
407 			return -EINVAL;
408 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
409 			return -EFAULT;
410 		tv->tv_sec = old_tv.tv_sec;
411 		tv->tv_usec = old_tv.tv_usec;
412 	} else {
413 		if (optlen < sizeof(*tv))
414 			return -EINVAL;
415 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
416 			return -EFAULT;
417 	}
418 
419 	return 0;
420 }
421 EXPORT_SYMBOL(sock_copy_user_timeval);
422 
423 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
424 			    bool old_timeval)
425 {
426 	struct __kernel_sock_timeval tv;
427 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
428 
429 	if (err)
430 		return err;
431 
432 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
433 		return -EDOM;
434 
435 	if (tv.tv_sec < 0) {
436 		static int warned __read_mostly;
437 
438 		*timeo_p = 0;
439 		if (warned < 10 && net_ratelimit()) {
440 			warned++;
441 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
442 				__func__, current->comm, task_pid_nr(current));
443 		}
444 		return 0;
445 	}
446 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
447 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
448 		return 0;
449 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
450 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
451 	return 0;
452 }
453 
454 static bool sock_needs_netstamp(const struct sock *sk)
455 {
456 	switch (sk->sk_family) {
457 	case AF_UNSPEC:
458 	case AF_UNIX:
459 		return false;
460 	default:
461 		return true;
462 	}
463 }
464 
465 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
466 {
467 	if (sk->sk_flags & flags) {
468 		sk->sk_flags &= ~flags;
469 		if (sock_needs_netstamp(sk) &&
470 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
471 			net_disable_timestamp();
472 	}
473 }
474 
475 
476 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
477 {
478 	unsigned long flags;
479 	struct sk_buff_head *list = &sk->sk_receive_queue;
480 
481 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
482 		atomic_inc(&sk->sk_drops);
483 		trace_sock_rcvqueue_full(sk, skb);
484 		return -ENOMEM;
485 	}
486 
487 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
488 		atomic_inc(&sk->sk_drops);
489 		return -ENOBUFS;
490 	}
491 
492 	skb->dev = NULL;
493 	skb_set_owner_r(skb, sk);
494 
495 	/* we escape from rcu protected region, make sure we dont leak
496 	 * a norefcounted dst
497 	 */
498 	skb_dst_force(skb);
499 
500 	spin_lock_irqsave(&list->lock, flags);
501 	sock_skb_set_dropcount(sk, skb);
502 	__skb_queue_tail(list, skb);
503 	spin_unlock_irqrestore(&list->lock, flags);
504 
505 	if (!sock_flag(sk, SOCK_DEAD))
506 		sk->sk_data_ready(sk);
507 	return 0;
508 }
509 EXPORT_SYMBOL(__sock_queue_rcv_skb);
510 
511 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
512 			      enum skb_drop_reason *reason)
513 {
514 	enum skb_drop_reason drop_reason;
515 	int err;
516 
517 	err = sk_filter(sk, skb);
518 	if (err) {
519 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
520 		goto out;
521 	}
522 	err = __sock_queue_rcv_skb(sk, skb);
523 	switch (err) {
524 	case -ENOMEM:
525 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
526 		break;
527 	case -ENOBUFS:
528 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
529 		break;
530 	default:
531 		drop_reason = SKB_NOT_DROPPED_YET;
532 		break;
533 	}
534 out:
535 	if (reason)
536 		*reason = drop_reason;
537 	return err;
538 }
539 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
540 
541 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
542 		     const int nested, unsigned int trim_cap, bool refcounted)
543 {
544 	int rc = NET_RX_SUCCESS;
545 
546 	if (sk_filter_trim_cap(sk, skb, trim_cap))
547 		goto discard_and_relse;
548 
549 	skb->dev = NULL;
550 
551 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
552 		atomic_inc(&sk->sk_drops);
553 		goto discard_and_relse;
554 	}
555 	if (nested)
556 		bh_lock_sock_nested(sk);
557 	else
558 		bh_lock_sock(sk);
559 	if (!sock_owned_by_user(sk)) {
560 		/*
561 		 * trylock + unlock semantics:
562 		 */
563 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
564 
565 		rc = sk_backlog_rcv(sk, skb);
566 
567 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
568 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
569 		bh_unlock_sock(sk);
570 		atomic_inc(&sk->sk_drops);
571 		goto discard_and_relse;
572 	}
573 
574 	bh_unlock_sock(sk);
575 out:
576 	if (refcounted)
577 		sock_put(sk);
578 	return rc;
579 discard_and_relse:
580 	kfree_skb(skb);
581 	goto out;
582 }
583 EXPORT_SYMBOL(__sk_receive_skb);
584 
585 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
586 							  u32));
587 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
588 							   u32));
589 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
590 {
591 	struct dst_entry *dst = __sk_dst_get(sk);
592 
593 	if (dst && dst->obsolete &&
594 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
595 			       dst, cookie) == NULL) {
596 		sk_tx_queue_clear(sk);
597 		sk->sk_dst_pending_confirm = 0;
598 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
599 		dst_release(dst);
600 		return NULL;
601 	}
602 
603 	return dst;
604 }
605 EXPORT_SYMBOL(__sk_dst_check);
606 
607 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
608 {
609 	struct dst_entry *dst = sk_dst_get(sk);
610 
611 	if (dst && dst->obsolete &&
612 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
613 			       dst, cookie) == NULL) {
614 		sk_dst_reset(sk);
615 		dst_release(dst);
616 		return NULL;
617 	}
618 
619 	return dst;
620 }
621 EXPORT_SYMBOL(sk_dst_check);
622 
623 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
624 {
625 	int ret = -ENOPROTOOPT;
626 #ifdef CONFIG_NETDEVICES
627 	struct net *net = sock_net(sk);
628 
629 	/* Sorry... */
630 	ret = -EPERM;
631 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
632 		goto out;
633 
634 	ret = -EINVAL;
635 	if (ifindex < 0)
636 		goto out;
637 
638 	/* Paired with all READ_ONCE() done locklessly. */
639 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
640 
641 	if (sk->sk_prot->rehash)
642 		sk->sk_prot->rehash(sk);
643 	sk_dst_reset(sk);
644 
645 	ret = 0;
646 
647 out:
648 #endif
649 
650 	return ret;
651 }
652 
653 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
654 {
655 	int ret;
656 
657 	if (lock_sk)
658 		lock_sock(sk);
659 	ret = sock_bindtoindex_locked(sk, ifindex);
660 	if (lock_sk)
661 		release_sock(sk);
662 
663 	return ret;
664 }
665 EXPORT_SYMBOL(sock_bindtoindex);
666 
667 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
668 {
669 	int ret = -ENOPROTOOPT;
670 #ifdef CONFIG_NETDEVICES
671 	struct net *net = sock_net(sk);
672 	char devname[IFNAMSIZ];
673 	int index;
674 
675 	ret = -EINVAL;
676 	if (optlen < 0)
677 		goto out;
678 
679 	/* Bind this socket to a particular device like "eth0",
680 	 * as specified in the passed interface name. If the
681 	 * name is "" or the option length is zero the socket
682 	 * is not bound.
683 	 */
684 	if (optlen > IFNAMSIZ - 1)
685 		optlen = IFNAMSIZ - 1;
686 	memset(devname, 0, sizeof(devname));
687 
688 	ret = -EFAULT;
689 	if (copy_from_sockptr(devname, optval, optlen))
690 		goto out;
691 
692 	index = 0;
693 	if (devname[0] != '\0') {
694 		struct net_device *dev;
695 
696 		rcu_read_lock();
697 		dev = dev_get_by_name_rcu(net, devname);
698 		if (dev)
699 			index = dev->ifindex;
700 		rcu_read_unlock();
701 		ret = -ENODEV;
702 		if (!dev)
703 			goto out;
704 	}
705 
706 	return sock_bindtoindex(sk, index, true);
707 out:
708 #endif
709 
710 	return ret;
711 }
712 
713 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
714 				int __user *optlen, int len)
715 {
716 	int ret = -ENOPROTOOPT;
717 #ifdef CONFIG_NETDEVICES
718 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
719 	struct net *net = sock_net(sk);
720 	char devname[IFNAMSIZ];
721 
722 	if (bound_dev_if == 0) {
723 		len = 0;
724 		goto zero;
725 	}
726 
727 	ret = -EINVAL;
728 	if (len < IFNAMSIZ)
729 		goto out;
730 
731 	ret = netdev_get_name(net, devname, bound_dev_if);
732 	if (ret)
733 		goto out;
734 
735 	len = strlen(devname) + 1;
736 
737 	ret = -EFAULT;
738 	if (copy_to_user(optval, devname, len))
739 		goto out;
740 
741 zero:
742 	ret = -EFAULT;
743 	if (put_user(len, optlen))
744 		goto out;
745 
746 	ret = 0;
747 
748 out:
749 #endif
750 
751 	return ret;
752 }
753 
754 bool sk_mc_loop(struct sock *sk)
755 {
756 	if (dev_recursion_level())
757 		return false;
758 	if (!sk)
759 		return true;
760 	switch (sk->sk_family) {
761 	case AF_INET:
762 		return inet_sk(sk)->mc_loop;
763 #if IS_ENABLED(CONFIG_IPV6)
764 	case AF_INET6:
765 		return inet6_sk(sk)->mc_loop;
766 #endif
767 	}
768 	WARN_ON_ONCE(1);
769 	return true;
770 }
771 EXPORT_SYMBOL(sk_mc_loop);
772 
773 void sock_set_reuseaddr(struct sock *sk)
774 {
775 	lock_sock(sk);
776 	sk->sk_reuse = SK_CAN_REUSE;
777 	release_sock(sk);
778 }
779 EXPORT_SYMBOL(sock_set_reuseaddr);
780 
781 void sock_set_reuseport(struct sock *sk)
782 {
783 	lock_sock(sk);
784 	sk->sk_reuseport = true;
785 	release_sock(sk);
786 }
787 EXPORT_SYMBOL(sock_set_reuseport);
788 
789 void sock_no_linger(struct sock *sk)
790 {
791 	lock_sock(sk);
792 	sk->sk_lingertime = 0;
793 	sock_set_flag(sk, SOCK_LINGER);
794 	release_sock(sk);
795 }
796 EXPORT_SYMBOL(sock_no_linger);
797 
798 void sock_set_priority(struct sock *sk, u32 priority)
799 {
800 	lock_sock(sk);
801 	sk->sk_priority = priority;
802 	release_sock(sk);
803 }
804 EXPORT_SYMBOL(sock_set_priority);
805 
806 void sock_set_sndtimeo(struct sock *sk, s64 secs)
807 {
808 	lock_sock(sk);
809 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
810 		sk->sk_sndtimeo = secs * HZ;
811 	else
812 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
813 	release_sock(sk);
814 }
815 EXPORT_SYMBOL(sock_set_sndtimeo);
816 
817 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
818 {
819 	if (val)  {
820 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
821 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
822 		sock_set_flag(sk, SOCK_RCVTSTAMP);
823 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
824 	} else {
825 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
826 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
827 	}
828 }
829 
830 void sock_enable_timestamps(struct sock *sk)
831 {
832 	lock_sock(sk);
833 	__sock_set_timestamps(sk, true, false, true);
834 	release_sock(sk);
835 }
836 EXPORT_SYMBOL(sock_enable_timestamps);
837 
838 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
839 {
840 	switch (optname) {
841 	case SO_TIMESTAMP_OLD:
842 		__sock_set_timestamps(sk, valbool, false, false);
843 		break;
844 	case SO_TIMESTAMP_NEW:
845 		__sock_set_timestamps(sk, valbool, true, false);
846 		break;
847 	case SO_TIMESTAMPNS_OLD:
848 		__sock_set_timestamps(sk, valbool, false, true);
849 		break;
850 	case SO_TIMESTAMPNS_NEW:
851 		__sock_set_timestamps(sk, valbool, true, true);
852 		break;
853 	}
854 }
855 
856 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
857 {
858 	struct net *net = sock_net(sk);
859 	struct net_device *dev = NULL;
860 	bool match = false;
861 	int *vclock_index;
862 	int i, num;
863 
864 	if (sk->sk_bound_dev_if)
865 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
866 
867 	if (!dev) {
868 		pr_err("%s: sock not bind to device\n", __func__);
869 		return -EOPNOTSUPP;
870 	}
871 
872 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
873 	dev_put(dev);
874 
875 	for (i = 0; i < num; i++) {
876 		if (*(vclock_index + i) == phc_index) {
877 			match = true;
878 			break;
879 		}
880 	}
881 
882 	if (num > 0)
883 		kfree(vclock_index);
884 
885 	if (!match)
886 		return -EINVAL;
887 
888 	sk->sk_bind_phc = phc_index;
889 
890 	return 0;
891 }
892 
893 int sock_set_timestamping(struct sock *sk, int optname,
894 			  struct so_timestamping timestamping)
895 {
896 	int val = timestamping.flags;
897 	int ret;
898 
899 	if (val & ~SOF_TIMESTAMPING_MASK)
900 		return -EINVAL;
901 
902 	if (val & SOF_TIMESTAMPING_OPT_ID &&
903 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
904 		if (sk_is_tcp(sk)) {
905 			if ((1 << sk->sk_state) &
906 			    (TCPF_CLOSE | TCPF_LISTEN))
907 				return -EINVAL;
908 			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
909 		} else {
910 			atomic_set(&sk->sk_tskey, 0);
911 		}
912 	}
913 
914 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
915 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
916 		return -EINVAL;
917 
918 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
919 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
920 		if (ret)
921 			return ret;
922 	}
923 
924 	sk->sk_tsflags = val;
925 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
926 
927 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
928 		sock_enable_timestamp(sk,
929 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
930 	else
931 		sock_disable_timestamp(sk,
932 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
933 	return 0;
934 }
935 
936 void sock_set_keepalive(struct sock *sk)
937 {
938 	lock_sock(sk);
939 	if (sk->sk_prot->keepalive)
940 		sk->sk_prot->keepalive(sk, true);
941 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
942 	release_sock(sk);
943 }
944 EXPORT_SYMBOL(sock_set_keepalive);
945 
946 static void __sock_set_rcvbuf(struct sock *sk, int val)
947 {
948 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
949 	 * as a negative value.
950 	 */
951 	val = min_t(int, val, INT_MAX / 2);
952 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
953 
954 	/* We double it on the way in to account for "struct sk_buff" etc.
955 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
956 	 * will allow that much actual data to be received on that socket.
957 	 *
958 	 * Applications are unaware that "struct sk_buff" and other overheads
959 	 * allocate from the receive buffer during socket buffer allocation.
960 	 *
961 	 * And after considering the possible alternatives, returning the value
962 	 * we actually used in getsockopt is the most desirable behavior.
963 	 */
964 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
965 }
966 
967 void sock_set_rcvbuf(struct sock *sk, int val)
968 {
969 	lock_sock(sk);
970 	__sock_set_rcvbuf(sk, val);
971 	release_sock(sk);
972 }
973 EXPORT_SYMBOL(sock_set_rcvbuf);
974 
975 static void __sock_set_mark(struct sock *sk, u32 val)
976 {
977 	if (val != sk->sk_mark) {
978 		sk->sk_mark = val;
979 		sk_dst_reset(sk);
980 	}
981 }
982 
983 void sock_set_mark(struct sock *sk, u32 val)
984 {
985 	lock_sock(sk);
986 	__sock_set_mark(sk, val);
987 	release_sock(sk);
988 }
989 EXPORT_SYMBOL(sock_set_mark);
990 
991 static void sock_release_reserved_memory(struct sock *sk, int bytes)
992 {
993 	/* Round down bytes to multiple of pages */
994 	bytes &= ~(SK_MEM_QUANTUM - 1);
995 
996 	WARN_ON(bytes > sk->sk_reserved_mem);
997 	sk->sk_reserved_mem -= bytes;
998 	sk_mem_reclaim(sk);
999 }
1000 
1001 static int sock_reserve_memory(struct sock *sk, int bytes)
1002 {
1003 	long allocated;
1004 	bool charged;
1005 	int pages;
1006 
1007 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1008 		return -EOPNOTSUPP;
1009 
1010 	if (!bytes)
1011 		return 0;
1012 
1013 	pages = sk_mem_pages(bytes);
1014 
1015 	/* pre-charge to memcg */
1016 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1017 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1018 	if (!charged)
1019 		return -ENOMEM;
1020 
1021 	/* pre-charge to forward_alloc */
1022 	allocated = sk_memory_allocated_add(sk, pages);
1023 	/* If the system goes into memory pressure with this
1024 	 * precharge, give up and return error.
1025 	 */
1026 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1027 		sk_memory_allocated_sub(sk, pages);
1028 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1029 		return -ENOMEM;
1030 	}
1031 	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1032 
1033 	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1034 
1035 	return 0;
1036 }
1037 
1038 /*
1039  *	This is meant for all protocols to use and covers goings on
1040  *	at the socket level. Everything here is generic.
1041  */
1042 
1043 int sock_setsockopt(struct socket *sock, int level, int optname,
1044 		    sockptr_t optval, unsigned int optlen)
1045 {
1046 	struct so_timestamping timestamping;
1047 	struct sock_txtime sk_txtime;
1048 	struct sock *sk = sock->sk;
1049 	int val;
1050 	int valbool;
1051 	struct linger ling;
1052 	int ret = 0;
1053 
1054 	/*
1055 	 *	Options without arguments
1056 	 */
1057 
1058 	if (optname == SO_BINDTODEVICE)
1059 		return sock_setbindtodevice(sk, optval, optlen);
1060 
1061 	if (optlen < sizeof(int))
1062 		return -EINVAL;
1063 
1064 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1065 		return -EFAULT;
1066 
1067 	valbool = val ? 1 : 0;
1068 
1069 	lock_sock(sk);
1070 
1071 	switch (optname) {
1072 	case SO_DEBUG:
1073 		if (val && !capable(CAP_NET_ADMIN))
1074 			ret = -EACCES;
1075 		else
1076 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1077 		break;
1078 	case SO_REUSEADDR:
1079 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1080 		break;
1081 	case SO_REUSEPORT:
1082 		sk->sk_reuseport = valbool;
1083 		break;
1084 	case SO_TYPE:
1085 	case SO_PROTOCOL:
1086 	case SO_DOMAIN:
1087 	case SO_ERROR:
1088 		ret = -ENOPROTOOPT;
1089 		break;
1090 	case SO_DONTROUTE:
1091 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1092 		sk_dst_reset(sk);
1093 		break;
1094 	case SO_BROADCAST:
1095 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1096 		break;
1097 	case SO_SNDBUF:
1098 		/* Don't error on this BSD doesn't and if you think
1099 		 * about it this is right. Otherwise apps have to
1100 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1101 		 * are treated in BSD as hints
1102 		 */
1103 		val = min_t(u32, val, sysctl_wmem_max);
1104 set_sndbuf:
1105 		/* Ensure val * 2 fits into an int, to prevent max_t()
1106 		 * from treating it as a negative value.
1107 		 */
1108 		val = min_t(int, val, INT_MAX / 2);
1109 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1110 		WRITE_ONCE(sk->sk_sndbuf,
1111 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1112 		/* Wake up sending tasks if we upped the value. */
1113 		sk->sk_write_space(sk);
1114 		break;
1115 
1116 	case SO_SNDBUFFORCE:
1117 		if (!capable(CAP_NET_ADMIN)) {
1118 			ret = -EPERM;
1119 			break;
1120 		}
1121 
1122 		/* No negative values (to prevent underflow, as val will be
1123 		 * multiplied by 2).
1124 		 */
1125 		if (val < 0)
1126 			val = 0;
1127 		goto set_sndbuf;
1128 
1129 	case SO_RCVBUF:
1130 		/* Don't error on this BSD doesn't and if you think
1131 		 * about it this is right. Otherwise apps have to
1132 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1133 		 * are treated in BSD as hints
1134 		 */
1135 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1136 		break;
1137 
1138 	case SO_RCVBUFFORCE:
1139 		if (!capable(CAP_NET_ADMIN)) {
1140 			ret = -EPERM;
1141 			break;
1142 		}
1143 
1144 		/* No negative values (to prevent underflow, as val will be
1145 		 * multiplied by 2).
1146 		 */
1147 		__sock_set_rcvbuf(sk, max(val, 0));
1148 		break;
1149 
1150 	case SO_KEEPALIVE:
1151 		if (sk->sk_prot->keepalive)
1152 			sk->sk_prot->keepalive(sk, valbool);
1153 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1154 		break;
1155 
1156 	case SO_OOBINLINE:
1157 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1158 		break;
1159 
1160 	case SO_NO_CHECK:
1161 		sk->sk_no_check_tx = valbool;
1162 		break;
1163 
1164 	case SO_PRIORITY:
1165 		if ((val >= 0 && val <= 6) ||
1166 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1167 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1168 			sk->sk_priority = val;
1169 		else
1170 			ret = -EPERM;
1171 		break;
1172 
1173 	case SO_LINGER:
1174 		if (optlen < sizeof(ling)) {
1175 			ret = -EINVAL;	/* 1003.1g */
1176 			break;
1177 		}
1178 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1179 			ret = -EFAULT;
1180 			break;
1181 		}
1182 		if (!ling.l_onoff)
1183 			sock_reset_flag(sk, SOCK_LINGER);
1184 		else {
1185 #if (BITS_PER_LONG == 32)
1186 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1187 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1188 			else
1189 #endif
1190 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1191 			sock_set_flag(sk, SOCK_LINGER);
1192 		}
1193 		break;
1194 
1195 	case SO_BSDCOMPAT:
1196 		break;
1197 
1198 	case SO_PASSCRED:
1199 		if (valbool)
1200 			set_bit(SOCK_PASSCRED, &sock->flags);
1201 		else
1202 			clear_bit(SOCK_PASSCRED, &sock->flags);
1203 		break;
1204 
1205 	case SO_TIMESTAMP_OLD:
1206 	case SO_TIMESTAMP_NEW:
1207 	case SO_TIMESTAMPNS_OLD:
1208 	case SO_TIMESTAMPNS_NEW:
1209 		sock_set_timestamp(sk, optname, valbool);
1210 		break;
1211 
1212 	case SO_TIMESTAMPING_NEW:
1213 	case SO_TIMESTAMPING_OLD:
1214 		if (optlen == sizeof(timestamping)) {
1215 			if (copy_from_sockptr(&timestamping, optval,
1216 					      sizeof(timestamping))) {
1217 				ret = -EFAULT;
1218 				break;
1219 			}
1220 		} else {
1221 			memset(&timestamping, 0, sizeof(timestamping));
1222 			timestamping.flags = val;
1223 		}
1224 		ret = sock_set_timestamping(sk, optname, timestamping);
1225 		break;
1226 
1227 	case SO_RCVLOWAT:
1228 		if (val < 0)
1229 			val = INT_MAX;
1230 		if (sock->ops->set_rcvlowat)
1231 			ret = sock->ops->set_rcvlowat(sk, val);
1232 		else
1233 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1234 		break;
1235 
1236 	case SO_RCVTIMEO_OLD:
1237 	case SO_RCVTIMEO_NEW:
1238 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1239 				       optlen, optname == SO_RCVTIMEO_OLD);
1240 		break;
1241 
1242 	case SO_SNDTIMEO_OLD:
1243 	case SO_SNDTIMEO_NEW:
1244 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1245 				       optlen, optname == SO_SNDTIMEO_OLD);
1246 		break;
1247 
1248 	case SO_ATTACH_FILTER: {
1249 		struct sock_fprog fprog;
1250 
1251 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1252 		if (!ret)
1253 			ret = sk_attach_filter(&fprog, sk);
1254 		break;
1255 	}
1256 	case SO_ATTACH_BPF:
1257 		ret = -EINVAL;
1258 		if (optlen == sizeof(u32)) {
1259 			u32 ufd;
1260 
1261 			ret = -EFAULT;
1262 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1263 				break;
1264 
1265 			ret = sk_attach_bpf(ufd, sk);
1266 		}
1267 		break;
1268 
1269 	case SO_ATTACH_REUSEPORT_CBPF: {
1270 		struct sock_fprog fprog;
1271 
1272 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1273 		if (!ret)
1274 			ret = sk_reuseport_attach_filter(&fprog, sk);
1275 		break;
1276 	}
1277 	case SO_ATTACH_REUSEPORT_EBPF:
1278 		ret = -EINVAL;
1279 		if (optlen == sizeof(u32)) {
1280 			u32 ufd;
1281 
1282 			ret = -EFAULT;
1283 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1284 				break;
1285 
1286 			ret = sk_reuseport_attach_bpf(ufd, sk);
1287 		}
1288 		break;
1289 
1290 	case SO_DETACH_REUSEPORT_BPF:
1291 		ret = reuseport_detach_prog(sk);
1292 		break;
1293 
1294 	case SO_DETACH_FILTER:
1295 		ret = sk_detach_filter(sk);
1296 		break;
1297 
1298 	case SO_LOCK_FILTER:
1299 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1300 			ret = -EPERM;
1301 		else
1302 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1303 		break;
1304 
1305 	case SO_PASSSEC:
1306 		if (valbool)
1307 			set_bit(SOCK_PASSSEC, &sock->flags);
1308 		else
1309 			clear_bit(SOCK_PASSSEC, &sock->flags);
1310 		break;
1311 	case SO_MARK:
1312 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1313 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1314 			ret = -EPERM;
1315 			break;
1316 		}
1317 
1318 		__sock_set_mark(sk, val);
1319 		break;
1320 	case SO_RCVMARK:
1321 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1322 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1323 			ret = -EPERM;
1324 			break;
1325 		}
1326 
1327 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1328 		break;
1329 
1330 	case SO_RXQ_OVFL:
1331 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1332 		break;
1333 
1334 	case SO_WIFI_STATUS:
1335 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1336 		break;
1337 
1338 	case SO_PEEK_OFF:
1339 		if (sock->ops->set_peek_off)
1340 			ret = sock->ops->set_peek_off(sk, val);
1341 		else
1342 			ret = -EOPNOTSUPP;
1343 		break;
1344 
1345 	case SO_NOFCS:
1346 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1347 		break;
1348 
1349 	case SO_SELECT_ERR_QUEUE:
1350 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1351 		break;
1352 
1353 #ifdef CONFIG_NET_RX_BUSY_POLL
1354 	case SO_BUSY_POLL:
1355 		/* allow unprivileged users to decrease the value */
1356 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1357 			ret = -EPERM;
1358 		else {
1359 			if (val < 0)
1360 				ret = -EINVAL;
1361 			else
1362 				WRITE_ONCE(sk->sk_ll_usec, val);
1363 		}
1364 		break;
1365 	case SO_PREFER_BUSY_POLL:
1366 		if (valbool && !capable(CAP_NET_ADMIN))
1367 			ret = -EPERM;
1368 		else
1369 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1370 		break;
1371 	case SO_BUSY_POLL_BUDGET:
1372 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1373 			ret = -EPERM;
1374 		} else {
1375 			if (val < 0 || val > U16_MAX)
1376 				ret = -EINVAL;
1377 			else
1378 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1379 		}
1380 		break;
1381 #endif
1382 
1383 	case SO_MAX_PACING_RATE:
1384 		{
1385 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1386 
1387 		if (sizeof(ulval) != sizeof(val) &&
1388 		    optlen >= sizeof(ulval) &&
1389 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1390 			ret = -EFAULT;
1391 			break;
1392 		}
1393 		if (ulval != ~0UL)
1394 			cmpxchg(&sk->sk_pacing_status,
1395 				SK_PACING_NONE,
1396 				SK_PACING_NEEDED);
1397 		sk->sk_max_pacing_rate = ulval;
1398 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1399 		break;
1400 		}
1401 	case SO_INCOMING_CPU:
1402 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1403 		break;
1404 
1405 	case SO_CNX_ADVICE:
1406 		if (val == 1)
1407 			dst_negative_advice(sk);
1408 		break;
1409 
1410 	case SO_ZEROCOPY:
1411 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1412 			if (!(sk_is_tcp(sk) ||
1413 			      (sk->sk_type == SOCK_DGRAM &&
1414 			       sk->sk_protocol == IPPROTO_UDP)))
1415 				ret = -EOPNOTSUPP;
1416 		} else if (sk->sk_family != PF_RDS) {
1417 			ret = -EOPNOTSUPP;
1418 		}
1419 		if (!ret) {
1420 			if (val < 0 || val > 1)
1421 				ret = -EINVAL;
1422 			else
1423 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1424 		}
1425 		break;
1426 
1427 	case SO_TXTIME:
1428 		if (optlen != sizeof(struct sock_txtime)) {
1429 			ret = -EINVAL;
1430 			break;
1431 		} else if (copy_from_sockptr(&sk_txtime, optval,
1432 			   sizeof(struct sock_txtime))) {
1433 			ret = -EFAULT;
1434 			break;
1435 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1436 			ret = -EINVAL;
1437 			break;
1438 		}
1439 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1440 		 * scheduler has enough safe guards.
1441 		 */
1442 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1443 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1444 			ret = -EPERM;
1445 			break;
1446 		}
1447 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1448 		sk->sk_clockid = sk_txtime.clockid;
1449 		sk->sk_txtime_deadline_mode =
1450 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1451 		sk->sk_txtime_report_errors =
1452 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1453 		break;
1454 
1455 	case SO_BINDTOIFINDEX:
1456 		ret = sock_bindtoindex_locked(sk, val);
1457 		break;
1458 
1459 	case SO_BUF_LOCK:
1460 		if (val & ~SOCK_BUF_LOCK_MASK) {
1461 			ret = -EINVAL;
1462 			break;
1463 		}
1464 		sk->sk_userlocks = val | (sk->sk_userlocks &
1465 					  ~SOCK_BUF_LOCK_MASK);
1466 		break;
1467 
1468 	case SO_RESERVE_MEM:
1469 	{
1470 		int delta;
1471 
1472 		if (val < 0) {
1473 			ret = -EINVAL;
1474 			break;
1475 		}
1476 
1477 		delta = val - sk->sk_reserved_mem;
1478 		if (delta < 0)
1479 			sock_release_reserved_memory(sk, -delta);
1480 		else
1481 			ret = sock_reserve_memory(sk, delta);
1482 		break;
1483 	}
1484 
1485 	case SO_TXREHASH:
1486 		if (val < -1 || val > 1) {
1487 			ret = -EINVAL;
1488 			break;
1489 		}
1490 		/* Paired with READ_ONCE() in tcp_rtx_synack() */
1491 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1492 		break;
1493 
1494 	default:
1495 		ret = -ENOPROTOOPT;
1496 		break;
1497 	}
1498 	release_sock(sk);
1499 	return ret;
1500 }
1501 EXPORT_SYMBOL(sock_setsockopt);
1502 
1503 static const struct cred *sk_get_peer_cred(struct sock *sk)
1504 {
1505 	const struct cred *cred;
1506 
1507 	spin_lock(&sk->sk_peer_lock);
1508 	cred = get_cred(sk->sk_peer_cred);
1509 	spin_unlock(&sk->sk_peer_lock);
1510 
1511 	return cred;
1512 }
1513 
1514 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1515 			  struct ucred *ucred)
1516 {
1517 	ucred->pid = pid_vnr(pid);
1518 	ucred->uid = ucred->gid = -1;
1519 	if (cred) {
1520 		struct user_namespace *current_ns = current_user_ns();
1521 
1522 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1523 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1524 	}
1525 }
1526 
1527 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1528 {
1529 	struct user_namespace *user_ns = current_user_ns();
1530 	int i;
1531 
1532 	for (i = 0; i < src->ngroups; i++)
1533 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1534 			return -EFAULT;
1535 
1536 	return 0;
1537 }
1538 
1539 int sock_getsockopt(struct socket *sock, int level, int optname,
1540 		    char __user *optval, int __user *optlen)
1541 {
1542 	struct sock *sk = sock->sk;
1543 
1544 	union {
1545 		int val;
1546 		u64 val64;
1547 		unsigned long ulval;
1548 		struct linger ling;
1549 		struct old_timeval32 tm32;
1550 		struct __kernel_old_timeval tm;
1551 		struct  __kernel_sock_timeval stm;
1552 		struct sock_txtime txtime;
1553 		struct so_timestamping timestamping;
1554 	} v;
1555 
1556 	int lv = sizeof(int);
1557 	int len;
1558 
1559 	if (get_user(len, optlen))
1560 		return -EFAULT;
1561 	if (len < 0)
1562 		return -EINVAL;
1563 
1564 	memset(&v, 0, sizeof(v));
1565 
1566 	switch (optname) {
1567 	case SO_DEBUG:
1568 		v.val = sock_flag(sk, SOCK_DBG);
1569 		break;
1570 
1571 	case SO_DONTROUTE:
1572 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1573 		break;
1574 
1575 	case SO_BROADCAST:
1576 		v.val = sock_flag(sk, SOCK_BROADCAST);
1577 		break;
1578 
1579 	case SO_SNDBUF:
1580 		v.val = sk->sk_sndbuf;
1581 		break;
1582 
1583 	case SO_RCVBUF:
1584 		v.val = sk->sk_rcvbuf;
1585 		break;
1586 
1587 	case SO_REUSEADDR:
1588 		v.val = sk->sk_reuse;
1589 		break;
1590 
1591 	case SO_REUSEPORT:
1592 		v.val = sk->sk_reuseport;
1593 		break;
1594 
1595 	case SO_KEEPALIVE:
1596 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1597 		break;
1598 
1599 	case SO_TYPE:
1600 		v.val = sk->sk_type;
1601 		break;
1602 
1603 	case SO_PROTOCOL:
1604 		v.val = sk->sk_protocol;
1605 		break;
1606 
1607 	case SO_DOMAIN:
1608 		v.val = sk->sk_family;
1609 		break;
1610 
1611 	case SO_ERROR:
1612 		v.val = -sock_error(sk);
1613 		if (v.val == 0)
1614 			v.val = xchg(&sk->sk_err_soft, 0);
1615 		break;
1616 
1617 	case SO_OOBINLINE:
1618 		v.val = sock_flag(sk, SOCK_URGINLINE);
1619 		break;
1620 
1621 	case SO_NO_CHECK:
1622 		v.val = sk->sk_no_check_tx;
1623 		break;
1624 
1625 	case SO_PRIORITY:
1626 		v.val = sk->sk_priority;
1627 		break;
1628 
1629 	case SO_LINGER:
1630 		lv		= sizeof(v.ling);
1631 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1632 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1633 		break;
1634 
1635 	case SO_BSDCOMPAT:
1636 		break;
1637 
1638 	case SO_TIMESTAMP_OLD:
1639 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1640 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1641 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1642 		break;
1643 
1644 	case SO_TIMESTAMPNS_OLD:
1645 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1646 		break;
1647 
1648 	case SO_TIMESTAMP_NEW:
1649 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1650 		break;
1651 
1652 	case SO_TIMESTAMPNS_NEW:
1653 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1654 		break;
1655 
1656 	case SO_TIMESTAMPING_OLD:
1657 		lv = sizeof(v.timestamping);
1658 		v.timestamping.flags = sk->sk_tsflags;
1659 		v.timestamping.bind_phc = sk->sk_bind_phc;
1660 		break;
1661 
1662 	case SO_RCVTIMEO_OLD:
1663 	case SO_RCVTIMEO_NEW:
1664 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1665 		break;
1666 
1667 	case SO_SNDTIMEO_OLD:
1668 	case SO_SNDTIMEO_NEW:
1669 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1670 		break;
1671 
1672 	case SO_RCVLOWAT:
1673 		v.val = sk->sk_rcvlowat;
1674 		break;
1675 
1676 	case SO_SNDLOWAT:
1677 		v.val = 1;
1678 		break;
1679 
1680 	case SO_PASSCRED:
1681 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1682 		break;
1683 
1684 	case SO_PEERCRED:
1685 	{
1686 		struct ucred peercred;
1687 		if (len > sizeof(peercred))
1688 			len = sizeof(peercred);
1689 
1690 		spin_lock(&sk->sk_peer_lock);
1691 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1692 		spin_unlock(&sk->sk_peer_lock);
1693 
1694 		if (copy_to_user(optval, &peercred, len))
1695 			return -EFAULT;
1696 		goto lenout;
1697 	}
1698 
1699 	case SO_PEERGROUPS:
1700 	{
1701 		const struct cred *cred;
1702 		int ret, n;
1703 
1704 		cred = sk_get_peer_cred(sk);
1705 		if (!cred)
1706 			return -ENODATA;
1707 
1708 		n = cred->group_info->ngroups;
1709 		if (len < n * sizeof(gid_t)) {
1710 			len = n * sizeof(gid_t);
1711 			put_cred(cred);
1712 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1713 		}
1714 		len = n * sizeof(gid_t);
1715 
1716 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1717 		put_cred(cred);
1718 		if (ret)
1719 			return ret;
1720 		goto lenout;
1721 	}
1722 
1723 	case SO_PEERNAME:
1724 	{
1725 		char address[128];
1726 
1727 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1728 		if (lv < 0)
1729 			return -ENOTCONN;
1730 		if (lv < len)
1731 			return -EINVAL;
1732 		if (copy_to_user(optval, address, len))
1733 			return -EFAULT;
1734 		goto lenout;
1735 	}
1736 
1737 	/* Dubious BSD thing... Probably nobody even uses it, but
1738 	 * the UNIX standard wants it for whatever reason... -DaveM
1739 	 */
1740 	case SO_ACCEPTCONN:
1741 		v.val = sk->sk_state == TCP_LISTEN;
1742 		break;
1743 
1744 	case SO_PASSSEC:
1745 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1746 		break;
1747 
1748 	case SO_PEERSEC:
1749 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1750 
1751 	case SO_MARK:
1752 		v.val = sk->sk_mark;
1753 		break;
1754 
1755 	case SO_RCVMARK:
1756 		v.val = sock_flag(sk, SOCK_RCVMARK);
1757 		break;
1758 
1759 	case SO_RXQ_OVFL:
1760 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1761 		break;
1762 
1763 	case SO_WIFI_STATUS:
1764 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1765 		break;
1766 
1767 	case SO_PEEK_OFF:
1768 		if (!sock->ops->set_peek_off)
1769 			return -EOPNOTSUPP;
1770 
1771 		v.val = sk->sk_peek_off;
1772 		break;
1773 	case SO_NOFCS:
1774 		v.val = sock_flag(sk, SOCK_NOFCS);
1775 		break;
1776 
1777 	case SO_BINDTODEVICE:
1778 		return sock_getbindtodevice(sk, optval, optlen, len);
1779 
1780 	case SO_GET_FILTER:
1781 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1782 		if (len < 0)
1783 			return len;
1784 
1785 		goto lenout;
1786 
1787 	case SO_LOCK_FILTER:
1788 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1789 		break;
1790 
1791 	case SO_BPF_EXTENSIONS:
1792 		v.val = bpf_tell_extensions();
1793 		break;
1794 
1795 	case SO_SELECT_ERR_QUEUE:
1796 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1797 		break;
1798 
1799 #ifdef CONFIG_NET_RX_BUSY_POLL
1800 	case SO_BUSY_POLL:
1801 		v.val = sk->sk_ll_usec;
1802 		break;
1803 	case SO_PREFER_BUSY_POLL:
1804 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1805 		break;
1806 #endif
1807 
1808 	case SO_MAX_PACING_RATE:
1809 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1810 			lv = sizeof(v.ulval);
1811 			v.ulval = sk->sk_max_pacing_rate;
1812 		} else {
1813 			/* 32bit version */
1814 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1815 		}
1816 		break;
1817 
1818 	case SO_INCOMING_CPU:
1819 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1820 		break;
1821 
1822 	case SO_MEMINFO:
1823 	{
1824 		u32 meminfo[SK_MEMINFO_VARS];
1825 
1826 		sk_get_meminfo(sk, meminfo);
1827 
1828 		len = min_t(unsigned int, len, sizeof(meminfo));
1829 		if (copy_to_user(optval, &meminfo, len))
1830 			return -EFAULT;
1831 
1832 		goto lenout;
1833 	}
1834 
1835 #ifdef CONFIG_NET_RX_BUSY_POLL
1836 	case SO_INCOMING_NAPI_ID:
1837 		v.val = READ_ONCE(sk->sk_napi_id);
1838 
1839 		/* aggregate non-NAPI IDs down to 0 */
1840 		if (v.val < MIN_NAPI_ID)
1841 			v.val = 0;
1842 
1843 		break;
1844 #endif
1845 
1846 	case SO_COOKIE:
1847 		lv = sizeof(u64);
1848 		if (len < lv)
1849 			return -EINVAL;
1850 		v.val64 = sock_gen_cookie(sk);
1851 		break;
1852 
1853 	case SO_ZEROCOPY:
1854 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1855 		break;
1856 
1857 	case SO_TXTIME:
1858 		lv = sizeof(v.txtime);
1859 		v.txtime.clockid = sk->sk_clockid;
1860 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1861 				  SOF_TXTIME_DEADLINE_MODE : 0;
1862 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1863 				  SOF_TXTIME_REPORT_ERRORS : 0;
1864 		break;
1865 
1866 	case SO_BINDTOIFINDEX:
1867 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1868 		break;
1869 
1870 	case SO_NETNS_COOKIE:
1871 		lv = sizeof(u64);
1872 		if (len != lv)
1873 			return -EINVAL;
1874 		v.val64 = sock_net(sk)->net_cookie;
1875 		break;
1876 
1877 	case SO_BUF_LOCK:
1878 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1879 		break;
1880 
1881 	case SO_RESERVE_MEM:
1882 		v.val = sk->sk_reserved_mem;
1883 		break;
1884 
1885 	case SO_TXREHASH:
1886 		v.val = sk->sk_txrehash;
1887 		break;
1888 
1889 	default:
1890 		/* We implement the SO_SNDLOWAT etc to not be settable
1891 		 * (1003.1g 7).
1892 		 */
1893 		return -ENOPROTOOPT;
1894 	}
1895 
1896 	if (len > lv)
1897 		len = lv;
1898 	if (copy_to_user(optval, &v, len))
1899 		return -EFAULT;
1900 lenout:
1901 	if (put_user(len, optlen))
1902 		return -EFAULT;
1903 	return 0;
1904 }
1905 
1906 /*
1907  * Initialize an sk_lock.
1908  *
1909  * (We also register the sk_lock with the lock validator.)
1910  */
1911 static inline void sock_lock_init(struct sock *sk)
1912 {
1913 	if (sk->sk_kern_sock)
1914 		sock_lock_init_class_and_name(
1915 			sk,
1916 			af_family_kern_slock_key_strings[sk->sk_family],
1917 			af_family_kern_slock_keys + sk->sk_family,
1918 			af_family_kern_key_strings[sk->sk_family],
1919 			af_family_kern_keys + sk->sk_family);
1920 	else
1921 		sock_lock_init_class_and_name(
1922 			sk,
1923 			af_family_slock_key_strings[sk->sk_family],
1924 			af_family_slock_keys + sk->sk_family,
1925 			af_family_key_strings[sk->sk_family],
1926 			af_family_keys + sk->sk_family);
1927 }
1928 
1929 /*
1930  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1931  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1932  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1933  */
1934 static void sock_copy(struct sock *nsk, const struct sock *osk)
1935 {
1936 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1937 #ifdef CONFIG_SECURITY_NETWORK
1938 	void *sptr = nsk->sk_security;
1939 #endif
1940 
1941 	/* If we move sk_tx_queue_mapping out of the private section,
1942 	 * we must check if sk_tx_queue_clear() is called after
1943 	 * sock_copy() in sk_clone_lock().
1944 	 */
1945 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1946 		     offsetof(struct sock, sk_dontcopy_begin) ||
1947 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1948 		     offsetof(struct sock, sk_dontcopy_end));
1949 
1950 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1951 
1952 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1953 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1954 
1955 #ifdef CONFIG_SECURITY_NETWORK
1956 	nsk->sk_security = sptr;
1957 	security_sk_clone(osk, nsk);
1958 #endif
1959 }
1960 
1961 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1962 		int family)
1963 {
1964 	struct sock *sk;
1965 	struct kmem_cache *slab;
1966 
1967 	slab = prot->slab;
1968 	if (slab != NULL) {
1969 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1970 		if (!sk)
1971 			return sk;
1972 		if (want_init_on_alloc(priority))
1973 			sk_prot_clear_nulls(sk, prot->obj_size);
1974 	} else
1975 		sk = kmalloc(prot->obj_size, priority);
1976 
1977 	if (sk != NULL) {
1978 		if (security_sk_alloc(sk, family, priority))
1979 			goto out_free;
1980 
1981 		if (!try_module_get(prot->owner))
1982 			goto out_free_sec;
1983 	}
1984 
1985 	return sk;
1986 
1987 out_free_sec:
1988 	security_sk_free(sk);
1989 out_free:
1990 	if (slab != NULL)
1991 		kmem_cache_free(slab, sk);
1992 	else
1993 		kfree(sk);
1994 	return NULL;
1995 }
1996 
1997 static void sk_prot_free(struct proto *prot, struct sock *sk)
1998 {
1999 	struct kmem_cache *slab;
2000 	struct module *owner;
2001 
2002 	owner = prot->owner;
2003 	slab = prot->slab;
2004 
2005 	cgroup_sk_free(&sk->sk_cgrp_data);
2006 	mem_cgroup_sk_free(sk);
2007 	security_sk_free(sk);
2008 	if (slab != NULL)
2009 		kmem_cache_free(slab, sk);
2010 	else
2011 		kfree(sk);
2012 	module_put(owner);
2013 }
2014 
2015 /**
2016  *	sk_alloc - All socket objects are allocated here
2017  *	@net: the applicable net namespace
2018  *	@family: protocol family
2019  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2020  *	@prot: struct proto associated with this new sock instance
2021  *	@kern: is this to be a kernel socket?
2022  */
2023 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2024 		      struct proto *prot, int kern)
2025 {
2026 	struct sock *sk;
2027 
2028 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2029 	if (sk) {
2030 		sk->sk_family = family;
2031 		/*
2032 		 * See comment in struct sock definition to understand
2033 		 * why we need sk_prot_creator -acme
2034 		 */
2035 		sk->sk_prot = sk->sk_prot_creator = prot;
2036 		sk->sk_kern_sock = kern;
2037 		sock_lock_init(sk);
2038 		sk->sk_net_refcnt = kern ? 0 : 1;
2039 		if (likely(sk->sk_net_refcnt)) {
2040 			get_net_track(net, &sk->ns_tracker, priority);
2041 			sock_inuse_add(net, 1);
2042 		}
2043 
2044 		sock_net_set(sk, net);
2045 		refcount_set(&sk->sk_wmem_alloc, 1);
2046 
2047 		mem_cgroup_sk_alloc(sk);
2048 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2049 		sock_update_classid(&sk->sk_cgrp_data);
2050 		sock_update_netprioidx(&sk->sk_cgrp_data);
2051 		sk_tx_queue_clear(sk);
2052 	}
2053 
2054 	return sk;
2055 }
2056 EXPORT_SYMBOL(sk_alloc);
2057 
2058 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2059  * grace period. This is the case for UDP sockets and TCP listeners.
2060  */
2061 static void __sk_destruct(struct rcu_head *head)
2062 {
2063 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2064 	struct sk_filter *filter;
2065 
2066 	if (sk->sk_destruct)
2067 		sk->sk_destruct(sk);
2068 
2069 	filter = rcu_dereference_check(sk->sk_filter,
2070 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2071 	if (filter) {
2072 		sk_filter_uncharge(sk, filter);
2073 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2074 	}
2075 
2076 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2077 
2078 #ifdef CONFIG_BPF_SYSCALL
2079 	bpf_sk_storage_free(sk);
2080 #endif
2081 
2082 	if (atomic_read(&sk->sk_omem_alloc))
2083 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2084 			 __func__, atomic_read(&sk->sk_omem_alloc));
2085 
2086 	if (sk->sk_frag.page) {
2087 		put_page(sk->sk_frag.page);
2088 		sk->sk_frag.page = NULL;
2089 	}
2090 
2091 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2092 	put_cred(sk->sk_peer_cred);
2093 	put_pid(sk->sk_peer_pid);
2094 
2095 	if (likely(sk->sk_net_refcnt))
2096 		put_net_track(sock_net(sk), &sk->ns_tracker);
2097 	sk_prot_free(sk->sk_prot_creator, sk);
2098 }
2099 
2100 void sk_destruct(struct sock *sk)
2101 {
2102 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2103 
2104 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2105 		reuseport_detach_sock(sk);
2106 		use_call_rcu = true;
2107 	}
2108 
2109 	if (use_call_rcu)
2110 		call_rcu(&sk->sk_rcu, __sk_destruct);
2111 	else
2112 		__sk_destruct(&sk->sk_rcu);
2113 }
2114 
2115 static void __sk_free(struct sock *sk)
2116 {
2117 	if (likely(sk->sk_net_refcnt))
2118 		sock_inuse_add(sock_net(sk), -1);
2119 
2120 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2121 		sock_diag_broadcast_destroy(sk);
2122 	else
2123 		sk_destruct(sk);
2124 }
2125 
2126 void sk_free(struct sock *sk)
2127 {
2128 	/*
2129 	 * We subtract one from sk_wmem_alloc and can know if
2130 	 * some packets are still in some tx queue.
2131 	 * If not null, sock_wfree() will call __sk_free(sk) later
2132 	 */
2133 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2134 		__sk_free(sk);
2135 }
2136 EXPORT_SYMBOL(sk_free);
2137 
2138 static void sk_init_common(struct sock *sk)
2139 {
2140 	skb_queue_head_init(&sk->sk_receive_queue);
2141 	skb_queue_head_init(&sk->sk_write_queue);
2142 	skb_queue_head_init(&sk->sk_error_queue);
2143 
2144 	rwlock_init(&sk->sk_callback_lock);
2145 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2146 			af_rlock_keys + sk->sk_family,
2147 			af_family_rlock_key_strings[sk->sk_family]);
2148 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2149 			af_wlock_keys + sk->sk_family,
2150 			af_family_wlock_key_strings[sk->sk_family]);
2151 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2152 			af_elock_keys + sk->sk_family,
2153 			af_family_elock_key_strings[sk->sk_family]);
2154 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2155 			af_callback_keys + sk->sk_family,
2156 			af_family_clock_key_strings[sk->sk_family]);
2157 }
2158 
2159 /**
2160  *	sk_clone_lock - clone a socket, and lock its clone
2161  *	@sk: the socket to clone
2162  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2163  *
2164  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2165  */
2166 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2167 {
2168 	struct proto *prot = READ_ONCE(sk->sk_prot);
2169 	struct sk_filter *filter;
2170 	bool is_charged = true;
2171 	struct sock *newsk;
2172 
2173 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2174 	if (!newsk)
2175 		goto out;
2176 
2177 	sock_copy(newsk, sk);
2178 
2179 	newsk->sk_prot_creator = prot;
2180 
2181 	/* SANITY */
2182 	if (likely(newsk->sk_net_refcnt)) {
2183 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2184 		sock_inuse_add(sock_net(newsk), 1);
2185 	}
2186 	sk_node_init(&newsk->sk_node);
2187 	sock_lock_init(newsk);
2188 	bh_lock_sock(newsk);
2189 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2190 	newsk->sk_backlog.len = 0;
2191 
2192 	atomic_set(&newsk->sk_rmem_alloc, 0);
2193 
2194 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2195 	refcount_set(&newsk->sk_wmem_alloc, 1);
2196 
2197 	atomic_set(&newsk->sk_omem_alloc, 0);
2198 	sk_init_common(newsk);
2199 
2200 	newsk->sk_dst_cache	= NULL;
2201 	newsk->sk_dst_pending_confirm = 0;
2202 	newsk->sk_wmem_queued	= 0;
2203 	newsk->sk_forward_alloc = 0;
2204 	newsk->sk_reserved_mem  = 0;
2205 	atomic_set(&newsk->sk_drops, 0);
2206 	newsk->sk_send_head	= NULL;
2207 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2208 	atomic_set(&newsk->sk_zckey, 0);
2209 
2210 	sock_reset_flag(newsk, SOCK_DONE);
2211 
2212 	/* sk->sk_memcg will be populated at accept() time */
2213 	newsk->sk_memcg = NULL;
2214 
2215 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2216 
2217 	rcu_read_lock();
2218 	filter = rcu_dereference(sk->sk_filter);
2219 	if (filter != NULL)
2220 		/* though it's an empty new sock, the charging may fail
2221 		 * if sysctl_optmem_max was changed between creation of
2222 		 * original socket and cloning
2223 		 */
2224 		is_charged = sk_filter_charge(newsk, filter);
2225 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2226 	rcu_read_unlock();
2227 
2228 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2229 		/* We need to make sure that we don't uncharge the new
2230 		 * socket if we couldn't charge it in the first place
2231 		 * as otherwise we uncharge the parent's filter.
2232 		 */
2233 		if (!is_charged)
2234 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2235 		sk_free_unlock_clone(newsk);
2236 		newsk = NULL;
2237 		goto out;
2238 	}
2239 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2240 
2241 	if (bpf_sk_storage_clone(sk, newsk)) {
2242 		sk_free_unlock_clone(newsk);
2243 		newsk = NULL;
2244 		goto out;
2245 	}
2246 
2247 	/* Clear sk_user_data if parent had the pointer tagged
2248 	 * as not suitable for copying when cloning.
2249 	 */
2250 	if (sk_user_data_is_nocopy(newsk))
2251 		newsk->sk_user_data = NULL;
2252 
2253 	newsk->sk_err	   = 0;
2254 	newsk->sk_err_soft = 0;
2255 	newsk->sk_priority = 0;
2256 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2257 
2258 	/* Before updating sk_refcnt, we must commit prior changes to memory
2259 	 * (Documentation/RCU/rculist_nulls.rst for details)
2260 	 */
2261 	smp_wmb();
2262 	refcount_set(&newsk->sk_refcnt, 2);
2263 
2264 	/* Increment the counter in the same struct proto as the master
2265 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2266 	 * is the same as sk->sk_prot->socks, as this field was copied
2267 	 * with memcpy).
2268 	 *
2269 	 * This _changes_ the previous behaviour, where
2270 	 * tcp_create_openreq_child always was incrementing the
2271 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2272 	 * to be taken into account in all callers. -acme
2273 	 */
2274 	sk_refcnt_debug_inc(newsk);
2275 	sk_set_socket(newsk, NULL);
2276 	sk_tx_queue_clear(newsk);
2277 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2278 
2279 	if (newsk->sk_prot->sockets_allocated)
2280 		sk_sockets_allocated_inc(newsk);
2281 
2282 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2283 		net_enable_timestamp();
2284 out:
2285 	return newsk;
2286 }
2287 EXPORT_SYMBOL_GPL(sk_clone_lock);
2288 
2289 void sk_free_unlock_clone(struct sock *sk)
2290 {
2291 	/* It is still raw copy of parent, so invalidate
2292 	 * destructor and make plain sk_free() */
2293 	sk->sk_destruct = NULL;
2294 	bh_unlock_sock(sk);
2295 	sk_free(sk);
2296 }
2297 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2298 
2299 static void sk_trim_gso_size(struct sock *sk)
2300 {
2301 	if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
2302 		return;
2303 #if IS_ENABLED(CONFIG_IPV6)
2304 	if (sk->sk_family == AF_INET6 &&
2305 	    sk_is_tcp(sk) &&
2306 	    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
2307 		return;
2308 #endif
2309 	sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
2310 }
2311 
2312 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2313 {
2314 	u32 max_segs = 1;
2315 
2316 	sk_dst_set(sk, dst);
2317 	sk->sk_route_caps = dst->dev->features;
2318 	if (sk_is_tcp(sk))
2319 		sk->sk_route_caps |= NETIF_F_GSO;
2320 	if (sk->sk_route_caps & NETIF_F_GSO)
2321 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2322 	if (unlikely(sk->sk_gso_disabled))
2323 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2324 	if (sk_can_gso(sk)) {
2325 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2326 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2327 		} else {
2328 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2329 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2330 			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2331 			sk_trim_gso_size(sk);
2332 			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2333 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2334 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2335 		}
2336 	}
2337 	sk->sk_gso_max_segs = max_segs;
2338 }
2339 EXPORT_SYMBOL_GPL(sk_setup_caps);
2340 
2341 /*
2342  *	Simple resource managers for sockets.
2343  */
2344 
2345 
2346 /*
2347  * Write buffer destructor automatically called from kfree_skb.
2348  */
2349 void sock_wfree(struct sk_buff *skb)
2350 {
2351 	struct sock *sk = skb->sk;
2352 	unsigned int len = skb->truesize;
2353 	bool free;
2354 
2355 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2356 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2357 		    sk->sk_write_space == sock_def_write_space) {
2358 			rcu_read_lock();
2359 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2360 			sock_def_write_space_wfree(sk);
2361 			rcu_read_unlock();
2362 			if (unlikely(free))
2363 				__sk_free(sk);
2364 			return;
2365 		}
2366 
2367 		/*
2368 		 * Keep a reference on sk_wmem_alloc, this will be released
2369 		 * after sk_write_space() call
2370 		 */
2371 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2372 		sk->sk_write_space(sk);
2373 		len = 1;
2374 	}
2375 	/*
2376 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2377 	 * could not do because of in-flight packets
2378 	 */
2379 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2380 		__sk_free(sk);
2381 }
2382 EXPORT_SYMBOL(sock_wfree);
2383 
2384 /* This variant of sock_wfree() is used by TCP,
2385  * since it sets SOCK_USE_WRITE_QUEUE.
2386  */
2387 void __sock_wfree(struct sk_buff *skb)
2388 {
2389 	struct sock *sk = skb->sk;
2390 
2391 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2392 		__sk_free(sk);
2393 }
2394 
2395 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2396 {
2397 	skb_orphan(skb);
2398 	skb->sk = sk;
2399 #ifdef CONFIG_INET
2400 	if (unlikely(!sk_fullsock(sk))) {
2401 		skb->destructor = sock_edemux;
2402 		sock_hold(sk);
2403 		return;
2404 	}
2405 #endif
2406 	skb->destructor = sock_wfree;
2407 	skb_set_hash_from_sk(skb, sk);
2408 	/*
2409 	 * We used to take a refcount on sk, but following operation
2410 	 * is enough to guarantee sk_free() wont free this sock until
2411 	 * all in-flight packets are completed
2412 	 */
2413 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2414 }
2415 EXPORT_SYMBOL(skb_set_owner_w);
2416 
2417 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2418 {
2419 #ifdef CONFIG_TLS_DEVICE
2420 	/* Drivers depend on in-order delivery for crypto offload,
2421 	 * partial orphan breaks out-of-order-OK logic.
2422 	 */
2423 	if (skb->decrypted)
2424 		return false;
2425 #endif
2426 	return (skb->destructor == sock_wfree ||
2427 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2428 }
2429 
2430 /* This helper is used by netem, as it can hold packets in its
2431  * delay queue. We want to allow the owner socket to send more
2432  * packets, as if they were already TX completed by a typical driver.
2433  * But we also want to keep skb->sk set because some packet schedulers
2434  * rely on it (sch_fq for example).
2435  */
2436 void skb_orphan_partial(struct sk_buff *skb)
2437 {
2438 	if (skb_is_tcp_pure_ack(skb))
2439 		return;
2440 
2441 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2442 		return;
2443 
2444 	skb_orphan(skb);
2445 }
2446 EXPORT_SYMBOL(skb_orphan_partial);
2447 
2448 /*
2449  * Read buffer destructor automatically called from kfree_skb.
2450  */
2451 void sock_rfree(struct sk_buff *skb)
2452 {
2453 	struct sock *sk = skb->sk;
2454 	unsigned int len = skb->truesize;
2455 
2456 	atomic_sub(len, &sk->sk_rmem_alloc);
2457 	sk_mem_uncharge(sk, len);
2458 }
2459 EXPORT_SYMBOL(sock_rfree);
2460 
2461 /*
2462  * Buffer destructor for skbs that are not used directly in read or write
2463  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2464  */
2465 void sock_efree(struct sk_buff *skb)
2466 {
2467 	sock_put(skb->sk);
2468 }
2469 EXPORT_SYMBOL(sock_efree);
2470 
2471 /* Buffer destructor for prefetch/receive path where reference count may
2472  * not be held, e.g. for listen sockets.
2473  */
2474 #ifdef CONFIG_INET
2475 void sock_pfree(struct sk_buff *skb)
2476 {
2477 	if (sk_is_refcounted(skb->sk))
2478 		sock_gen_put(skb->sk);
2479 }
2480 EXPORT_SYMBOL(sock_pfree);
2481 #endif /* CONFIG_INET */
2482 
2483 kuid_t sock_i_uid(struct sock *sk)
2484 {
2485 	kuid_t uid;
2486 
2487 	read_lock_bh(&sk->sk_callback_lock);
2488 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2489 	read_unlock_bh(&sk->sk_callback_lock);
2490 	return uid;
2491 }
2492 EXPORT_SYMBOL(sock_i_uid);
2493 
2494 unsigned long sock_i_ino(struct sock *sk)
2495 {
2496 	unsigned long ino;
2497 
2498 	read_lock_bh(&sk->sk_callback_lock);
2499 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2500 	read_unlock_bh(&sk->sk_callback_lock);
2501 	return ino;
2502 }
2503 EXPORT_SYMBOL(sock_i_ino);
2504 
2505 /*
2506  * Allocate a skb from the socket's send buffer.
2507  */
2508 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2509 			     gfp_t priority)
2510 {
2511 	if (force ||
2512 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2513 		struct sk_buff *skb = alloc_skb(size, priority);
2514 
2515 		if (skb) {
2516 			skb_set_owner_w(skb, sk);
2517 			return skb;
2518 		}
2519 	}
2520 	return NULL;
2521 }
2522 EXPORT_SYMBOL(sock_wmalloc);
2523 
2524 static void sock_ofree(struct sk_buff *skb)
2525 {
2526 	struct sock *sk = skb->sk;
2527 
2528 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2529 }
2530 
2531 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2532 			     gfp_t priority)
2533 {
2534 	struct sk_buff *skb;
2535 
2536 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2537 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2538 	    sysctl_optmem_max)
2539 		return NULL;
2540 
2541 	skb = alloc_skb(size, priority);
2542 	if (!skb)
2543 		return NULL;
2544 
2545 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2546 	skb->sk = sk;
2547 	skb->destructor = sock_ofree;
2548 	return skb;
2549 }
2550 
2551 /*
2552  * Allocate a memory block from the socket's option memory buffer.
2553  */
2554 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2555 {
2556 	if ((unsigned int)size <= sysctl_optmem_max &&
2557 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2558 		void *mem;
2559 		/* First do the add, to avoid the race if kmalloc
2560 		 * might sleep.
2561 		 */
2562 		atomic_add(size, &sk->sk_omem_alloc);
2563 		mem = kmalloc(size, priority);
2564 		if (mem)
2565 			return mem;
2566 		atomic_sub(size, &sk->sk_omem_alloc);
2567 	}
2568 	return NULL;
2569 }
2570 EXPORT_SYMBOL(sock_kmalloc);
2571 
2572 /* Free an option memory block. Note, we actually want the inline
2573  * here as this allows gcc to detect the nullify and fold away the
2574  * condition entirely.
2575  */
2576 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2577 				  const bool nullify)
2578 {
2579 	if (WARN_ON_ONCE(!mem))
2580 		return;
2581 	if (nullify)
2582 		kfree_sensitive(mem);
2583 	else
2584 		kfree(mem);
2585 	atomic_sub(size, &sk->sk_omem_alloc);
2586 }
2587 
2588 void sock_kfree_s(struct sock *sk, void *mem, int size)
2589 {
2590 	__sock_kfree_s(sk, mem, size, false);
2591 }
2592 EXPORT_SYMBOL(sock_kfree_s);
2593 
2594 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2595 {
2596 	__sock_kfree_s(sk, mem, size, true);
2597 }
2598 EXPORT_SYMBOL(sock_kzfree_s);
2599 
2600 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2601    I think, these locks should be removed for datagram sockets.
2602  */
2603 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2604 {
2605 	DEFINE_WAIT(wait);
2606 
2607 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2608 	for (;;) {
2609 		if (!timeo)
2610 			break;
2611 		if (signal_pending(current))
2612 			break;
2613 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2614 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2615 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2616 			break;
2617 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2618 			break;
2619 		if (sk->sk_err)
2620 			break;
2621 		timeo = schedule_timeout(timeo);
2622 	}
2623 	finish_wait(sk_sleep(sk), &wait);
2624 	return timeo;
2625 }
2626 
2627 
2628 /*
2629  *	Generic send/receive buffer handlers
2630  */
2631 
2632 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2633 				     unsigned long data_len, int noblock,
2634 				     int *errcode, int max_page_order)
2635 {
2636 	struct sk_buff *skb;
2637 	long timeo;
2638 	int err;
2639 
2640 	timeo = sock_sndtimeo(sk, noblock);
2641 	for (;;) {
2642 		err = sock_error(sk);
2643 		if (err != 0)
2644 			goto failure;
2645 
2646 		err = -EPIPE;
2647 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2648 			goto failure;
2649 
2650 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2651 			break;
2652 
2653 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2654 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2655 		err = -EAGAIN;
2656 		if (!timeo)
2657 			goto failure;
2658 		if (signal_pending(current))
2659 			goto interrupted;
2660 		timeo = sock_wait_for_wmem(sk, timeo);
2661 	}
2662 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2663 				   errcode, sk->sk_allocation);
2664 	if (skb)
2665 		skb_set_owner_w(skb, sk);
2666 	return skb;
2667 
2668 interrupted:
2669 	err = sock_intr_errno(timeo);
2670 failure:
2671 	*errcode = err;
2672 	return NULL;
2673 }
2674 EXPORT_SYMBOL(sock_alloc_send_pskb);
2675 
2676 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2677 		     struct sockcm_cookie *sockc)
2678 {
2679 	u32 tsflags;
2680 
2681 	switch (cmsg->cmsg_type) {
2682 	case SO_MARK:
2683 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2684 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2685 			return -EPERM;
2686 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2687 			return -EINVAL;
2688 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2689 		break;
2690 	case SO_TIMESTAMPING_OLD:
2691 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2692 			return -EINVAL;
2693 
2694 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2695 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2696 			return -EINVAL;
2697 
2698 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2699 		sockc->tsflags |= tsflags;
2700 		break;
2701 	case SCM_TXTIME:
2702 		if (!sock_flag(sk, SOCK_TXTIME))
2703 			return -EINVAL;
2704 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2705 			return -EINVAL;
2706 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2707 		break;
2708 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2709 	case SCM_RIGHTS:
2710 	case SCM_CREDENTIALS:
2711 		break;
2712 	default:
2713 		return -EINVAL;
2714 	}
2715 	return 0;
2716 }
2717 EXPORT_SYMBOL(__sock_cmsg_send);
2718 
2719 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2720 		   struct sockcm_cookie *sockc)
2721 {
2722 	struct cmsghdr *cmsg;
2723 	int ret;
2724 
2725 	for_each_cmsghdr(cmsg, msg) {
2726 		if (!CMSG_OK(msg, cmsg))
2727 			return -EINVAL;
2728 		if (cmsg->cmsg_level != SOL_SOCKET)
2729 			continue;
2730 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2731 		if (ret)
2732 			return ret;
2733 	}
2734 	return 0;
2735 }
2736 EXPORT_SYMBOL(sock_cmsg_send);
2737 
2738 static void sk_enter_memory_pressure(struct sock *sk)
2739 {
2740 	if (!sk->sk_prot->enter_memory_pressure)
2741 		return;
2742 
2743 	sk->sk_prot->enter_memory_pressure(sk);
2744 }
2745 
2746 static void sk_leave_memory_pressure(struct sock *sk)
2747 {
2748 	if (sk->sk_prot->leave_memory_pressure) {
2749 		sk->sk_prot->leave_memory_pressure(sk);
2750 	} else {
2751 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2752 
2753 		if (memory_pressure && READ_ONCE(*memory_pressure))
2754 			WRITE_ONCE(*memory_pressure, 0);
2755 	}
2756 }
2757 
2758 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2759 
2760 /**
2761  * skb_page_frag_refill - check that a page_frag contains enough room
2762  * @sz: minimum size of the fragment we want to get
2763  * @pfrag: pointer to page_frag
2764  * @gfp: priority for memory allocation
2765  *
2766  * Note: While this allocator tries to use high order pages, there is
2767  * no guarantee that allocations succeed. Therefore, @sz MUST be
2768  * less or equal than PAGE_SIZE.
2769  */
2770 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2771 {
2772 	if (pfrag->page) {
2773 		if (page_ref_count(pfrag->page) == 1) {
2774 			pfrag->offset = 0;
2775 			return true;
2776 		}
2777 		if (pfrag->offset + sz <= pfrag->size)
2778 			return true;
2779 		put_page(pfrag->page);
2780 	}
2781 
2782 	pfrag->offset = 0;
2783 	if (SKB_FRAG_PAGE_ORDER &&
2784 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2785 		/* Avoid direct reclaim but allow kswapd to wake */
2786 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2787 					  __GFP_COMP | __GFP_NOWARN |
2788 					  __GFP_NORETRY,
2789 					  SKB_FRAG_PAGE_ORDER);
2790 		if (likely(pfrag->page)) {
2791 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2792 			return true;
2793 		}
2794 	}
2795 	pfrag->page = alloc_page(gfp);
2796 	if (likely(pfrag->page)) {
2797 		pfrag->size = PAGE_SIZE;
2798 		return true;
2799 	}
2800 	return false;
2801 }
2802 EXPORT_SYMBOL(skb_page_frag_refill);
2803 
2804 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2805 {
2806 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2807 		return true;
2808 
2809 	sk_enter_memory_pressure(sk);
2810 	sk_stream_moderate_sndbuf(sk);
2811 	return false;
2812 }
2813 EXPORT_SYMBOL(sk_page_frag_refill);
2814 
2815 void __lock_sock(struct sock *sk)
2816 	__releases(&sk->sk_lock.slock)
2817 	__acquires(&sk->sk_lock.slock)
2818 {
2819 	DEFINE_WAIT(wait);
2820 
2821 	for (;;) {
2822 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2823 					TASK_UNINTERRUPTIBLE);
2824 		spin_unlock_bh(&sk->sk_lock.slock);
2825 		schedule();
2826 		spin_lock_bh(&sk->sk_lock.slock);
2827 		if (!sock_owned_by_user(sk))
2828 			break;
2829 	}
2830 	finish_wait(&sk->sk_lock.wq, &wait);
2831 }
2832 
2833 void __release_sock(struct sock *sk)
2834 	__releases(&sk->sk_lock.slock)
2835 	__acquires(&sk->sk_lock.slock)
2836 {
2837 	struct sk_buff *skb, *next;
2838 
2839 	while ((skb = sk->sk_backlog.head) != NULL) {
2840 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2841 
2842 		spin_unlock_bh(&sk->sk_lock.slock);
2843 
2844 		do {
2845 			next = skb->next;
2846 			prefetch(next);
2847 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2848 			skb_mark_not_on_list(skb);
2849 			sk_backlog_rcv(sk, skb);
2850 
2851 			cond_resched();
2852 
2853 			skb = next;
2854 		} while (skb != NULL);
2855 
2856 		spin_lock_bh(&sk->sk_lock.slock);
2857 	}
2858 
2859 	/*
2860 	 * Doing the zeroing here guarantee we can not loop forever
2861 	 * while a wild producer attempts to flood us.
2862 	 */
2863 	sk->sk_backlog.len = 0;
2864 }
2865 
2866 void __sk_flush_backlog(struct sock *sk)
2867 {
2868 	spin_lock_bh(&sk->sk_lock.slock);
2869 	__release_sock(sk);
2870 	spin_unlock_bh(&sk->sk_lock.slock);
2871 }
2872 
2873 /**
2874  * sk_wait_data - wait for data to arrive at sk_receive_queue
2875  * @sk:    sock to wait on
2876  * @timeo: for how long
2877  * @skb:   last skb seen on sk_receive_queue
2878  *
2879  * Now socket state including sk->sk_err is changed only under lock,
2880  * hence we may omit checks after joining wait queue.
2881  * We check receive queue before schedule() only as optimization;
2882  * it is very likely that release_sock() added new data.
2883  */
2884 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2885 {
2886 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2887 	int rc;
2888 
2889 	add_wait_queue(sk_sleep(sk), &wait);
2890 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2891 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2892 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2893 	remove_wait_queue(sk_sleep(sk), &wait);
2894 	return rc;
2895 }
2896 EXPORT_SYMBOL(sk_wait_data);
2897 
2898 /**
2899  *	__sk_mem_raise_allocated - increase memory_allocated
2900  *	@sk: socket
2901  *	@size: memory size to allocate
2902  *	@amt: pages to allocate
2903  *	@kind: allocation type
2904  *
2905  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2906  */
2907 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2908 {
2909 	struct proto *prot = sk->sk_prot;
2910 	long allocated = sk_memory_allocated_add(sk, amt);
2911 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2912 	bool charged = true;
2913 
2914 	if (memcg_charge &&
2915 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2916 						gfp_memcg_charge())))
2917 		goto suppress_allocation;
2918 
2919 	/* Under limit. */
2920 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2921 		sk_leave_memory_pressure(sk);
2922 		return 1;
2923 	}
2924 
2925 	/* Under pressure. */
2926 	if (allocated > sk_prot_mem_limits(sk, 1))
2927 		sk_enter_memory_pressure(sk);
2928 
2929 	/* Over hard limit. */
2930 	if (allocated > sk_prot_mem_limits(sk, 2))
2931 		goto suppress_allocation;
2932 
2933 	/* guarantee minimum buffer size under pressure */
2934 	if (kind == SK_MEM_RECV) {
2935 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2936 			return 1;
2937 
2938 	} else { /* SK_MEM_SEND */
2939 		int wmem0 = sk_get_wmem0(sk, prot);
2940 
2941 		if (sk->sk_type == SOCK_STREAM) {
2942 			if (sk->sk_wmem_queued < wmem0)
2943 				return 1;
2944 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2945 				return 1;
2946 		}
2947 	}
2948 
2949 	if (sk_has_memory_pressure(sk)) {
2950 		u64 alloc;
2951 
2952 		if (!sk_under_memory_pressure(sk))
2953 			return 1;
2954 		alloc = sk_sockets_allocated_read_positive(sk);
2955 		if (sk_prot_mem_limits(sk, 2) > alloc *
2956 		    sk_mem_pages(sk->sk_wmem_queued +
2957 				 atomic_read(&sk->sk_rmem_alloc) +
2958 				 sk->sk_forward_alloc))
2959 			return 1;
2960 	}
2961 
2962 suppress_allocation:
2963 
2964 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2965 		sk_stream_moderate_sndbuf(sk);
2966 
2967 		/* Fail only if socket is _under_ its sndbuf.
2968 		 * In this case we cannot block, so that we have to fail.
2969 		 */
2970 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2971 			/* Force charge with __GFP_NOFAIL */
2972 			if (memcg_charge && !charged) {
2973 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2974 					gfp_memcg_charge() | __GFP_NOFAIL);
2975 			}
2976 			return 1;
2977 		}
2978 	}
2979 
2980 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2981 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2982 
2983 	sk_memory_allocated_sub(sk, amt);
2984 
2985 	if (memcg_charge && charged)
2986 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2987 
2988 	return 0;
2989 }
2990 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2991 
2992 /**
2993  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2994  *	@sk: socket
2995  *	@size: memory size to allocate
2996  *	@kind: allocation type
2997  *
2998  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2999  *	rmem allocation. This function assumes that protocols which have
3000  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3001  */
3002 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3003 {
3004 	int ret, amt = sk_mem_pages(size);
3005 
3006 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
3007 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3008 	if (!ret)
3009 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
3010 	return ret;
3011 }
3012 EXPORT_SYMBOL(__sk_mem_schedule);
3013 
3014 /**
3015  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3016  *	@sk: socket
3017  *	@amount: number of quanta
3018  *
3019  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3020  */
3021 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3022 {
3023 	sk_memory_allocated_sub(sk, amount);
3024 
3025 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3026 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3027 
3028 	if (sk_under_memory_pressure(sk) &&
3029 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3030 		sk_leave_memory_pressure(sk);
3031 }
3032 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
3033 
3034 /**
3035  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3036  *	@sk: socket
3037  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
3038  */
3039 void __sk_mem_reclaim(struct sock *sk, int amount)
3040 {
3041 	amount >>= SK_MEM_QUANTUM_SHIFT;
3042 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
3043 	__sk_mem_reduce_allocated(sk, amount);
3044 }
3045 EXPORT_SYMBOL(__sk_mem_reclaim);
3046 
3047 int sk_set_peek_off(struct sock *sk, int val)
3048 {
3049 	sk->sk_peek_off = val;
3050 	return 0;
3051 }
3052 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3053 
3054 /*
3055  * Set of default routines for initialising struct proto_ops when
3056  * the protocol does not support a particular function. In certain
3057  * cases where it makes no sense for a protocol to have a "do nothing"
3058  * function, some default processing is provided.
3059  */
3060 
3061 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3062 {
3063 	return -EOPNOTSUPP;
3064 }
3065 EXPORT_SYMBOL(sock_no_bind);
3066 
3067 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3068 		    int len, int flags)
3069 {
3070 	return -EOPNOTSUPP;
3071 }
3072 EXPORT_SYMBOL(sock_no_connect);
3073 
3074 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3075 {
3076 	return -EOPNOTSUPP;
3077 }
3078 EXPORT_SYMBOL(sock_no_socketpair);
3079 
3080 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3081 		   bool kern)
3082 {
3083 	return -EOPNOTSUPP;
3084 }
3085 EXPORT_SYMBOL(sock_no_accept);
3086 
3087 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3088 		    int peer)
3089 {
3090 	return -EOPNOTSUPP;
3091 }
3092 EXPORT_SYMBOL(sock_no_getname);
3093 
3094 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3095 {
3096 	return -EOPNOTSUPP;
3097 }
3098 EXPORT_SYMBOL(sock_no_ioctl);
3099 
3100 int sock_no_listen(struct socket *sock, int backlog)
3101 {
3102 	return -EOPNOTSUPP;
3103 }
3104 EXPORT_SYMBOL(sock_no_listen);
3105 
3106 int sock_no_shutdown(struct socket *sock, int how)
3107 {
3108 	return -EOPNOTSUPP;
3109 }
3110 EXPORT_SYMBOL(sock_no_shutdown);
3111 
3112 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3113 {
3114 	return -EOPNOTSUPP;
3115 }
3116 EXPORT_SYMBOL(sock_no_sendmsg);
3117 
3118 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3119 {
3120 	return -EOPNOTSUPP;
3121 }
3122 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3123 
3124 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3125 		    int flags)
3126 {
3127 	return -EOPNOTSUPP;
3128 }
3129 EXPORT_SYMBOL(sock_no_recvmsg);
3130 
3131 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3132 {
3133 	/* Mirror missing mmap method error code */
3134 	return -ENODEV;
3135 }
3136 EXPORT_SYMBOL(sock_no_mmap);
3137 
3138 /*
3139  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3140  * various sock-based usage counts.
3141  */
3142 void __receive_sock(struct file *file)
3143 {
3144 	struct socket *sock;
3145 
3146 	sock = sock_from_file(file);
3147 	if (sock) {
3148 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3149 		sock_update_classid(&sock->sk->sk_cgrp_data);
3150 	}
3151 }
3152 
3153 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3154 {
3155 	ssize_t res;
3156 	struct msghdr msg = {.msg_flags = flags};
3157 	struct kvec iov;
3158 	char *kaddr = kmap(page);
3159 	iov.iov_base = kaddr + offset;
3160 	iov.iov_len = size;
3161 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3162 	kunmap(page);
3163 	return res;
3164 }
3165 EXPORT_SYMBOL(sock_no_sendpage);
3166 
3167 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3168 				int offset, size_t size, int flags)
3169 {
3170 	ssize_t res;
3171 	struct msghdr msg = {.msg_flags = flags};
3172 	struct kvec iov;
3173 	char *kaddr = kmap(page);
3174 
3175 	iov.iov_base = kaddr + offset;
3176 	iov.iov_len = size;
3177 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3178 	kunmap(page);
3179 	return res;
3180 }
3181 EXPORT_SYMBOL(sock_no_sendpage_locked);
3182 
3183 /*
3184  *	Default Socket Callbacks
3185  */
3186 
3187 static void sock_def_wakeup(struct sock *sk)
3188 {
3189 	struct socket_wq *wq;
3190 
3191 	rcu_read_lock();
3192 	wq = rcu_dereference(sk->sk_wq);
3193 	if (skwq_has_sleeper(wq))
3194 		wake_up_interruptible_all(&wq->wait);
3195 	rcu_read_unlock();
3196 }
3197 
3198 static void sock_def_error_report(struct sock *sk)
3199 {
3200 	struct socket_wq *wq;
3201 
3202 	rcu_read_lock();
3203 	wq = rcu_dereference(sk->sk_wq);
3204 	if (skwq_has_sleeper(wq))
3205 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3206 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3207 	rcu_read_unlock();
3208 }
3209 
3210 void sock_def_readable(struct sock *sk)
3211 {
3212 	struct socket_wq *wq;
3213 
3214 	rcu_read_lock();
3215 	wq = rcu_dereference(sk->sk_wq);
3216 	if (skwq_has_sleeper(wq))
3217 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3218 						EPOLLRDNORM | EPOLLRDBAND);
3219 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3220 	rcu_read_unlock();
3221 }
3222 
3223 static void sock_def_write_space(struct sock *sk)
3224 {
3225 	struct socket_wq *wq;
3226 
3227 	rcu_read_lock();
3228 
3229 	/* Do not wake up a writer until he can make "significant"
3230 	 * progress.  --DaveM
3231 	 */
3232 	if (sock_writeable(sk)) {
3233 		wq = rcu_dereference(sk->sk_wq);
3234 		if (skwq_has_sleeper(wq))
3235 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3236 						EPOLLWRNORM | EPOLLWRBAND);
3237 
3238 		/* Should agree with poll, otherwise some programs break */
3239 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3240 	}
3241 
3242 	rcu_read_unlock();
3243 }
3244 
3245 /* An optimised version of sock_def_write_space(), should only be called
3246  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3247  * ->sk_wmem_alloc.
3248  */
3249 static void sock_def_write_space_wfree(struct sock *sk)
3250 {
3251 	/* Do not wake up a writer until he can make "significant"
3252 	 * progress.  --DaveM
3253 	 */
3254 	if (sock_writeable(sk)) {
3255 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3256 
3257 		/* rely on refcount_sub from sock_wfree() */
3258 		smp_mb__after_atomic();
3259 		if (wq && waitqueue_active(&wq->wait))
3260 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3261 						EPOLLWRNORM | EPOLLWRBAND);
3262 
3263 		/* Should agree with poll, otherwise some programs break */
3264 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3265 	}
3266 }
3267 
3268 static void sock_def_destruct(struct sock *sk)
3269 {
3270 }
3271 
3272 void sk_send_sigurg(struct sock *sk)
3273 {
3274 	if (sk->sk_socket && sk->sk_socket->file)
3275 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3276 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3277 }
3278 EXPORT_SYMBOL(sk_send_sigurg);
3279 
3280 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3281 		    unsigned long expires)
3282 {
3283 	if (!mod_timer(timer, expires))
3284 		sock_hold(sk);
3285 }
3286 EXPORT_SYMBOL(sk_reset_timer);
3287 
3288 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3289 {
3290 	if (del_timer(timer))
3291 		__sock_put(sk);
3292 }
3293 EXPORT_SYMBOL(sk_stop_timer);
3294 
3295 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3296 {
3297 	if (del_timer_sync(timer))
3298 		__sock_put(sk);
3299 }
3300 EXPORT_SYMBOL(sk_stop_timer_sync);
3301 
3302 void sock_init_data(struct socket *sock, struct sock *sk)
3303 {
3304 	sk_init_common(sk);
3305 	sk->sk_send_head	=	NULL;
3306 
3307 	timer_setup(&sk->sk_timer, NULL, 0);
3308 
3309 	sk->sk_allocation	=	GFP_KERNEL;
3310 	sk->sk_rcvbuf		=	sysctl_rmem_default;
3311 	sk->sk_sndbuf		=	sysctl_wmem_default;
3312 	sk->sk_state		=	TCP_CLOSE;
3313 	sk_set_socket(sk, sock);
3314 
3315 	sock_set_flag(sk, SOCK_ZAPPED);
3316 
3317 	if (sock) {
3318 		sk->sk_type	=	sock->type;
3319 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3320 		sock->sk	=	sk;
3321 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3322 	} else {
3323 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3324 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3325 	}
3326 
3327 	rwlock_init(&sk->sk_callback_lock);
3328 	if (sk->sk_kern_sock)
3329 		lockdep_set_class_and_name(
3330 			&sk->sk_callback_lock,
3331 			af_kern_callback_keys + sk->sk_family,
3332 			af_family_kern_clock_key_strings[sk->sk_family]);
3333 	else
3334 		lockdep_set_class_and_name(
3335 			&sk->sk_callback_lock,
3336 			af_callback_keys + sk->sk_family,
3337 			af_family_clock_key_strings[sk->sk_family]);
3338 
3339 	sk->sk_state_change	=	sock_def_wakeup;
3340 	sk->sk_data_ready	=	sock_def_readable;
3341 	sk->sk_write_space	=	sock_def_write_space;
3342 	sk->sk_error_report	=	sock_def_error_report;
3343 	sk->sk_destruct		=	sock_def_destruct;
3344 
3345 	sk->sk_frag.page	=	NULL;
3346 	sk->sk_frag.offset	=	0;
3347 	sk->sk_peek_off		=	-1;
3348 
3349 	sk->sk_peer_pid 	=	NULL;
3350 	sk->sk_peer_cred	=	NULL;
3351 	spin_lock_init(&sk->sk_peer_lock);
3352 
3353 	sk->sk_write_pending	=	0;
3354 	sk->sk_rcvlowat		=	1;
3355 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3356 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3357 
3358 	sk->sk_stamp = SK_DEFAULT_STAMP;
3359 #if BITS_PER_LONG==32
3360 	seqlock_init(&sk->sk_stamp_seq);
3361 #endif
3362 	atomic_set(&sk->sk_zckey, 0);
3363 
3364 #ifdef CONFIG_NET_RX_BUSY_POLL
3365 	sk->sk_napi_id		=	0;
3366 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3367 #endif
3368 
3369 	sk->sk_max_pacing_rate = ~0UL;
3370 	sk->sk_pacing_rate = ~0UL;
3371 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3372 	sk->sk_incoming_cpu = -1;
3373 	sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3374 
3375 	sk_rx_queue_clear(sk);
3376 	/*
3377 	 * Before updating sk_refcnt, we must commit prior changes to memory
3378 	 * (Documentation/RCU/rculist_nulls.rst for details)
3379 	 */
3380 	smp_wmb();
3381 	refcount_set(&sk->sk_refcnt, 1);
3382 	atomic_set(&sk->sk_drops, 0);
3383 }
3384 EXPORT_SYMBOL(sock_init_data);
3385 
3386 void lock_sock_nested(struct sock *sk, int subclass)
3387 {
3388 	/* The sk_lock has mutex_lock() semantics here. */
3389 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3390 
3391 	might_sleep();
3392 	spin_lock_bh(&sk->sk_lock.slock);
3393 	if (sock_owned_by_user_nocheck(sk))
3394 		__lock_sock(sk);
3395 	sk->sk_lock.owned = 1;
3396 	spin_unlock_bh(&sk->sk_lock.slock);
3397 }
3398 EXPORT_SYMBOL(lock_sock_nested);
3399 
3400 void release_sock(struct sock *sk)
3401 {
3402 	spin_lock_bh(&sk->sk_lock.slock);
3403 	if (sk->sk_backlog.tail)
3404 		__release_sock(sk);
3405 
3406 	/* Warning : release_cb() might need to release sk ownership,
3407 	 * ie call sock_release_ownership(sk) before us.
3408 	 */
3409 	if (sk->sk_prot->release_cb)
3410 		sk->sk_prot->release_cb(sk);
3411 
3412 	sock_release_ownership(sk);
3413 	if (waitqueue_active(&sk->sk_lock.wq))
3414 		wake_up(&sk->sk_lock.wq);
3415 	spin_unlock_bh(&sk->sk_lock.slock);
3416 }
3417 EXPORT_SYMBOL(release_sock);
3418 
3419 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3420 {
3421 	might_sleep();
3422 	spin_lock_bh(&sk->sk_lock.slock);
3423 
3424 	if (!sock_owned_by_user_nocheck(sk)) {
3425 		/*
3426 		 * Fast path return with bottom halves disabled and
3427 		 * sock::sk_lock.slock held.
3428 		 *
3429 		 * The 'mutex' is not contended and holding
3430 		 * sock::sk_lock.slock prevents all other lockers to
3431 		 * proceed so the corresponding unlock_sock_fast() can
3432 		 * avoid the slow path of release_sock() completely and
3433 		 * just release slock.
3434 		 *
3435 		 * From a semantical POV this is equivalent to 'acquiring'
3436 		 * the 'mutex', hence the corresponding lockdep
3437 		 * mutex_release() has to happen in the fast path of
3438 		 * unlock_sock_fast().
3439 		 */
3440 		return false;
3441 	}
3442 
3443 	__lock_sock(sk);
3444 	sk->sk_lock.owned = 1;
3445 	__acquire(&sk->sk_lock.slock);
3446 	spin_unlock_bh(&sk->sk_lock.slock);
3447 	return true;
3448 }
3449 EXPORT_SYMBOL(__lock_sock_fast);
3450 
3451 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3452 		   bool timeval, bool time32)
3453 {
3454 	struct sock *sk = sock->sk;
3455 	struct timespec64 ts;
3456 
3457 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3458 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3459 	if (ts.tv_sec == -1)
3460 		return -ENOENT;
3461 	if (ts.tv_sec == 0) {
3462 		ktime_t kt = ktime_get_real();
3463 		sock_write_timestamp(sk, kt);
3464 		ts = ktime_to_timespec64(kt);
3465 	}
3466 
3467 	if (timeval)
3468 		ts.tv_nsec /= 1000;
3469 
3470 #ifdef CONFIG_COMPAT_32BIT_TIME
3471 	if (time32)
3472 		return put_old_timespec32(&ts, userstamp);
3473 #endif
3474 #ifdef CONFIG_SPARC64
3475 	/* beware of padding in sparc64 timeval */
3476 	if (timeval && !in_compat_syscall()) {
3477 		struct __kernel_old_timeval __user tv = {
3478 			.tv_sec = ts.tv_sec,
3479 			.tv_usec = ts.tv_nsec,
3480 		};
3481 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3482 			return -EFAULT;
3483 		return 0;
3484 	}
3485 #endif
3486 	return put_timespec64(&ts, userstamp);
3487 }
3488 EXPORT_SYMBOL(sock_gettstamp);
3489 
3490 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3491 {
3492 	if (!sock_flag(sk, flag)) {
3493 		unsigned long previous_flags = sk->sk_flags;
3494 
3495 		sock_set_flag(sk, flag);
3496 		/*
3497 		 * we just set one of the two flags which require net
3498 		 * time stamping, but time stamping might have been on
3499 		 * already because of the other one
3500 		 */
3501 		if (sock_needs_netstamp(sk) &&
3502 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3503 			net_enable_timestamp();
3504 	}
3505 }
3506 
3507 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3508 		       int level, int type)
3509 {
3510 	struct sock_exterr_skb *serr;
3511 	struct sk_buff *skb;
3512 	int copied, err;
3513 
3514 	err = -EAGAIN;
3515 	skb = sock_dequeue_err_skb(sk);
3516 	if (skb == NULL)
3517 		goto out;
3518 
3519 	copied = skb->len;
3520 	if (copied > len) {
3521 		msg->msg_flags |= MSG_TRUNC;
3522 		copied = len;
3523 	}
3524 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3525 	if (err)
3526 		goto out_free_skb;
3527 
3528 	sock_recv_timestamp(msg, sk, skb);
3529 
3530 	serr = SKB_EXT_ERR(skb);
3531 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3532 
3533 	msg->msg_flags |= MSG_ERRQUEUE;
3534 	err = copied;
3535 
3536 out_free_skb:
3537 	kfree_skb(skb);
3538 out:
3539 	return err;
3540 }
3541 EXPORT_SYMBOL(sock_recv_errqueue);
3542 
3543 /*
3544  *	Get a socket option on an socket.
3545  *
3546  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3547  *	asynchronous errors should be reported by getsockopt. We assume
3548  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3549  */
3550 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3551 			   char __user *optval, int __user *optlen)
3552 {
3553 	struct sock *sk = sock->sk;
3554 
3555 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3556 }
3557 EXPORT_SYMBOL(sock_common_getsockopt);
3558 
3559 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3560 			int flags)
3561 {
3562 	struct sock *sk = sock->sk;
3563 	int addr_len = 0;
3564 	int err;
3565 
3566 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3567 	if (err >= 0)
3568 		msg->msg_namelen = addr_len;
3569 	return err;
3570 }
3571 EXPORT_SYMBOL(sock_common_recvmsg);
3572 
3573 /*
3574  *	Set socket options on an inet socket.
3575  */
3576 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3577 			   sockptr_t optval, unsigned int optlen)
3578 {
3579 	struct sock *sk = sock->sk;
3580 
3581 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3582 }
3583 EXPORT_SYMBOL(sock_common_setsockopt);
3584 
3585 void sk_common_release(struct sock *sk)
3586 {
3587 	if (sk->sk_prot->destroy)
3588 		sk->sk_prot->destroy(sk);
3589 
3590 	/*
3591 	 * Observation: when sk_common_release is called, processes have
3592 	 * no access to socket. But net still has.
3593 	 * Step one, detach it from networking:
3594 	 *
3595 	 * A. Remove from hash tables.
3596 	 */
3597 
3598 	sk->sk_prot->unhash(sk);
3599 
3600 	/*
3601 	 * In this point socket cannot receive new packets, but it is possible
3602 	 * that some packets are in flight because some CPU runs receiver and
3603 	 * did hash table lookup before we unhashed socket. They will achieve
3604 	 * receive queue and will be purged by socket destructor.
3605 	 *
3606 	 * Also we still have packets pending on receive queue and probably,
3607 	 * our own packets waiting in device queues. sock_destroy will drain
3608 	 * receive queue, but transmitted packets will delay socket destruction
3609 	 * until the last reference will be released.
3610 	 */
3611 
3612 	sock_orphan(sk);
3613 
3614 	xfrm_sk_free_policy(sk);
3615 
3616 	sk_refcnt_debug_release(sk);
3617 
3618 	sock_put(sk);
3619 }
3620 EXPORT_SYMBOL(sk_common_release);
3621 
3622 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3623 {
3624 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3625 
3626 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3627 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3628 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3629 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3630 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3631 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3632 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3633 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3634 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3635 }
3636 
3637 #ifdef CONFIG_PROC_FS
3638 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3639 
3640 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3641 {
3642 	int cpu, idx = prot->inuse_idx;
3643 	int res = 0;
3644 
3645 	for_each_possible_cpu(cpu)
3646 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3647 
3648 	return res >= 0 ? res : 0;
3649 }
3650 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3651 
3652 int sock_inuse_get(struct net *net)
3653 {
3654 	int cpu, res = 0;
3655 
3656 	for_each_possible_cpu(cpu)
3657 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3658 
3659 	return res;
3660 }
3661 
3662 EXPORT_SYMBOL_GPL(sock_inuse_get);
3663 
3664 static int __net_init sock_inuse_init_net(struct net *net)
3665 {
3666 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3667 	if (net->core.prot_inuse == NULL)
3668 		return -ENOMEM;
3669 	return 0;
3670 }
3671 
3672 static void __net_exit sock_inuse_exit_net(struct net *net)
3673 {
3674 	free_percpu(net->core.prot_inuse);
3675 }
3676 
3677 static struct pernet_operations net_inuse_ops = {
3678 	.init = sock_inuse_init_net,
3679 	.exit = sock_inuse_exit_net,
3680 };
3681 
3682 static __init int net_inuse_init(void)
3683 {
3684 	if (register_pernet_subsys(&net_inuse_ops))
3685 		panic("Cannot initialize net inuse counters");
3686 
3687 	return 0;
3688 }
3689 
3690 core_initcall(net_inuse_init);
3691 
3692 static int assign_proto_idx(struct proto *prot)
3693 {
3694 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3695 
3696 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3697 		pr_err("PROTO_INUSE_NR exhausted\n");
3698 		return -ENOSPC;
3699 	}
3700 
3701 	set_bit(prot->inuse_idx, proto_inuse_idx);
3702 	return 0;
3703 }
3704 
3705 static void release_proto_idx(struct proto *prot)
3706 {
3707 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3708 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3709 }
3710 #else
3711 static inline int assign_proto_idx(struct proto *prot)
3712 {
3713 	return 0;
3714 }
3715 
3716 static inline void release_proto_idx(struct proto *prot)
3717 {
3718 }
3719 
3720 #endif
3721 
3722 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3723 {
3724 	if (!twsk_prot)
3725 		return;
3726 	kfree(twsk_prot->twsk_slab_name);
3727 	twsk_prot->twsk_slab_name = NULL;
3728 	kmem_cache_destroy(twsk_prot->twsk_slab);
3729 	twsk_prot->twsk_slab = NULL;
3730 }
3731 
3732 static int tw_prot_init(const struct proto *prot)
3733 {
3734 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3735 
3736 	if (!twsk_prot)
3737 		return 0;
3738 
3739 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3740 					      prot->name);
3741 	if (!twsk_prot->twsk_slab_name)
3742 		return -ENOMEM;
3743 
3744 	twsk_prot->twsk_slab =
3745 		kmem_cache_create(twsk_prot->twsk_slab_name,
3746 				  twsk_prot->twsk_obj_size, 0,
3747 				  SLAB_ACCOUNT | prot->slab_flags,
3748 				  NULL);
3749 	if (!twsk_prot->twsk_slab) {
3750 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3751 			prot->name);
3752 		return -ENOMEM;
3753 	}
3754 
3755 	return 0;
3756 }
3757 
3758 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3759 {
3760 	if (!rsk_prot)
3761 		return;
3762 	kfree(rsk_prot->slab_name);
3763 	rsk_prot->slab_name = NULL;
3764 	kmem_cache_destroy(rsk_prot->slab);
3765 	rsk_prot->slab = NULL;
3766 }
3767 
3768 static int req_prot_init(const struct proto *prot)
3769 {
3770 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3771 
3772 	if (!rsk_prot)
3773 		return 0;
3774 
3775 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3776 					prot->name);
3777 	if (!rsk_prot->slab_name)
3778 		return -ENOMEM;
3779 
3780 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3781 					   rsk_prot->obj_size, 0,
3782 					   SLAB_ACCOUNT | prot->slab_flags,
3783 					   NULL);
3784 
3785 	if (!rsk_prot->slab) {
3786 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3787 			prot->name);
3788 		return -ENOMEM;
3789 	}
3790 	return 0;
3791 }
3792 
3793 int proto_register(struct proto *prot, int alloc_slab)
3794 {
3795 	int ret = -ENOBUFS;
3796 
3797 	if (prot->memory_allocated && !prot->sysctl_mem) {
3798 		pr_err("%s: missing sysctl_mem\n", prot->name);
3799 		return -EINVAL;
3800 	}
3801 	if (alloc_slab) {
3802 		prot->slab = kmem_cache_create_usercopy(prot->name,
3803 					prot->obj_size, 0,
3804 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3805 					prot->slab_flags,
3806 					prot->useroffset, prot->usersize,
3807 					NULL);
3808 
3809 		if (prot->slab == NULL) {
3810 			pr_crit("%s: Can't create sock SLAB cache!\n",
3811 				prot->name);
3812 			goto out;
3813 		}
3814 
3815 		if (req_prot_init(prot))
3816 			goto out_free_request_sock_slab;
3817 
3818 		if (tw_prot_init(prot))
3819 			goto out_free_timewait_sock_slab;
3820 	}
3821 
3822 	mutex_lock(&proto_list_mutex);
3823 	ret = assign_proto_idx(prot);
3824 	if (ret) {
3825 		mutex_unlock(&proto_list_mutex);
3826 		goto out_free_timewait_sock_slab;
3827 	}
3828 	list_add(&prot->node, &proto_list);
3829 	mutex_unlock(&proto_list_mutex);
3830 	return ret;
3831 
3832 out_free_timewait_sock_slab:
3833 	if (alloc_slab)
3834 		tw_prot_cleanup(prot->twsk_prot);
3835 out_free_request_sock_slab:
3836 	if (alloc_slab) {
3837 		req_prot_cleanup(prot->rsk_prot);
3838 
3839 		kmem_cache_destroy(prot->slab);
3840 		prot->slab = NULL;
3841 	}
3842 out:
3843 	return ret;
3844 }
3845 EXPORT_SYMBOL(proto_register);
3846 
3847 void proto_unregister(struct proto *prot)
3848 {
3849 	mutex_lock(&proto_list_mutex);
3850 	release_proto_idx(prot);
3851 	list_del(&prot->node);
3852 	mutex_unlock(&proto_list_mutex);
3853 
3854 	kmem_cache_destroy(prot->slab);
3855 	prot->slab = NULL;
3856 
3857 	req_prot_cleanup(prot->rsk_prot);
3858 	tw_prot_cleanup(prot->twsk_prot);
3859 }
3860 EXPORT_SYMBOL(proto_unregister);
3861 
3862 int sock_load_diag_module(int family, int protocol)
3863 {
3864 	if (!protocol) {
3865 		if (!sock_is_registered(family))
3866 			return -ENOENT;
3867 
3868 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3869 				      NETLINK_SOCK_DIAG, family);
3870 	}
3871 
3872 #ifdef CONFIG_INET
3873 	if (family == AF_INET &&
3874 	    protocol != IPPROTO_RAW &&
3875 	    protocol < MAX_INET_PROTOS &&
3876 	    !rcu_access_pointer(inet_protos[protocol]))
3877 		return -ENOENT;
3878 #endif
3879 
3880 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3881 			      NETLINK_SOCK_DIAG, family, protocol);
3882 }
3883 EXPORT_SYMBOL(sock_load_diag_module);
3884 
3885 #ifdef CONFIG_PROC_FS
3886 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3887 	__acquires(proto_list_mutex)
3888 {
3889 	mutex_lock(&proto_list_mutex);
3890 	return seq_list_start_head(&proto_list, *pos);
3891 }
3892 
3893 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3894 {
3895 	return seq_list_next(v, &proto_list, pos);
3896 }
3897 
3898 static void proto_seq_stop(struct seq_file *seq, void *v)
3899 	__releases(proto_list_mutex)
3900 {
3901 	mutex_unlock(&proto_list_mutex);
3902 }
3903 
3904 static char proto_method_implemented(const void *method)
3905 {
3906 	return method == NULL ? 'n' : 'y';
3907 }
3908 static long sock_prot_memory_allocated(struct proto *proto)
3909 {
3910 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3911 }
3912 
3913 static const char *sock_prot_memory_pressure(struct proto *proto)
3914 {
3915 	return proto->memory_pressure != NULL ?
3916 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3917 }
3918 
3919 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3920 {
3921 
3922 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3923 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3924 		   proto->name,
3925 		   proto->obj_size,
3926 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3927 		   sock_prot_memory_allocated(proto),
3928 		   sock_prot_memory_pressure(proto),
3929 		   proto->max_header,
3930 		   proto->slab == NULL ? "no" : "yes",
3931 		   module_name(proto->owner),
3932 		   proto_method_implemented(proto->close),
3933 		   proto_method_implemented(proto->connect),
3934 		   proto_method_implemented(proto->disconnect),
3935 		   proto_method_implemented(proto->accept),
3936 		   proto_method_implemented(proto->ioctl),
3937 		   proto_method_implemented(proto->init),
3938 		   proto_method_implemented(proto->destroy),
3939 		   proto_method_implemented(proto->shutdown),
3940 		   proto_method_implemented(proto->setsockopt),
3941 		   proto_method_implemented(proto->getsockopt),
3942 		   proto_method_implemented(proto->sendmsg),
3943 		   proto_method_implemented(proto->recvmsg),
3944 		   proto_method_implemented(proto->sendpage),
3945 		   proto_method_implemented(proto->bind),
3946 		   proto_method_implemented(proto->backlog_rcv),
3947 		   proto_method_implemented(proto->hash),
3948 		   proto_method_implemented(proto->unhash),
3949 		   proto_method_implemented(proto->get_port),
3950 		   proto_method_implemented(proto->enter_memory_pressure));
3951 }
3952 
3953 static int proto_seq_show(struct seq_file *seq, void *v)
3954 {
3955 	if (v == &proto_list)
3956 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3957 			   "protocol",
3958 			   "size",
3959 			   "sockets",
3960 			   "memory",
3961 			   "press",
3962 			   "maxhdr",
3963 			   "slab",
3964 			   "module",
3965 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3966 	else
3967 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3968 	return 0;
3969 }
3970 
3971 static const struct seq_operations proto_seq_ops = {
3972 	.start  = proto_seq_start,
3973 	.next   = proto_seq_next,
3974 	.stop   = proto_seq_stop,
3975 	.show   = proto_seq_show,
3976 };
3977 
3978 static __net_init int proto_init_net(struct net *net)
3979 {
3980 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3981 			sizeof(struct seq_net_private)))
3982 		return -ENOMEM;
3983 
3984 	return 0;
3985 }
3986 
3987 static __net_exit void proto_exit_net(struct net *net)
3988 {
3989 	remove_proc_entry("protocols", net->proc_net);
3990 }
3991 
3992 
3993 static __net_initdata struct pernet_operations proto_net_ops = {
3994 	.init = proto_init_net,
3995 	.exit = proto_exit_net,
3996 };
3997 
3998 static int __init proto_init(void)
3999 {
4000 	return register_pernet_subsys(&proto_net_ops);
4001 }
4002 
4003 subsys_initcall(proto_init);
4004 
4005 #endif /* PROC_FS */
4006 
4007 #ifdef CONFIG_NET_RX_BUSY_POLL
4008 bool sk_busy_loop_end(void *p, unsigned long start_time)
4009 {
4010 	struct sock *sk = p;
4011 
4012 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4013 	       sk_busy_loop_timeout(sk, start_time);
4014 }
4015 EXPORT_SYMBOL(sk_busy_loop_end);
4016 #endif /* CONFIG_NET_RX_BUSY_POLL */
4017 
4018 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4019 {
4020 	if (!sk->sk_prot->bind_add)
4021 		return -EOPNOTSUPP;
4022 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4023 }
4024 EXPORT_SYMBOL(sock_bind_add);
4025