xref: /linux/net/core/sock.c (revision c99ebb6132595b4b288a413981197eb076547c5a)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include <uapi/linux/pidfd.h>
152 
153 #include "dev.h"
154 
155 static DEFINE_MUTEX(proto_list_mutex);
156 static LIST_HEAD(proto_list);
157 
158 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
159 static void sock_def_write_space(struct sock *sk);
160 
161 /**
162  * sk_ns_capable - General socket capability test
163  * @sk: Socket to use a capability on or through
164  * @user_ns: The user namespace of the capability to use
165  * @cap: The capability to use
166  *
167  * Test to see if the opener of the socket had when the socket was
168  * created and the current process has the capability @cap in the user
169  * namespace @user_ns.
170  */
171 bool sk_ns_capable(const struct sock *sk,
172 		   struct user_namespace *user_ns, int cap)
173 {
174 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
175 		ns_capable(user_ns, cap);
176 }
177 EXPORT_SYMBOL(sk_ns_capable);
178 
179 /**
180  * sk_capable - Socket global capability test
181  * @sk: Socket to use a capability on or through
182  * @cap: The global capability to use
183  *
184  * Test to see if the opener of the socket had when the socket was
185  * created and the current process has the capability @cap in all user
186  * namespaces.
187  */
188 bool sk_capable(const struct sock *sk, int cap)
189 {
190 	return sk_ns_capable(sk, &init_user_ns, cap);
191 }
192 EXPORT_SYMBOL(sk_capable);
193 
194 /**
195  * sk_net_capable - Network namespace socket capability test
196  * @sk: Socket to use a capability on or through
197  * @cap: The capability to use
198  *
199  * Test to see if the opener of the socket had when the socket was created
200  * and the current process has the capability @cap over the network namespace
201  * the socket is a member of.
202  */
203 bool sk_net_capable(const struct sock *sk, int cap)
204 {
205 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206 }
207 EXPORT_SYMBOL(sk_net_capable);
208 
209 /*
210  * Each address family might have different locking rules, so we have
211  * one slock key per address family and separate keys for internal and
212  * userspace sockets.
213  */
214 static struct lock_class_key af_family_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_keys[AF_MAX];
216 static struct lock_class_key af_family_slock_keys[AF_MAX];
217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218 
219 /*
220  * Make lock validator output more readable. (we pre-construct these
221  * strings build-time, so that runtime initialization of socket
222  * locks is fast):
223  */
224 
225 #define _sock_locks(x)						  \
226   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
227   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
228   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
229   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
230   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
231   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
232   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
233   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
234   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
235   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
236   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
237   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
238   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
239   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
240   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
241   x "AF_MCTP"  , \
242   x "AF_MAX"
243 
244 static const char *const af_family_key_strings[AF_MAX+1] = {
245 	_sock_locks("sk_lock-")
246 };
247 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
248 	_sock_locks("slock-")
249 };
250 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
251 	_sock_locks("clock-")
252 };
253 
254 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
255 	_sock_locks("k-sk_lock-")
256 };
257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
258 	_sock_locks("k-slock-")
259 };
260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
261 	_sock_locks("k-clock-")
262 };
263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
264 	_sock_locks("rlock-")
265 };
266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
267 	_sock_locks("wlock-")
268 };
269 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
270 	_sock_locks("elock-")
271 };
272 
273 /*
274  * sk_callback_lock and sk queues locking rules are per-address-family,
275  * so split the lock classes by using a per-AF key:
276  */
277 static struct lock_class_key af_callback_keys[AF_MAX];
278 static struct lock_class_key af_rlock_keys[AF_MAX];
279 static struct lock_class_key af_wlock_keys[AF_MAX];
280 static struct lock_class_key af_elock_keys[AF_MAX];
281 static struct lock_class_key af_kern_callback_keys[AF_MAX];
282 
283 /* Run time adjustable parameters. */
284 __u32 sysctl_wmem_max __read_mostly = 4 << 20;
285 EXPORT_SYMBOL(sysctl_wmem_max);
286 __u32 sysctl_rmem_max __read_mostly = 4 << 20;
287 EXPORT_SYMBOL(sysctl_rmem_max);
288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
290 
291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292 EXPORT_SYMBOL_GPL(memalloc_socks_key);
293 
294 /**
295  * sk_set_memalloc - sets %SOCK_MEMALLOC
296  * @sk: socket to set it on
297  *
298  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299  * It's the responsibility of the admin to adjust min_free_kbytes
300  * to meet the requirements
301  */
302 void sk_set_memalloc(struct sock *sk)
303 {
304 	sock_set_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation |= __GFP_MEMALLOC;
306 	static_branch_inc(&memalloc_socks_key);
307 }
308 EXPORT_SYMBOL_GPL(sk_set_memalloc);
309 
310 void sk_clear_memalloc(struct sock *sk)
311 {
312 	sock_reset_flag(sk, SOCK_MEMALLOC);
313 	sk->sk_allocation &= ~__GFP_MEMALLOC;
314 	static_branch_dec(&memalloc_socks_key);
315 
316 	/*
317 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
319 	 * it has rmem allocations due to the last swapfile being deactivated
320 	 * but there is a risk that the socket is unusable due to exceeding
321 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
322 	 */
323 	sk_mem_reclaim(sk);
324 }
325 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326 
327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
328 {
329 	int ret;
330 	unsigned int noreclaim_flag;
331 
332 	/* these should have been dropped before queueing */
333 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334 
335 	noreclaim_flag = memalloc_noreclaim_save();
336 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337 				 tcp_v6_do_rcv,
338 				 tcp_v4_do_rcv,
339 				 sk, skb);
340 	memalloc_noreclaim_restore(noreclaim_flag);
341 
342 	return ret;
343 }
344 EXPORT_SYMBOL(__sk_backlog_rcv);
345 
346 void sk_error_report(struct sock *sk)
347 {
348 	sk->sk_error_report(sk);
349 
350 	switch (sk->sk_family) {
351 	case AF_INET:
352 		fallthrough;
353 	case AF_INET6:
354 		trace_inet_sk_error_report(sk);
355 		break;
356 	default:
357 		break;
358 	}
359 }
360 EXPORT_SYMBOL(sk_error_report);
361 
362 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363 {
364 	struct __kernel_sock_timeval tv;
365 
366 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
367 		tv.tv_sec = 0;
368 		tv.tv_usec = 0;
369 	} else {
370 		tv.tv_sec = timeo / HZ;
371 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372 	}
373 
374 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376 		*(struct old_timeval32 *)optval = tv32;
377 		return sizeof(tv32);
378 	}
379 
380 	if (old_timeval) {
381 		struct __kernel_old_timeval old_tv;
382 		old_tv.tv_sec = tv.tv_sec;
383 		old_tv.tv_usec = tv.tv_usec;
384 		*(struct __kernel_old_timeval *)optval = old_tv;
385 		return sizeof(old_tv);
386 	}
387 
388 	*(struct __kernel_sock_timeval *)optval = tv;
389 	return sizeof(tv);
390 }
391 EXPORT_SYMBOL(sock_get_timeout);
392 
393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394 			   sockptr_t optval, int optlen, bool old_timeval)
395 {
396 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397 		struct old_timeval32 tv32;
398 
399 		if (optlen < sizeof(tv32))
400 			return -EINVAL;
401 
402 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
403 			return -EFAULT;
404 		tv->tv_sec = tv32.tv_sec;
405 		tv->tv_usec = tv32.tv_usec;
406 	} else if (old_timeval) {
407 		struct __kernel_old_timeval old_tv;
408 
409 		if (optlen < sizeof(old_tv))
410 			return -EINVAL;
411 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
412 			return -EFAULT;
413 		tv->tv_sec = old_tv.tv_sec;
414 		tv->tv_usec = old_tv.tv_usec;
415 	} else {
416 		if (optlen < sizeof(*tv))
417 			return -EINVAL;
418 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
419 			return -EFAULT;
420 	}
421 
422 	return 0;
423 }
424 EXPORT_SYMBOL(sock_copy_user_timeval);
425 
426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
427 			    bool old_timeval)
428 {
429 	struct __kernel_sock_timeval tv;
430 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431 	long val;
432 
433 	if (err)
434 		return err;
435 
436 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 		return -EDOM;
438 
439 	if (tv.tv_sec < 0) {
440 		static int warned __read_mostly;
441 
442 		WRITE_ONCE(*timeo_p, 0);
443 		if (warned < 10 && net_ratelimit()) {
444 			warned++;
445 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 				__func__, current->comm, task_pid_nr(current));
447 		}
448 		return 0;
449 	}
450 	val = MAX_SCHEDULE_TIMEOUT;
451 	if ((tv.tv_sec || tv.tv_usec) &&
452 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
453 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454 						    USEC_PER_SEC / HZ);
455 	WRITE_ONCE(*timeo_p, val);
456 	return 0;
457 }
458 
459 static bool sk_set_prio_allowed(const struct sock *sk, int val)
460 {
461 	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
462 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
463 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
464 }
465 
466 static bool sock_needs_netstamp(const struct sock *sk)
467 {
468 	switch (sk->sk_family) {
469 	case AF_UNSPEC:
470 	case AF_UNIX:
471 		return false;
472 	default:
473 		return true;
474 	}
475 }
476 
477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
478 {
479 	if (sk->sk_flags & flags) {
480 		sk->sk_flags &= ~flags;
481 		if (sock_needs_netstamp(sk) &&
482 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
483 			net_disable_timestamp();
484 	}
485 }
486 
487 
488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
489 {
490 	unsigned long flags;
491 	struct sk_buff_head *list = &sk->sk_receive_queue;
492 
493 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
494 		sk_drops_inc(sk);
495 		trace_sock_rcvqueue_full(sk, skb);
496 		return -ENOMEM;
497 	}
498 
499 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
500 		sk_drops_inc(sk);
501 		return -ENOBUFS;
502 	}
503 
504 	skb->dev = NULL;
505 	skb_set_owner_r(skb, sk);
506 
507 	/* we escape from rcu protected region, make sure we dont leak
508 	 * a norefcounted dst
509 	 */
510 	skb_dst_force(skb);
511 
512 	spin_lock_irqsave(&list->lock, flags);
513 	sock_skb_set_dropcount(sk, skb);
514 	__skb_queue_tail(list, skb);
515 	spin_unlock_irqrestore(&list->lock, flags);
516 
517 	if (!sock_flag(sk, SOCK_DEAD))
518 		sk->sk_data_ready(sk);
519 	return 0;
520 }
521 EXPORT_SYMBOL(__sock_queue_rcv_skb);
522 
523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
524 			      enum skb_drop_reason *reason)
525 {
526 	enum skb_drop_reason drop_reason;
527 	int err;
528 
529 	err = sk_filter_reason(sk, skb, &drop_reason);
530 	if (err)
531 		goto out;
532 
533 	err = __sock_queue_rcv_skb(sk, skb);
534 	switch (err) {
535 	case -ENOMEM:
536 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
537 		break;
538 	case -ENOBUFS:
539 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
540 		break;
541 	default:
542 		drop_reason = SKB_NOT_DROPPED_YET;
543 		break;
544 	}
545 out:
546 	if (reason)
547 		*reason = drop_reason;
548 	return err;
549 }
550 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
551 
552 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
553 		     const int nested, unsigned int trim_cap, bool refcounted)
554 {
555 	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
556 	int rc = NET_RX_SUCCESS;
557 	int err;
558 
559 	if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
560 		goto discard_and_relse;
561 
562 	skb->dev = NULL;
563 
564 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
565 		sk_drops_inc(sk);
566 		reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
567 		goto discard_and_relse;
568 	}
569 	if (nested)
570 		bh_lock_sock_nested(sk);
571 	else
572 		bh_lock_sock(sk);
573 	if (!sock_owned_by_user(sk)) {
574 		/*
575 		 * trylock + unlock semantics:
576 		 */
577 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
578 
579 		rc = sk_backlog_rcv(sk, skb);
580 
581 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
582 	} else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
583 		bh_unlock_sock(sk);
584 		if (err == -ENOMEM)
585 			reason = SKB_DROP_REASON_PFMEMALLOC;
586 		if (err == -ENOBUFS)
587 			reason = SKB_DROP_REASON_SOCKET_BACKLOG;
588 		sk_drops_inc(sk);
589 		goto discard_and_relse;
590 	}
591 
592 	bh_unlock_sock(sk);
593 out:
594 	if (refcounted)
595 		sock_put(sk);
596 	return rc;
597 discard_and_relse:
598 	sk_skb_reason_drop(sk, skb, reason);
599 	goto out;
600 }
601 EXPORT_SYMBOL(__sk_receive_skb);
602 
603 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
604 							  u32));
605 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
606 							   u32));
607 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
608 {
609 	struct dst_entry *dst = __sk_dst_get(sk);
610 
611 	if (dst && READ_ONCE(dst->obsolete) &&
612 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
613 			       dst, cookie) == NULL) {
614 		sk_tx_queue_clear(sk);
615 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
616 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
617 		dst_release(dst);
618 		return NULL;
619 	}
620 
621 	return dst;
622 }
623 EXPORT_SYMBOL(__sk_dst_check);
624 
625 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
626 {
627 	struct dst_entry *dst = sk_dst_get(sk);
628 
629 	if (dst && READ_ONCE(dst->obsolete) &&
630 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
631 			       dst, cookie) == NULL) {
632 		sk_dst_reset(sk);
633 		dst_release(dst);
634 		return NULL;
635 	}
636 
637 	return dst;
638 }
639 EXPORT_SYMBOL(sk_dst_check);
640 
641 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
642 {
643 	int ret = -ENOPROTOOPT;
644 #ifdef CONFIG_NETDEVICES
645 	struct net *net = sock_net(sk);
646 
647 	/* Sorry... */
648 	ret = -EPERM;
649 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
650 		goto out;
651 
652 	ret = -EINVAL;
653 	if (ifindex < 0)
654 		goto out;
655 
656 	/* Paired with all READ_ONCE() done locklessly. */
657 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
658 
659 	if (sk->sk_prot->rehash)
660 		sk->sk_prot->rehash(sk);
661 	sk_dst_reset(sk);
662 
663 	ret = 0;
664 
665 out:
666 #endif
667 
668 	return ret;
669 }
670 
671 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
672 {
673 	int ret;
674 
675 	if (lock_sk)
676 		lock_sock(sk);
677 	ret = sock_bindtoindex_locked(sk, ifindex);
678 	if (lock_sk)
679 		release_sock(sk);
680 
681 	return ret;
682 }
683 EXPORT_SYMBOL(sock_bindtoindex);
684 
685 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
686 {
687 	int ret = -ENOPROTOOPT;
688 #ifdef CONFIG_NETDEVICES
689 	struct net *net = sock_net(sk);
690 	char devname[IFNAMSIZ];
691 	int index;
692 
693 	ret = -EINVAL;
694 	if (optlen < 0)
695 		goto out;
696 
697 	/* Bind this socket to a particular device like "eth0",
698 	 * as specified in the passed interface name. If the
699 	 * name is "" or the option length is zero the socket
700 	 * is not bound.
701 	 */
702 	if (optlen > IFNAMSIZ - 1)
703 		optlen = IFNAMSIZ - 1;
704 	memset(devname, 0, sizeof(devname));
705 
706 	ret = -EFAULT;
707 	if (copy_from_sockptr(devname, optval, optlen))
708 		goto out;
709 
710 	index = 0;
711 	if (devname[0] != '\0') {
712 		struct net_device *dev;
713 
714 		rcu_read_lock();
715 		dev = dev_get_by_name_rcu(net, devname);
716 		if (dev)
717 			index = dev->ifindex;
718 		rcu_read_unlock();
719 		ret = -ENODEV;
720 		if (!dev)
721 			goto out;
722 	}
723 
724 	sockopt_lock_sock(sk);
725 	ret = sock_bindtoindex_locked(sk, index);
726 	sockopt_release_sock(sk);
727 out:
728 #endif
729 
730 	return ret;
731 }
732 
733 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
734 				sockptr_t optlen, int len)
735 {
736 	int ret = -ENOPROTOOPT;
737 #ifdef CONFIG_NETDEVICES
738 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
739 	struct net *net = sock_net(sk);
740 	char devname[IFNAMSIZ];
741 
742 	if (bound_dev_if == 0) {
743 		len = 0;
744 		goto zero;
745 	}
746 
747 	ret = -EINVAL;
748 	if (len < IFNAMSIZ)
749 		goto out;
750 
751 	ret = netdev_get_name(net, devname, bound_dev_if);
752 	if (ret)
753 		goto out;
754 
755 	len = strlen(devname) + 1;
756 
757 	ret = -EFAULT;
758 	if (copy_to_sockptr(optval, devname, len))
759 		goto out;
760 
761 zero:
762 	ret = -EFAULT;
763 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
764 		goto out;
765 
766 	ret = 0;
767 
768 out:
769 #endif
770 
771 	return ret;
772 }
773 
774 bool sk_mc_loop(const struct sock *sk)
775 {
776 	if (dev_recursion_level())
777 		return false;
778 	if (!sk)
779 		return true;
780 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
781 	switch (READ_ONCE(sk->sk_family)) {
782 	case AF_INET:
783 		return inet_test_bit(MC_LOOP, sk);
784 #if IS_ENABLED(CONFIG_IPV6)
785 	case AF_INET6:
786 		return inet6_test_bit(MC6_LOOP, sk);
787 #endif
788 	}
789 	WARN_ON_ONCE(1);
790 	return true;
791 }
792 EXPORT_SYMBOL(sk_mc_loop);
793 
794 void sock_set_reuseaddr(struct sock *sk)
795 {
796 	lock_sock(sk);
797 	sk->sk_reuse = SK_CAN_REUSE;
798 	release_sock(sk);
799 }
800 EXPORT_SYMBOL(sock_set_reuseaddr);
801 
802 void sock_set_reuseport(struct sock *sk)
803 {
804 	lock_sock(sk);
805 	sk->sk_reuseport = true;
806 	release_sock(sk);
807 }
808 EXPORT_SYMBOL(sock_set_reuseport);
809 
810 void sock_no_linger(struct sock *sk)
811 {
812 	lock_sock(sk);
813 	WRITE_ONCE(sk->sk_lingertime, 0);
814 	sock_set_flag(sk, SOCK_LINGER);
815 	release_sock(sk);
816 }
817 EXPORT_SYMBOL(sock_no_linger);
818 
819 void sock_set_priority(struct sock *sk, u32 priority)
820 {
821 	WRITE_ONCE(sk->sk_priority, priority);
822 }
823 EXPORT_SYMBOL(sock_set_priority);
824 
825 void sock_set_sndtimeo(struct sock *sk, s64 secs)
826 {
827 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
828 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
829 	else
830 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
831 }
832 EXPORT_SYMBOL(sock_set_sndtimeo);
833 
834 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
835 {
836 	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
837 	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
838 	if (val)  {
839 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
840 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
841 	}
842 }
843 
844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
845 {
846 	switch (optname) {
847 	case SO_TIMESTAMP_OLD:
848 		__sock_set_timestamps(sk, valbool, false, false);
849 		break;
850 	case SO_TIMESTAMP_NEW:
851 		__sock_set_timestamps(sk, valbool, true, false);
852 		break;
853 	case SO_TIMESTAMPNS_OLD:
854 		__sock_set_timestamps(sk, valbool, false, true);
855 		break;
856 	case SO_TIMESTAMPNS_NEW:
857 		__sock_set_timestamps(sk, valbool, true, true);
858 		break;
859 	}
860 }
861 
862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
863 {
864 	struct net *net = sock_net(sk);
865 	struct net_device *dev = NULL;
866 	bool match = false;
867 	int *vclock_index;
868 	int i, num;
869 
870 	if (sk->sk_bound_dev_if)
871 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
872 
873 	if (!dev) {
874 		pr_err("%s: sock not bind to device\n", __func__);
875 		return -EOPNOTSUPP;
876 	}
877 
878 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
879 	dev_put(dev);
880 
881 	for (i = 0; i < num; i++) {
882 		if (*(vclock_index + i) == phc_index) {
883 			match = true;
884 			break;
885 		}
886 	}
887 
888 	if (num > 0)
889 		kfree(vclock_index);
890 
891 	if (!match)
892 		return -EINVAL;
893 
894 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
895 
896 	return 0;
897 }
898 
899 int sock_set_timestamping(struct sock *sk, int optname,
900 			  struct so_timestamping timestamping)
901 {
902 	int val = timestamping.flags;
903 	int ret;
904 
905 	if (val & ~SOF_TIMESTAMPING_MASK)
906 		return -EINVAL;
907 
908 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
909 	    !(val & SOF_TIMESTAMPING_OPT_ID))
910 		return -EINVAL;
911 
912 	if (val & SOF_TIMESTAMPING_OPT_ID &&
913 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
914 		if (sk_is_tcp(sk)) {
915 			if ((1 << sk->sk_state) &
916 			    (TCPF_CLOSE | TCPF_LISTEN))
917 				return -EINVAL;
918 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
919 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
920 			else
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
922 		} else {
923 			atomic_set(&sk->sk_tskey, 0);
924 		}
925 	}
926 
927 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
928 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
929 		return -EINVAL;
930 
931 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
932 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
933 		if (ret)
934 			return ret;
935 	}
936 
937 	WRITE_ONCE(sk->sk_tsflags, val);
938 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
939 	sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
940 
941 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
942 		sock_enable_timestamp(sk,
943 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
944 	else
945 		sock_disable_timestamp(sk,
946 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
947 	return 0;
948 }
949 
950 #if defined(CONFIG_CGROUP_BPF)
951 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
952 {
953 	struct bpf_sock_ops_kern sock_ops;
954 
955 	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
956 	sock_ops.op = op;
957 	sock_ops.is_fullsock = 1;
958 	sock_ops.sk = sk;
959 	bpf_skops_init_skb(&sock_ops, skb, 0);
960 	__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
961 }
962 #endif
963 
964 void sock_set_keepalive(struct sock *sk)
965 {
966 	lock_sock(sk);
967 	if (sk->sk_prot->keepalive)
968 		sk->sk_prot->keepalive(sk, true);
969 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
970 	release_sock(sk);
971 }
972 EXPORT_SYMBOL(sock_set_keepalive);
973 
974 static void __sock_set_rcvbuf(struct sock *sk, int val)
975 {
976 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
977 	 * as a negative value.
978 	 */
979 	val = min_t(int, val, INT_MAX / 2);
980 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
981 
982 	/* We double it on the way in to account for "struct sk_buff" etc.
983 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
984 	 * will allow that much actual data to be received on that socket.
985 	 *
986 	 * Applications are unaware that "struct sk_buff" and other overheads
987 	 * allocate from the receive buffer during socket buffer allocation.
988 	 *
989 	 * And after considering the possible alternatives, returning the value
990 	 * we actually used in getsockopt is the most desirable behavior.
991 	 */
992 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
993 }
994 
995 void sock_set_rcvbuf(struct sock *sk, int val)
996 {
997 	lock_sock(sk);
998 	__sock_set_rcvbuf(sk, val);
999 	release_sock(sk);
1000 }
1001 EXPORT_SYMBOL(sock_set_rcvbuf);
1002 
1003 static void __sock_set_mark(struct sock *sk, u32 val)
1004 {
1005 	if (val != sk->sk_mark) {
1006 		WRITE_ONCE(sk->sk_mark, val);
1007 		sk_dst_reset(sk);
1008 	}
1009 }
1010 
1011 void sock_set_mark(struct sock *sk, u32 val)
1012 {
1013 	lock_sock(sk);
1014 	__sock_set_mark(sk, val);
1015 	release_sock(sk);
1016 }
1017 EXPORT_SYMBOL(sock_set_mark);
1018 
1019 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1020 {
1021 	/* Round down bytes to multiple of pages */
1022 	bytes = round_down(bytes, PAGE_SIZE);
1023 
1024 	WARN_ON(bytes > sk->sk_reserved_mem);
1025 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1026 	sk_mem_reclaim(sk);
1027 }
1028 
1029 static int sock_reserve_memory(struct sock *sk, int bytes)
1030 {
1031 	long allocated;
1032 	bool charged;
1033 	int pages;
1034 
1035 	if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
1036 		return -EOPNOTSUPP;
1037 
1038 	if (!bytes)
1039 		return 0;
1040 
1041 	pages = sk_mem_pages(bytes);
1042 
1043 	/* pre-charge to memcg */
1044 	charged = mem_cgroup_sk_charge(sk, pages,
1045 				       GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1046 	if (!charged)
1047 		return -ENOMEM;
1048 
1049 	if (sk->sk_bypass_prot_mem)
1050 		goto success;
1051 
1052 	/* pre-charge to forward_alloc */
1053 	sk_memory_allocated_add(sk, pages);
1054 	allocated = sk_memory_allocated(sk);
1055 
1056 	/* If the system goes into memory pressure with this
1057 	 * precharge, give up and return error.
1058 	 */
1059 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1060 		sk_memory_allocated_sub(sk, pages);
1061 		mem_cgroup_sk_uncharge(sk, pages);
1062 		return -ENOMEM;
1063 	}
1064 
1065 success:
1066 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1067 
1068 	WRITE_ONCE(sk->sk_reserved_mem,
1069 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1070 
1071 	return 0;
1072 }
1073 
1074 #ifdef CONFIG_PAGE_POOL
1075 
1076 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1077  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1078  * allocates to copy these tokens, and to prevent looping over the frags for
1079  * too long.
1080  */
1081 #define MAX_DONTNEED_TOKENS 128
1082 #define MAX_DONTNEED_FRAGS 1024
1083 
1084 static noinline_for_stack int
1085 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1086 {
1087 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1088 	struct dmabuf_token *tokens;
1089 	int ret = 0, num_frags = 0;
1090 	netmem_ref netmems[16];
1091 
1092 	if (!sk_is_tcp(sk))
1093 		return -EBADF;
1094 
1095 	if (optlen % sizeof(*tokens) ||
1096 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1097 		return -EINVAL;
1098 
1099 	num_tokens = optlen / sizeof(*tokens);
1100 	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1101 	if (!tokens)
1102 		return -ENOMEM;
1103 
1104 	if (copy_from_sockptr(tokens, optval, optlen)) {
1105 		kvfree(tokens);
1106 		return -EFAULT;
1107 	}
1108 
1109 	xa_lock_bh(&sk->sk_user_frags);
1110 	for (i = 0; i < num_tokens; i++) {
1111 		for (j = 0; j < tokens[i].token_count; j++) {
1112 			if (++num_frags > MAX_DONTNEED_FRAGS)
1113 				goto frag_limit_reached;
1114 
1115 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1116 				&sk->sk_user_frags, tokens[i].token_start + j);
1117 
1118 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1119 				continue;
1120 
1121 			netmems[netmem_num++] = netmem;
1122 			if (netmem_num == ARRAY_SIZE(netmems)) {
1123 				xa_unlock_bh(&sk->sk_user_frags);
1124 				for (k = 0; k < netmem_num; k++)
1125 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1126 				netmem_num = 0;
1127 				xa_lock_bh(&sk->sk_user_frags);
1128 			}
1129 			ret++;
1130 		}
1131 	}
1132 
1133 frag_limit_reached:
1134 	xa_unlock_bh(&sk->sk_user_frags);
1135 	for (k = 0; k < netmem_num; k++)
1136 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1137 
1138 	kvfree(tokens);
1139 	return ret;
1140 }
1141 #endif
1142 
1143 void sockopt_lock_sock(struct sock *sk)
1144 {
1145 	/* When current->bpf_ctx is set, the setsockopt is called from
1146 	 * a bpf prog.  bpf has ensured the sk lock has been
1147 	 * acquired before calling setsockopt().
1148 	 */
1149 	if (has_current_bpf_ctx())
1150 		return;
1151 
1152 	lock_sock(sk);
1153 }
1154 EXPORT_SYMBOL(sockopt_lock_sock);
1155 
1156 void sockopt_release_sock(struct sock *sk)
1157 {
1158 	if (has_current_bpf_ctx())
1159 		return;
1160 
1161 	release_sock(sk);
1162 }
1163 EXPORT_SYMBOL(sockopt_release_sock);
1164 
1165 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1166 {
1167 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1168 }
1169 EXPORT_SYMBOL(sockopt_ns_capable);
1170 
1171 bool sockopt_capable(int cap)
1172 {
1173 	return has_current_bpf_ctx() || capable(cap);
1174 }
1175 EXPORT_SYMBOL(sockopt_capable);
1176 
1177 static int sockopt_validate_clockid(__kernel_clockid_t value)
1178 {
1179 	switch (value) {
1180 	case CLOCK_REALTIME:
1181 	case CLOCK_MONOTONIC:
1182 	case CLOCK_TAI:
1183 		return 0;
1184 	}
1185 	return -EINVAL;
1186 }
1187 
1188 /*
1189  *	This is meant for all protocols to use and covers goings on
1190  *	at the socket level. Everything here is generic.
1191  */
1192 
1193 int sk_setsockopt(struct sock *sk, int level, int optname,
1194 		  sockptr_t optval, unsigned int optlen)
1195 {
1196 	struct so_timestamping timestamping;
1197 	struct socket *sock = sk->sk_socket;
1198 	struct sock_txtime sk_txtime;
1199 	int val;
1200 	int valbool;
1201 	struct linger ling;
1202 	int ret = 0;
1203 
1204 	/*
1205 	 *	Options without arguments
1206 	 */
1207 
1208 	if (optname == SO_BINDTODEVICE)
1209 		return sock_setbindtodevice(sk, optval, optlen);
1210 
1211 	if (optlen < sizeof(int))
1212 		return -EINVAL;
1213 
1214 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1215 		return -EFAULT;
1216 
1217 	valbool = val ? 1 : 0;
1218 
1219 	/* handle options which do not require locking the socket. */
1220 	switch (optname) {
1221 	case SO_PRIORITY:
1222 		if (sk_set_prio_allowed(sk, val)) {
1223 			sock_set_priority(sk, val);
1224 			return 0;
1225 		}
1226 		return -EPERM;
1227 	case SO_TYPE:
1228 	case SO_PROTOCOL:
1229 	case SO_DOMAIN:
1230 	case SO_ERROR:
1231 		return -ENOPROTOOPT;
1232 #ifdef CONFIG_NET_RX_BUSY_POLL
1233 	case SO_BUSY_POLL:
1234 		if (val < 0)
1235 			return -EINVAL;
1236 		WRITE_ONCE(sk->sk_ll_usec, val);
1237 		return 0;
1238 	case SO_PREFER_BUSY_POLL:
1239 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1240 			return -EPERM;
1241 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1242 		return 0;
1243 	case SO_BUSY_POLL_BUDGET:
1244 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1245 		    !sockopt_capable(CAP_NET_ADMIN))
1246 			return -EPERM;
1247 		if (val < 0 || val > U16_MAX)
1248 			return -EINVAL;
1249 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1250 		return 0;
1251 #endif
1252 	case SO_MAX_PACING_RATE:
1253 		{
1254 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1255 		unsigned long pacing_rate;
1256 
1257 		if (sizeof(ulval) != sizeof(val) &&
1258 		    optlen >= sizeof(ulval) &&
1259 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1260 			return -EFAULT;
1261 		}
1262 		if (ulval != ~0UL)
1263 			cmpxchg(&sk->sk_pacing_status,
1264 				SK_PACING_NONE,
1265 				SK_PACING_NEEDED);
1266 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1267 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1268 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1269 		if (ulval < pacing_rate)
1270 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1271 		return 0;
1272 		}
1273 	case SO_TXREHASH:
1274 		if (!sk_is_tcp(sk))
1275 			return -EOPNOTSUPP;
1276 		if (val < -1 || val > 1)
1277 			return -EINVAL;
1278 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1279 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1280 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1281 		 * and sk_getsockopt().
1282 		 */
1283 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1284 		return 0;
1285 	case SO_PEEK_OFF:
1286 		{
1287 		int (*set_peek_off)(struct sock *sk, int val);
1288 
1289 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1290 		if (set_peek_off)
1291 			ret = set_peek_off(sk, val);
1292 		else
1293 			ret = -EOPNOTSUPP;
1294 		return ret;
1295 		}
1296 #ifdef CONFIG_PAGE_POOL
1297 	case SO_DEVMEM_DONTNEED:
1298 		return sock_devmem_dontneed(sk, optval, optlen);
1299 #endif
1300 	case SO_SNDTIMEO_OLD:
1301 	case SO_SNDTIMEO_NEW:
1302 		return sock_set_timeout(&sk->sk_sndtimeo, optval,
1303 					optlen, optname == SO_SNDTIMEO_OLD);
1304 	case SO_RCVTIMEO_OLD:
1305 	case SO_RCVTIMEO_NEW:
1306 		return sock_set_timeout(&sk->sk_rcvtimeo, optval,
1307 					optlen, optname == SO_RCVTIMEO_OLD);
1308 	}
1309 
1310 	sockopt_lock_sock(sk);
1311 
1312 	switch (optname) {
1313 	case SO_DEBUG:
1314 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1315 			ret = -EACCES;
1316 		else
1317 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1318 		break;
1319 	case SO_REUSEADDR:
1320 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1321 		break;
1322 	case SO_REUSEPORT:
1323 		if (valbool && !sk_is_inet(sk))
1324 			ret = -EOPNOTSUPP;
1325 		else
1326 			sk->sk_reuseport = valbool;
1327 		break;
1328 	case SO_DONTROUTE:
1329 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1330 		sk_dst_reset(sk);
1331 		break;
1332 	case SO_BROADCAST:
1333 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1334 		break;
1335 	case SO_SNDBUF:
1336 		/* Don't error on this BSD doesn't and if you think
1337 		 * about it this is right. Otherwise apps have to
1338 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1339 		 * are treated in BSD as hints
1340 		 */
1341 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1342 set_sndbuf:
1343 		/* Ensure val * 2 fits into an int, to prevent max_t()
1344 		 * from treating it as a negative value.
1345 		 */
1346 		val = min_t(int, val, INT_MAX / 2);
1347 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1348 		WRITE_ONCE(sk->sk_sndbuf,
1349 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1350 		/* Wake up sending tasks if we upped the value. */
1351 		sk->sk_write_space(sk);
1352 		break;
1353 
1354 	case SO_SNDBUFFORCE:
1355 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1356 			ret = -EPERM;
1357 			break;
1358 		}
1359 
1360 		/* No negative values (to prevent underflow, as val will be
1361 		 * multiplied by 2).
1362 		 */
1363 		if (val < 0)
1364 			val = 0;
1365 		goto set_sndbuf;
1366 
1367 	case SO_RCVBUF:
1368 		/* Don't error on this BSD doesn't and if you think
1369 		 * about it this is right. Otherwise apps have to
1370 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1371 		 * are treated in BSD as hints
1372 		 */
1373 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1374 		break;
1375 
1376 	case SO_RCVBUFFORCE:
1377 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1378 			ret = -EPERM;
1379 			break;
1380 		}
1381 
1382 		/* No negative values (to prevent underflow, as val will be
1383 		 * multiplied by 2).
1384 		 */
1385 		__sock_set_rcvbuf(sk, max(val, 0));
1386 		break;
1387 
1388 	case SO_KEEPALIVE:
1389 		if (sk->sk_prot->keepalive)
1390 			sk->sk_prot->keepalive(sk, valbool);
1391 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1392 		break;
1393 
1394 	case SO_OOBINLINE:
1395 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1396 		break;
1397 
1398 	case SO_NO_CHECK:
1399 		sk->sk_no_check_tx = valbool;
1400 		break;
1401 
1402 	case SO_LINGER:
1403 		if (optlen < sizeof(ling)) {
1404 			ret = -EINVAL;	/* 1003.1g */
1405 			break;
1406 		}
1407 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1408 			ret = -EFAULT;
1409 			break;
1410 		}
1411 		if (!ling.l_onoff) {
1412 			sock_reset_flag(sk, SOCK_LINGER);
1413 		} else {
1414 			unsigned long t_sec = ling.l_linger;
1415 
1416 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1417 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1418 			else
1419 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1420 			sock_set_flag(sk, SOCK_LINGER);
1421 		}
1422 		break;
1423 
1424 	case SO_BSDCOMPAT:
1425 		break;
1426 
1427 	case SO_TIMESTAMP_OLD:
1428 	case SO_TIMESTAMP_NEW:
1429 	case SO_TIMESTAMPNS_OLD:
1430 	case SO_TIMESTAMPNS_NEW:
1431 		sock_set_timestamp(sk, optname, valbool);
1432 		break;
1433 
1434 	case SO_TIMESTAMPING_NEW:
1435 	case SO_TIMESTAMPING_OLD:
1436 		if (optlen == sizeof(timestamping)) {
1437 			if (copy_from_sockptr(&timestamping, optval,
1438 					      sizeof(timestamping))) {
1439 				ret = -EFAULT;
1440 				break;
1441 			}
1442 		} else {
1443 			memset(&timestamping, 0, sizeof(timestamping));
1444 			timestamping.flags = val;
1445 		}
1446 		ret = sock_set_timestamping(sk, optname, timestamping);
1447 		break;
1448 
1449 	case SO_RCVLOWAT:
1450 		{
1451 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1452 
1453 		if (val < 0)
1454 			val = INT_MAX;
1455 		if (sock)
1456 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1457 		if (set_rcvlowat)
1458 			ret = set_rcvlowat(sk, val);
1459 		else
1460 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1461 		break;
1462 		}
1463 	case SO_ATTACH_FILTER: {
1464 		struct sock_fprog fprog;
1465 
1466 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1467 		if (!ret)
1468 			ret = sk_attach_filter(&fprog, sk);
1469 		break;
1470 	}
1471 	case SO_ATTACH_BPF:
1472 		ret = -EINVAL;
1473 		if (optlen == sizeof(u32)) {
1474 			u32 ufd;
1475 
1476 			ret = -EFAULT;
1477 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1478 				break;
1479 
1480 			ret = sk_attach_bpf(ufd, sk);
1481 		}
1482 		break;
1483 
1484 	case SO_ATTACH_REUSEPORT_CBPF: {
1485 		struct sock_fprog fprog;
1486 
1487 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1488 		if (!ret)
1489 			ret = sk_reuseport_attach_filter(&fprog, sk);
1490 		break;
1491 	}
1492 	case SO_ATTACH_REUSEPORT_EBPF:
1493 		ret = -EINVAL;
1494 		if (optlen == sizeof(u32)) {
1495 			u32 ufd;
1496 
1497 			ret = -EFAULT;
1498 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1499 				break;
1500 
1501 			ret = sk_reuseport_attach_bpf(ufd, sk);
1502 		}
1503 		break;
1504 
1505 	case SO_DETACH_REUSEPORT_BPF:
1506 		ret = reuseport_detach_prog(sk);
1507 		break;
1508 
1509 	case SO_DETACH_FILTER:
1510 		ret = sk_detach_filter(sk);
1511 		break;
1512 
1513 	case SO_LOCK_FILTER:
1514 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1515 			ret = -EPERM;
1516 		else
1517 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1518 		break;
1519 
1520 	case SO_MARK:
1521 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1522 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1523 			ret = -EPERM;
1524 			break;
1525 		}
1526 
1527 		__sock_set_mark(sk, val);
1528 		break;
1529 	case SO_RCVMARK:
1530 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1531 		break;
1532 
1533 	case SO_RCVPRIORITY:
1534 		sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1535 		break;
1536 
1537 	case SO_RXQ_OVFL:
1538 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1539 		break;
1540 
1541 	case SO_WIFI_STATUS:
1542 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1543 		break;
1544 
1545 	case SO_NOFCS:
1546 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1547 		break;
1548 
1549 	case SO_SELECT_ERR_QUEUE:
1550 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1551 		break;
1552 
1553 	case SO_PASSCRED:
1554 		if (sk_may_scm_recv(sk))
1555 			sk->sk_scm_credentials = valbool;
1556 		else
1557 			ret = -EOPNOTSUPP;
1558 		break;
1559 
1560 	case SO_PASSSEC:
1561 		if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1562 			sk->sk_scm_security = valbool;
1563 		else
1564 			ret = -EOPNOTSUPP;
1565 		break;
1566 
1567 	case SO_PASSPIDFD:
1568 		if (sk_is_unix(sk))
1569 			sk->sk_scm_pidfd = valbool;
1570 		else
1571 			ret = -EOPNOTSUPP;
1572 		break;
1573 
1574 	case SO_PASSRIGHTS:
1575 		if (sk_is_unix(sk))
1576 			sk->sk_scm_rights = valbool;
1577 		else
1578 			ret = -EOPNOTSUPP;
1579 		break;
1580 
1581 	case SO_INCOMING_CPU:
1582 		reuseport_update_incoming_cpu(sk, val);
1583 		break;
1584 
1585 	case SO_CNX_ADVICE:
1586 		if (val == 1)
1587 			dst_negative_advice(sk);
1588 		break;
1589 
1590 	case SO_ZEROCOPY:
1591 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1592 			if (!(sk_is_tcp(sk) ||
1593 			      (sk->sk_type == SOCK_DGRAM &&
1594 			       sk->sk_protocol == IPPROTO_UDP)))
1595 				ret = -EOPNOTSUPP;
1596 		} else if (sk->sk_family != PF_RDS) {
1597 			ret = -EOPNOTSUPP;
1598 		}
1599 		if (!ret) {
1600 			if (val < 0 || val > 1)
1601 				ret = -EINVAL;
1602 			else
1603 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1604 		}
1605 		break;
1606 
1607 	case SO_TXTIME:
1608 		if (optlen != sizeof(struct sock_txtime)) {
1609 			ret = -EINVAL;
1610 			break;
1611 		} else if (copy_from_sockptr(&sk_txtime, optval,
1612 			   sizeof(struct sock_txtime))) {
1613 			ret = -EFAULT;
1614 			break;
1615 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1616 			ret = -EINVAL;
1617 			break;
1618 		}
1619 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1620 		 * scheduler has enough safe guards.
1621 		 */
1622 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1623 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1624 			ret = -EPERM;
1625 			break;
1626 		}
1627 
1628 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1629 		if (ret)
1630 			break;
1631 
1632 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1633 		sk->sk_clockid = sk_txtime.clockid;
1634 		sk->sk_txtime_deadline_mode =
1635 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1636 		sk->sk_txtime_report_errors =
1637 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1638 		break;
1639 
1640 	case SO_BINDTOIFINDEX:
1641 		ret = sock_bindtoindex_locked(sk, val);
1642 		break;
1643 
1644 	case SO_BUF_LOCK:
1645 		if (val & ~SOCK_BUF_LOCK_MASK) {
1646 			ret = -EINVAL;
1647 			break;
1648 		}
1649 		sk->sk_userlocks = val | (sk->sk_userlocks &
1650 					  ~SOCK_BUF_LOCK_MASK);
1651 		break;
1652 
1653 	case SO_RESERVE_MEM:
1654 	{
1655 		int delta;
1656 
1657 		if (val < 0) {
1658 			ret = -EINVAL;
1659 			break;
1660 		}
1661 
1662 		delta = val - sk->sk_reserved_mem;
1663 		if (delta < 0)
1664 			sock_release_reserved_memory(sk, -delta);
1665 		else
1666 			ret = sock_reserve_memory(sk, delta);
1667 		break;
1668 	}
1669 
1670 	default:
1671 		ret = -ENOPROTOOPT;
1672 		break;
1673 	}
1674 	sockopt_release_sock(sk);
1675 	return ret;
1676 }
1677 
1678 int sock_setsockopt(struct socket *sock, int level, int optname,
1679 		    sockptr_t optval, unsigned int optlen)
1680 {
1681 	return sk_setsockopt(sock->sk, level, optname,
1682 			     optval, optlen);
1683 }
1684 EXPORT_SYMBOL(sock_setsockopt);
1685 
1686 static const struct cred *sk_get_peer_cred(struct sock *sk)
1687 {
1688 	const struct cred *cred;
1689 
1690 	spin_lock(&sk->sk_peer_lock);
1691 	cred = get_cred(sk->sk_peer_cred);
1692 	spin_unlock(&sk->sk_peer_lock);
1693 
1694 	return cred;
1695 }
1696 
1697 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1698 			  struct ucred *ucred)
1699 {
1700 	ucred->pid = pid_vnr(pid);
1701 	ucred->uid = ucred->gid = -1;
1702 	if (cred) {
1703 		struct user_namespace *current_ns = current_user_ns();
1704 
1705 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1706 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1707 	}
1708 }
1709 
1710 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1711 {
1712 	struct user_namespace *user_ns = current_user_ns();
1713 	int i;
1714 
1715 	for (i = 0; i < src->ngroups; i++) {
1716 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1717 
1718 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1719 			return -EFAULT;
1720 	}
1721 
1722 	return 0;
1723 }
1724 
1725 int sk_getsockopt(struct sock *sk, int level, int optname,
1726 		  sockptr_t optval, sockptr_t optlen)
1727 {
1728 	struct socket *sock = sk->sk_socket;
1729 
1730 	union {
1731 		int val;
1732 		u64 val64;
1733 		unsigned long ulval;
1734 		struct linger ling;
1735 		struct old_timeval32 tm32;
1736 		struct __kernel_old_timeval tm;
1737 		struct  __kernel_sock_timeval stm;
1738 		struct sock_txtime txtime;
1739 		struct so_timestamping timestamping;
1740 	} v;
1741 
1742 	int lv = sizeof(int);
1743 	int len;
1744 
1745 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1746 		return -EFAULT;
1747 	if (len < 0)
1748 		return -EINVAL;
1749 
1750 	memset(&v, 0, sizeof(v));
1751 
1752 	switch (optname) {
1753 	case SO_DEBUG:
1754 		v.val = sock_flag(sk, SOCK_DBG);
1755 		break;
1756 
1757 	case SO_DONTROUTE:
1758 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1759 		break;
1760 
1761 	case SO_BROADCAST:
1762 		v.val = sock_flag(sk, SOCK_BROADCAST);
1763 		break;
1764 
1765 	case SO_SNDBUF:
1766 		v.val = READ_ONCE(sk->sk_sndbuf);
1767 		break;
1768 
1769 	case SO_RCVBUF:
1770 		v.val = READ_ONCE(sk->sk_rcvbuf);
1771 		break;
1772 
1773 	case SO_REUSEADDR:
1774 		v.val = sk->sk_reuse;
1775 		break;
1776 
1777 	case SO_REUSEPORT:
1778 		v.val = sk->sk_reuseport;
1779 		break;
1780 
1781 	case SO_KEEPALIVE:
1782 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1783 		break;
1784 
1785 	case SO_TYPE:
1786 		v.val = sk->sk_type;
1787 		break;
1788 
1789 	case SO_PROTOCOL:
1790 		v.val = sk->sk_protocol;
1791 		break;
1792 
1793 	case SO_DOMAIN:
1794 		v.val = sk->sk_family;
1795 		break;
1796 
1797 	case SO_ERROR:
1798 		v.val = -sock_error(sk);
1799 		if (v.val == 0)
1800 			v.val = xchg(&sk->sk_err_soft, 0);
1801 		break;
1802 
1803 	case SO_OOBINLINE:
1804 		v.val = sock_flag(sk, SOCK_URGINLINE);
1805 		break;
1806 
1807 	case SO_NO_CHECK:
1808 		v.val = sk->sk_no_check_tx;
1809 		break;
1810 
1811 	case SO_PRIORITY:
1812 		v.val = READ_ONCE(sk->sk_priority);
1813 		break;
1814 
1815 	case SO_LINGER:
1816 		lv		= sizeof(v.ling);
1817 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1818 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1819 		break;
1820 
1821 	case SO_BSDCOMPAT:
1822 		break;
1823 
1824 	case SO_TIMESTAMP_OLD:
1825 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1826 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1827 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1828 		break;
1829 
1830 	case SO_TIMESTAMPNS_OLD:
1831 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1832 		break;
1833 
1834 	case SO_TIMESTAMP_NEW:
1835 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1836 		break;
1837 
1838 	case SO_TIMESTAMPNS_NEW:
1839 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1840 		break;
1841 
1842 	case SO_TIMESTAMPING_OLD:
1843 	case SO_TIMESTAMPING_NEW:
1844 		lv = sizeof(v.timestamping);
1845 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1846 		 * returning the flags when they were set through the same option.
1847 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1848 		 */
1849 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1850 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1851 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1852 		}
1853 		break;
1854 
1855 	case SO_RCVTIMEO_OLD:
1856 	case SO_RCVTIMEO_NEW:
1857 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1858 				      SO_RCVTIMEO_OLD == optname);
1859 		break;
1860 
1861 	case SO_SNDTIMEO_OLD:
1862 	case SO_SNDTIMEO_NEW:
1863 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1864 				      SO_SNDTIMEO_OLD == optname);
1865 		break;
1866 
1867 	case SO_RCVLOWAT:
1868 		v.val = READ_ONCE(sk->sk_rcvlowat);
1869 		break;
1870 
1871 	case SO_SNDLOWAT:
1872 		v.val = 1;
1873 		break;
1874 
1875 	case SO_PASSCRED:
1876 		if (!sk_may_scm_recv(sk))
1877 			return -EOPNOTSUPP;
1878 
1879 		v.val = sk->sk_scm_credentials;
1880 		break;
1881 
1882 	case SO_PASSPIDFD:
1883 		if (!sk_is_unix(sk))
1884 			return -EOPNOTSUPP;
1885 
1886 		v.val = sk->sk_scm_pidfd;
1887 		break;
1888 
1889 	case SO_PASSRIGHTS:
1890 		if (!sk_is_unix(sk))
1891 			return -EOPNOTSUPP;
1892 
1893 		v.val = sk->sk_scm_rights;
1894 		break;
1895 
1896 	case SO_PEERCRED:
1897 	{
1898 		struct ucred peercred;
1899 		if (len > sizeof(peercred))
1900 			len = sizeof(peercred);
1901 
1902 		spin_lock(&sk->sk_peer_lock);
1903 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1904 		spin_unlock(&sk->sk_peer_lock);
1905 
1906 		if (copy_to_sockptr(optval, &peercred, len))
1907 			return -EFAULT;
1908 		goto lenout;
1909 	}
1910 
1911 	case SO_PEERPIDFD:
1912 	{
1913 		struct pid *peer_pid;
1914 		struct file *pidfd_file = NULL;
1915 		unsigned int flags = 0;
1916 		int pidfd;
1917 
1918 		if (len > sizeof(pidfd))
1919 			len = sizeof(pidfd);
1920 
1921 		spin_lock(&sk->sk_peer_lock);
1922 		peer_pid = get_pid(sk->sk_peer_pid);
1923 		spin_unlock(&sk->sk_peer_lock);
1924 
1925 		if (!peer_pid)
1926 			return -ENODATA;
1927 
1928 		/* The use of PIDFD_STALE requires stashing of struct pid
1929 		 * on pidfs with pidfs_register_pid() and only AF_UNIX
1930 		 * were prepared for this.
1931 		 */
1932 		if (sk->sk_family == AF_UNIX)
1933 			flags = PIDFD_STALE;
1934 
1935 		pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
1936 		put_pid(peer_pid);
1937 		if (pidfd < 0)
1938 			return pidfd;
1939 
1940 		if (copy_to_sockptr(optval, &pidfd, len) ||
1941 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1942 			put_unused_fd(pidfd);
1943 			fput(pidfd_file);
1944 
1945 			return -EFAULT;
1946 		}
1947 
1948 		fd_install(pidfd, pidfd_file);
1949 		return 0;
1950 	}
1951 
1952 	case SO_PEERGROUPS:
1953 	{
1954 		const struct cred *cred;
1955 		int ret, n;
1956 
1957 		cred = sk_get_peer_cred(sk);
1958 		if (!cred)
1959 			return -ENODATA;
1960 
1961 		n = cred->group_info->ngroups;
1962 		if (len < n * sizeof(gid_t)) {
1963 			len = n * sizeof(gid_t);
1964 			put_cred(cred);
1965 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1966 		}
1967 		len = n * sizeof(gid_t);
1968 
1969 		ret = groups_to_user(optval, cred->group_info);
1970 		put_cred(cred);
1971 		if (ret)
1972 			return ret;
1973 		goto lenout;
1974 	}
1975 
1976 	case SO_PEERNAME:
1977 	{
1978 		struct sockaddr_storage address;
1979 
1980 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1981 		if (lv < 0)
1982 			return -ENOTCONN;
1983 		if (lv < len)
1984 			return -EINVAL;
1985 		if (copy_to_sockptr(optval, &address, len))
1986 			return -EFAULT;
1987 		goto lenout;
1988 	}
1989 
1990 	/* Dubious BSD thing... Probably nobody even uses it, but
1991 	 * the UNIX standard wants it for whatever reason... -DaveM
1992 	 */
1993 	case SO_ACCEPTCONN:
1994 		v.val = sk->sk_state == TCP_LISTEN;
1995 		break;
1996 
1997 	case SO_PASSSEC:
1998 		if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
1999 			return -EOPNOTSUPP;
2000 
2001 		v.val = sk->sk_scm_security;
2002 		break;
2003 
2004 	case SO_PEERSEC:
2005 		return security_socket_getpeersec_stream(sock,
2006 							 optval, optlen, len);
2007 
2008 	case SO_MARK:
2009 		v.val = READ_ONCE(sk->sk_mark);
2010 		break;
2011 
2012 	case SO_RCVMARK:
2013 		v.val = sock_flag(sk, SOCK_RCVMARK);
2014 		break;
2015 
2016 	case SO_RCVPRIORITY:
2017 		v.val = sock_flag(sk, SOCK_RCVPRIORITY);
2018 		break;
2019 
2020 	case SO_RXQ_OVFL:
2021 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
2022 		break;
2023 
2024 	case SO_WIFI_STATUS:
2025 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
2026 		break;
2027 
2028 	case SO_PEEK_OFF:
2029 		if (!READ_ONCE(sock->ops)->set_peek_off)
2030 			return -EOPNOTSUPP;
2031 
2032 		v.val = READ_ONCE(sk->sk_peek_off);
2033 		break;
2034 	case SO_NOFCS:
2035 		v.val = sock_flag(sk, SOCK_NOFCS);
2036 		break;
2037 
2038 	case SO_BINDTODEVICE:
2039 		return sock_getbindtodevice(sk, optval, optlen, len);
2040 
2041 	case SO_GET_FILTER:
2042 		len = sk_get_filter(sk, optval, len);
2043 		if (len < 0)
2044 			return len;
2045 
2046 		goto lenout;
2047 
2048 	case SO_LOCK_FILTER:
2049 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
2050 		break;
2051 
2052 	case SO_BPF_EXTENSIONS:
2053 		v.val = bpf_tell_extensions();
2054 		break;
2055 
2056 	case SO_SELECT_ERR_QUEUE:
2057 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2058 		break;
2059 
2060 #ifdef CONFIG_NET_RX_BUSY_POLL
2061 	case SO_BUSY_POLL:
2062 		v.val = READ_ONCE(sk->sk_ll_usec);
2063 		break;
2064 	case SO_PREFER_BUSY_POLL:
2065 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2066 		break;
2067 #endif
2068 
2069 	case SO_MAX_PACING_RATE:
2070 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2071 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2072 			lv = sizeof(v.ulval);
2073 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2074 		} else {
2075 			/* 32bit version */
2076 			v.val = min_t(unsigned long, ~0U,
2077 				      READ_ONCE(sk->sk_max_pacing_rate));
2078 		}
2079 		break;
2080 
2081 	case SO_INCOMING_CPU:
2082 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2083 		break;
2084 
2085 	case SO_MEMINFO:
2086 	{
2087 		u32 meminfo[SK_MEMINFO_VARS];
2088 
2089 		sk_get_meminfo(sk, meminfo);
2090 
2091 		len = min_t(unsigned int, len, sizeof(meminfo));
2092 		if (copy_to_sockptr(optval, &meminfo, len))
2093 			return -EFAULT;
2094 
2095 		goto lenout;
2096 	}
2097 
2098 #ifdef CONFIG_NET_RX_BUSY_POLL
2099 	case SO_INCOMING_NAPI_ID:
2100 		v.val = READ_ONCE(sk->sk_napi_id);
2101 
2102 		/* aggregate non-NAPI IDs down to 0 */
2103 		if (!napi_id_valid(v.val))
2104 			v.val = 0;
2105 
2106 		break;
2107 #endif
2108 
2109 	case SO_COOKIE:
2110 		lv = sizeof(u64);
2111 		if (len < lv)
2112 			return -EINVAL;
2113 		v.val64 = sock_gen_cookie(sk);
2114 		break;
2115 
2116 	case SO_ZEROCOPY:
2117 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2118 		break;
2119 
2120 	case SO_TXTIME:
2121 		lv = sizeof(v.txtime);
2122 		v.txtime.clockid = sk->sk_clockid;
2123 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2124 				  SOF_TXTIME_DEADLINE_MODE : 0;
2125 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2126 				  SOF_TXTIME_REPORT_ERRORS : 0;
2127 		break;
2128 
2129 	case SO_BINDTOIFINDEX:
2130 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2131 		break;
2132 
2133 	case SO_NETNS_COOKIE:
2134 		lv = sizeof(u64);
2135 		if (len != lv)
2136 			return -EINVAL;
2137 		v.val64 = sock_net(sk)->net_cookie;
2138 		break;
2139 
2140 	case SO_BUF_LOCK:
2141 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2142 		break;
2143 
2144 	case SO_RESERVE_MEM:
2145 		v.val = READ_ONCE(sk->sk_reserved_mem);
2146 		break;
2147 
2148 	case SO_TXREHASH:
2149 		if (!sk_is_tcp(sk))
2150 			return -EOPNOTSUPP;
2151 
2152 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2153 		v.val = READ_ONCE(sk->sk_txrehash);
2154 		break;
2155 
2156 	default:
2157 		/* We implement the SO_SNDLOWAT etc to not be settable
2158 		 * (1003.1g 7).
2159 		 */
2160 		return -ENOPROTOOPT;
2161 	}
2162 
2163 	if (len > lv)
2164 		len = lv;
2165 	if (copy_to_sockptr(optval, &v, len))
2166 		return -EFAULT;
2167 lenout:
2168 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2169 		return -EFAULT;
2170 	return 0;
2171 }
2172 
2173 /*
2174  * Initialize an sk_lock.
2175  *
2176  * (We also register the sk_lock with the lock validator.)
2177  */
2178 static inline void sock_lock_init(struct sock *sk)
2179 {
2180 	sk_owner_clear(sk);
2181 
2182 	if (sk->sk_kern_sock)
2183 		sock_lock_init_class_and_name(
2184 			sk,
2185 			af_family_kern_slock_key_strings[sk->sk_family],
2186 			af_family_kern_slock_keys + sk->sk_family,
2187 			af_family_kern_key_strings[sk->sk_family],
2188 			af_family_kern_keys + sk->sk_family);
2189 	else
2190 		sock_lock_init_class_and_name(
2191 			sk,
2192 			af_family_slock_key_strings[sk->sk_family],
2193 			af_family_slock_keys + sk->sk_family,
2194 			af_family_key_strings[sk->sk_family],
2195 			af_family_keys + sk->sk_family);
2196 }
2197 
2198 /*
2199  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2200  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2201  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2202  */
2203 static void sock_copy(struct sock *nsk, const struct sock *osk)
2204 {
2205 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2206 #ifdef CONFIG_SECURITY_NETWORK
2207 	void *sptr = nsk->sk_security;
2208 #endif
2209 
2210 	/* If we move sk_tx_queue_mapping out of the private section,
2211 	 * we must check if sk_tx_queue_clear() is called after
2212 	 * sock_copy() in sk_clone_lock().
2213 	 */
2214 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2215 		     offsetof(struct sock, sk_dontcopy_begin) ||
2216 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2217 		     offsetof(struct sock, sk_dontcopy_end));
2218 
2219 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2220 
2221 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2222 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2223 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2224 
2225 #ifdef CONFIG_SECURITY_NETWORK
2226 	nsk->sk_security = sptr;
2227 	security_sk_clone(osk, nsk);
2228 #endif
2229 }
2230 
2231 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2232 		int family)
2233 {
2234 	struct sock *sk;
2235 	struct kmem_cache *slab;
2236 
2237 	slab = prot->slab;
2238 	if (slab != NULL) {
2239 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2240 		if (!sk)
2241 			return sk;
2242 		if (want_init_on_alloc(priority))
2243 			sk_prot_clear_nulls(sk, prot->obj_size);
2244 	} else
2245 		sk = kmalloc(prot->obj_size, priority);
2246 
2247 	if (sk != NULL) {
2248 		if (security_sk_alloc(sk, family, priority))
2249 			goto out_free;
2250 
2251 		if (!try_module_get(prot->owner))
2252 			goto out_free_sec;
2253 	}
2254 
2255 	return sk;
2256 
2257 out_free_sec:
2258 	security_sk_free(sk);
2259 out_free:
2260 	if (slab != NULL)
2261 		kmem_cache_free(slab, sk);
2262 	else
2263 		kfree(sk);
2264 	return NULL;
2265 }
2266 
2267 static void sk_prot_free(struct proto *prot, struct sock *sk)
2268 {
2269 	struct kmem_cache *slab;
2270 	struct module *owner;
2271 
2272 	owner = prot->owner;
2273 	slab = prot->slab;
2274 
2275 	cgroup_sk_free(&sk->sk_cgrp_data);
2276 	mem_cgroup_sk_free(sk);
2277 	security_sk_free(sk);
2278 
2279 	sk_owner_put(sk);
2280 
2281 	if (slab != NULL)
2282 		kmem_cache_free(slab, sk);
2283 	else
2284 		kfree(sk);
2285 	module_put(owner);
2286 }
2287 
2288 /**
2289  *	sk_alloc - All socket objects are allocated here
2290  *	@net: the applicable net namespace
2291  *	@family: protocol family
2292  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2293  *	@prot: struct proto associated with this new sock instance
2294  *	@kern: is this to be a kernel socket?
2295  */
2296 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2297 		      struct proto *prot, int kern)
2298 {
2299 	struct sock *sk;
2300 
2301 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2302 	if (sk) {
2303 		sk->sk_family = family;
2304 		/*
2305 		 * See comment in struct sock definition to understand
2306 		 * why we need sk_prot_creator -acme
2307 		 */
2308 		sk->sk_prot = sk->sk_prot_creator = prot;
2309 
2310 		if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
2311 			sk->sk_bypass_prot_mem = 1;
2312 
2313 		sk->sk_kern_sock = kern;
2314 		sock_lock_init(sk);
2315 
2316 		sk->sk_net_refcnt = kern ? 0 : 1;
2317 		if (likely(sk->sk_net_refcnt)) {
2318 			get_net_track(net, &sk->ns_tracker, priority);
2319 			sock_inuse_add(net, 1);
2320 		} else {
2321 			net_passive_inc(net);
2322 			__netns_tracker_alloc(net, &sk->ns_tracker,
2323 					      false, priority);
2324 		}
2325 
2326 		sock_net_set(sk, net);
2327 		refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2328 
2329 		mem_cgroup_sk_alloc(sk);
2330 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2331 		sock_update_classid(&sk->sk_cgrp_data);
2332 		sock_update_netprioidx(&sk->sk_cgrp_data);
2333 		sk_tx_queue_clear(sk);
2334 	}
2335 
2336 	return sk;
2337 }
2338 EXPORT_SYMBOL(sk_alloc);
2339 
2340 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2341  * grace period. This is the case for UDP sockets and TCP listeners.
2342  */
2343 static void __sk_destruct(struct rcu_head *head)
2344 {
2345 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2346 	struct net *net = sock_net(sk);
2347 	struct sk_filter *filter;
2348 
2349 	if (sk->sk_destruct)
2350 		sk->sk_destruct(sk);
2351 
2352 	filter = rcu_dereference_check(sk->sk_filter,
2353 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2354 	if (filter) {
2355 		sk_filter_uncharge(sk, filter);
2356 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2357 	}
2358 
2359 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2360 
2361 #ifdef CONFIG_BPF_SYSCALL
2362 	bpf_sk_storage_free(sk);
2363 #endif
2364 
2365 	if (atomic_read(&sk->sk_omem_alloc))
2366 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2367 			 __func__, atomic_read(&sk->sk_omem_alloc));
2368 
2369 	if (sk->sk_frag.page) {
2370 		put_page(sk->sk_frag.page);
2371 		sk->sk_frag.page = NULL;
2372 	}
2373 
2374 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2375 	put_cred(sk->sk_peer_cred);
2376 	put_pid(sk->sk_peer_pid);
2377 
2378 	if (likely(sk->sk_net_refcnt)) {
2379 		put_net_track(net, &sk->ns_tracker);
2380 	} else {
2381 		__netns_tracker_free(net, &sk->ns_tracker, false);
2382 		net_passive_dec(net);
2383 	}
2384 	sk_prot_free(sk->sk_prot_creator, sk);
2385 }
2386 
2387 void sk_net_refcnt_upgrade(struct sock *sk)
2388 {
2389 	struct net *net = sock_net(sk);
2390 
2391 	WARN_ON_ONCE(sk->sk_net_refcnt);
2392 	__netns_tracker_free(net, &sk->ns_tracker, false);
2393 	net_passive_dec(net);
2394 	sk->sk_net_refcnt = 1;
2395 	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2396 	sock_inuse_add(net, 1);
2397 }
2398 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2399 
2400 void sk_destruct(struct sock *sk)
2401 {
2402 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2403 
2404 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2405 		reuseport_detach_sock(sk);
2406 		use_call_rcu = true;
2407 	}
2408 
2409 	if (use_call_rcu)
2410 		call_rcu(&sk->sk_rcu, __sk_destruct);
2411 	else
2412 		__sk_destruct(&sk->sk_rcu);
2413 }
2414 
2415 static void __sk_free(struct sock *sk)
2416 {
2417 	if (likely(sk->sk_net_refcnt))
2418 		sock_inuse_add(sock_net(sk), -1);
2419 
2420 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2421 		sock_diag_broadcast_destroy(sk);
2422 	else
2423 		sk_destruct(sk);
2424 }
2425 
2426 void sk_free(struct sock *sk)
2427 {
2428 	/*
2429 	 * We subtract one from sk_wmem_alloc and can know if
2430 	 * some packets are still in some tx queue.
2431 	 * If not null, sock_wfree() will call __sk_free(sk) later
2432 	 */
2433 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2434 		__sk_free(sk);
2435 }
2436 EXPORT_SYMBOL(sk_free);
2437 
2438 static void sk_init_common(struct sock *sk)
2439 {
2440 	skb_queue_head_init(&sk->sk_receive_queue);
2441 	skb_queue_head_init(&sk->sk_write_queue);
2442 	skb_queue_head_init(&sk->sk_error_queue);
2443 
2444 	rwlock_init(&sk->sk_callback_lock);
2445 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2446 			af_rlock_keys + sk->sk_family,
2447 			af_family_rlock_key_strings[sk->sk_family]);
2448 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2449 			af_wlock_keys + sk->sk_family,
2450 			af_family_wlock_key_strings[sk->sk_family]);
2451 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2452 			af_elock_keys + sk->sk_family,
2453 			af_family_elock_key_strings[sk->sk_family]);
2454 	if (sk->sk_kern_sock)
2455 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2456 			af_kern_callback_keys + sk->sk_family,
2457 			af_family_kern_clock_key_strings[sk->sk_family]);
2458 	else
2459 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2460 			af_callback_keys + sk->sk_family,
2461 			af_family_clock_key_strings[sk->sk_family]);
2462 }
2463 
2464 /**
2465  * sk_clone - clone a socket
2466  * @sk: the socket to clone
2467  * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2468  * @lock: if true, lock the cloned sk
2469  *
2470  * If @lock is true, the clone is locked by bh_lock_sock(), and
2471  * caller must unlock socket even in error path by bh_unlock_sock().
2472  */
2473 struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
2474 		      bool lock)
2475 {
2476 	struct proto *prot = READ_ONCE(sk->sk_prot);
2477 	struct sk_filter *filter;
2478 	bool is_charged = true;
2479 	struct sock *newsk;
2480 
2481 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2482 	if (!newsk)
2483 		goto out;
2484 
2485 	sock_copy(newsk, sk);
2486 
2487 	newsk->sk_prot_creator = prot;
2488 
2489 	/* SANITY */
2490 	if (likely(newsk->sk_net_refcnt)) {
2491 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2492 		sock_inuse_add(sock_net(newsk), 1);
2493 	} else {
2494 		/* Kernel sockets are not elevating the struct net refcount.
2495 		 * Instead, use a tracker to more easily detect if a layer
2496 		 * is not properly dismantling its kernel sockets at netns
2497 		 * destroy time.
2498 		 */
2499 		net_passive_inc(sock_net(newsk));
2500 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2501 				      false, priority);
2502 	}
2503 
2504 	sk_node_init(&newsk->sk_node);
2505 	sock_lock_init(newsk);
2506 
2507 	if (lock)
2508 		bh_lock_sock(newsk);
2509 
2510 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2511 	newsk->sk_backlog.len = 0;
2512 
2513 	atomic_set(&newsk->sk_rmem_alloc, 0);
2514 
2515 	refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2516 
2517 	atomic_set(&newsk->sk_omem_alloc, 0);
2518 	sk_init_common(newsk);
2519 
2520 	newsk->sk_dst_cache	= NULL;
2521 	newsk->sk_dst_pending_confirm = 0;
2522 	newsk->sk_wmem_queued	= 0;
2523 	newsk->sk_forward_alloc = 0;
2524 	newsk->sk_reserved_mem  = 0;
2525 	DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
2526 	sk_drops_reset(newsk);
2527 	newsk->sk_send_head	= NULL;
2528 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2529 	atomic_set(&newsk->sk_zckey, 0);
2530 
2531 	sock_reset_flag(newsk, SOCK_DONE);
2532 
2533 #ifdef CONFIG_MEMCG
2534 	/* sk->sk_memcg will be populated at accept() time */
2535 	newsk->sk_memcg = NULL;
2536 #endif
2537 
2538 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2539 
2540 	rcu_read_lock();
2541 	filter = rcu_dereference(sk->sk_filter);
2542 	if (filter != NULL)
2543 		/* though it's an empty new sock, the charging may fail
2544 		 * if sysctl_optmem_max was changed between creation of
2545 		 * original socket and cloning
2546 		 */
2547 		is_charged = sk_filter_charge(newsk, filter);
2548 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2549 	rcu_read_unlock();
2550 
2551 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2552 		/* We need to make sure that we don't uncharge the new
2553 		 * socket if we couldn't charge it in the first place
2554 		 * as otherwise we uncharge the parent's filter.
2555 		 */
2556 		if (!is_charged)
2557 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2558 
2559 		goto free;
2560 	}
2561 
2562 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2563 
2564 	if (bpf_sk_storage_clone(sk, newsk))
2565 		goto free;
2566 
2567 	/* Clear sk_user_data if parent had the pointer tagged
2568 	 * as not suitable for copying when cloning.
2569 	 */
2570 	if (sk_user_data_is_nocopy(newsk))
2571 		newsk->sk_user_data = NULL;
2572 
2573 	newsk->sk_err	   = 0;
2574 	newsk->sk_err_soft = 0;
2575 	newsk->sk_priority = 0;
2576 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2577 
2578 	/* Before updating sk_refcnt, we must commit prior changes to memory
2579 	 * (Documentation/RCU/rculist_nulls.rst for details)
2580 	 */
2581 	smp_wmb();
2582 	refcount_set(&newsk->sk_refcnt, 2);
2583 
2584 	sk_set_socket(newsk, NULL);
2585 	sk_tx_queue_clear(newsk);
2586 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2587 
2588 	if (newsk->sk_prot->sockets_allocated)
2589 		sk_sockets_allocated_inc(newsk);
2590 
2591 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2592 		net_enable_timestamp();
2593 out:
2594 	return newsk;
2595 free:
2596 	/* It is still raw copy of parent, so invalidate
2597 	 * destructor and make plain sk_free()
2598 	 */
2599 	newsk->sk_destruct = NULL;
2600 	if (lock)
2601 		bh_unlock_sock(newsk);
2602 	sk_free(newsk);
2603 	newsk = NULL;
2604 	goto out;
2605 }
2606 EXPORT_SYMBOL_GPL(sk_clone);
2607 
2608 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
2609 {
2610 	bool is_ipv6 = false;
2611 	u32 max_size;
2612 
2613 #if IS_ENABLED(CONFIG_IPV6)
2614 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2615 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2616 #endif
2617 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2618 	max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
2619 			READ_ONCE(dev->gso_ipv4_max_size);
2620 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2621 		max_size = GSO_LEGACY_MAX_SIZE;
2622 
2623 	return max_size - (MAX_TCP_HEADER + 1);
2624 }
2625 
2626 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2627 {
2628 	const struct net_device *dev;
2629 	u32 max_segs = 1;
2630 
2631 	rcu_read_lock();
2632 	dev = dst_dev_rcu(dst);
2633 	sk->sk_route_caps = dev->features;
2634 	if (sk_is_tcp(sk)) {
2635 		struct inet_connection_sock *icsk = inet_csk(sk);
2636 
2637 		sk->sk_route_caps |= NETIF_F_GSO;
2638 		icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2639 	}
2640 	if (sk->sk_route_caps & NETIF_F_GSO)
2641 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2642 	if (unlikely(sk->sk_gso_disabled))
2643 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2644 	if (sk_can_gso(sk)) {
2645 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2646 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2647 		} else {
2648 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2649 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
2650 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2651 			max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
2652 		}
2653 	}
2654 	sk->sk_gso_max_segs = max_segs;
2655 	sk_dst_set(sk, dst);
2656 	rcu_read_unlock();
2657 }
2658 EXPORT_SYMBOL_GPL(sk_setup_caps);
2659 
2660 /*
2661  *	Simple resource managers for sockets.
2662  */
2663 
2664 
2665 /*
2666  * Write buffer destructor automatically called from kfree_skb.
2667  */
2668 void sock_wfree(struct sk_buff *skb)
2669 {
2670 	unsigned int len = skb->truesize;
2671 	struct sock *sk = skb->sk;
2672 	bool free;
2673 	int old;
2674 
2675 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2676 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2677 		    sk->sk_write_space == sock_def_write_space) {
2678 			rcu_read_lock();
2679 			free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
2680 						       &old);
2681 			sock_def_write_space_wfree(sk, old - len);
2682 			rcu_read_unlock();
2683 			if (unlikely(free))
2684 				__sk_free(sk);
2685 			return;
2686 		}
2687 
2688 		/*
2689 		 * Keep a reference on sk_wmem_alloc, this will be released
2690 		 * after sk_write_space() call
2691 		 */
2692 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2693 		sk->sk_write_space(sk);
2694 		len = 1;
2695 	}
2696 	/*
2697 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2698 	 * could not do because of in-flight packets
2699 	 */
2700 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2701 		__sk_free(sk);
2702 }
2703 EXPORT_SYMBOL(sock_wfree);
2704 
2705 /* This variant of sock_wfree() is used by TCP,
2706  * since it sets SOCK_USE_WRITE_QUEUE.
2707  */
2708 void __sock_wfree(struct sk_buff *skb)
2709 {
2710 	struct sock *sk = skb->sk;
2711 
2712 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2713 		__sk_free(sk);
2714 }
2715 
2716 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2717 {
2718 	int old_wmem;
2719 
2720 	skb_orphan(skb);
2721 #ifdef CONFIG_INET
2722 	if (unlikely(!sk_fullsock(sk)))
2723 		return skb_set_owner_edemux(skb, sk);
2724 #endif
2725 	skb->sk = sk;
2726 	skb->destructor = sock_wfree;
2727 	skb_set_hash_from_sk(skb, sk);
2728 	/*
2729 	 * We used to take a refcount on sk, but following operation
2730 	 * is enough to guarantee sk_free() won't free this sock until
2731 	 * all in-flight packets are completed
2732 	 */
2733 	__refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);
2734 
2735 	/* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
2736 	 * is in a host queue (qdisc, NIC queue).
2737 	 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
2738 	 * based on XPS for better performance.
2739 	 * Otherwise clear ooo_okay to not risk Out Of Order delivery.
2740 	 */
2741 	skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
2742 }
2743 EXPORT_SYMBOL(skb_set_owner_w);
2744 
2745 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2746 {
2747 	/* Drivers depend on in-order delivery for crypto offload,
2748 	 * partial orphan breaks out-of-order-OK logic.
2749 	 */
2750 	if (skb_is_decrypted(skb))
2751 		return false;
2752 
2753 	return (skb->destructor == sock_wfree ||
2754 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2755 }
2756 
2757 /* This helper is used by netem, as it can hold packets in its
2758  * delay queue. We want to allow the owner socket to send more
2759  * packets, as if they were already TX completed by a typical driver.
2760  * But we also want to keep skb->sk set because some packet schedulers
2761  * rely on it (sch_fq for example).
2762  */
2763 void skb_orphan_partial(struct sk_buff *skb)
2764 {
2765 	if (skb_is_tcp_pure_ack(skb))
2766 		return;
2767 
2768 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2769 		return;
2770 
2771 	skb_orphan(skb);
2772 }
2773 EXPORT_SYMBOL(skb_orphan_partial);
2774 
2775 /*
2776  * Read buffer destructor automatically called from kfree_skb.
2777  */
2778 void sock_rfree(struct sk_buff *skb)
2779 {
2780 	struct sock *sk = skb->sk;
2781 	unsigned int len = skb->truesize;
2782 
2783 	atomic_sub(len, &sk->sk_rmem_alloc);
2784 	sk_mem_uncharge(sk, len);
2785 }
2786 EXPORT_SYMBOL(sock_rfree);
2787 
2788 /*
2789  * Buffer destructor for skbs that are not used directly in read or write
2790  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2791  */
2792 void sock_efree(struct sk_buff *skb)
2793 {
2794 	sock_put(skb->sk);
2795 }
2796 EXPORT_SYMBOL(sock_efree);
2797 
2798 /* Buffer destructor for prefetch/receive path where reference count may
2799  * not be held, e.g. for listen sockets.
2800  */
2801 #ifdef CONFIG_INET
2802 void sock_pfree(struct sk_buff *skb)
2803 {
2804 	struct sock *sk = skb->sk;
2805 
2806 	if (!sk_is_refcounted(sk))
2807 		return;
2808 
2809 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2810 		inet_reqsk(sk)->rsk_listener = NULL;
2811 		reqsk_free(inet_reqsk(sk));
2812 		return;
2813 	}
2814 
2815 	sock_gen_put(sk);
2816 }
2817 EXPORT_SYMBOL(sock_pfree);
2818 #endif /* CONFIG_INET */
2819 
2820 /*
2821  * Allocate a skb from the socket's send buffer.
2822  */
2823 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2824 			     gfp_t priority)
2825 {
2826 	if (force ||
2827 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2828 		struct sk_buff *skb = alloc_skb(size, priority);
2829 
2830 		if (skb) {
2831 			skb_set_owner_w(skb, sk);
2832 			return skb;
2833 		}
2834 	}
2835 	return NULL;
2836 }
2837 EXPORT_SYMBOL(sock_wmalloc);
2838 
2839 static void sock_ofree(struct sk_buff *skb)
2840 {
2841 	struct sock *sk = skb->sk;
2842 
2843 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2844 }
2845 
2846 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2847 			     gfp_t priority)
2848 {
2849 	struct sk_buff *skb;
2850 
2851 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2852 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2853 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2854 		return NULL;
2855 
2856 	skb = alloc_skb(size, priority);
2857 	if (!skb)
2858 		return NULL;
2859 
2860 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2861 	skb->sk = sk;
2862 	skb->destructor = sock_ofree;
2863 	return skb;
2864 }
2865 
2866 /*
2867  * Allocate a memory block from the socket's option memory buffer.
2868  */
2869 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2870 {
2871 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2872 
2873 	if ((unsigned int)size <= optmem_max &&
2874 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2875 		void *mem;
2876 		/* First do the add, to avoid the race if kmalloc
2877 		 * might sleep.
2878 		 */
2879 		atomic_add(size, &sk->sk_omem_alloc);
2880 		mem = kmalloc(size, priority);
2881 		if (mem)
2882 			return mem;
2883 		atomic_sub(size, &sk->sk_omem_alloc);
2884 	}
2885 	return NULL;
2886 }
2887 EXPORT_SYMBOL(sock_kmalloc);
2888 
2889 /*
2890  * Duplicate the input "src" memory block using the socket's
2891  * option memory buffer.
2892  */
2893 void *sock_kmemdup(struct sock *sk, const void *src,
2894 		   int size, gfp_t priority)
2895 {
2896 	void *mem;
2897 
2898 	mem = sock_kmalloc(sk, size, priority);
2899 	if (mem)
2900 		memcpy(mem, src, size);
2901 	return mem;
2902 }
2903 EXPORT_SYMBOL(sock_kmemdup);
2904 
2905 /* Free an option memory block. Note, we actually want the inline
2906  * here as this allows gcc to detect the nullify and fold away the
2907  * condition entirely.
2908  */
2909 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2910 				  const bool nullify)
2911 {
2912 	if (WARN_ON_ONCE(!mem))
2913 		return;
2914 	if (nullify)
2915 		kfree_sensitive(mem);
2916 	else
2917 		kfree(mem);
2918 	atomic_sub(size, &sk->sk_omem_alloc);
2919 }
2920 
2921 void sock_kfree_s(struct sock *sk, void *mem, int size)
2922 {
2923 	__sock_kfree_s(sk, mem, size, false);
2924 }
2925 EXPORT_SYMBOL(sock_kfree_s);
2926 
2927 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2928 {
2929 	__sock_kfree_s(sk, mem, size, true);
2930 }
2931 EXPORT_SYMBOL(sock_kzfree_s);
2932 
2933 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2934    I think, these locks should be removed for datagram sockets.
2935  */
2936 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2937 {
2938 	DEFINE_WAIT(wait);
2939 
2940 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2941 	for (;;) {
2942 		if (!timeo)
2943 			break;
2944 		if (signal_pending(current))
2945 			break;
2946 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2947 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2948 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2949 			break;
2950 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2951 			break;
2952 		if (READ_ONCE(sk->sk_err))
2953 			break;
2954 		timeo = schedule_timeout(timeo);
2955 	}
2956 	finish_wait(sk_sleep(sk), &wait);
2957 	return timeo;
2958 }
2959 
2960 
2961 /*
2962  *	Generic send/receive buffer handlers
2963  */
2964 
2965 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2966 				     unsigned long data_len, int noblock,
2967 				     int *errcode, int max_page_order)
2968 {
2969 	struct sk_buff *skb;
2970 	long timeo;
2971 	int err;
2972 
2973 	timeo = sock_sndtimeo(sk, noblock);
2974 	for (;;) {
2975 		err = sock_error(sk);
2976 		if (err != 0)
2977 			goto failure;
2978 
2979 		err = -EPIPE;
2980 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2981 			goto failure;
2982 
2983 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2984 			break;
2985 
2986 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2987 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2988 		err = -EAGAIN;
2989 		if (!timeo)
2990 			goto failure;
2991 		if (signal_pending(current))
2992 			goto interrupted;
2993 		timeo = sock_wait_for_wmem(sk, timeo);
2994 	}
2995 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2996 				   errcode, sk->sk_allocation);
2997 	if (skb)
2998 		skb_set_owner_w(skb, sk);
2999 	return skb;
3000 
3001 interrupted:
3002 	err = sock_intr_errno(timeo);
3003 failure:
3004 	*errcode = err;
3005 	return NULL;
3006 }
3007 EXPORT_SYMBOL(sock_alloc_send_pskb);
3008 
3009 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
3010 		     struct sockcm_cookie *sockc)
3011 {
3012 	u32 tsflags;
3013 
3014 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
3015 
3016 	switch (cmsg->cmsg_type) {
3017 	case SO_MARK:
3018 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
3019 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3020 			return -EPERM;
3021 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3022 			return -EINVAL;
3023 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
3024 		break;
3025 	case SO_TIMESTAMPING_OLD:
3026 	case SO_TIMESTAMPING_NEW:
3027 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3028 			return -EINVAL;
3029 
3030 		tsflags = *(u32 *)CMSG_DATA(cmsg);
3031 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3032 			return -EINVAL;
3033 
3034 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3035 		sockc->tsflags |= tsflags;
3036 		break;
3037 	case SCM_TXTIME:
3038 		if (!sock_flag(sk, SOCK_TXTIME))
3039 			return -EINVAL;
3040 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3041 			return -EINVAL;
3042 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3043 		break;
3044 	case SCM_TS_OPT_ID:
3045 		if (sk_is_tcp(sk))
3046 			return -EINVAL;
3047 		tsflags = READ_ONCE(sk->sk_tsflags);
3048 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3049 			return -EINVAL;
3050 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3051 			return -EINVAL;
3052 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
3053 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
3054 		break;
3055 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
3056 	case SCM_RIGHTS:
3057 	case SCM_CREDENTIALS:
3058 		break;
3059 	case SO_PRIORITY:
3060 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3061 			return -EINVAL;
3062 		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
3063 			return -EPERM;
3064 		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
3065 		break;
3066 	case SCM_DEVMEM_DMABUF:
3067 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3068 			return -EINVAL;
3069 		sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
3070 		break;
3071 	default:
3072 		return -EINVAL;
3073 	}
3074 	return 0;
3075 }
3076 EXPORT_SYMBOL(__sock_cmsg_send);
3077 
3078 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
3079 		   struct sockcm_cookie *sockc)
3080 {
3081 	struct cmsghdr *cmsg;
3082 	int ret;
3083 
3084 	for_each_cmsghdr(cmsg, msg) {
3085 		if (!CMSG_OK(msg, cmsg))
3086 			return -EINVAL;
3087 		if (cmsg->cmsg_level != SOL_SOCKET)
3088 			continue;
3089 		ret = __sock_cmsg_send(sk, cmsg, sockc);
3090 		if (ret)
3091 			return ret;
3092 	}
3093 	return 0;
3094 }
3095 EXPORT_SYMBOL(sock_cmsg_send);
3096 
3097 static void sk_enter_memory_pressure(struct sock *sk)
3098 {
3099 	if (!sk->sk_prot->enter_memory_pressure)
3100 		return;
3101 
3102 	sk->sk_prot->enter_memory_pressure(sk);
3103 }
3104 
3105 static void sk_leave_memory_pressure(struct sock *sk)
3106 {
3107 	if (sk->sk_prot->leave_memory_pressure) {
3108 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3109 				     tcp_leave_memory_pressure, sk);
3110 	} else {
3111 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3112 
3113 		if (memory_pressure && READ_ONCE(*memory_pressure))
3114 			WRITE_ONCE(*memory_pressure, 0);
3115 	}
3116 }
3117 
3118 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3119 
3120 /**
3121  * skb_page_frag_refill - check that a page_frag contains enough room
3122  * @sz: minimum size of the fragment we want to get
3123  * @pfrag: pointer to page_frag
3124  * @gfp: priority for memory allocation
3125  *
3126  * Note: While this allocator tries to use high order pages, there is
3127  * no guarantee that allocations succeed. Therefore, @sz MUST be
3128  * less or equal than PAGE_SIZE.
3129  */
3130 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3131 {
3132 	if (pfrag->page) {
3133 		if (page_ref_count(pfrag->page) == 1) {
3134 			pfrag->offset = 0;
3135 			return true;
3136 		}
3137 		if (pfrag->offset + sz <= pfrag->size)
3138 			return true;
3139 		put_page(pfrag->page);
3140 	}
3141 
3142 	pfrag->offset = 0;
3143 	if (SKB_FRAG_PAGE_ORDER &&
3144 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3145 		/* Avoid direct reclaim but allow kswapd to wake */
3146 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3147 					  __GFP_COMP | __GFP_NOWARN |
3148 					  __GFP_NORETRY,
3149 					  SKB_FRAG_PAGE_ORDER);
3150 		if (likely(pfrag->page)) {
3151 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3152 			return true;
3153 		}
3154 	}
3155 	pfrag->page = alloc_page(gfp);
3156 	if (likely(pfrag->page)) {
3157 		pfrag->size = PAGE_SIZE;
3158 		return true;
3159 	}
3160 	return false;
3161 }
3162 EXPORT_SYMBOL(skb_page_frag_refill);
3163 
3164 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3165 {
3166 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3167 		return true;
3168 
3169 	if (!sk->sk_bypass_prot_mem)
3170 		sk_enter_memory_pressure(sk);
3171 
3172 	sk_stream_moderate_sndbuf(sk);
3173 
3174 	return false;
3175 }
3176 EXPORT_SYMBOL(sk_page_frag_refill);
3177 
3178 void __lock_sock(struct sock *sk)
3179 	__releases(&sk->sk_lock.slock)
3180 	__acquires(&sk->sk_lock.slock)
3181 {
3182 	DEFINE_WAIT(wait);
3183 
3184 	for (;;) {
3185 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3186 					TASK_UNINTERRUPTIBLE);
3187 		spin_unlock_bh(&sk->sk_lock.slock);
3188 		schedule();
3189 		spin_lock_bh(&sk->sk_lock.slock);
3190 		if (!sock_owned_by_user(sk))
3191 			break;
3192 	}
3193 	finish_wait(&sk->sk_lock.wq, &wait);
3194 }
3195 
3196 void __release_sock(struct sock *sk)
3197 	__releases(&sk->sk_lock.slock)
3198 	__acquires(&sk->sk_lock.slock)
3199 {
3200 	struct sk_buff *skb, *next;
3201 	int nb = 0;
3202 
3203 	while ((skb = sk->sk_backlog.head) != NULL) {
3204 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3205 
3206 		spin_unlock_bh(&sk->sk_lock.slock);
3207 
3208 		while (1) {
3209 			next = skb->next;
3210 			prefetch(next);
3211 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3212 			skb_mark_not_on_list(skb);
3213 			sk_backlog_rcv(sk, skb);
3214 
3215 			skb = next;
3216 			if (!skb)
3217 				break;
3218 
3219 			if (!(++nb & 15))
3220 				cond_resched();
3221 		}
3222 
3223 		spin_lock_bh(&sk->sk_lock.slock);
3224 	}
3225 
3226 	/*
3227 	 * Doing the zeroing here guarantee we can not loop forever
3228 	 * while a wild producer attempts to flood us.
3229 	 */
3230 	sk->sk_backlog.len = 0;
3231 }
3232 
3233 void __sk_flush_backlog(struct sock *sk)
3234 {
3235 	spin_lock_bh(&sk->sk_lock.slock);
3236 	__release_sock(sk);
3237 
3238 	if (sk->sk_prot->release_cb)
3239 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3240 				     tcp_release_cb, sk);
3241 
3242 	spin_unlock_bh(&sk->sk_lock.slock);
3243 }
3244 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3245 
3246 /**
3247  * sk_wait_data - wait for data to arrive at sk_receive_queue
3248  * @sk:    sock to wait on
3249  * @timeo: for how long
3250  * @skb:   last skb seen on sk_receive_queue
3251  *
3252  * Now socket state including sk->sk_err is changed only under lock,
3253  * hence we may omit checks after joining wait queue.
3254  * We check receive queue before schedule() only as optimization;
3255  * it is very likely that release_sock() added new data.
3256  */
3257 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3258 {
3259 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3260 	int rc;
3261 
3262 	add_wait_queue(sk_sleep(sk), &wait);
3263 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3264 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3265 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3266 	remove_wait_queue(sk_sleep(sk), &wait);
3267 	return rc;
3268 }
3269 EXPORT_SYMBOL(sk_wait_data);
3270 
3271 /**
3272  *	__sk_mem_raise_allocated - increase memory_allocated
3273  *	@sk: socket
3274  *	@size: memory size to allocate
3275  *	@amt: pages to allocate
3276  *	@kind: allocation type
3277  *
3278  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3279  *
3280  *	Unlike the globally shared limits among the sockets under same protocol,
3281  *	consuming the budget of a memcg won't have direct effect on other ones.
3282  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3283  *	whether or not to raise allocated through sk_under_memory_pressure() or
3284  *	its variants.
3285  */
3286 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3287 {
3288 	bool memcg_enabled = false, charged = false;
3289 	struct proto *prot = sk->sk_prot;
3290 	long allocated = 0;
3291 
3292 	if (!sk->sk_bypass_prot_mem) {
3293 		sk_memory_allocated_add(sk, amt);
3294 		allocated = sk_memory_allocated(sk);
3295 	}
3296 
3297 	if (mem_cgroup_sk_enabled(sk)) {
3298 		memcg_enabled = true;
3299 		charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
3300 		if (!charged)
3301 			goto suppress_allocation;
3302 	}
3303 
3304 	if (!allocated)
3305 		return 1;
3306 
3307 	/* Under limit. */
3308 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3309 		sk_leave_memory_pressure(sk);
3310 		return 1;
3311 	}
3312 
3313 	/* Under pressure. */
3314 	if (allocated > sk_prot_mem_limits(sk, 1))
3315 		sk_enter_memory_pressure(sk);
3316 
3317 	/* Over hard limit. */
3318 	if (allocated > sk_prot_mem_limits(sk, 2))
3319 		goto suppress_allocation;
3320 
3321 	/* Guarantee minimum buffer size under pressure (either global
3322 	 * or memcg) to make sure features described in RFC 7323 (TCP
3323 	 * Extensions for High Performance) work properly.
3324 	 *
3325 	 * This rule does NOT stand when exceeds global or memcg's hard
3326 	 * limit, or else a DoS attack can be taken place by spawning
3327 	 * lots of sockets whose usage are under minimum buffer size.
3328 	 */
3329 	if (kind == SK_MEM_RECV) {
3330 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3331 			return 1;
3332 
3333 	} else { /* SK_MEM_SEND */
3334 		int wmem0 = sk_get_wmem0(sk, prot);
3335 
3336 		if (sk->sk_type == SOCK_STREAM) {
3337 			if (sk->sk_wmem_queued < wmem0)
3338 				return 1;
3339 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3340 				return 1;
3341 		}
3342 	}
3343 
3344 	if (sk_has_memory_pressure(sk)) {
3345 		u64 alloc;
3346 
3347 		/* The following 'average' heuristic is within the
3348 		 * scope of global accounting, so it only makes
3349 		 * sense for global memory pressure.
3350 		 */
3351 		if (!sk_under_global_memory_pressure(sk))
3352 			return 1;
3353 
3354 		/* Try to be fair among all the sockets under global
3355 		 * pressure by allowing the ones that below average
3356 		 * usage to raise.
3357 		 */
3358 		alloc = sk_sockets_allocated_read_positive(sk);
3359 		if (sk_prot_mem_limits(sk, 2) > alloc *
3360 		    sk_mem_pages(sk->sk_wmem_queued +
3361 				 atomic_read(&sk->sk_rmem_alloc) +
3362 				 sk->sk_forward_alloc))
3363 			return 1;
3364 	}
3365 
3366 suppress_allocation:
3367 
3368 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3369 		sk_stream_moderate_sndbuf(sk);
3370 
3371 		/* Fail only if socket is _under_ its sndbuf.
3372 		 * In this case we cannot block, so that we have to fail.
3373 		 */
3374 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3375 			/* Force charge with __GFP_NOFAIL */
3376 			if (memcg_enabled && !charged)
3377 				mem_cgroup_sk_charge(sk, amt,
3378 						     gfp_memcg_charge() | __GFP_NOFAIL);
3379 			return 1;
3380 		}
3381 	}
3382 
3383 	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3384 
3385 	if (allocated)
3386 		sk_memory_allocated_sub(sk, amt);
3387 
3388 	if (charged)
3389 		mem_cgroup_sk_uncharge(sk, amt);
3390 
3391 	return 0;
3392 }
3393 
3394 /**
3395  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3396  *	@sk: socket
3397  *	@size: memory size to allocate
3398  *	@kind: allocation type
3399  *
3400  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3401  *	rmem allocation. This function assumes that protocols which have
3402  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3403  */
3404 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3405 {
3406 	int ret, amt = sk_mem_pages(size);
3407 
3408 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3409 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3410 	if (!ret)
3411 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3412 	return ret;
3413 }
3414 EXPORT_SYMBOL(__sk_mem_schedule);
3415 
3416 /**
3417  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3418  *	@sk: socket
3419  *	@amount: number of quanta
3420  *
3421  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3422  */
3423 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3424 {
3425 	if (mem_cgroup_sk_enabled(sk))
3426 		mem_cgroup_sk_uncharge(sk, amount);
3427 
3428 	if (sk->sk_bypass_prot_mem)
3429 		return;
3430 
3431 	sk_memory_allocated_sub(sk, amount);
3432 
3433 	if (sk_under_global_memory_pressure(sk) &&
3434 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3435 		sk_leave_memory_pressure(sk);
3436 }
3437 
3438 /**
3439  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3440  *	@sk: socket
3441  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3442  */
3443 void __sk_mem_reclaim(struct sock *sk, int amount)
3444 {
3445 	amount >>= PAGE_SHIFT;
3446 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3447 	__sk_mem_reduce_allocated(sk, amount);
3448 }
3449 EXPORT_SYMBOL(__sk_mem_reclaim);
3450 
3451 int sk_set_peek_off(struct sock *sk, int val)
3452 {
3453 	WRITE_ONCE(sk->sk_peek_off, val);
3454 	return 0;
3455 }
3456 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3457 
3458 /*
3459  * Set of default routines for initialising struct proto_ops when
3460  * the protocol does not support a particular function. In certain
3461  * cases where it makes no sense for a protocol to have a "do nothing"
3462  * function, some default processing is provided.
3463  */
3464 
3465 int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
3466 {
3467 	return -EOPNOTSUPP;
3468 }
3469 EXPORT_SYMBOL(sock_no_bind);
3470 
3471 int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
3472 		    int len, int flags)
3473 {
3474 	return -EOPNOTSUPP;
3475 }
3476 EXPORT_SYMBOL(sock_no_connect);
3477 
3478 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3479 {
3480 	return -EOPNOTSUPP;
3481 }
3482 EXPORT_SYMBOL(sock_no_socketpair);
3483 
3484 int sock_no_accept(struct socket *sock, struct socket *newsock,
3485 		   struct proto_accept_arg *arg)
3486 {
3487 	return -EOPNOTSUPP;
3488 }
3489 EXPORT_SYMBOL(sock_no_accept);
3490 
3491 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3492 		    int peer)
3493 {
3494 	return -EOPNOTSUPP;
3495 }
3496 EXPORT_SYMBOL(sock_no_getname);
3497 
3498 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3499 {
3500 	return -EOPNOTSUPP;
3501 }
3502 EXPORT_SYMBOL(sock_no_ioctl);
3503 
3504 int sock_no_listen(struct socket *sock, int backlog)
3505 {
3506 	return -EOPNOTSUPP;
3507 }
3508 EXPORT_SYMBOL(sock_no_listen);
3509 
3510 int sock_no_shutdown(struct socket *sock, int how)
3511 {
3512 	return -EOPNOTSUPP;
3513 }
3514 EXPORT_SYMBOL(sock_no_shutdown);
3515 
3516 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3517 {
3518 	return -EOPNOTSUPP;
3519 }
3520 EXPORT_SYMBOL(sock_no_sendmsg);
3521 
3522 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3523 {
3524 	return -EOPNOTSUPP;
3525 }
3526 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3527 
3528 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3529 		    int flags)
3530 {
3531 	return -EOPNOTSUPP;
3532 }
3533 EXPORT_SYMBOL(sock_no_recvmsg);
3534 
3535 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3536 {
3537 	/* Mirror missing mmap method error code */
3538 	return -ENODEV;
3539 }
3540 EXPORT_SYMBOL(sock_no_mmap);
3541 
3542 /*
3543  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3544  * various sock-based usage counts.
3545  */
3546 void __receive_sock(struct file *file)
3547 {
3548 	struct socket *sock;
3549 
3550 	sock = sock_from_file(file);
3551 	if (sock) {
3552 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3553 		sock_update_classid(&sock->sk->sk_cgrp_data);
3554 	}
3555 }
3556 
3557 /*
3558  *	Default Socket Callbacks
3559  */
3560 
3561 static void sock_def_wakeup(struct sock *sk)
3562 {
3563 	struct socket_wq *wq;
3564 
3565 	rcu_read_lock();
3566 	wq = rcu_dereference(sk->sk_wq);
3567 	if (skwq_has_sleeper(wq))
3568 		wake_up_interruptible_all(&wq->wait);
3569 	rcu_read_unlock();
3570 }
3571 
3572 static void sock_def_error_report(struct sock *sk)
3573 {
3574 	struct socket_wq *wq;
3575 
3576 	rcu_read_lock();
3577 	wq = rcu_dereference(sk->sk_wq);
3578 	if (skwq_has_sleeper(wq))
3579 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3580 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3581 	rcu_read_unlock();
3582 }
3583 
3584 void sock_def_readable(struct sock *sk)
3585 {
3586 	struct socket_wq *wq;
3587 
3588 	trace_sk_data_ready(sk);
3589 
3590 	rcu_read_lock();
3591 	wq = rcu_dereference(sk->sk_wq);
3592 	if (skwq_has_sleeper(wq))
3593 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3594 						EPOLLRDNORM | EPOLLRDBAND);
3595 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3596 	rcu_read_unlock();
3597 }
3598 
3599 static void sock_def_write_space(struct sock *sk)
3600 {
3601 	struct socket_wq *wq;
3602 
3603 	rcu_read_lock();
3604 
3605 	/* Do not wake up a writer until he can make "significant"
3606 	 * progress.  --DaveM
3607 	 */
3608 	if (sock_writeable(sk)) {
3609 		wq = rcu_dereference(sk->sk_wq);
3610 		if (skwq_has_sleeper(wq))
3611 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3612 						EPOLLWRNORM | EPOLLWRBAND);
3613 
3614 		/* Should agree with poll, otherwise some programs break */
3615 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3616 	}
3617 
3618 	rcu_read_unlock();
3619 }
3620 
3621 /* An optimised version of sock_def_write_space(), should only be called
3622  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3623  * ->sk_wmem_alloc.
3624  */
3625 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
3626 {
3627 	/* Do not wake up a writer until he can make "significant"
3628 	 * progress.  --DaveM
3629 	 */
3630 	if (__sock_writeable(sk, wmem_alloc)) {
3631 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3632 
3633 		/* rely on refcount_sub from sock_wfree() */
3634 		smp_mb__after_atomic();
3635 		if (wq && waitqueue_active(&wq->wait))
3636 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3637 						EPOLLWRNORM | EPOLLWRBAND);
3638 
3639 		/* Should agree with poll, otherwise some programs break */
3640 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3641 	}
3642 }
3643 
3644 static void sock_def_destruct(struct sock *sk)
3645 {
3646 }
3647 
3648 void sk_send_sigurg(struct sock *sk)
3649 {
3650 	if (sk->sk_socket && sk->sk_socket->file)
3651 		if (send_sigurg(sk->sk_socket->file))
3652 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3653 }
3654 EXPORT_SYMBOL(sk_send_sigurg);
3655 
3656 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3657 		    unsigned long expires)
3658 {
3659 	if (!mod_timer(timer, expires))
3660 		sock_hold(sk);
3661 }
3662 EXPORT_SYMBOL(sk_reset_timer);
3663 
3664 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3665 {
3666 	if (timer_delete(timer))
3667 		__sock_put(sk);
3668 }
3669 EXPORT_SYMBOL(sk_stop_timer);
3670 
3671 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3672 {
3673 	if (timer_delete_sync(timer))
3674 		__sock_put(sk);
3675 }
3676 EXPORT_SYMBOL(sk_stop_timer_sync);
3677 
3678 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3679 {
3680 	sk_init_common(sk);
3681 	sk->sk_send_head	=	NULL;
3682 
3683 	timer_setup(&sk->sk_timer, NULL, 0);
3684 
3685 	sk->sk_allocation	=	GFP_KERNEL;
3686 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3687 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3688 	sk->sk_state		=	TCP_CLOSE;
3689 	sk->sk_use_task_frag	=	true;
3690 	sk_set_socket(sk, sock);
3691 
3692 	sock_set_flag(sk, SOCK_ZAPPED);
3693 
3694 	if (sock) {
3695 		sk->sk_type	=	sock->type;
3696 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3697 		sock->sk	=	sk;
3698 	} else {
3699 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3700 	}
3701 	sk->sk_uid	=	uid;
3702 
3703 	sk->sk_state_change	=	sock_def_wakeup;
3704 	sk->sk_data_ready	=	sock_def_readable;
3705 	sk->sk_write_space	=	sock_def_write_space;
3706 	sk->sk_error_report	=	sock_def_error_report;
3707 	sk->sk_destruct		=	sock_def_destruct;
3708 
3709 	sk->sk_frag.page	=	NULL;
3710 	sk->sk_frag.offset	=	0;
3711 	sk->sk_peek_off		=	-1;
3712 
3713 	sk->sk_peer_pid 	=	NULL;
3714 	sk->sk_peer_cred	=	NULL;
3715 	spin_lock_init(&sk->sk_peer_lock);
3716 
3717 	sk->sk_write_pending	=	0;
3718 	sk->sk_rcvlowat		=	1;
3719 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3720 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3721 
3722 	sk->sk_stamp = SK_DEFAULT_STAMP;
3723 #if BITS_PER_LONG==32
3724 	seqlock_init(&sk->sk_stamp_seq);
3725 #endif
3726 	atomic_set(&sk->sk_zckey, 0);
3727 
3728 #ifdef CONFIG_NET_RX_BUSY_POLL
3729 	sk->sk_napi_id		=	0;
3730 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3731 #endif
3732 
3733 	sk->sk_max_pacing_rate = ~0UL;
3734 	sk->sk_pacing_rate = ~0UL;
3735 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3736 	sk->sk_incoming_cpu = -1;
3737 
3738 	sk_rx_queue_clear(sk);
3739 	/*
3740 	 * Before updating sk_refcnt, we must commit prior changes to memory
3741 	 * (Documentation/RCU/rculist_nulls.rst for details)
3742 	 */
3743 	smp_wmb();
3744 	refcount_set(&sk->sk_refcnt, 1);
3745 	sk_drops_reset(sk);
3746 }
3747 EXPORT_SYMBOL(sock_init_data_uid);
3748 
3749 void sock_init_data(struct socket *sock, struct sock *sk)
3750 {
3751 	kuid_t uid = sock ?
3752 		SOCK_INODE(sock)->i_uid :
3753 		make_kuid(sock_net(sk)->user_ns, 0);
3754 
3755 	sock_init_data_uid(sock, sk, uid);
3756 }
3757 EXPORT_SYMBOL(sock_init_data);
3758 
3759 void lock_sock_nested(struct sock *sk, int subclass)
3760 {
3761 	/* The sk_lock has mutex_lock() semantics here. */
3762 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3763 
3764 	might_sleep();
3765 	spin_lock_bh(&sk->sk_lock.slock);
3766 	if (sock_owned_by_user_nocheck(sk))
3767 		__lock_sock(sk);
3768 	sk->sk_lock.owned = 1;
3769 	spin_unlock_bh(&sk->sk_lock.slock);
3770 }
3771 EXPORT_SYMBOL(lock_sock_nested);
3772 
3773 void release_sock(struct sock *sk)
3774 {
3775 	spin_lock_bh(&sk->sk_lock.slock);
3776 	if (sk->sk_backlog.tail)
3777 		__release_sock(sk);
3778 
3779 	if (sk->sk_prot->release_cb)
3780 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3781 				     tcp_release_cb, sk);
3782 
3783 	sock_release_ownership(sk);
3784 	if (waitqueue_active(&sk->sk_lock.wq))
3785 		wake_up(&sk->sk_lock.wq);
3786 	spin_unlock_bh(&sk->sk_lock.slock);
3787 }
3788 EXPORT_SYMBOL(release_sock);
3789 
3790 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3791 {
3792 	might_sleep();
3793 	spin_lock_bh(&sk->sk_lock.slock);
3794 
3795 	if (!sock_owned_by_user_nocheck(sk)) {
3796 		/*
3797 		 * Fast path return with bottom halves disabled and
3798 		 * sock::sk_lock.slock held.
3799 		 *
3800 		 * The 'mutex' is not contended and holding
3801 		 * sock::sk_lock.slock prevents all other lockers to
3802 		 * proceed so the corresponding unlock_sock_fast() can
3803 		 * avoid the slow path of release_sock() completely and
3804 		 * just release slock.
3805 		 *
3806 		 * From a semantical POV this is equivalent to 'acquiring'
3807 		 * the 'mutex', hence the corresponding lockdep
3808 		 * mutex_release() has to happen in the fast path of
3809 		 * unlock_sock_fast().
3810 		 */
3811 		return false;
3812 	}
3813 
3814 	__lock_sock(sk);
3815 	sk->sk_lock.owned = 1;
3816 	__acquire(&sk->sk_lock.slock);
3817 	spin_unlock_bh(&sk->sk_lock.slock);
3818 	return true;
3819 }
3820 EXPORT_SYMBOL(__lock_sock_fast);
3821 
3822 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3823 		   bool timeval, bool time32)
3824 {
3825 	struct sock *sk = sock->sk;
3826 	struct timespec64 ts;
3827 
3828 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3829 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3830 	if (ts.tv_sec == -1)
3831 		return -ENOENT;
3832 	if (ts.tv_sec == 0) {
3833 		ktime_t kt = ktime_get_real();
3834 		sock_write_timestamp(sk, kt);
3835 		ts = ktime_to_timespec64(kt);
3836 	}
3837 
3838 	if (timeval)
3839 		ts.tv_nsec /= 1000;
3840 
3841 #ifdef CONFIG_COMPAT_32BIT_TIME
3842 	if (time32)
3843 		return put_old_timespec32(&ts, userstamp);
3844 #endif
3845 #ifdef CONFIG_SPARC64
3846 	/* beware of padding in sparc64 timeval */
3847 	if (timeval && !in_compat_syscall()) {
3848 		struct __kernel_old_timeval __user tv = {
3849 			.tv_sec = ts.tv_sec,
3850 			.tv_usec = ts.tv_nsec,
3851 		};
3852 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3853 			return -EFAULT;
3854 		return 0;
3855 	}
3856 #endif
3857 	return put_timespec64(&ts, userstamp);
3858 }
3859 EXPORT_SYMBOL(sock_gettstamp);
3860 
3861 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3862 {
3863 	if (!sock_flag(sk, flag)) {
3864 		unsigned long previous_flags = sk->sk_flags;
3865 
3866 		sock_set_flag(sk, flag);
3867 		/*
3868 		 * we just set one of the two flags which require net
3869 		 * time stamping, but time stamping might have been on
3870 		 * already because of the other one
3871 		 */
3872 		if (sock_needs_netstamp(sk) &&
3873 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3874 			net_enable_timestamp();
3875 	}
3876 }
3877 
3878 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3879 		       int level, int type)
3880 {
3881 	struct sock_exterr_skb *serr;
3882 	struct sk_buff *skb;
3883 	int copied, err;
3884 
3885 	err = -EAGAIN;
3886 	skb = sock_dequeue_err_skb(sk);
3887 	if (skb == NULL)
3888 		goto out;
3889 
3890 	copied = skb->len;
3891 	if (copied > len) {
3892 		msg->msg_flags |= MSG_TRUNC;
3893 		copied = len;
3894 	}
3895 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3896 	if (err)
3897 		goto out_free_skb;
3898 
3899 	sock_recv_timestamp(msg, sk, skb);
3900 
3901 	serr = SKB_EXT_ERR(skb);
3902 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3903 
3904 	msg->msg_flags |= MSG_ERRQUEUE;
3905 	err = copied;
3906 
3907 out_free_skb:
3908 	kfree_skb(skb);
3909 out:
3910 	return err;
3911 }
3912 EXPORT_SYMBOL(sock_recv_errqueue);
3913 
3914 /*
3915  *	Get a socket option on an socket.
3916  *
3917  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3918  *	asynchronous errors should be reported by getsockopt. We assume
3919  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3920  */
3921 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3922 			   char __user *optval, int __user *optlen)
3923 {
3924 	struct sock *sk = sock->sk;
3925 
3926 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3927 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3928 }
3929 EXPORT_SYMBOL(sock_common_getsockopt);
3930 
3931 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3932 			int flags)
3933 {
3934 	struct sock *sk = sock->sk;
3935 	int addr_len = 0;
3936 	int err;
3937 
3938 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3939 	if (err >= 0)
3940 		msg->msg_namelen = addr_len;
3941 	return err;
3942 }
3943 EXPORT_SYMBOL(sock_common_recvmsg);
3944 
3945 /*
3946  *	Set socket options on an inet socket.
3947  */
3948 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3949 			   sockptr_t optval, unsigned int optlen)
3950 {
3951 	struct sock *sk = sock->sk;
3952 
3953 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3954 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3955 }
3956 EXPORT_SYMBOL(sock_common_setsockopt);
3957 
3958 void sk_common_release(struct sock *sk)
3959 {
3960 	if (sk->sk_prot->destroy)
3961 		sk->sk_prot->destroy(sk);
3962 
3963 	/*
3964 	 * Observation: when sk_common_release is called, processes have
3965 	 * no access to socket. But net still has.
3966 	 * Step one, detach it from networking:
3967 	 *
3968 	 * A. Remove from hash tables.
3969 	 */
3970 
3971 	sk->sk_prot->unhash(sk);
3972 
3973 	/*
3974 	 * In this point socket cannot receive new packets, but it is possible
3975 	 * that some packets are in flight because some CPU runs receiver and
3976 	 * did hash table lookup before we unhashed socket. They will achieve
3977 	 * receive queue and will be purged by socket destructor.
3978 	 *
3979 	 * Also we still have packets pending on receive queue and probably,
3980 	 * our own packets waiting in device queues. sock_destroy will drain
3981 	 * receive queue, but transmitted packets will delay socket destruction
3982 	 * until the last reference will be released.
3983 	 */
3984 
3985 	sock_orphan(sk);
3986 
3987 	xfrm_sk_free_policy(sk);
3988 
3989 	sock_put(sk);
3990 }
3991 EXPORT_SYMBOL(sk_common_release);
3992 
3993 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3994 {
3995 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3996 
3997 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3998 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3999 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
4000 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
4001 	mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
4002 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
4003 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
4004 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
4005 	mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
4006 }
4007 
4008 #ifdef CONFIG_PROC_FS
4009 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
4010 
4011 int sock_prot_inuse_get(struct net *net, struct proto *prot)
4012 {
4013 	int cpu, idx = prot->inuse_idx;
4014 	int res = 0;
4015 
4016 	for_each_possible_cpu(cpu)
4017 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
4018 
4019 	return res >= 0 ? res : 0;
4020 }
4021 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4022 
4023 int sock_inuse_get(struct net *net)
4024 {
4025 	int cpu, res = 0;
4026 
4027 	for_each_possible_cpu(cpu)
4028 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4029 
4030 	return res;
4031 }
4032 
4033 EXPORT_SYMBOL_GPL(sock_inuse_get);
4034 
4035 static int __net_init sock_inuse_init_net(struct net *net)
4036 {
4037 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4038 	if (net->core.prot_inuse == NULL)
4039 		return -ENOMEM;
4040 	return 0;
4041 }
4042 
4043 static void __net_exit sock_inuse_exit_net(struct net *net)
4044 {
4045 	free_percpu(net->core.prot_inuse);
4046 }
4047 
4048 static struct pernet_operations net_inuse_ops = {
4049 	.init = sock_inuse_init_net,
4050 	.exit = sock_inuse_exit_net,
4051 };
4052 
4053 static __init int net_inuse_init(void)
4054 {
4055 	if (register_pernet_subsys(&net_inuse_ops))
4056 		panic("Cannot initialize net inuse counters");
4057 
4058 	return 0;
4059 }
4060 
4061 core_initcall(net_inuse_init);
4062 
4063 static int assign_proto_idx(struct proto *prot)
4064 {
4065 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4066 
4067 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4068 		pr_err("PROTO_INUSE_NR exhausted\n");
4069 		return -ENOSPC;
4070 	}
4071 
4072 	set_bit(prot->inuse_idx, proto_inuse_idx);
4073 	return 0;
4074 }
4075 
4076 static void release_proto_idx(struct proto *prot)
4077 {
4078 	if (prot->inuse_idx != PROTO_INUSE_NR)
4079 		clear_bit(prot->inuse_idx, proto_inuse_idx);
4080 }
4081 #else
4082 static inline int assign_proto_idx(struct proto *prot)
4083 {
4084 	return 0;
4085 }
4086 
4087 static inline void release_proto_idx(struct proto *prot)
4088 {
4089 }
4090 
4091 #endif
4092 
4093 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4094 {
4095 	if (!twsk_prot)
4096 		return;
4097 	kfree(twsk_prot->twsk_slab_name);
4098 	twsk_prot->twsk_slab_name = NULL;
4099 	kmem_cache_destroy(twsk_prot->twsk_slab);
4100 	twsk_prot->twsk_slab = NULL;
4101 }
4102 
4103 static int tw_prot_init(const struct proto *prot)
4104 {
4105 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4106 
4107 	if (!twsk_prot)
4108 		return 0;
4109 
4110 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4111 					      prot->name);
4112 	if (!twsk_prot->twsk_slab_name)
4113 		return -ENOMEM;
4114 
4115 	twsk_prot->twsk_slab =
4116 		kmem_cache_create(twsk_prot->twsk_slab_name,
4117 				  twsk_prot->twsk_obj_size, 0,
4118 				  SLAB_ACCOUNT | prot->slab_flags,
4119 				  NULL);
4120 	if (!twsk_prot->twsk_slab) {
4121 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4122 			prot->name);
4123 		return -ENOMEM;
4124 	}
4125 
4126 	return 0;
4127 }
4128 
4129 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4130 {
4131 	if (!rsk_prot)
4132 		return;
4133 	kfree(rsk_prot->slab_name);
4134 	rsk_prot->slab_name = NULL;
4135 	kmem_cache_destroy(rsk_prot->slab);
4136 	rsk_prot->slab = NULL;
4137 }
4138 
4139 static int req_prot_init(const struct proto *prot)
4140 {
4141 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4142 
4143 	if (!rsk_prot)
4144 		return 0;
4145 
4146 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4147 					prot->name);
4148 	if (!rsk_prot->slab_name)
4149 		return -ENOMEM;
4150 
4151 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4152 					   rsk_prot->obj_size, 0,
4153 					   SLAB_ACCOUNT | prot->slab_flags,
4154 					   NULL);
4155 
4156 	if (!rsk_prot->slab) {
4157 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4158 			prot->name);
4159 		return -ENOMEM;
4160 	}
4161 	return 0;
4162 }
4163 
4164 int proto_register(struct proto *prot, int alloc_slab)
4165 {
4166 	int ret = -ENOBUFS;
4167 
4168 	if (prot->memory_allocated && !prot->sysctl_mem) {
4169 		pr_err("%s: missing sysctl_mem\n", prot->name);
4170 		return -EINVAL;
4171 	}
4172 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4173 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4174 		return -EINVAL;
4175 	}
4176 	if (alloc_slab) {
4177 		prot->slab = kmem_cache_create_usercopy(prot->name,
4178 					prot->obj_size, 0,
4179 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4180 					prot->slab_flags,
4181 					prot->useroffset, prot->usersize,
4182 					NULL);
4183 
4184 		if (prot->slab == NULL) {
4185 			pr_crit("%s: Can't create sock SLAB cache!\n",
4186 				prot->name);
4187 			goto out;
4188 		}
4189 
4190 		if (req_prot_init(prot))
4191 			goto out_free_request_sock_slab;
4192 
4193 		if (tw_prot_init(prot))
4194 			goto out_free_timewait_sock_slab;
4195 	}
4196 
4197 	mutex_lock(&proto_list_mutex);
4198 	ret = assign_proto_idx(prot);
4199 	if (ret) {
4200 		mutex_unlock(&proto_list_mutex);
4201 		goto out_free_timewait_sock_slab;
4202 	}
4203 	list_add(&prot->node, &proto_list);
4204 	mutex_unlock(&proto_list_mutex);
4205 	return ret;
4206 
4207 out_free_timewait_sock_slab:
4208 	if (alloc_slab)
4209 		tw_prot_cleanup(prot->twsk_prot);
4210 out_free_request_sock_slab:
4211 	if (alloc_slab) {
4212 		req_prot_cleanup(prot->rsk_prot);
4213 
4214 		kmem_cache_destroy(prot->slab);
4215 		prot->slab = NULL;
4216 	}
4217 out:
4218 	return ret;
4219 }
4220 EXPORT_SYMBOL(proto_register);
4221 
4222 void proto_unregister(struct proto *prot)
4223 {
4224 	mutex_lock(&proto_list_mutex);
4225 	release_proto_idx(prot);
4226 	list_del(&prot->node);
4227 	mutex_unlock(&proto_list_mutex);
4228 
4229 	kmem_cache_destroy(prot->slab);
4230 	prot->slab = NULL;
4231 
4232 	req_prot_cleanup(prot->rsk_prot);
4233 	tw_prot_cleanup(prot->twsk_prot);
4234 }
4235 EXPORT_SYMBOL(proto_unregister);
4236 
4237 int sock_load_diag_module(int family, int protocol)
4238 {
4239 	if (!protocol) {
4240 		if (!sock_is_registered(family))
4241 			return -ENOENT;
4242 
4243 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4244 				      NETLINK_SOCK_DIAG, family);
4245 	}
4246 
4247 #ifdef CONFIG_INET
4248 	if (family == AF_INET &&
4249 	    protocol != IPPROTO_RAW &&
4250 	    protocol < MAX_INET_PROTOS &&
4251 	    !rcu_access_pointer(inet_protos[protocol]))
4252 		return -ENOENT;
4253 #endif
4254 
4255 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4256 			      NETLINK_SOCK_DIAG, family, protocol);
4257 }
4258 EXPORT_SYMBOL(sock_load_diag_module);
4259 
4260 #ifdef CONFIG_PROC_FS
4261 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4262 	__acquires(proto_list_mutex)
4263 {
4264 	mutex_lock(&proto_list_mutex);
4265 	return seq_list_start_head(&proto_list, *pos);
4266 }
4267 
4268 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4269 {
4270 	return seq_list_next(v, &proto_list, pos);
4271 }
4272 
4273 static void proto_seq_stop(struct seq_file *seq, void *v)
4274 	__releases(proto_list_mutex)
4275 {
4276 	mutex_unlock(&proto_list_mutex);
4277 }
4278 
4279 static char proto_method_implemented(const void *method)
4280 {
4281 	return method == NULL ? 'n' : 'y';
4282 }
4283 static long sock_prot_memory_allocated(struct proto *proto)
4284 {
4285 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4286 }
4287 
4288 static const char *sock_prot_memory_pressure(struct proto *proto)
4289 {
4290 	return proto->memory_pressure != NULL ?
4291 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4292 }
4293 
4294 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4295 {
4296 
4297 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4298 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4299 		   proto->name,
4300 		   proto->obj_size,
4301 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4302 		   sock_prot_memory_allocated(proto),
4303 		   sock_prot_memory_pressure(proto),
4304 		   proto->max_header,
4305 		   proto->slab == NULL ? "no" : "yes",
4306 		   module_name(proto->owner),
4307 		   proto_method_implemented(proto->close),
4308 		   proto_method_implemented(proto->connect),
4309 		   proto_method_implemented(proto->disconnect),
4310 		   proto_method_implemented(proto->accept),
4311 		   proto_method_implemented(proto->ioctl),
4312 		   proto_method_implemented(proto->init),
4313 		   proto_method_implemented(proto->destroy),
4314 		   proto_method_implemented(proto->shutdown),
4315 		   proto_method_implemented(proto->setsockopt),
4316 		   proto_method_implemented(proto->getsockopt),
4317 		   proto_method_implemented(proto->sendmsg),
4318 		   proto_method_implemented(proto->recvmsg),
4319 		   proto_method_implemented(proto->bind),
4320 		   proto_method_implemented(proto->backlog_rcv),
4321 		   proto_method_implemented(proto->hash),
4322 		   proto_method_implemented(proto->unhash),
4323 		   proto_method_implemented(proto->get_port),
4324 		   proto_method_implemented(proto->enter_memory_pressure));
4325 }
4326 
4327 static int proto_seq_show(struct seq_file *seq, void *v)
4328 {
4329 	if (v == &proto_list)
4330 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4331 			   "protocol",
4332 			   "size",
4333 			   "sockets",
4334 			   "memory",
4335 			   "press",
4336 			   "maxhdr",
4337 			   "slab",
4338 			   "module",
4339 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4340 	else
4341 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4342 	return 0;
4343 }
4344 
4345 static const struct seq_operations proto_seq_ops = {
4346 	.start  = proto_seq_start,
4347 	.next   = proto_seq_next,
4348 	.stop   = proto_seq_stop,
4349 	.show   = proto_seq_show,
4350 };
4351 
4352 static __net_init int proto_init_net(struct net *net)
4353 {
4354 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4355 			sizeof(struct seq_net_private)))
4356 		return -ENOMEM;
4357 
4358 	return 0;
4359 }
4360 
4361 static __net_exit void proto_exit_net(struct net *net)
4362 {
4363 	remove_proc_entry("protocols", net->proc_net);
4364 }
4365 
4366 
4367 static __net_initdata struct pernet_operations proto_net_ops = {
4368 	.init = proto_init_net,
4369 	.exit = proto_exit_net,
4370 };
4371 
4372 static int __init proto_init(void)
4373 {
4374 	return register_pernet_subsys(&proto_net_ops);
4375 }
4376 
4377 subsys_initcall(proto_init);
4378 
4379 #endif /* PROC_FS */
4380 
4381 #ifdef CONFIG_NET_RX_BUSY_POLL
4382 bool sk_busy_loop_end(void *p, unsigned long start_time)
4383 {
4384 	struct sock *sk = p;
4385 
4386 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4387 		return true;
4388 
4389 	if (sk_is_udp(sk) &&
4390 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4391 		return true;
4392 
4393 	return sk_busy_loop_timeout(sk, start_time);
4394 }
4395 EXPORT_SYMBOL(sk_busy_loop_end);
4396 #endif /* CONFIG_NET_RX_BUSY_POLL */
4397 
4398 int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
4399 {
4400 	if (!sk->sk_prot->bind_add)
4401 		return -EOPNOTSUPP;
4402 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4403 }
4404 EXPORT_SYMBOL(sock_bind_add);
4405 
4406 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4407 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4408 		     void __user *arg, void *karg, size_t size)
4409 {
4410 	int ret;
4411 
4412 	if (copy_from_user(karg, arg, size))
4413 		return -EFAULT;
4414 
4415 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4416 	if (ret)
4417 		return ret;
4418 
4419 	if (copy_to_user(arg, karg, size))
4420 		return -EFAULT;
4421 
4422 	return 0;
4423 }
4424 EXPORT_SYMBOL(sock_ioctl_inout);
4425 
4426 /* This is the most common ioctl prep function, where the result (4 bytes) is
4427  * copied back to userspace if the ioctl() returns successfully. No input is
4428  * copied from userspace as input argument.
4429  */
4430 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4431 {
4432 	int ret, karg = 0;
4433 
4434 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4435 	if (ret)
4436 		return ret;
4437 
4438 	return put_user(karg, (int __user *)arg);
4439 }
4440 
4441 /* A wrapper around sock ioctls, which copies the data from userspace
4442  * (depending on the protocol/ioctl), and copies back the result to userspace.
4443  * The main motivation for this function is to pass kernel memory to the
4444  * protocol ioctl callbacks, instead of userspace memory.
4445  */
4446 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4447 {
4448 	int rc = 1;
4449 
4450 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4451 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4452 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4453 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4454 	else if (sk_is_phonet(sk))
4455 		rc = phonet_sk_ioctl(sk, cmd, arg);
4456 
4457 	/* If ioctl was processed, returns its value */
4458 	if (rc <= 0)
4459 		return rc;
4460 
4461 	/* Otherwise call the default handler */
4462 	return sock_ioctl_out(sk, cmd, arg);
4463 }
4464 EXPORT_SYMBOL(sk_ioctl);
4465 
4466 static int __init sock_struct_check(void)
4467 {
4468 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4469 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4470 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4471 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4472 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4473 
4474 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4475 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4476 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4477 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4478 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4479 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4480 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4481 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4482 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4483 
4484 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4485 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4486 #ifdef CONFIG_MEMCG
4487 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4488 #endif
4489 
4490 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4491 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4492 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4493 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4494 
4495 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4496 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4497 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
4498 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4499 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4500 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4501 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4502 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4503 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4504 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4505 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4506 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4507 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4508 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4509 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4510 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4511 
4512 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4513 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4514 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4515 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4516 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
4517 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
4518 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4519 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4520 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4521 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4522 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4523 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4524 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
4525 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4526 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4527 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4528 	return 0;
4529 }
4530 
4531 core_initcall(sock_struct_check);
4532