xref: /linux/net/core/sock.c (revision 8f7aa3d3c7323f4ca2768a9e74ebbe359c4f8f88)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include <uapi/linux/pidfd.h>
152 
153 #include "dev.h"
154 
155 static DEFINE_MUTEX(proto_list_mutex);
156 static LIST_HEAD(proto_list);
157 
158 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
159 static void sock_def_write_space(struct sock *sk);
160 
161 /**
162  * sk_ns_capable - General socket capability test
163  * @sk: Socket to use a capability on or through
164  * @user_ns: The user namespace of the capability to use
165  * @cap: The capability to use
166  *
167  * Test to see if the opener of the socket had when the socket was
168  * created and the current process has the capability @cap in the user
169  * namespace @user_ns.
170  */
171 bool sk_ns_capable(const struct sock *sk,
172 		   struct user_namespace *user_ns, int cap)
173 {
174 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
175 		ns_capable(user_ns, cap);
176 }
177 EXPORT_SYMBOL(sk_ns_capable);
178 
179 /**
180  * sk_capable - Socket global capability test
181  * @sk: Socket to use a capability on or through
182  * @cap: The global capability to use
183  *
184  * Test to see if the opener of the socket had when the socket was
185  * created and the current process has the capability @cap in all user
186  * namespaces.
187  */
188 bool sk_capable(const struct sock *sk, int cap)
189 {
190 	return sk_ns_capable(sk, &init_user_ns, cap);
191 }
192 EXPORT_SYMBOL(sk_capable);
193 
194 /**
195  * sk_net_capable - Network namespace socket capability test
196  * @sk: Socket to use a capability on or through
197  * @cap: The capability to use
198  *
199  * Test to see if the opener of the socket had when the socket was created
200  * and the current process has the capability @cap over the network namespace
201  * the socket is a member of.
202  */
203 bool sk_net_capable(const struct sock *sk, int cap)
204 {
205 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206 }
207 EXPORT_SYMBOL(sk_net_capable);
208 
209 /*
210  * Each address family might have different locking rules, so we have
211  * one slock key per address family and separate keys for internal and
212  * userspace sockets.
213  */
214 static struct lock_class_key af_family_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_keys[AF_MAX];
216 static struct lock_class_key af_family_slock_keys[AF_MAX];
217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218 
219 /*
220  * Make lock validator output more readable. (we pre-construct these
221  * strings build-time, so that runtime initialization of socket
222  * locks is fast):
223  */
224 
225 #define _sock_locks(x)						  \
226   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
227   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
228   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
229   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
230   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
231   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
232   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
233   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
234   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
235   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
236   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
237   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
238   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
239   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
240   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
241   x "AF_MCTP"  , \
242   x "AF_MAX"
243 
244 static const char *const af_family_key_strings[AF_MAX+1] = {
245 	_sock_locks("sk_lock-")
246 };
247 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
248 	_sock_locks("slock-")
249 };
250 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
251 	_sock_locks("clock-")
252 };
253 
254 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
255 	_sock_locks("k-sk_lock-")
256 };
257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
258 	_sock_locks("k-slock-")
259 };
260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
261 	_sock_locks("k-clock-")
262 };
263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
264 	_sock_locks("rlock-")
265 };
266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
267 	_sock_locks("wlock-")
268 };
269 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
270 	_sock_locks("elock-")
271 };
272 
273 /*
274  * sk_callback_lock and sk queues locking rules are per-address-family,
275  * so split the lock classes by using a per-AF key:
276  */
277 static struct lock_class_key af_callback_keys[AF_MAX];
278 static struct lock_class_key af_rlock_keys[AF_MAX];
279 static struct lock_class_key af_wlock_keys[AF_MAX];
280 static struct lock_class_key af_elock_keys[AF_MAX];
281 static struct lock_class_key af_kern_callback_keys[AF_MAX];
282 
283 /* Run time adjustable parameters. */
284 __u32 sysctl_wmem_max __read_mostly = 4 << 20;
285 EXPORT_SYMBOL(sysctl_wmem_max);
286 __u32 sysctl_rmem_max __read_mostly = 4 << 20;
287 EXPORT_SYMBOL(sysctl_rmem_max);
288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
290 
291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292 EXPORT_SYMBOL_GPL(memalloc_socks_key);
293 
294 /**
295  * sk_set_memalloc - sets %SOCK_MEMALLOC
296  * @sk: socket to set it on
297  *
298  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299  * It's the responsibility of the admin to adjust min_free_kbytes
300  * to meet the requirements
301  */
302 void sk_set_memalloc(struct sock *sk)
303 {
304 	sock_set_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation |= __GFP_MEMALLOC;
306 	static_branch_inc(&memalloc_socks_key);
307 }
308 EXPORT_SYMBOL_GPL(sk_set_memalloc);
309 
310 void sk_clear_memalloc(struct sock *sk)
311 {
312 	sock_reset_flag(sk, SOCK_MEMALLOC);
313 	sk->sk_allocation &= ~__GFP_MEMALLOC;
314 	static_branch_dec(&memalloc_socks_key);
315 
316 	/*
317 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
319 	 * it has rmem allocations due to the last swapfile being deactivated
320 	 * but there is a risk that the socket is unusable due to exceeding
321 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
322 	 */
323 	sk_mem_reclaim(sk);
324 }
325 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326 
327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
328 {
329 	int ret;
330 	unsigned int noreclaim_flag;
331 
332 	/* these should have been dropped before queueing */
333 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334 
335 	noreclaim_flag = memalloc_noreclaim_save();
336 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337 				 tcp_v6_do_rcv,
338 				 tcp_v4_do_rcv,
339 				 sk, skb);
340 	memalloc_noreclaim_restore(noreclaim_flag);
341 
342 	return ret;
343 }
344 EXPORT_SYMBOL(__sk_backlog_rcv);
345 
346 void sk_error_report(struct sock *sk)
347 {
348 	sk->sk_error_report(sk);
349 
350 	switch (sk->sk_family) {
351 	case AF_INET:
352 		fallthrough;
353 	case AF_INET6:
354 		trace_inet_sk_error_report(sk);
355 		break;
356 	default:
357 		break;
358 	}
359 }
360 EXPORT_SYMBOL(sk_error_report);
361 
362 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363 {
364 	struct __kernel_sock_timeval tv;
365 
366 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
367 		tv.tv_sec = 0;
368 		tv.tv_usec = 0;
369 	} else {
370 		tv.tv_sec = timeo / HZ;
371 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372 	}
373 
374 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376 		*(struct old_timeval32 *)optval = tv32;
377 		return sizeof(tv32);
378 	}
379 
380 	if (old_timeval) {
381 		struct __kernel_old_timeval old_tv;
382 		old_tv.tv_sec = tv.tv_sec;
383 		old_tv.tv_usec = tv.tv_usec;
384 		*(struct __kernel_old_timeval *)optval = old_tv;
385 		return sizeof(old_tv);
386 	}
387 
388 	*(struct __kernel_sock_timeval *)optval = tv;
389 	return sizeof(tv);
390 }
391 EXPORT_SYMBOL(sock_get_timeout);
392 
393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394 			   sockptr_t optval, int optlen, bool old_timeval)
395 {
396 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397 		struct old_timeval32 tv32;
398 
399 		if (optlen < sizeof(tv32))
400 			return -EINVAL;
401 
402 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
403 			return -EFAULT;
404 		tv->tv_sec = tv32.tv_sec;
405 		tv->tv_usec = tv32.tv_usec;
406 	} else if (old_timeval) {
407 		struct __kernel_old_timeval old_tv;
408 
409 		if (optlen < sizeof(old_tv))
410 			return -EINVAL;
411 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
412 			return -EFAULT;
413 		tv->tv_sec = old_tv.tv_sec;
414 		tv->tv_usec = old_tv.tv_usec;
415 	} else {
416 		if (optlen < sizeof(*tv))
417 			return -EINVAL;
418 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
419 			return -EFAULT;
420 	}
421 
422 	return 0;
423 }
424 EXPORT_SYMBOL(sock_copy_user_timeval);
425 
426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
427 			    bool old_timeval)
428 {
429 	struct __kernel_sock_timeval tv;
430 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431 	long val;
432 
433 	if (err)
434 		return err;
435 
436 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 		return -EDOM;
438 
439 	if (tv.tv_sec < 0) {
440 		static int warned __read_mostly;
441 
442 		WRITE_ONCE(*timeo_p, 0);
443 		if (warned < 10 && net_ratelimit()) {
444 			warned++;
445 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 				__func__, current->comm, task_pid_nr(current));
447 		}
448 		return 0;
449 	}
450 	val = MAX_SCHEDULE_TIMEOUT;
451 	if ((tv.tv_sec || tv.tv_usec) &&
452 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
453 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454 						    USEC_PER_SEC / HZ);
455 	WRITE_ONCE(*timeo_p, val);
456 	return 0;
457 }
458 
459 static bool sk_set_prio_allowed(const struct sock *sk, int val)
460 {
461 	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
462 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
463 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
464 }
465 
466 static bool sock_needs_netstamp(const struct sock *sk)
467 {
468 	switch (sk->sk_family) {
469 	case AF_UNSPEC:
470 	case AF_UNIX:
471 		return false;
472 	default:
473 		return true;
474 	}
475 }
476 
477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
478 {
479 	if (sk->sk_flags & flags) {
480 		sk->sk_flags &= ~flags;
481 		if (sock_needs_netstamp(sk) &&
482 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
483 			net_disable_timestamp();
484 	}
485 }
486 
487 
488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
489 {
490 	unsigned long flags;
491 	struct sk_buff_head *list = &sk->sk_receive_queue;
492 
493 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
494 		sk_drops_inc(sk);
495 		trace_sock_rcvqueue_full(sk, skb);
496 		return -ENOMEM;
497 	}
498 
499 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
500 		sk_drops_inc(sk);
501 		return -ENOBUFS;
502 	}
503 
504 	skb->dev = NULL;
505 	skb_set_owner_r(skb, sk);
506 
507 	/* we escape from rcu protected region, make sure we dont leak
508 	 * a norefcounted dst
509 	 */
510 	skb_dst_force(skb);
511 
512 	spin_lock_irqsave(&list->lock, flags);
513 	sock_skb_set_dropcount(sk, skb);
514 	__skb_queue_tail(list, skb);
515 	spin_unlock_irqrestore(&list->lock, flags);
516 
517 	if (!sock_flag(sk, SOCK_DEAD))
518 		sk->sk_data_ready(sk);
519 	return 0;
520 }
521 EXPORT_SYMBOL(__sock_queue_rcv_skb);
522 
523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
524 			      enum skb_drop_reason *reason)
525 {
526 	enum skb_drop_reason drop_reason;
527 	int err;
528 
529 	err = sk_filter_reason(sk, skb, &drop_reason);
530 	if (err)
531 		goto out;
532 
533 	err = __sock_queue_rcv_skb(sk, skb);
534 	switch (err) {
535 	case -ENOMEM:
536 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
537 		break;
538 	case -ENOBUFS:
539 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
540 		break;
541 	default:
542 		drop_reason = SKB_NOT_DROPPED_YET;
543 		break;
544 	}
545 out:
546 	if (reason)
547 		*reason = drop_reason;
548 	return err;
549 }
550 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
551 
552 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
553 		     const int nested, unsigned int trim_cap, bool refcounted)
554 {
555 	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
556 	int rc = NET_RX_SUCCESS;
557 	int err;
558 
559 	if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
560 		goto discard_and_relse;
561 
562 	skb->dev = NULL;
563 
564 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
565 		sk_drops_inc(sk);
566 		reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
567 		goto discard_and_relse;
568 	}
569 	if (nested)
570 		bh_lock_sock_nested(sk);
571 	else
572 		bh_lock_sock(sk);
573 	if (!sock_owned_by_user(sk)) {
574 		/*
575 		 * trylock + unlock semantics:
576 		 */
577 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
578 
579 		rc = sk_backlog_rcv(sk, skb);
580 
581 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
582 	} else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
583 		bh_unlock_sock(sk);
584 		if (err == -ENOMEM)
585 			reason = SKB_DROP_REASON_PFMEMALLOC;
586 		if (err == -ENOBUFS)
587 			reason = SKB_DROP_REASON_SOCKET_BACKLOG;
588 		sk_drops_inc(sk);
589 		goto discard_and_relse;
590 	}
591 
592 	bh_unlock_sock(sk);
593 out:
594 	if (refcounted)
595 		sock_put(sk);
596 	return rc;
597 discard_and_relse:
598 	sk_skb_reason_drop(sk, skb, reason);
599 	goto out;
600 }
601 EXPORT_SYMBOL(__sk_receive_skb);
602 
603 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
604 							  u32));
605 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
606 							   u32));
607 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
608 {
609 	struct dst_entry *dst = __sk_dst_get(sk);
610 
611 	if (dst && READ_ONCE(dst->obsolete) &&
612 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
613 			       dst, cookie) == NULL) {
614 		sk_tx_queue_clear(sk);
615 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
616 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
617 		dst_release(dst);
618 		return NULL;
619 	}
620 
621 	return dst;
622 }
623 EXPORT_SYMBOL(__sk_dst_check);
624 
625 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
626 {
627 	struct dst_entry *dst = sk_dst_get(sk);
628 
629 	if (dst && READ_ONCE(dst->obsolete) &&
630 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
631 			       dst, cookie) == NULL) {
632 		sk_dst_reset(sk);
633 		dst_release(dst);
634 		return NULL;
635 	}
636 
637 	return dst;
638 }
639 EXPORT_SYMBOL(sk_dst_check);
640 
641 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
642 {
643 	int ret = -ENOPROTOOPT;
644 #ifdef CONFIG_NETDEVICES
645 	struct net *net = sock_net(sk);
646 
647 	/* Sorry... */
648 	ret = -EPERM;
649 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
650 		goto out;
651 
652 	ret = -EINVAL;
653 	if (ifindex < 0)
654 		goto out;
655 
656 	/* Paired with all READ_ONCE() done locklessly. */
657 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
658 
659 	if (sk->sk_prot->rehash)
660 		sk->sk_prot->rehash(sk);
661 	sk_dst_reset(sk);
662 
663 	ret = 0;
664 
665 out:
666 #endif
667 
668 	return ret;
669 }
670 
671 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
672 {
673 	int ret;
674 
675 	if (lock_sk)
676 		lock_sock(sk);
677 	ret = sock_bindtoindex_locked(sk, ifindex);
678 	if (lock_sk)
679 		release_sock(sk);
680 
681 	return ret;
682 }
683 EXPORT_SYMBOL(sock_bindtoindex);
684 
685 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
686 {
687 	int ret = -ENOPROTOOPT;
688 #ifdef CONFIG_NETDEVICES
689 	struct net *net = sock_net(sk);
690 	char devname[IFNAMSIZ];
691 	int index;
692 
693 	ret = -EINVAL;
694 	if (optlen < 0)
695 		goto out;
696 
697 	/* Bind this socket to a particular device like "eth0",
698 	 * as specified in the passed interface name. If the
699 	 * name is "" or the option length is zero the socket
700 	 * is not bound.
701 	 */
702 	if (optlen > IFNAMSIZ - 1)
703 		optlen = IFNAMSIZ - 1;
704 	memset(devname, 0, sizeof(devname));
705 
706 	ret = -EFAULT;
707 	if (copy_from_sockptr(devname, optval, optlen))
708 		goto out;
709 
710 	index = 0;
711 	if (devname[0] != '\0') {
712 		struct net_device *dev;
713 
714 		rcu_read_lock();
715 		dev = dev_get_by_name_rcu(net, devname);
716 		if (dev)
717 			index = dev->ifindex;
718 		rcu_read_unlock();
719 		ret = -ENODEV;
720 		if (!dev)
721 			goto out;
722 	}
723 
724 	sockopt_lock_sock(sk);
725 	ret = sock_bindtoindex_locked(sk, index);
726 	sockopt_release_sock(sk);
727 out:
728 #endif
729 
730 	return ret;
731 }
732 
733 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
734 				sockptr_t optlen, int len)
735 {
736 	int ret = -ENOPROTOOPT;
737 #ifdef CONFIG_NETDEVICES
738 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
739 	struct net *net = sock_net(sk);
740 	char devname[IFNAMSIZ];
741 
742 	if (bound_dev_if == 0) {
743 		len = 0;
744 		goto zero;
745 	}
746 
747 	ret = -EINVAL;
748 	if (len < IFNAMSIZ)
749 		goto out;
750 
751 	ret = netdev_get_name(net, devname, bound_dev_if);
752 	if (ret)
753 		goto out;
754 
755 	len = strlen(devname) + 1;
756 
757 	ret = -EFAULT;
758 	if (copy_to_sockptr(optval, devname, len))
759 		goto out;
760 
761 zero:
762 	ret = -EFAULT;
763 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
764 		goto out;
765 
766 	ret = 0;
767 
768 out:
769 #endif
770 
771 	return ret;
772 }
773 
774 bool sk_mc_loop(const struct sock *sk)
775 {
776 	if (dev_recursion_level())
777 		return false;
778 	if (!sk)
779 		return true;
780 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
781 	switch (READ_ONCE(sk->sk_family)) {
782 	case AF_INET:
783 		return inet_test_bit(MC_LOOP, sk);
784 #if IS_ENABLED(CONFIG_IPV6)
785 	case AF_INET6:
786 		return inet6_test_bit(MC6_LOOP, sk);
787 #endif
788 	}
789 	WARN_ON_ONCE(1);
790 	return true;
791 }
792 EXPORT_SYMBOL(sk_mc_loop);
793 
794 void sock_set_reuseaddr(struct sock *sk)
795 {
796 	lock_sock(sk);
797 	sk->sk_reuse = SK_CAN_REUSE;
798 	release_sock(sk);
799 }
800 EXPORT_SYMBOL(sock_set_reuseaddr);
801 
802 void sock_set_reuseport(struct sock *sk)
803 {
804 	lock_sock(sk);
805 	sk->sk_reuseport = true;
806 	release_sock(sk);
807 }
808 EXPORT_SYMBOL(sock_set_reuseport);
809 
810 void sock_no_linger(struct sock *sk)
811 {
812 	lock_sock(sk);
813 	WRITE_ONCE(sk->sk_lingertime, 0);
814 	sock_set_flag(sk, SOCK_LINGER);
815 	release_sock(sk);
816 }
817 EXPORT_SYMBOL(sock_no_linger);
818 
819 void sock_set_priority(struct sock *sk, u32 priority)
820 {
821 	WRITE_ONCE(sk->sk_priority, priority);
822 }
823 EXPORT_SYMBOL(sock_set_priority);
824 
825 void sock_set_sndtimeo(struct sock *sk, s64 secs)
826 {
827 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
828 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
829 	else
830 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
831 }
832 EXPORT_SYMBOL(sock_set_sndtimeo);
833 
834 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
835 {
836 	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
837 	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
838 	if (val)  {
839 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
840 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
841 	}
842 }
843 
844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
845 {
846 	switch (optname) {
847 	case SO_TIMESTAMP_OLD:
848 		__sock_set_timestamps(sk, valbool, false, false);
849 		break;
850 	case SO_TIMESTAMP_NEW:
851 		__sock_set_timestamps(sk, valbool, true, false);
852 		break;
853 	case SO_TIMESTAMPNS_OLD:
854 		__sock_set_timestamps(sk, valbool, false, true);
855 		break;
856 	case SO_TIMESTAMPNS_NEW:
857 		__sock_set_timestamps(sk, valbool, true, true);
858 		break;
859 	}
860 }
861 
862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
863 {
864 	struct net *net = sock_net(sk);
865 	struct net_device *dev = NULL;
866 	bool match = false;
867 	int *vclock_index;
868 	int i, num;
869 
870 	if (sk->sk_bound_dev_if)
871 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
872 
873 	if (!dev) {
874 		pr_err("%s: sock not bind to device\n", __func__);
875 		return -EOPNOTSUPP;
876 	}
877 
878 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
879 	dev_put(dev);
880 
881 	for (i = 0; i < num; i++) {
882 		if (*(vclock_index + i) == phc_index) {
883 			match = true;
884 			break;
885 		}
886 	}
887 
888 	if (num > 0)
889 		kfree(vclock_index);
890 
891 	if (!match)
892 		return -EINVAL;
893 
894 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
895 
896 	return 0;
897 }
898 
899 int sock_set_timestamping(struct sock *sk, int optname,
900 			  struct so_timestamping timestamping)
901 {
902 	int val = timestamping.flags;
903 	int ret;
904 
905 	if (val & ~SOF_TIMESTAMPING_MASK)
906 		return -EINVAL;
907 
908 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
909 	    !(val & SOF_TIMESTAMPING_OPT_ID))
910 		return -EINVAL;
911 
912 	if (val & SOF_TIMESTAMPING_OPT_ID &&
913 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
914 		if (sk_is_tcp(sk)) {
915 			if ((1 << sk->sk_state) &
916 			    (TCPF_CLOSE | TCPF_LISTEN))
917 				return -EINVAL;
918 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
919 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
920 			else
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
922 		} else {
923 			atomic_set(&sk->sk_tskey, 0);
924 		}
925 	}
926 
927 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
928 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
929 		return -EINVAL;
930 
931 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
932 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
933 		if (ret)
934 			return ret;
935 	}
936 
937 	WRITE_ONCE(sk->sk_tsflags, val);
938 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
939 	sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
940 
941 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
942 		sock_enable_timestamp(sk,
943 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
944 	else
945 		sock_disable_timestamp(sk,
946 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
947 	return 0;
948 }
949 
950 #if defined(CONFIG_CGROUP_BPF)
951 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
952 {
953 	struct bpf_sock_ops_kern sock_ops;
954 
955 	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
956 	sock_ops.op = op;
957 	sock_ops.is_fullsock = 1;
958 	sock_ops.sk = sk;
959 	bpf_skops_init_skb(&sock_ops, skb, 0);
960 	__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
961 }
962 #endif
963 
964 void sock_set_keepalive(struct sock *sk)
965 {
966 	lock_sock(sk);
967 	if (sk->sk_prot->keepalive)
968 		sk->sk_prot->keepalive(sk, true);
969 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
970 	release_sock(sk);
971 }
972 EXPORT_SYMBOL(sock_set_keepalive);
973 
974 static void __sock_set_rcvbuf(struct sock *sk, int val)
975 {
976 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
977 	 * as a negative value.
978 	 */
979 	val = min_t(int, val, INT_MAX / 2);
980 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
981 
982 	/* We double it on the way in to account for "struct sk_buff" etc.
983 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
984 	 * will allow that much actual data to be received on that socket.
985 	 *
986 	 * Applications are unaware that "struct sk_buff" and other overheads
987 	 * allocate from the receive buffer during socket buffer allocation.
988 	 *
989 	 * And after considering the possible alternatives, returning the value
990 	 * we actually used in getsockopt is the most desirable behavior.
991 	 */
992 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
993 }
994 
995 void sock_set_rcvbuf(struct sock *sk, int val)
996 {
997 	lock_sock(sk);
998 	__sock_set_rcvbuf(sk, val);
999 	release_sock(sk);
1000 }
1001 EXPORT_SYMBOL(sock_set_rcvbuf);
1002 
1003 static void __sock_set_mark(struct sock *sk, u32 val)
1004 {
1005 	if (val != sk->sk_mark) {
1006 		WRITE_ONCE(sk->sk_mark, val);
1007 		sk_dst_reset(sk);
1008 	}
1009 }
1010 
1011 void sock_set_mark(struct sock *sk, u32 val)
1012 {
1013 	lock_sock(sk);
1014 	__sock_set_mark(sk, val);
1015 	release_sock(sk);
1016 }
1017 EXPORT_SYMBOL(sock_set_mark);
1018 
1019 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1020 {
1021 	/* Round down bytes to multiple of pages */
1022 	bytes = round_down(bytes, PAGE_SIZE);
1023 
1024 	WARN_ON(bytes > sk->sk_reserved_mem);
1025 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1026 	sk_mem_reclaim(sk);
1027 }
1028 
1029 static int sock_reserve_memory(struct sock *sk, int bytes)
1030 {
1031 	long allocated;
1032 	bool charged;
1033 	int pages;
1034 
1035 	if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
1036 		return -EOPNOTSUPP;
1037 
1038 	if (!bytes)
1039 		return 0;
1040 
1041 	pages = sk_mem_pages(bytes);
1042 
1043 	/* pre-charge to memcg */
1044 	charged = mem_cgroup_sk_charge(sk, pages,
1045 				       GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1046 	if (!charged)
1047 		return -ENOMEM;
1048 
1049 	if (sk->sk_bypass_prot_mem)
1050 		goto success;
1051 
1052 	/* pre-charge to forward_alloc */
1053 	sk_memory_allocated_add(sk, pages);
1054 	allocated = sk_memory_allocated(sk);
1055 
1056 	/* If the system goes into memory pressure with this
1057 	 * precharge, give up and return error.
1058 	 */
1059 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1060 		sk_memory_allocated_sub(sk, pages);
1061 		mem_cgroup_sk_uncharge(sk, pages);
1062 		return -ENOMEM;
1063 	}
1064 
1065 success:
1066 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1067 
1068 	WRITE_ONCE(sk->sk_reserved_mem,
1069 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1070 
1071 	return 0;
1072 }
1073 
1074 #ifdef CONFIG_PAGE_POOL
1075 
1076 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1077  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1078  * allocates to copy these tokens, and to prevent looping over the frags for
1079  * too long.
1080  */
1081 #define MAX_DONTNEED_TOKENS 128
1082 #define MAX_DONTNEED_FRAGS 1024
1083 
1084 static noinline_for_stack int
1085 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1086 {
1087 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1088 	struct dmabuf_token *tokens;
1089 	int ret = 0, num_frags = 0;
1090 	netmem_ref netmems[16];
1091 
1092 	if (!sk_is_tcp(sk))
1093 		return -EBADF;
1094 
1095 	if (optlen % sizeof(*tokens) ||
1096 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1097 		return -EINVAL;
1098 
1099 	num_tokens = optlen / sizeof(*tokens);
1100 	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1101 	if (!tokens)
1102 		return -ENOMEM;
1103 
1104 	if (copy_from_sockptr(tokens, optval, optlen)) {
1105 		kvfree(tokens);
1106 		return -EFAULT;
1107 	}
1108 
1109 	xa_lock_bh(&sk->sk_user_frags);
1110 	for (i = 0; i < num_tokens; i++) {
1111 		for (j = 0; j < tokens[i].token_count; j++) {
1112 			if (++num_frags > MAX_DONTNEED_FRAGS)
1113 				goto frag_limit_reached;
1114 
1115 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1116 				&sk->sk_user_frags, tokens[i].token_start + j);
1117 
1118 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1119 				continue;
1120 
1121 			netmems[netmem_num++] = netmem;
1122 			if (netmem_num == ARRAY_SIZE(netmems)) {
1123 				xa_unlock_bh(&sk->sk_user_frags);
1124 				for (k = 0; k < netmem_num; k++)
1125 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1126 				netmem_num = 0;
1127 				xa_lock_bh(&sk->sk_user_frags);
1128 			}
1129 			ret++;
1130 		}
1131 	}
1132 
1133 frag_limit_reached:
1134 	xa_unlock_bh(&sk->sk_user_frags);
1135 	for (k = 0; k < netmem_num; k++)
1136 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1137 
1138 	kvfree(tokens);
1139 	return ret;
1140 }
1141 #endif
1142 
1143 void sockopt_lock_sock(struct sock *sk)
1144 {
1145 	/* When current->bpf_ctx is set, the setsockopt is called from
1146 	 * a bpf prog.  bpf has ensured the sk lock has been
1147 	 * acquired before calling setsockopt().
1148 	 */
1149 	if (has_current_bpf_ctx())
1150 		return;
1151 
1152 	lock_sock(sk);
1153 }
1154 EXPORT_SYMBOL(sockopt_lock_sock);
1155 
1156 void sockopt_release_sock(struct sock *sk)
1157 {
1158 	if (has_current_bpf_ctx())
1159 		return;
1160 
1161 	release_sock(sk);
1162 }
1163 EXPORT_SYMBOL(sockopt_release_sock);
1164 
1165 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1166 {
1167 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1168 }
1169 EXPORT_SYMBOL(sockopt_ns_capable);
1170 
1171 bool sockopt_capable(int cap)
1172 {
1173 	return has_current_bpf_ctx() || capable(cap);
1174 }
1175 EXPORT_SYMBOL(sockopt_capable);
1176 
1177 static int sockopt_validate_clockid(__kernel_clockid_t value)
1178 {
1179 	switch (value) {
1180 	case CLOCK_REALTIME:
1181 	case CLOCK_MONOTONIC:
1182 	case CLOCK_TAI:
1183 		return 0;
1184 	}
1185 	return -EINVAL;
1186 }
1187 
1188 /*
1189  *	This is meant for all protocols to use and covers goings on
1190  *	at the socket level. Everything here is generic.
1191  */
1192 
1193 int sk_setsockopt(struct sock *sk, int level, int optname,
1194 		  sockptr_t optval, unsigned int optlen)
1195 {
1196 	struct so_timestamping timestamping;
1197 	struct socket *sock = sk->sk_socket;
1198 	struct sock_txtime sk_txtime;
1199 	int val;
1200 	int valbool;
1201 	struct linger ling;
1202 	int ret = 0;
1203 
1204 	/*
1205 	 *	Options without arguments
1206 	 */
1207 
1208 	if (optname == SO_BINDTODEVICE)
1209 		return sock_setbindtodevice(sk, optval, optlen);
1210 
1211 	if (optlen < sizeof(int))
1212 		return -EINVAL;
1213 
1214 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1215 		return -EFAULT;
1216 
1217 	valbool = val ? 1 : 0;
1218 
1219 	/* handle options which do not require locking the socket. */
1220 	switch (optname) {
1221 	case SO_PRIORITY:
1222 		if (sk_set_prio_allowed(sk, val)) {
1223 			sock_set_priority(sk, val);
1224 			return 0;
1225 		}
1226 		return -EPERM;
1227 	case SO_TYPE:
1228 	case SO_PROTOCOL:
1229 	case SO_DOMAIN:
1230 	case SO_ERROR:
1231 		return -ENOPROTOOPT;
1232 #ifdef CONFIG_NET_RX_BUSY_POLL
1233 	case SO_BUSY_POLL:
1234 		if (val < 0)
1235 			return -EINVAL;
1236 		WRITE_ONCE(sk->sk_ll_usec, val);
1237 		return 0;
1238 	case SO_PREFER_BUSY_POLL:
1239 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1240 			return -EPERM;
1241 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1242 		return 0;
1243 	case SO_BUSY_POLL_BUDGET:
1244 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1245 		    !sockopt_capable(CAP_NET_ADMIN))
1246 			return -EPERM;
1247 		if (val < 0 || val > U16_MAX)
1248 			return -EINVAL;
1249 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1250 		return 0;
1251 #endif
1252 	case SO_MAX_PACING_RATE:
1253 		{
1254 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1255 		unsigned long pacing_rate;
1256 
1257 		if (sizeof(ulval) != sizeof(val) &&
1258 		    optlen >= sizeof(ulval) &&
1259 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1260 			return -EFAULT;
1261 		}
1262 		if (ulval != ~0UL)
1263 			cmpxchg(&sk->sk_pacing_status,
1264 				SK_PACING_NONE,
1265 				SK_PACING_NEEDED);
1266 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1267 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1268 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1269 		if (ulval < pacing_rate)
1270 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1271 		return 0;
1272 		}
1273 	case SO_TXREHASH:
1274 		if (!sk_is_tcp(sk))
1275 			return -EOPNOTSUPP;
1276 		if (val < -1 || val > 1)
1277 			return -EINVAL;
1278 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1279 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1280 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1281 		 * and sk_getsockopt().
1282 		 */
1283 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1284 		return 0;
1285 	case SO_PEEK_OFF:
1286 		{
1287 		int (*set_peek_off)(struct sock *sk, int val);
1288 
1289 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1290 		if (set_peek_off)
1291 			ret = set_peek_off(sk, val);
1292 		else
1293 			ret = -EOPNOTSUPP;
1294 		return ret;
1295 		}
1296 #ifdef CONFIG_PAGE_POOL
1297 	case SO_DEVMEM_DONTNEED:
1298 		return sock_devmem_dontneed(sk, optval, optlen);
1299 #endif
1300 	case SO_SNDTIMEO_OLD:
1301 	case SO_SNDTIMEO_NEW:
1302 		return sock_set_timeout(&sk->sk_sndtimeo, optval,
1303 					optlen, optname == SO_SNDTIMEO_OLD);
1304 	case SO_RCVTIMEO_OLD:
1305 	case SO_RCVTIMEO_NEW:
1306 		return sock_set_timeout(&sk->sk_rcvtimeo, optval,
1307 					optlen, optname == SO_RCVTIMEO_OLD);
1308 	}
1309 
1310 	sockopt_lock_sock(sk);
1311 
1312 	switch (optname) {
1313 	case SO_DEBUG:
1314 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1315 			ret = -EACCES;
1316 		else
1317 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1318 		break;
1319 	case SO_REUSEADDR:
1320 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1321 		break;
1322 	case SO_REUSEPORT:
1323 		if (valbool && !sk_is_inet(sk))
1324 			ret = -EOPNOTSUPP;
1325 		else
1326 			sk->sk_reuseport = valbool;
1327 		break;
1328 	case SO_DONTROUTE:
1329 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1330 		sk_dst_reset(sk);
1331 		break;
1332 	case SO_BROADCAST:
1333 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1334 		break;
1335 	case SO_SNDBUF:
1336 		/* Don't error on this BSD doesn't and if you think
1337 		 * about it this is right. Otherwise apps have to
1338 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1339 		 * are treated in BSD as hints
1340 		 */
1341 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1342 set_sndbuf:
1343 		/* Ensure val * 2 fits into an int, to prevent max_t()
1344 		 * from treating it as a negative value.
1345 		 */
1346 		val = min_t(int, val, INT_MAX / 2);
1347 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1348 		WRITE_ONCE(sk->sk_sndbuf,
1349 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1350 		/* Wake up sending tasks if we upped the value. */
1351 		sk->sk_write_space(sk);
1352 		break;
1353 
1354 	case SO_SNDBUFFORCE:
1355 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1356 			ret = -EPERM;
1357 			break;
1358 		}
1359 
1360 		/* No negative values (to prevent underflow, as val will be
1361 		 * multiplied by 2).
1362 		 */
1363 		if (val < 0)
1364 			val = 0;
1365 		goto set_sndbuf;
1366 
1367 	case SO_RCVBUF:
1368 		/* Don't error on this BSD doesn't and if you think
1369 		 * about it this is right. Otherwise apps have to
1370 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1371 		 * are treated in BSD as hints
1372 		 */
1373 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1374 		break;
1375 
1376 	case SO_RCVBUFFORCE:
1377 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1378 			ret = -EPERM;
1379 			break;
1380 		}
1381 
1382 		/* No negative values (to prevent underflow, as val will be
1383 		 * multiplied by 2).
1384 		 */
1385 		__sock_set_rcvbuf(sk, max(val, 0));
1386 		break;
1387 
1388 	case SO_KEEPALIVE:
1389 		if (sk->sk_prot->keepalive)
1390 			sk->sk_prot->keepalive(sk, valbool);
1391 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1392 		break;
1393 
1394 	case SO_OOBINLINE:
1395 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1396 		break;
1397 
1398 	case SO_NO_CHECK:
1399 		sk->sk_no_check_tx = valbool;
1400 		break;
1401 
1402 	case SO_LINGER:
1403 		if (optlen < sizeof(ling)) {
1404 			ret = -EINVAL;	/* 1003.1g */
1405 			break;
1406 		}
1407 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1408 			ret = -EFAULT;
1409 			break;
1410 		}
1411 		if (!ling.l_onoff) {
1412 			sock_reset_flag(sk, SOCK_LINGER);
1413 		} else {
1414 			unsigned long t_sec = ling.l_linger;
1415 
1416 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1417 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1418 			else
1419 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1420 			sock_set_flag(sk, SOCK_LINGER);
1421 		}
1422 		break;
1423 
1424 	case SO_BSDCOMPAT:
1425 		break;
1426 
1427 	case SO_TIMESTAMP_OLD:
1428 	case SO_TIMESTAMP_NEW:
1429 	case SO_TIMESTAMPNS_OLD:
1430 	case SO_TIMESTAMPNS_NEW:
1431 		sock_set_timestamp(sk, optname, valbool);
1432 		break;
1433 
1434 	case SO_TIMESTAMPING_NEW:
1435 	case SO_TIMESTAMPING_OLD:
1436 		if (optlen == sizeof(timestamping)) {
1437 			if (copy_from_sockptr(&timestamping, optval,
1438 					      sizeof(timestamping))) {
1439 				ret = -EFAULT;
1440 				break;
1441 			}
1442 		} else {
1443 			memset(&timestamping, 0, sizeof(timestamping));
1444 			timestamping.flags = val;
1445 		}
1446 		ret = sock_set_timestamping(sk, optname, timestamping);
1447 		break;
1448 
1449 	case SO_RCVLOWAT:
1450 		{
1451 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1452 
1453 		if (val < 0)
1454 			val = INT_MAX;
1455 		if (sock)
1456 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1457 		if (set_rcvlowat)
1458 			ret = set_rcvlowat(sk, val);
1459 		else
1460 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1461 		break;
1462 		}
1463 	case SO_ATTACH_FILTER: {
1464 		struct sock_fprog fprog;
1465 
1466 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1467 		if (!ret)
1468 			ret = sk_attach_filter(&fprog, sk);
1469 		break;
1470 	}
1471 	case SO_ATTACH_BPF:
1472 		ret = -EINVAL;
1473 		if (optlen == sizeof(u32)) {
1474 			u32 ufd;
1475 
1476 			ret = -EFAULT;
1477 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1478 				break;
1479 
1480 			ret = sk_attach_bpf(ufd, sk);
1481 		}
1482 		break;
1483 
1484 	case SO_ATTACH_REUSEPORT_CBPF: {
1485 		struct sock_fprog fprog;
1486 
1487 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1488 		if (!ret)
1489 			ret = sk_reuseport_attach_filter(&fprog, sk);
1490 		break;
1491 	}
1492 	case SO_ATTACH_REUSEPORT_EBPF:
1493 		ret = -EINVAL;
1494 		if (optlen == sizeof(u32)) {
1495 			u32 ufd;
1496 
1497 			ret = -EFAULT;
1498 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1499 				break;
1500 
1501 			ret = sk_reuseport_attach_bpf(ufd, sk);
1502 		}
1503 		break;
1504 
1505 	case SO_DETACH_REUSEPORT_BPF:
1506 		ret = reuseport_detach_prog(sk);
1507 		break;
1508 
1509 	case SO_DETACH_FILTER:
1510 		ret = sk_detach_filter(sk);
1511 		break;
1512 
1513 	case SO_LOCK_FILTER:
1514 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1515 			ret = -EPERM;
1516 		else
1517 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1518 		break;
1519 
1520 	case SO_MARK:
1521 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1522 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1523 			ret = -EPERM;
1524 			break;
1525 		}
1526 
1527 		__sock_set_mark(sk, val);
1528 		break;
1529 	case SO_RCVMARK:
1530 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1531 		break;
1532 
1533 	case SO_RCVPRIORITY:
1534 		sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1535 		break;
1536 
1537 	case SO_RXQ_OVFL:
1538 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1539 		break;
1540 
1541 	case SO_WIFI_STATUS:
1542 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1543 		break;
1544 
1545 	case SO_NOFCS:
1546 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1547 		break;
1548 
1549 	case SO_SELECT_ERR_QUEUE:
1550 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1551 		break;
1552 
1553 	case SO_PASSCRED:
1554 		if (sk_may_scm_recv(sk))
1555 			sk->sk_scm_credentials = valbool;
1556 		else
1557 			ret = -EOPNOTSUPP;
1558 		break;
1559 
1560 	case SO_PASSSEC:
1561 		if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1562 			sk->sk_scm_security = valbool;
1563 		else
1564 			ret = -EOPNOTSUPP;
1565 		break;
1566 
1567 	case SO_PASSPIDFD:
1568 		if (sk_is_unix(sk))
1569 			sk->sk_scm_pidfd = valbool;
1570 		else
1571 			ret = -EOPNOTSUPP;
1572 		break;
1573 
1574 	case SO_PASSRIGHTS:
1575 		if (sk_is_unix(sk))
1576 			sk->sk_scm_rights = valbool;
1577 		else
1578 			ret = -EOPNOTSUPP;
1579 		break;
1580 
1581 	case SO_INCOMING_CPU:
1582 		reuseport_update_incoming_cpu(sk, val);
1583 		break;
1584 
1585 	case SO_CNX_ADVICE:
1586 		if (val == 1)
1587 			dst_negative_advice(sk);
1588 		break;
1589 
1590 	case SO_ZEROCOPY:
1591 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1592 			if (!(sk_is_tcp(sk) ||
1593 			      (sk->sk_type == SOCK_DGRAM &&
1594 			       sk->sk_protocol == IPPROTO_UDP)))
1595 				ret = -EOPNOTSUPP;
1596 		} else if (sk->sk_family != PF_RDS) {
1597 			ret = -EOPNOTSUPP;
1598 		}
1599 		if (!ret) {
1600 			if (val < 0 || val > 1)
1601 				ret = -EINVAL;
1602 			else
1603 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1604 		}
1605 		break;
1606 
1607 	case SO_TXTIME:
1608 		if (optlen != sizeof(struct sock_txtime)) {
1609 			ret = -EINVAL;
1610 			break;
1611 		} else if (copy_from_sockptr(&sk_txtime, optval,
1612 			   sizeof(struct sock_txtime))) {
1613 			ret = -EFAULT;
1614 			break;
1615 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1616 			ret = -EINVAL;
1617 			break;
1618 		}
1619 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1620 		 * scheduler has enough safe guards.
1621 		 */
1622 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1623 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1624 			ret = -EPERM;
1625 			break;
1626 		}
1627 
1628 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1629 		if (ret)
1630 			break;
1631 
1632 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1633 		sk->sk_clockid = sk_txtime.clockid;
1634 		sk->sk_txtime_deadline_mode =
1635 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1636 		sk->sk_txtime_report_errors =
1637 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1638 		break;
1639 
1640 	case SO_BINDTOIFINDEX:
1641 		ret = sock_bindtoindex_locked(sk, val);
1642 		break;
1643 
1644 	case SO_BUF_LOCK:
1645 		if (val & ~SOCK_BUF_LOCK_MASK) {
1646 			ret = -EINVAL;
1647 			break;
1648 		}
1649 		sk->sk_userlocks = val | (sk->sk_userlocks &
1650 					  ~SOCK_BUF_LOCK_MASK);
1651 		break;
1652 
1653 	case SO_RESERVE_MEM:
1654 	{
1655 		int delta;
1656 
1657 		if (val < 0) {
1658 			ret = -EINVAL;
1659 			break;
1660 		}
1661 
1662 		delta = val - sk->sk_reserved_mem;
1663 		if (delta < 0)
1664 			sock_release_reserved_memory(sk, -delta);
1665 		else
1666 			ret = sock_reserve_memory(sk, delta);
1667 		break;
1668 	}
1669 
1670 	default:
1671 		ret = -ENOPROTOOPT;
1672 		break;
1673 	}
1674 	sockopt_release_sock(sk);
1675 	return ret;
1676 }
1677 
1678 int sock_setsockopt(struct socket *sock, int level, int optname,
1679 		    sockptr_t optval, unsigned int optlen)
1680 {
1681 	return sk_setsockopt(sock->sk, level, optname,
1682 			     optval, optlen);
1683 }
1684 EXPORT_SYMBOL(sock_setsockopt);
1685 
1686 static const struct cred *sk_get_peer_cred(struct sock *sk)
1687 {
1688 	const struct cred *cred;
1689 
1690 	spin_lock(&sk->sk_peer_lock);
1691 	cred = get_cred(sk->sk_peer_cred);
1692 	spin_unlock(&sk->sk_peer_lock);
1693 
1694 	return cred;
1695 }
1696 
1697 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1698 			  struct ucred *ucred)
1699 {
1700 	ucred->pid = pid_vnr(pid);
1701 	ucred->uid = ucred->gid = -1;
1702 	if (cred) {
1703 		struct user_namespace *current_ns = current_user_ns();
1704 
1705 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1706 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1707 	}
1708 }
1709 
1710 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1711 {
1712 	struct user_namespace *user_ns = current_user_ns();
1713 	int i;
1714 
1715 	for (i = 0; i < src->ngroups; i++) {
1716 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1717 
1718 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1719 			return -EFAULT;
1720 	}
1721 
1722 	return 0;
1723 }
1724 
1725 int sk_getsockopt(struct sock *sk, int level, int optname,
1726 		  sockptr_t optval, sockptr_t optlen)
1727 {
1728 	struct socket *sock = sk->sk_socket;
1729 
1730 	union {
1731 		int val;
1732 		u64 val64;
1733 		unsigned long ulval;
1734 		struct linger ling;
1735 		struct old_timeval32 tm32;
1736 		struct __kernel_old_timeval tm;
1737 		struct  __kernel_sock_timeval stm;
1738 		struct sock_txtime txtime;
1739 		struct so_timestamping timestamping;
1740 	} v;
1741 
1742 	int lv = sizeof(int);
1743 	int len;
1744 
1745 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1746 		return -EFAULT;
1747 	if (len < 0)
1748 		return -EINVAL;
1749 
1750 	memset(&v, 0, sizeof(v));
1751 
1752 	switch (optname) {
1753 	case SO_DEBUG:
1754 		v.val = sock_flag(sk, SOCK_DBG);
1755 		break;
1756 
1757 	case SO_DONTROUTE:
1758 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1759 		break;
1760 
1761 	case SO_BROADCAST:
1762 		v.val = sock_flag(sk, SOCK_BROADCAST);
1763 		break;
1764 
1765 	case SO_SNDBUF:
1766 		v.val = READ_ONCE(sk->sk_sndbuf);
1767 		break;
1768 
1769 	case SO_RCVBUF:
1770 		v.val = READ_ONCE(sk->sk_rcvbuf);
1771 		break;
1772 
1773 	case SO_REUSEADDR:
1774 		v.val = sk->sk_reuse;
1775 		break;
1776 
1777 	case SO_REUSEPORT:
1778 		v.val = sk->sk_reuseport;
1779 		break;
1780 
1781 	case SO_KEEPALIVE:
1782 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1783 		break;
1784 
1785 	case SO_TYPE:
1786 		v.val = sk->sk_type;
1787 		break;
1788 
1789 	case SO_PROTOCOL:
1790 		v.val = sk->sk_protocol;
1791 		break;
1792 
1793 	case SO_DOMAIN:
1794 		v.val = sk->sk_family;
1795 		break;
1796 
1797 	case SO_ERROR:
1798 		v.val = -sock_error(sk);
1799 		if (v.val == 0)
1800 			v.val = xchg(&sk->sk_err_soft, 0);
1801 		break;
1802 
1803 	case SO_OOBINLINE:
1804 		v.val = sock_flag(sk, SOCK_URGINLINE);
1805 		break;
1806 
1807 	case SO_NO_CHECK:
1808 		v.val = sk->sk_no_check_tx;
1809 		break;
1810 
1811 	case SO_PRIORITY:
1812 		v.val = READ_ONCE(sk->sk_priority);
1813 		break;
1814 
1815 	case SO_LINGER:
1816 		lv		= sizeof(v.ling);
1817 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1818 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1819 		break;
1820 
1821 	case SO_BSDCOMPAT:
1822 		break;
1823 
1824 	case SO_TIMESTAMP_OLD:
1825 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1826 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1827 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1828 		break;
1829 
1830 	case SO_TIMESTAMPNS_OLD:
1831 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1832 		break;
1833 
1834 	case SO_TIMESTAMP_NEW:
1835 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1836 		break;
1837 
1838 	case SO_TIMESTAMPNS_NEW:
1839 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1840 		break;
1841 
1842 	case SO_TIMESTAMPING_OLD:
1843 	case SO_TIMESTAMPING_NEW:
1844 		lv = sizeof(v.timestamping);
1845 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1846 		 * returning the flags when they were set through the same option.
1847 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1848 		 */
1849 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1850 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1851 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1852 		}
1853 		break;
1854 
1855 	case SO_RCVTIMEO_OLD:
1856 	case SO_RCVTIMEO_NEW:
1857 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1858 				      SO_RCVTIMEO_OLD == optname);
1859 		break;
1860 
1861 	case SO_SNDTIMEO_OLD:
1862 	case SO_SNDTIMEO_NEW:
1863 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1864 				      SO_SNDTIMEO_OLD == optname);
1865 		break;
1866 
1867 	case SO_RCVLOWAT:
1868 		v.val = READ_ONCE(sk->sk_rcvlowat);
1869 		break;
1870 
1871 	case SO_SNDLOWAT:
1872 		v.val = 1;
1873 		break;
1874 
1875 	case SO_PASSCRED:
1876 		if (!sk_may_scm_recv(sk))
1877 			return -EOPNOTSUPP;
1878 
1879 		v.val = sk->sk_scm_credentials;
1880 		break;
1881 
1882 	case SO_PASSPIDFD:
1883 		if (!sk_is_unix(sk))
1884 			return -EOPNOTSUPP;
1885 
1886 		v.val = sk->sk_scm_pidfd;
1887 		break;
1888 
1889 	case SO_PASSRIGHTS:
1890 		if (!sk_is_unix(sk))
1891 			return -EOPNOTSUPP;
1892 
1893 		v.val = sk->sk_scm_rights;
1894 		break;
1895 
1896 	case SO_PEERCRED:
1897 	{
1898 		struct ucred peercred;
1899 		if (len > sizeof(peercred))
1900 			len = sizeof(peercred);
1901 
1902 		spin_lock(&sk->sk_peer_lock);
1903 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1904 		spin_unlock(&sk->sk_peer_lock);
1905 
1906 		if (copy_to_sockptr(optval, &peercred, len))
1907 			return -EFAULT;
1908 		goto lenout;
1909 	}
1910 
1911 	case SO_PEERPIDFD:
1912 	{
1913 		struct pid *peer_pid;
1914 		struct file *pidfd_file = NULL;
1915 		unsigned int flags = 0;
1916 		int pidfd;
1917 
1918 		if (len > sizeof(pidfd))
1919 			len = sizeof(pidfd);
1920 
1921 		spin_lock(&sk->sk_peer_lock);
1922 		peer_pid = get_pid(sk->sk_peer_pid);
1923 		spin_unlock(&sk->sk_peer_lock);
1924 
1925 		if (!peer_pid)
1926 			return -ENODATA;
1927 
1928 		/* The use of PIDFD_STALE requires stashing of struct pid
1929 		 * on pidfs with pidfs_register_pid() and only AF_UNIX
1930 		 * were prepared for this.
1931 		 */
1932 		if (sk->sk_family == AF_UNIX)
1933 			flags = PIDFD_STALE;
1934 
1935 		pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
1936 		put_pid(peer_pid);
1937 		if (pidfd < 0)
1938 			return pidfd;
1939 
1940 		if (copy_to_sockptr(optval, &pidfd, len) ||
1941 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1942 			put_unused_fd(pidfd);
1943 			fput(pidfd_file);
1944 
1945 			return -EFAULT;
1946 		}
1947 
1948 		fd_install(pidfd, pidfd_file);
1949 		return 0;
1950 	}
1951 
1952 	case SO_PEERGROUPS:
1953 	{
1954 		const struct cred *cred;
1955 		int ret, n;
1956 
1957 		cred = sk_get_peer_cred(sk);
1958 		if (!cred)
1959 			return -ENODATA;
1960 
1961 		n = cred->group_info->ngroups;
1962 		if (len < n * sizeof(gid_t)) {
1963 			len = n * sizeof(gid_t);
1964 			put_cred(cred);
1965 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1966 		}
1967 		len = n * sizeof(gid_t);
1968 
1969 		ret = groups_to_user(optval, cred->group_info);
1970 		put_cred(cred);
1971 		if (ret)
1972 			return ret;
1973 		goto lenout;
1974 	}
1975 
1976 	case SO_PEERNAME:
1977 	{
1978 		struct sockaddr_storage address;
1979 
1980 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1981 		if (lv < 0)
1982 			return -ENOTCONN;
1983 		if (lv < len)
1984 			return -EINVAL;
1985 		if (copy_to_sockptr(optval, &address, len))
1986 			return -EFAULT;
1987 		goto lenout;
1988 	}
1989 
1990 	/* Dubious BSD thing... Probably nobody even uses it, but
1991 	 * the UNIX standard wants it for whatever reason... -DaveM
1992 	 */
1993 	case SO_ACCEPTCONN:
1994 		v.val = sk->sk_state == TCP_LISTEN;
1995 		break;
1996 
1997 	case SO_PASSSEC:
1998 		if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
1999 			return -EOPNOTSUPP;
2000 
2001 		v.val = sk->sk_scm_security;
2002 		break;
2003 
2004 	case SO_PEERSEC:
2005 		return security_socket_getpeersec_stream(sock,
2006 							 optval, optlen, len);
2007 
2008 	case SO_MARK:
2009 		v.val = READ_ONCE(sk->sk_mark);
2010 		break;
2011 
2012 	case SO_RCVMARK:
2013 		v.val = sock_flag(sk, SOCK_RCVMARK);
2014 		break;
2015 
2016 	case SO_RCVPRIORITY:
2017 		v.val = sock_flag(sk, SOCK_RCVPRIORITY);
2018 		break;
2019 
2020 	case SO_RXQ_OVFL:
2021 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
2022 		break;
2023 
2024 	case SO_WIFI_STATUS:
2025 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
2026 		break;
2027 
2028 	case SO_PEEK_OFF:
2029 		if (!READ_ONCE(sock->ops)->set_peek_off)
2030 			return -EOPNOTSUPP;
2031 
2032 		v.val = READ_ONCE(sk->sk_peek_off);
2033 		break;
2034 	case SO_NOFCS:
2035 		v.val = sock_flag(sk, SOCK_NOFCS);
2036 		break;
2037 
2038 	case SO_BINDTODEVICE:
2039 		return sock_getbindtodevice(sk, optval, optlen, len);
2040 
2041 	case SO_GET_FILTER:
2042 		len = sk_get_filter(sk, optval, len);
2043 		if (len < 0)
2044 			return len;
2045 
2046 		goto lenout;
2047 
2048 	case SO_LOCK_FILTER:
2049 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
2050 		break;
2051 
2052 	case SO_BPF_EXTENSIONS:
2053 		v.val = bpf_tell_extensions();
2054 		break;
2055 
2056 	case SO_SELECT_ERR_QUEUE:
2057 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2058 		break;
2059 
2060 #ifdef CONFIG_NET_RX_BUSY_POLL
2061 	case SO_BUSY_POLL:
2062 		v.val = READ_ONCE(sk->sk_ll_usec);
2063 		break;
2064 	case SO_PREFER_BUSY_POLL:
2065 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2066 		break;
2067 #endif
2068 
2069 	case SO_MAX_PACING_RATE:
2070 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2071 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2072 			lv = sizeof(v.ulval);
2073 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2074 		} else {
2075 			/* 32bit version */
2076 			v.val = min_t(unsigned long, ~0U,
2077 				      READ_ONCE(sk->sk_max_pacing_rate));
2078 		}
2079 		break;
2080 
2081 	case SO_INCOMING_CPU:
2082 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2083 		break;
2084 
2085 	case SO_MEMINFO:
2086 	{
2087 		u32 meminfo[SK_MEMINFO_VARS];
2088 
2089 		sk_get_meminfo(sk, meminfo);
2090 
2091 		len = min_t(unsigned int, len, sizeof(meminfo));
2092 		if (copy_to_sockptr(optval, &meminfo, len))
2093 			return -EFAULT;
2094 
2095 		goto lenout;
2096 	}
2097 
2098 #ifdef CONFIG_NET_RX_BUSY_POLL
2099 	case SO_INCOMING_NAPI_ID:
2100 		v.val = READ_ONCE(sk->sk_napi_id);
2101 
2102 		/* aggregate non-NAPI IDs down to 0 */
2103 		if (!napi_id_valid(v.val))
2104 			v.val = 0;
2105 
2106 		break;
2107 #endif
2108 
2109 	case SO_COOKIE:
2110 		lv = sizeof(u64);
2111 		if (len < lv)
2112 			return -EINVAL;
2113 		v.val64 = sock_gen_cookie(sk);
2114 		break;
2115 
2116 	case SO_ZEROCOPY:
2117 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2118 		break;
2119 
2120 	case SO_TXTIME:
2121 		lv = sizeof(v.txtime);
2122 		v.txtime.clockid = sk->sk_clockid;
2123 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2124 				  SOF_TXTIME_DEADLINE_MODE : 0;
2125 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2126 				  SOF_TXTIME_REPORT_ERRORS : 0;
2127 		break;
2128 
2129 	case SO_BINDTOIFINDEX:
2130 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2131 		break;
2132 
2133 	case SO_NETNS_COOKIE:
2134 		lv = sizeof(u64);
2135 		if (len != lv)
2136 			return -EINVAL;
2137 		v.val64 = sock_net(sk)->net_cookie;
2138 		break;
2139 
2140 	case SO_BUF_LOCK:
2141 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2142 		break;
2143 
2144 	case SO_RESERVE_MEM:
2145 		v.val = READ_ONCE(sk->sk_reserved_mem);
2146 		break;
2147 
2148 	case SO_TXREHASH:
2149 		if (!sk_is_tcp(sk))
2150 			return -EOPNOTSUPP;
2151 
2152 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2153 		v.val = READ_ONCE(sk->sk_txrehash);
2154 		break;
2155 
2156 	default:
2157 		/* We implement the SO_SNDLOWAT etc to not be settable
2158 		 * (1003.1g 7).
2159 		 */
2160 		return -ENOPROTOOPT;
2161 	}
2162 
2163 	if (len > lv)
2164 		len = lv;
2165 	if (copy_to_sockptr(optval, &v, len))
2166 		return -EFAULT;
2167 lenout:
2168 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2169 		return -EFAULT;
2170 	return 0;
2171 }
2172 
2173 /*
2174  * Initialize an sk_lock.
2175  *
2176  * (We also register the sk_lock with the lock validator.)
2177  */
2178 static inline void sock_lock_init(struct sock *sk)
2179 {
2180 	sk_owner_clear(sk);
2181 
2182 	if (sk->sk_kern_sock)
2183 		sock_lock_init_class_and_name(
2184 			sk,
2185 			af_family_kern_slock_key_strings[sk->sk_family],
2186 			af_family_kern_slock_keys + sk->sk_family,
2187 			af_family_kern_key_strings[sk->sk_family],
2188 			af_family_kern_keys + sk->sk_family);
2189 	else
2190 		sock_lock_init_class_and_name(
2191 			sk,
2192 			af_family_slock_key_strings[sk->sk_family],
2193 			af_family_slock_keys + sk->sk_family,
2194 			af_family_key_strings[sk->sk_family],
2195 			af_family_keys + sk->sk_family);
2196 }
2197 
2198 /*
2199  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2200  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2201  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2202  */
2203 static void sock_copy(struct sock *nsk, const struct sock *osk)
2204 {
2205 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2206 #ifdef CONFIG_SECURITY_NETWORK
2207 	void *sptr = nsk->sk_security;
2208 #endif
2209 
2210 	/* If we move sk_tx_queue_mapping out of the private section,
2211 	 * we must check if sk_tx_queue_clear() is called after
2212 	 * sock_copy() in sk_clone_lock().
2213 	 */
2214 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2215 		     offsetof(struct sock, sk_dontcopy_begin) ||
2216 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2217 		     offsetof(struct sock, sk_dontcopy_end));
2218 
2219 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2220 
2221 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2222 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2223 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2224 
2225 #ifdef CONFIG_SECURITY_NETWORK
2226 	nsk->sk_security = sptr;
2227 	security_sk_clone(osk, nsk);
2228 #endif
2229 }
2230 
2231 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2232 		int family)
2233 {
2234 	struct sock *sk;
2235 	struct kmem_cache *slab;
2236 
2237 	slab = prot->slab;
2238 	if (slab != NULL) {
2239 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2240 		if (!sk)
2241 			return sk;
2242 		if (want_init_on_alloc(priority))
2243 			sk_prot_clear_nulls(sk, prot->obj_size);
2244 	} else
2245 		sk = kmalloc(prot->obj_size, priority);
2246 
2247 	if (sk != NULL) {
2248 		if (security_sk_alloc(sk, family, priority))
2249 			goto out_free;
2250 
2251 		if (!try_module_get(prot->owner))
2252 			goto out_free_sec;
2253 	}
2254 
2255 	return sk;
2256 
2257 out_free_sec:
2258 	security_sk_free(sk);
2259 out_free:
2260 	if (slab != NULL)
2261 		kmem_cache_free(slab, sk);
2262 	else
2263 		kfree(sk);
2264 	return NULL;
2265 }
2266 
2267 static void sk_prot_free(struct proto *prot, struct sock *sk)
2268 {
2269 	struct kmem_cache *slab;
2270 	struct module *owner;
2271 
2272 	owner = prot->owner;
2273 	slab = prot->slab;
2274 
2275 	cgroup_sk_free(&sk->sk_cgrp_data);
2276 	mem_cgroup_sk_free(sk);
2277 	security_sk_free(sk);
2278 
2279 	sk_owner_put(sk);
2280 
2281 	if (slab != NULL)
2282 		kmem_cache_free(slab, sk);
2283 	else
2284 		kfree(sk);
2285 	module_put(owner);
2286 }
2287 
2288 /**
2289  *	sk_alloc - All socket objects are allocated here
2290  *	@net: the applicable net namespace
2291  *	@family: protocol family
2292  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2293  *	@prot: struct proto associated with this new sock instance
2294  *	@kern: is this to be a kernel socket?
2295  */
2296 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2297 		      struct proto *prot, int kern)
2298 {
2299 	struct sock *sk;
2300 
2301 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2302 	if (sk) {
2303 		sk->sk_family = family;
2304 		/*
2305 		 * See comment in struct sock definition to understand
2306 		 * why we need sk_prot_creator -acme
2307 		 */
2308 		sk->sk_prot = sk->sk_prot_creator = prot;
2309 
2310 		if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
2311 			sk->sk_bypass_prot_mem = 1;
2312 
2313 		sk->sk_kern_sock = kern;
2314 		sock_lock_init(sk);
2315 
2316 		sk->sk_net_refcnt = kern ? 0 : 1;
2317 		if (likely(sk->sk_net_refcnt)) {
2318 			get_net_track(net, &sk->ns_tracker, priority);
2319 			sock_inuse_add(net, 1);
2320 		} else {
2321 			net_passive_inc(net);
2322 			__netns_tracker_alloc(net, &sk->ns_tracker,
2323 					      false, priority);
2324 		}
2325 
2326 		sock_net_set(sk, net);
2327 		refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2328 
2329 		mem_cgroup_sk_alloc(sk);
2330 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2331 		sock_update_classid(&sk->sk_cgrp_data);
2332 		sock_update_netprioidx(&sk->sk_cgrp_data);
2333 		sk_tx_queue_clear(sk);
2334 	}
2335 
2336 	return sk;
2337 }
2338 EXPORT_SYMBOL(sk_alloc);
2339 
2340 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2341  * grace period. This is the case for UDP sockets and TCP listeners.
2342  */
2343 static void __sk_destruct(struct rcu_head *head)
2344 {
2345 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2346 	struct net *net = sock_net(sk);
2347 	struct sk_filter *filter;
2348 
2349 	if (sk->sk_destruct)
2350 		sk->sk_destruct(sk);
2351 
2352 	filter = rcu_dereference_check(sk->sk_filter,
2353 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2354 	if (filter) {
2355 		sk_filter_uncharge(sk, filter);
2356 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2357 	}
2358 
2359 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2360 
2361 #ifdef CONFIG_BPF_SYSCALL
2362 	bpf_sk_storage_free(sk);
2363 #endif
2364 
2365 	if (atomic_read(&sk->sk_omem_alloc))
2366 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2367 			 __func__, atomic_read(&sk->sk_omem_alloc));
2368 
2369 	if (sk->sk_frag.page) {
2370 		put_page(sk->sk_frag.page);
2371 		sk->sk_frag.page = NULL;
2372 	}
2373 
2374 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2375 	put_cred(sk->sk_peer_cred);
2376 	put_pid(sk->sk_peer_pid);
2377 
2378 	if (likely(sk->sk_net_refcnt)) {
2379 		put_net_track(net, &sk->ns_tracker);
2380 	} else {
2381 		__netns_tracker_free(net, &sk->ns_tracker, false);
2382 		net_passive_dec(net);
2383 	}
2384 	sk_prot_free(sk->sk_prot_creator, sk);
2385 }
2386 
2387 void sk_net_refcnt_upgrade(struct sock *sk)
2388 {
2389 	struct net *net = sock_net(sk);
2390 
2391 	WARN_ON_ONCE(sk->sk_net_refcnt);
2392 	__netns_tracker_free(net, &sk->ns_tracker, false);
2393 	net_passive_dec(net);
2394 	sk->sk_net_refcnt = 1;
2395 	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2396 	sock_inuse_add(net, 1);
2397 }
2398 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2399 
2400 void sk_destruct(struct sock *sk)
2401 {
2402 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2403 
2404 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2405 		reuseport_detach_sock(sk);
2406 		use_call_rcu = true;
2407 	}
2408 
2409 	if (use_call_rcu)
2410 		call_rcu(&sk->sk_rcu, __sk_destruct);
2411 	else
2412 		__sk_destruct(&sk->sk_rcu);
2413 }
2414 
2415 static void __sk_free(struct sock *sk)
2416 {
2417 	if (likely(sk->sk_net_refcnt))
2418 		sock_inuse_add(sock_net(sk), -1);
2419 
2420 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2421 		sock_diag_broadcast_destroy(sk);
2422 	else
2423 		sk_destruct(sk);
2424 }
2425 
2426 void sk_free(struct sock *sk)
2427 {
2428 	/*
2429 	 * We subtract one from sk_wmem_alloc and can know if
2430 	 * some packets are still in some tx queue.
2431 	 * If not null, sock_wfree() will call __sk_free(sk) later
2432 	 */
2433 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2434 		__sk_free(sk);
2435 }
2436 EXPORT_SYMBOL(sk_free);
2437 
2438 static void sk_init_common(struct sock *sk)
2439 {
2440 	skb_queue_head_init(&sk->sk_receive_queue);
2441 	skb_queue_head_init(&sk->sk_write_queue);
2442 	skb_queue_head_init(&sk->sk_error_queue);
2443 
2444 	rwlock_init(&sk->sk_callback_lock);
2445 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2446 			af_rlock_keys + sk->sk_family,
2447 			af_family_rlock_key_strings[sk->sk_family]);
2448 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2449 			af_wlock_keys + sk->sk_family,
2450 			af_family_wlock_key_strings[sk->sk_family]);
2451 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2452 			af_elock_keys + sk->sk_family,
2453 			af_family_elock_key_strings[sk->sk_family]);
2454 	if (sk->sk_kern_sock)
2455 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2456 			af_kern_callback_keys + sk->sk_family,
2457 			af_family_kern_clock_key_strings[sk->sk_family]);
2458 	else
2459 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2460 			af_callback_keys + sk->sk_family,
2461 			af_family_clock_key_strings[sk->sk_family]);
2462 }
2463 
2464 /**
2465  * sk_clone - clone a socket
2466  * @sk: the socket to clone
2467  * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2468  * @lock: if true, lock the cloned sk
2469  *
2470  * If @lock is true, the clone is locked by bh_lock_sock(), and
2471  * caller must unlock socket even in error path by bh_unlock_sock().
2472  */
2473 struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
2474 		      bool lock)
2475 {
2476 	struct proto *prot = READ_ONCE(sk->sk_prot);
2477 	struct sk_filter *filter;
2478 	bool is_charged = true;
2479 	struct sock *newsk;
2480 
2481 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2482 	if (!newsk)
2483 		goto out;
2484 
2485 	sock_copy(newsk, sk);
2486 
2487 	newsk->sk_prot_creator = prot;
2488 
2489 	/* SANITY */
2490 	if (likely(newsk->sk_net_refcnt)) {
2491 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2492 		sock_inuse_add(sock_net(newsk), 1);
2493 	} else {
2494 		/* Kernel sockets are not elevating the struct net refcount.
2495 		 * Instead, use a tracker to more easily detect if a layer
2496 		 * is not properly dismantling its kernel sockets at netns
2497 		 * destroy time.
2498 		 */
2499 		net_passive_inc(sock_net(newsk));
2500 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2501 				      false, priority);
2502 	}
2503 
2504 	sk_node_init(&newsk->sk_node);
2505 	sock_lock_init(newsk);
2506 
2507 	if (lock)
2508 		bh_lock_sock(newsk);
2509 
2510 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2511 	newsk->sk_backlog.len = 0;
2512 
2513 	atomic_set(&newsk->sk_rmem_alloc, 0);
2514 
2515 	refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2516 
2517 	atomic_set(&newsk->sk_omem_alloc, 0);
2518 	sk_init_common(newsk);
2519 
2520 	newsk->sk_dst_cache	= NULL;
2521 	newsk->sk_dst_pending_confirm = 0;
2522 	newsk->sk_wmem_queued	= 0;
2523 	newsk->sk_forward_alloc = 0;
2524 	newsk->sk_reserved_mem  = 0;
2525 	DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
2526 	sk_drops_reset(newsk);
2527 	newsk->sk_send_head	= NULL;
2528 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2529 	atomic_set(&newsk->sk_zckey, 0);
2530 
2531 	sock_reset_flag(newsk, SOCK_DONE);
2532 
2533 #ifdef CONFIG_MEMCG
2534 	/* sk->sk_memcg will be populated at accept() time */
2535 	newsk->sk_memcg = NULL;
2536 #endif
2537 
2538 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2539 
2540 	rcu_read_lock();
2541 	filter = rcu_dereference(sk->sk_filter);
2542 	if (filter != NULL)
2543 		/* though it's an empty new sock, the charging may fail
2544 		 * if sysctl_optmem_max was changed between creation of
2545 		 * original socket and cloning
2546 		 */
2547 		is_charged = sk_filter_charge(newsk, filter);
2548 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2549 	rcu_read_unlock();
2550 
2551 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2552 		/* We need to make sure that we don't uncharge the new
2553 		 * socket if we couldn't charge it in the first place
2554 		 * as otherwise we uncharge the parent's filter.
2555 		 */
2556 		if (!is_charged)
2557 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2558 
2559 		goto free;
2560 	}
2561 
2562 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2563 
2564 	if (bpf_sk_storage_clone(sk, newsk))
2565 		goto free;
2566 
2567 	/* Clear sk_user_data if parent had the pointer tagged
2568 	 * as not suitable for copying when cloning.
2569 	 */
2570 	if (sk_user_data_is_nocopy(newsk))
2571 		newsk->sk_user_data = NULL;
2572 
2573 	newsk->sk_err	   = 0;
2574 	newsk->sk_err_soft = 0;
2575 	newsk->sk_priority = 0;
2576 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2577 
2578 	/* Before updating sk_refcnt, we must commit prior changes to memory
2579 	 * (Documentation/RCU/rculist_nulls.rst for details)
2580 	 */
2581 	smp_wmb();
2582 	refcount_set(&newsk->sk_refcnt, 2);
2583 
2584 	sk_set_socket(newsk, NULL);
2585 	sk_tx_queue_clear(newsk);
2586 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2587 
2588 	if (newsk->sk_prot->sockets_allocated)
2589 		sk_sockets_allocated_inc(newsk);
2590 
2591 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2592 		net_enable_timestamp();
2593 out:
2594 	return newsk;
2595 free:
2596 	/* It is still raw copy of parent, so invalidate
2597 	 * destructor and make plain sk_free()
2598 	 */
2599 	newsk->sk_destruct = NULL;
2600 	if (lock)
2601 		bh_unlock_sock(newsk);
2602 	sk_free(newsk);
2603 	newsk = NULL;
2604 	goto out;
2605 }
2606 EXPORT_SYMBOL_GPL(sk_clone);
2607 
2608 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
2609 {
2610 	bool is_ipv6 = false;
2611 	u32 max_size;
2612 
2613 #if IS_ENABLED(CONFIG_IPV6)
2614 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2615 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2616 #endif
2617 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2618 	max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
2619 			READ_ONCE(dev->gso_ipv4_max_size);
2620 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2621 		max_size = GSO_LEGACY_MAX_SIZE;
2622 
2623 	return max_size - (MAX_TCP_HEADER + 1);
2624 }
2625 
2626 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2627 {
2628 	const struct net_device *dev;
2629 	u32 max_segs = 1;
2630 
2631 	rcu_read_lock();
2632 	dev = dst_dev_rcu(dst);
2633 	sk->sk_route_caps = dev->features;
2634 	if (sk_is_tcp(sk)) {
2635 		struct inet_connection_sock *icsk = inet_csk(sk);
2636 
2637 		sk->sk_route_caps |= NETIF_F_GSO;
2638 		icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2639 	}
2640 	if (sk->sk_route_caps & NETIF_F_GSO)
2641 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2642 	if (unlikely(sk->sk_gso_disabled))
2643 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2644 	if (sk_can_gso(sk)) {
2645 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2646 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2647 		} else {
2648 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2649 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
2650 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2651 			max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
2652 		}
2653 	}
2654 	sk->sk_gso_max_segs = max_segs;
2655 	sk_dst_set(sk, dst);
2656 	rcu_read_unlock();
2657 }
2658 EXPORT_SYMBOL_GPL(sk_setup_caps);
2659 
2660 /*
2661  *	Simple resource managers for sockets.
2662  */
2663 
2664 
2665 /*
2666  * Write buffer destructor automatically called from kfree_skb.
2667  */
2668 void sock_wfree(struct sk_buff *skb)
2669 {
2670 	unsigned int len = skb->truesize;
2671 	struct sock *sk = skb->sk;
2672 	bool free;
2673 	int old;
2674 
2675 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2676 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2677 		    sk->sk_write_space == sock_def_write_space) {
2678 			rcu_read_lock();
2679 			free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
2680 						       &old);
2681 			sock_def_write_space_wfree(sk, old - len);
2682 			rcu_read_unlock();
2683 			if (unlikely(free))
2684 				__sk_free(sk);
2685 			return;
2686 		}
2687 
2688 		/*
2689 		 * Keep a reference on sk_wmem_alloc, this will be released
2690 		 * after sk_write_space() call
2691 		 */
2692 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2693 		sk->sk_write_space(sk);
2694 		len = 1;
2695 	}
2696 	/*
2697 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2698 	 * could not do because of in-flight packets
2699 	 */
2700 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2701 		__sk_free(sk);
2702 }
2703 EXPORT_SYMBOL(sock_wfree);
2704 
2705 /* This variant of sock_wfree() is used by TCP,
2706  * since it sets SOCK_USE_WRITE_QUEUE.
2707  */
2708 void __sock_wfree(struct sk_buff *skb)
2709 {
2710 	struct sock *sk = skb->sk;
2711 
2712 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2713 		__sk_free(sk);
2714 }
2715 
2716 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2717 {
2718 	int old_wmem;
2719 
2720 	skb_orphan(skb);
2721 #ifdef CONFIG_INET
2722 	if (unlikely(!sk_fullsock(sk)))
2723 		return skb_set_owner_edemux(skb, sk);
2724 #endif
2725 	skb->sk = sk;
2726 	skb->destructor = sock_wfree;
2727 	skb_set_hash_from_sk(skb, sk);
2728 	/*
2729 	 * We used to take a refcount on sk, but following operation
2730 	 * is enough to guarantee sk_free() won't free this sock until
2731 	 * all in-flight packets are completed
2732 	 */
2733 	__refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);
2734 
2735 	/* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
2736 	 * is in a host queue (qdisc, NIC queue).
2737 	 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
2738 	 * based on XPS for better performance.
2739 	 * Otherwise clear ooo_okay to not risk Out Of Order delivery.
2740 	 */
2741 	skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
2742 }
2743 EXPORT_SYMBOL(skb_set_owner_w);
2744 
2745 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2746 {
2747 	/* Drivers depend on in-order delivery for crypto offload,
2748 	 * partial orphan breaks out-of-order-OK logic.
2749 	 */
2750 	if (skb_is_decrypted(skb))
2751 		return false;
2752 
2753 	return (skb->destructor == sock_wfree ||
2754 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2755 }
2756 
2757 /* This helper is used by netem, as it can hold packets in its
2758  * delay queue. We want to allow the owner socket to send more
2759  * packets, as if they were already TX completed by a typical driver.
2760  * But we also want to keep skb->sk set because some packet schedulers
2761  * rely on it (sch_fq for example).
2762  */
2763 void skb_orphan_partial(struct sk_buff *skb)
2764 {
2765 	if (skb_is_tcp_pure_ack(skb))
2766 		return;
2767 
2768 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2769 		return;
2770 
2771 	skb_orphan(skb);
2772 }
2773 EXPORT_SYMBOL(skb_orphan_partial);
2774 
2775 /*
2776  * Read buffer destructor automatically called from kfree_skb.
2777  */
2778 void sock_rfree(struct sk_buff *skb)
2779 {
2780 	struct sock *sk = skb->sk;
2781 	unsigned int len = skb->truesize;
2782 
2783 	atomic_sub(len, &sk->sk_rmem_alloc);
2784 	sk_mem_uncharge(sk, len);
2785 }
2786 EXPORT_SYMBOL(sock_rfree);
2787 
2788 /*
2789  * Buffer destructor for skbs that are not used directly in read or write
2790  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2791  */
2792 void sock_efree(struct sk_buff *skb)
2793 {
2794 	sock_put(skb->sk);
2795 }
2796 EXPORT_SYMBOL(sock_efree);
2797 
2798 /* Buffer destructor for prefetch/receive path where reference count may
2799  * not be held, e.g. for listen sockets.
2800  */
2801 #ifdef CONFIG_INET
2802 void sock_pfree(struct sk_buff *skb)
2803 {
2804 	struct sock *sk = skb->sk;
2805 
2806 	if (!sk_is_refcounted(sk))
2807 		return;
2808 
2809 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2810 		inet_reqsk(sk)->rsk_listener = NULL;
2811 		reqsk_free(inet_reqsk(sk));
2812 		return;
2813 	}
2814 
2815 	sock_gen_put(sk);
2816 }
2817 EXPORT_SYMBOL(sock_pfree);
2818 #endif /* CONFIG_INET */
2819 
2820 /*
2821  * Allocate a skb from the socket's send buffer.
2822  */
2823 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2824 			     gfp_t priority)
2825 {
2826 	if (force ||
2827 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2828 		struct sk_buff *skb = alloc_skb(size, priority);
2829 
2830 		if (skb) {
2831 			skb_set_owner_w(skb, sk);
2832 			return skb;
2833 		}
2834 	}
2835 	return NULL;
2836 }
2837 EXPORT_SYMBOL(sock_wmalloc);
2838 
2839 static void sock_ofree(struct sk_buff *skb)
2840 {
2841 	struct sock *sk = skb->sk;
2842 
2843 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2844 }
2845 
2846 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2847 			     gfp_t priority)
2848 {
2849 	struct sk_buff *skb;
2850 
2851 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2852 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2853 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2854 		return NULL;
2855 
2856 	skb = alloc_skb(size, priority);
2857 	if (!skb)
2858 		return NULL;
2859 
2860 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2861 	skb->sk = sk;
2862 	skb->destructor = sock_ofree;
2863 	return skb;
2864 }
2865 
2866 /*
2867  * Allocate a memory block from the socket's option memory buffer.
2868  */
2869 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2870 {
2871 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2872 
2873 	if ((unsigned int)size <= optmem_max &&
2874 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2875 		void *mem;
2876 		/* First do the add, to avoid the race if kmalloc
2877 		 * might sleep.
2878 		 */
2879 		atomic_add(size, &sk->sk_omem_alloc);
2880 		mem = kmalloc(size, priority);
2881 		if (mem)
2882 			return mem;
2883 		atomic_sub(size, &sk->sk_omem_alloc);
2884 	}
2885 	return NULL;
2886 }
2887 EXPORT_SYMBOL(sock_kmalloc);
2888 
2889 /*
2890  * Duplicate the input "src" memory block using the socket's
2891  * option memory buffer.
2892  */
2893 void *sock_kmemdup(struct sock *sk, const void *src,
2894 		   int size, gfp_t priority)
2895 {
2896 	void *mem;
2897 
2898 	mem = sock_kmalloc(sk, size, priority);
2899 	if (mem)
2900 		memcpy(mem, src, size);
2901 	return mem;
2902 }
2903 EXPORT_SYMBOL(sock_kmemdup);
2904 
2905 /* Free an option memory block. Note, we actually want the inline
2906  * here as this allows gcc to detect the nullify and fold away the
2907  * condition entirely.
2908  */
2909 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2910 				  const bool nullify)
2911 {
2912 	if (WARN_ON_ONCE(!mem))
2913 		return;
2914 	if (nullify)
2915 		kfree_sensitive(mem);
2916 	else
2917 		kfree(mem);
2918 	atomic_sub(size, &sk->sk_omem_alloc);
2919 }
2920 
2921 void sock_kfree_s(struct sock *sk, void *mem, int size)
2922 {
2923 	__sock_kfree_s(sk, mem, size, false);
2924 }
2925 EXPORT_SYMBOL(sock_kfree_s);
2926 
2927 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2928 {
2929 	__sock_kfree_s(sk, mem, size, true);
2930 }
2931 EXPORT_SYMBOL(sock_kzfree_s);
2932 
2933 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2934    I think, these locks should be removed for datagram sockets.
2935  */
2936 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2937 {
2938 	DEFINE_WAIT(wait);
2939 
2940 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2941 	for (;;) {
2942 		if (!timeo)
2943 			break;
2944 		if (signal_pending(current))
2945 			break;
2946 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2947 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2948 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2949 			break;
2950 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2951 			break;
2952 		if (READ_ONCE(sk->sk_err))
2953 			break;
2954 		timeo = schedule_timeout(timeo);
2955 	}
2956 	finish_wait(sk_sleep(sk), &wait);
2957 	return timeo;
2958 }
2959 
2960 
2961 /*
2962  *	Generic send/receive buffer handlers
2963  */
2964 
2965 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2966 				     unsigned long data_len, int noblock,
2967 				     int *errcode, int max_page_order)
2968 {
2969 	struct sk_buff *skb;
2970 	long timeo;
2971 	int err;
2972 
2973 	timeo = sock_sndtimeo(sk, noblock);
2974 	for (;;) {
2975 		err = sock_error(sk);
2976 		if (err != 0)
2977 			goto failure;
2978 
2979 		err = -EPIPE;
2980 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2981 			goto failure;
2982 
2983 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2984 			break;
2985 
2986 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2987 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2988 		err = -EAGAIN;
2989 		if (!timeo)
2990 			goto failure;
2991 		if (signal_pending(current))
2992 			goto interrupted;
2993 		timeo = sock_wait_for_wmem(sk, timeo);
2994 	}
2995 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2996 				   errcode, sk->sk_allocation);
2997 	if (skb)
2998 		skb_set_owner_w(skb, sk);
2999 	return skb;
3000 
3001 interrupted:
3002 	err = sock_intr_errno(timeo);
3003 failure:
3004 	*errcode = err;
3005 	return NULL;
3006 }
3007 EXPORT_SYMBOL(sock_alloc_send_pskb);
3008 
3009 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
3010 		     struct sockcm_cookie *sockc)
3011 {
3012 	u32 tsflags;
3013 
3014 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
3015 
3016 	switch (cmsg->cmsg_type) {
3017 	case SO_MARK:
3018 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
3019 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3020 			return -EPERM;
3021 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3022 			return -EINVAL;
3023 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
3024 		break;
3025 	case SO_TIMESTAMPING_OLD:
3026 	case SO_TIMESTAMPING_NEW:
3027 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3028 			return -EINVAL;
3029 
3030 		tsflags = *(u32 *)CMSG_DATA(cmsg);
3031 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3032 			return -EINVAL;
3033 
3034 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3035 		sockc->tsflags |= tsflags;
3036 		break;
3037 	case SCM_TXTIME:
3038 		if (!sock_flag(sk, SOCK_TXTIME))
3039 			return -EINVAL;
3040 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3041 			return -EINVAL;
3042 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3043 		break;
3044 	case SCM_TS_OPT_ID:
3045 		if (sk_is_tcp(sk))
3046 			return -EINVAL;
3047 		tsflags = READ_ONCE(sk->sk_tsflags);
3048 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3049 			return -EINVAL;
3050 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3051 			return -EINVAL;
3052 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
3053 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
3054 		break;
3055 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
3056 	case SCM_RIGHTS:
3057 	case SCM_CREDENTIALS:
3058 		break;
3059 	case SO_PRIORITY:
3060 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3061 			return -EINVAL;
3062 		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
3063 			return -EPERM;
3064 		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
3065 		break;
3066 	case SCM_DEVMEM_DMABUF:
3067 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3068 			return -EINVAL;
3069 		sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
3070 		break;
3071 	default:
3072 		return -EINVAL;
3073 	}
3074 	return 0;
3075 }
3076 EXPORT_SYMBOL(__sock_cmsg_send);
3077 
3078 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
3079 		   struct sockcm_cookie *sockc)
3080 {
3081 	struct cmsghdr *cmsg;
3082 	int ret;
3083 
3084 	for_each_cmsghdr(cmsg, msg) {
3085 		if (!CMSG_OK(msg, cmsg))
3086 			return -EINVAL;
3087 		if (cmsg->cmsg_level != SOL_SOCKET)
3088 			continue;
3089 		ret = __sock_cmsg_send(sk, cmsg, sockc);
3090 		if (ret)
3091 			return ret;
3092 	}
3093 	return 0;
3094 }
3095 EXPORT_SYMBOL(sock_cmsg_send);
3096 
3097 static void sk_enter_memory_pressure(struct sock *sk)
3098 {
3099 	if (!sk->sk_prot->enter_memory_pressure)
3100 		return;
3101 
3102 	sk->sk_prot->enter_memory_pressure(sk);
3103 }
3104 
3105 static void sk_leave_memory_pressure(struct sock *sk)
3106 {
3107 	if (sk->sk_prot->leave_memory_pressure) {
3108 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3109 				     tcp_leave_memory_pressure, sk);
3110 	} else {
3111 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3112 
3113 		if (memory_pressure && READ_ONCE(*memory_pressure))
3114 			WRITE_ONCE(*memory_pressure, 0);
3115 	}
3116 }
3117 
3118 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3119 
3120 /**
3121  * skb_page_frag_refill - check that a page_frag contains enough room
3122  * @sz: minimum size of the fragment we want to get
3123  * @pfrag: pointer to page_frag
3124  * @gfp: priority for memory allocation
3125  *
3126  * Note: While this allocator tries to use high order pages, there is
3127  * no guarantee that allocations succeed. Therefore, @sz MUST be
3128  * less or equal than PAGE_SIZE.
3129  */
3130 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3131 {
3132 	if (pfrag->page) {
3133 		if (page_ref_count(pfrag->page) == 1) {
3134 			pfrag->offset = 0;
3135 			return true;
3136 		}
3137 		if (pfrag->offset + sz <= pfrag->size)
3138 			return true;
3139 		put_page(pfrag->page);
3140 	}
3141 
3142 	pfrag->offset = 0;
3143 	if (SKB_FRAG_PAGE_ORDER &&
3144 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3145 		/* Avoid direct reclaim but allow kswapd to wake */
3146 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3147 					  __GFP_COMP | __GFP_NOWARN |
3148 					  __GFP_NORETRY,
3149 					  SKB_FRAG_PAGE_ORDER);
3150 		if (likely(pfrag->page)) {
3151 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3152 			return true;
3153 		}
3154 	}
3155 	pfrag->page = alloc_page(gfp);
3156 	if (likely(pfrag->page)) {
3157 		pfrag->size = PAGE_SIZE;
3158 		return true;
3159 	}
3160 	return false;
3161 }
3162 EXPORT_SYMBOL(skb_page_frag_refill);
3163 
3164 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3165 {
3166 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3167 		return true;
3168 
3169 	if (!sk->sk_bypass_prot_mem)
3170 		sk_enter_memory_pressure(sk);
3171 
3172 	sk_stream_moderate_sndbuf(sk);
3173 
3174 	return false;
3175 }
3176 EXPORT_SYMBOL(sk_page_frag_refill);
3177 
3178 void __lock_sock(struct sock *sk)
3179 	__releases(&sk->sk_lock.slock)
3180 	__acquires(&sk->sk_lock.slock)
3181 {
3182 	DEFINE_WAIT(wait);
3183 
3184 	for (;;) {
3185 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3186 					TASK_UNINTERRUPTIBLE);
3187 		spin_unlock_bh(&sk->sk_lock.slock);
3188 		schedule();
3189 		spin_lock_bh(&sk->sk_lock.slock);
3190 		if (!sock_owned_by_user(sk))
3191 			break;
3192 	}
3193 	finish_wait(&sk->sk_lock.wq, &wait);
3194 }
3195 
3196 void __release_sock(struct sock *sk)
3197 	__releases(&sk->sk_lock.slock)
3198 	__acquires(&sk->sk_lock.slock)
3199 {
3200 	struct sk_buff *skb, *next;
3201 	int nb = 0;
3202 
3203 	while ((skb = sk->sk_backlog.head) != NULL) {
3204 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3205 
3206 		spin_unlock_bh(&sk->sk_lock.slock);
3207 
3208 		while (1) {
3209 			next = skb->next;
3210 			prefetch(next);
3211 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3212 			skb_mark_not_on_list(skb);
3213 			sk_backlog_rcv(sk, skb);
3214 
3215 			skb = next;
3216 			if (!skb)
3217 				break;
3218 
3219 			if (!(++nb & 15))
3220 				cond_resched();
3221 		}
3222 
3223 		spin_lock_bh(&sk->sk_lock.slock);
3224 	}
3225 
3226 	/*
3227 	 * Doing the zeroing here guarantee we can not loop forever
3228 	 * while a wild producer attempts to flood us.
3229 	 */
3230 	sk->sk_backlog.len = 0;
3231 }
3232 
3233 void __sk_flush_backlog(struct sock *sk)
3234 {
3235 	spin_lock_bh(&sk->sk_lock.slock);
3236 	__release_sock(sk);
3237 
3238 	if (sk->sk_prot->release_cb)
3239 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3240 				     tcp_release_cb, sk);
3241 
3242 	spin_unlock_bh(&sk->sk_lock.slock);
3243 }
3244 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3245 
3246 /**
3247  * sk_wait_data - wait for data to arrive at sk_receive_queue
3248  * @sk:    sock to wait on
3249  * @timeo: for how long
3250  * @skb:   last skb seen on sk_receive_queue
3251  *
3252  * Now socket state including sk->sk_err is changed only under lock,
3253  * hence we may omit checks after joining wait queue.
3254  * We check receive queue before schedule() only as optimization;
3255  * it is very likely that release_sock() added new data.
3256  */
3257 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3258 {
3259 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3260 	int rc;
3261 
3262 	add_wait_queue(sk_sleep(sk), &wait);
3263 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3264 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3265 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3266 	remove_wait_queue(sk_sleep(sk), &wait);
3267 	return rc;
3268 }
3269 EXPORT_SYMBOL(sk_wait_data);
3270 
3271 /**
3272  *	__sk_mem_raise_allocated - increase memory_allocated
3273  *	@sk: socket
3274  *	@size: memory size to allocate
3275  *	@amt: pages to allocate
3276  *	@kind: allocation type
3277  *
3278  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3279  *
3280  *	Unlike the globally shared limits among the sockets under same protocol,
3281  *	consuming the budget of a memcg won't have direct effect on other ones.
3282  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3283  *	whether or not to raise allocated through sk_under_memory_pressure() or
3284  *	its variants.
3285  */
3286 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3287 {
3288 	bool memcg_enabled = false, charged = false;
3289 	struct proto *prot = sk->sk_prot;
3290 	long allocated = 0;
3291 
3292 	if (!sk->sk_bypass_prot_mem) {
3293 		sk_memory_allocated_add(sk, amt);
3294 		allocated = sk_memory_allocated(sk);
3295 	}
3296 
3297 	if (mem_cgroup_sk_enabled(sk)) {
3298 		memcg_enabled = true;
3299 		charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
3300 		if (!charged)
3301 			goto suppress_allocation;
3302 	}
3303 
3304 	if (!allocated)
3305 		return 1;
3306 
3307 	/* Under limit. */
3308 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3309 		sk_leave_memory_pressure(sk);
3310 		return 1;
3311 	}
3312 
3313 	/* Under pressure. */
3314 	if (allocated > sk_prot_mem_limits(sk, 1))
3315 		sk_enter_memory_pressure(sk);
3316 
3317 	/* Over hard limit. */
3318 	if (allocated > sk_prot_mem_limits(sk, 2))
3319 		goto suppress_allocation;
3320 
3321 	/* Guarantee minimum buffer size under pressure (either global
3322 	 * or memcg) to make sure features described in RFC 7323 (TCP
3323 	 * Extensions for High Performance) work properly.
3324 	 *
3325 	 * This rule does NOT stand when exceeds global or memcg's hard
3326 	 * limit, or else a DoS attack can be taken place by spawning
3327 	 * lots of sockets whose usage are under minimum buffer size.
3328 	 */
3329 	if (kind == SK_MEM_RECV) {
3330 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3331 			return 1;
3332 
3333 	} else { /* SK_MEM_SEND */
3334 		int wmem0 = sk_get_wmem0(sk, prot);
3335 
3336 		if (sk->sk_type == SOCK_STREAM) {
3337 			if (sk->sk_wmem_queued < wmem0)
3338 				return 1;
3339 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3340 				return 1;
3341 		}
3342 	}
3343 
3344 	if (sk_has_memory_pressure(sk)) {
3345 		u64 alloc;
3346 
3347 		/* The following 'average' heuristic is within the
3348 		 * scope of global accounting, so it only makes
3349 		 * sense for global memory pressure.
3350 		 */
3351 		if (!sk_under_global_memory_pressure(sk))
3352 			return 1;
3353 
3354 		/* Try to be fair among all the sockets under global
3355 		 * pressure by allowing the ones that below average
3356 		 * usage to raise.
3357 		 */
3358 		alloc = sk_sockets_allocated_read_positive(sk);
3359 		if (sk_prot_mem_limits(sk, 2) > alloc *
3360 		    sk_mem_pages(sk->sk_wmem_queued +
3361 				 atomic_read(&sk->sk_rmem_alloc) +
3362 				 sk->sk_forward_alloc))
3363 			return 1;
3364 	}
3365 
3366 suppress_allocation:
3367 
3368 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3369 		sk_stream_moderate_sndbuf(sk);
3370 
3371 		/* Fail only if socket is _under_ its sndbuf.
3372 		 * In this case we cannot block, so that we have to fail.
3373 		 */
3374 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3375 			/* Force charge with __GFP_NOFAIL */
3376 			if (memcg_enabled && !charged)
3377 				mem_cgroup_sk_charge(sk, amt,
3378 						     gfp_memcg_charge() | __GFP_NOFAIL);
3379 			return 1;
3380 		}
3381 	}
3382 
3383 	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3384 
3385 	if (allocated)
3386 		sk_memory_allocated_sub(sk, amt);
3387 
3388 	if (charged)
3389 		mem_cgroup_sk_uncharge(sk, amt);
3390 
3391 	return 0;
3392 }
3393 
3394 /**
3395  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3396  *	@sk: socket
3397  *	@size: memory size to allocate
3398  *	@kind: allocation type
3399  *
3400  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3401  *	rmem allocation. This function assumes that protocols which have
3402  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3403  */
3404 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3405 {
3406 	int ret, amt = sk_mem_pages(size);
3407 
3408 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3409 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3410 	if (!ret)
3411 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3412 	return ret;
3413 }
3414 EXPORT_SYMBOL(__sk_mem_schedule);
3415 
3416 /**
3417  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3418  *	@sk: socket
3419  *	@amount: number of quanta
3420  *
3421  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3422  */
3423 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3424 {
3425 	if (mem_cgroup_sk_enabled(sk))
3426 		mem_cgroup_sk_uncharge(sk, amount);
3427 
3428 	if (sk->sk_bypass_prot_mem)
3429 		return;
3430 
3431 	sk_memory_allocated_sub(sk, amount);
3432 
3433 	if (sk_under_global_memory_pressure(sk) &&
3434 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3435 		sk_leave_memory_pressure(sk);
3436 }
3437 
3438 /**
3439  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3440  *	@sk: socket
3441  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3442  */
3443 void __sk_mem_reclaim(struct sock *sk, int amount)
3444 {
3445 	amount >>= PAGE_SHIFT;
3446 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3447 	__sk_mem_reduce_allocated(sk, amount);
3448 }
3449 EXPORT_SYMBOL(__sk_mem_reclaim);
3450 
3451 void __sk_charge(struct sock *sk, gfp_t gfp)
3452 {
3453 	int amt;
3454 
3455 	gfp |= __GFP_NOFAIL;
3456 	if (mem_cgroup_from_sk(sk)) {
3457 		/* The socket has not been accepted yet, no need
3458 		 * to look at newsk->sk_wmem_queued.
3459 		 */
3460 		amt = sk_mem_pages(sk->sk_forward_alloc +
3461 				   atomic_read(&sk->sk_rmem_alloc));
3462 		if (amt)
3463 			mem_cgroup_sk_charge(sk, amt, gfp);
3464 	}
3465 
3466 	kmem_cache_charge(sk, gfp);
3467 }
3468 
3469 int sk_set_peek_off(struct sock *sk, int val)
3470 {
3471 	WRITE_ONCE(sk->sk_peek_off, val);
3472 	return 0;
3473 }
3474 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3475 
3476 /*
3477  * Set of default routines for initialising struct proto_ops when
3478  * the protocol does not support a particular function. In certain
3479  * cases where it makes no sense for a protocol to have a "do nothing"
3480  * function, some default processing is provided.
3481  */
3482 
3483 int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
3484 {
3485 	return -EOPNOTSUPP;
3486 }
3487 EXPORT_SYMBOL(sock_no_bind);
3488 
3489 int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
3490 		    int len, int flags)
3491 {
3492 	return -EOPNOTSUPP;
3493 }
3494 EXPORT_SYMBOL(sock_no_connect);
3495 
3496 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3497 {
3498 	return -EOPNOTSUPP;
3499 }
3500 EXPORT_SYMBOL(sock_no_socketpair);
3501 
3502 int sock_no_accept(struct socket *sock, struct socket *newsock,
3503 		   struct proto_accept_arg *arg)
3504 {
3505 	return -EOPNOTSUPP;
3506 }
3507 EXPORT_SYMBOL(sock_no_accept);
3508 
3509 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3510 		    int peer)
3511 {
3512 	return -EOPNOTSUPP;
3513 }
3514 EXPORT_SYMBOL(sock_no_getname);
3515 
3516 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3517 {
3518 	return -EOPNOTSUPP;
3519 }
3520 EXPORT_SYMBOL(sock_no_ioctl);
3521 
3522 int sock_no_listen(struct socket *sock, int backlog)
3523 {
3524 	return -EOPNOTSUPP;
3525 }
3526 EXPORT_SYMBOL(sock_no_listen);
3527 
3528 int sock_no_shutdown(struct socket *sock, int how)
3529 {
3530 	return -EOPNOTSUPP;
3531 }
3532 EXPORT_SYMBOL(sock_no_shutdown);
3533 
3534 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3535 {
3536 	return -EOPNOTSUPP;
3537 }
3538 EXPORT_SYMBOL(sock_no_sendmsg);
3539 
3540 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3541 {
3542 	return -EOPNOTSUPP;
3543 }
3544 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3545 
3546 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3547 		    int flags)
3548 {
3549 	return -EOPNOTSUPP;
3550 }
3551 EXPORT_SYMBOL(sock_no_recvmsg);
3552 
3553 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3554 {
3555 	/* Mirror missing mmap method error code */
3556 	return -ENODEV;
3557 }
3558 EXPORT_SYMBOL(sock_no_mmap);
3559 
3560 /*
3561  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3562  * various sock-based usage counts.
3563  */
3564 void __receive_sock(struct file *file)
3565 {
3566 	struct socket *sock;
3567 
3568 	sock = sock_from_file(file);
3569 	if (sock) {
3570 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3571 		sock_update_classid(&sock->sk->sk_cgrp_data);
3572 	}
3573 }
3574 
3575 /*
3576  *	Default Socket Callbacks
3577  */
3578 
3579 static void sock_def_wakeup(struct sock *sk)
3580 {
3581 	struct socket_wq *wq;
3582 
3583 	rcu_read_lock();
3584 	wq = rcu_dereference(sk->sk_wq);
3585 	if (skwq_has_sleeper(wq))
3586 		wake_up_interruptible_all(&wq->wait);
3587 	rcu_read_unlock();
3588 }
3589 
3590 static void sock_def_error_report(struct sock *sk)
3591 {
3592 	struct socket_wq *wq;
3593 
3594 	rcu_read_lock();
3595 	wq = rcu_dereference(sk->sk_wq);
3596 	if (skwq_has_sleeper(wq))
3597 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3598 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3599 	rcu_read_unlock();
3600 }
3601 
3602 void sock_def_readable(struct sock *sk)
3603 {
3604 	struct socket_wq *wq;
3605 
3606 	trace_sk_data_ready(sk);
3607 
3608 	rcu_read_lock();
3609 	wq = rcu_dereference(sk->sk_wq);
3610 	if (skwq_has_sleeper(wq))
3611 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3612 						EPOLLRDNORM | EPOLLRDBAND);
3613 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3614 	rcu_read_unlock();
3615 }
3616 
3617 static void sock_def_write_space(struct sock *sk)
3618 {
3619 	struct socket_wq *wq;
3620 
3621 	rcu_read_lock();
3622 
3623 	/* Do not wake up a writer until he can make "significant"
3624 	 * progress.  --DaveM
3625 	 */
3626 	if (sock_writeable(sk)) {
3627 		wq = rcu_dereference(sk->sk_wq);
3628 		if (skwq_has_sleeper(wq))
3629 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3630 						EPOLLWRNORM | EPOLLWRBAND);
3631 
3632 		/* Should agree with poll, otherwise some programs break */
3633 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3634 	}
3635 
3636 	rcu_read_unlock();
3637 }
3638 
3639 /* An optimised version of sock_def_write_space(), should only be called
3640  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3641  * ->sk_wmem_alloc.
3642  */
3643 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
3644 {
3645 	/* Do not wake up a writer until he can make "significant"
3646 	 * progress.  --DaveM
3647 	 */
3648 	if (__sock_writeable(sk, wmem_alloc)) {
3649 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3650 
3651 		/* rely on refcount_sub from sock_wfree() */
3652 		smp_mb__after_atomic();
3653 		if (wq && waitqueue_active(&wq->wait))
3654 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3655 						EPOLLWRNORM | EPOLLWRBAND);
3656 
3657 		/* Should agree with poll, otherwise some programs break */
3658 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3659 	}
3660 }
3661 
3662 static void sock_def_destruct(struct sock *sk)
3663 {
3664 }
3665 
3666 void sk_send_sigurg(struct sock *sk)
3667 {
3668 	if (sk->sk_socket && sk->sk_socket->file)
3669 		if (send_sigurg(sk->sk_socket->file))
3670 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3671 }
3672 EXPORT_SYMBOL(sk_send_sigurg);
3673 
3674 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3675 		    unsigned long expires)
3676 {
3677 	if (!mod_timer(timer, expires))
3678 		sock_hold(sk);
3679 }
3680 EXPORT_SYMBOL(sk_reset_timer);
3681 
3682 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3683 {
3684 	if (timer_delete(timer))
3685 		__sock_put(sk);
3686 }
3687 EXPORT_SYMBOL(sk_stop_timer);
3688 
3689 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3690 {
3691 	if (timer_delete_sync(timer))
3692 		__sock_put(sk);
3693 }
3694 EXPORT_SYMBOL(sk_stop_timer_sync);
3695 
3696 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3697 {
3698 	sk_init_common(sk);
3699 	sk->sk_send_head	=	NULL;
3700 
3701 	timer_setup(&sk->sk_timer, NULL, 0);
3702 
3703 	sk->sk_allocation	=	GFP_KERNEL;
3704 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3705 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3706 	sk->sk_state		=	TCP_CLOSE;
3707 	sk->sk_use_task_frag	=	true;
3708 	sk_set_socket(sk, sock);
3709 
3710 	sock_set_flag(sk, SOCK_ZAPPED);
3711 
3712 	if (sock) {
3713 		sk->sk_type	=	sock->type;
3714 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3715 		sock->sk	=	sk;
3716 	} else {
3717 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3718 	}
3719 	sk->sk_uid	=	uid;
3720 
3721 	sk->sk_state_change	=	sock_def_wakeup;
3722 	sk->sk_data_ready	=	sock_def_readable;
3723 	sk->sk_write_space	=	sock_def_write_space;
3724 	sk->sk_error_report	=	sock_def_error_report;
3725 	sk->sk_destruct		=	sock_def_destruct;
3726 
3727 	sk->sk_frag.page	=	NULL;
3728 	sk->sk_frag.offset	=	0;
3729 	sk->sk_peek_off		=	-1;
3730 
3731 	sk->sk_peer_pid 	=	NULL;
3732 	sk->sk_peer_cred	=	NULL;
3733 	spin_lock_init(&sk->sk_peer_lock);
3734 
3735 	sk->sk_write_pending	=	0;
3736 	sk->sk_rcvlowat		=	1;
3737 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3738 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3739 
3740 	sk->sk_stamp = SK_DEFAULT_STAMP;
3741 #if BITS_PER_LONG==32
3742 	seqlock_init(&sk->sk_stamp_seq);
3743 #endif
3744 	atomic_set(&sk->sk_zckey, 0);
3745 
3746 #ifdef CONFIG_NET_RX_BUSY_POLL
3747 	sk->sk_napi_id		=	0;
3748 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3749 #endif
3750 
3751 	sk->sk_max_pacing_rate = ~0UL;
3752 	sk->sk_pacing_rate = ~0UL;
3753 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3754 	sk->sk_incoming_cpu = -1;
3755 
3756 	sk_rx_queue_clear(sk);
3757 	/*
3758 	 * Before updating sk_refcnt, we must commit prior changes to memory
3759 	 * (Documentation/RCU/rculist_nulls.rst for details)
3760 	 */
3761 	smp_wmb();
3762 	refcount_set(&sk->sk_refcnt, 1);
3763 	sk_drops_reset(sk);
3764 }
3765 EXPORT_SYMBOL(sock_init_data_uid);
3766 
3767 void sock_init_data(struct socket *sock, struct sock *sk)
3768 {
3769 	kuid_t uid = sock ?
3770 		SOCK_INODE(sock)->i_uid :
3771 		make_kuid(sock_net(sk)->user_ns, 0);
3772 
3773 	sock_init_data_uid(sock, sk, uid);
3774 }
3775 EXPORT_SYMBOL(sock_init_data);
3776 
3777 void lock_sock_nested(struct sock *sk, int subclass)
3778 {
3779 	/* The sk_lock has mutex_lock() semantics here. */
3780 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3781 
3782 	might_sleep();
3783 	spin_lock_bh(&sk->sk_lock.slock);
3784 	if (sock_owned_by_user_nocheck(sk))
3785 		__lock_sock(sk);
3786 	sk->sk_lock.owned = 1;
3787 	spin_unlock_bh(&sk->sk_lock.slock);
3788 }
3789 EXPORT_SYMBOL(lock_sock_nested);
3790 
3791 void release_sock(struct sock *sk)
3792 {
3793 	spin_lock_bh(&sk->sk_lock.slock);
3794 	if (sk->sk_backlog.tail)
3795 		__release_sock(sk);
3796 
3797 	if (sk->sk_prot->release_cb)
3798 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3799 				     tcp_release_cb, sk);
3800 
3801 	sock_release_ownership(sk);
3802 	if (waitqueue_active(&sk->sk_lock.wq))
3803 		wake_up(&sk->sk_lock.wq);
3804 	spin_unlock_bh(&sk->sk_lock.slock);
3805 }
3806 EXPORT_SYMBOL(release_sock);
3807 
3808 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3809 {
3810 	might_sleep();
3811 	spin_lock_bh(&sk->sk_lock.slock);
3812 
3813 	if (!sock_owned_by_user_nocheck(sk)) {
3814 		/*
3815 		 * Fast path return with bottom halves disabled and
3816 		 * sock::sk_lock.slock held.
3817 		 *
3818 		 * The 'mutex' is not contended and holding
3819 		 * sock::sk_lock.slock prevents all other lockers to
3820 		 * proceed so the corresponding unlock_sock_fast() can
3821 		 * avoid the slow path of release_sock() completely and
3822 		 * just release slock.
3823 		 *
3824 		 * From a semantical POV this is equivalent to 'acquiring'
3825 		 * the 'mutex', hence the corresponding lockdep
3826 		 * mutex_release() has to happen in the fast path of
3827 		 * unlock_sock_fast().
3828 		 */
3829 		return false;
3830 	}
3831 
3832 	__lock_sock(sk);
3833 	sk->sk_lock.owned = 1;
3834 	__acquire(&sk->sk_lock.slock);
3835 	spin_unlock_bh(&sk->sk_lock.slock);
3836 	return true;
3837 }
3838 EXPORT_SYMBOL(__lock_sock_fast);
3839 
3840 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3841 		   bool timeval, bool time32)
3842 {
3843 	struct sock *sk = sock->sk;
3844 	struct timespec64 ts;
3845 
3846 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3847 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3848 	if (ts.tv_sec == -1)
3849 		return -ENOENT;
3850 	if (ts.tv_sec == 0) {
3851 		ktime_t kt = ktime_get_real();
3852 		sock_write_timestamp(sk, kt);
3853 		ts = ktime_to_timespec64(kt);
3854 	}
3855 
3856 	if (timeval)
3857 		ts.tv_nsec /= 1000;
3858 
3859 #ifdef CONFIG_COMPAT_32BIT_TIME
3860 	if (time32)
3861 		return put_old_timespec32(&ts, userstamp);
3862 #endif
3863 #ifdef CONFIG_SPARC64
3864 	/* beware of padding in sparc64 timeval */
3865 	if (timeval && !in_compat_syscall()) {
3866 		struct __kernel_old_timeval __user tv = {
3867 			.tv_sec = ts.tv_sec,
3868 			.tv_usec = ts.tv_nsec,
3869 		};
3870 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3871 			return -EFAULT;
3872 		return 0;
3873 	}
3874 #endif
3875 	return put_timespec64(&ts, userstamp);
3876 }
3877 EXPORT_SYMBOL(sock_gettstamp);
3878 
3879 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3880 {
3881 	if (!sock_flag(sk, flag)) {
3882 		unsigned long previous_flags = sk->sk_flags;
3883 
3884 		sock_set_flag(sk, flag);
3885 		/*
3886 		 * we just set one of the two flags which require net
3887 		 * time stamping, but time stamping might have been on
3888 		 * already because of the other one
3889 		 */
3890 		if (sock_needs_netstamp(sk) &&
3891 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3892 			net_enable_timestamp();
3893 	}
3894 }
3895 
3896 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3897 		       int level, int type)
3898 {
3899 	struct sock_exterr_skb *serr;
3900 	struct sk_buff *skb;
3901 	int copied, err;
3902 
3903 	err = -EAGAIN;
3904 	skb = sock_dequeue_err_skb(sk);
3905 	if (skb == NULL)
3906 		goto out;
3907 
3908 	copied = skb->len;
3909 	if (copied > len) {
3910 		msg->msg_flags |= MSG_TRUNC;
3911 		copied = len;
3912 	}
3913 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3914 	if (err)
3915 		goto out_free_skb;
3916 
3917 	sock_recv_timestamp(msg, sk, skb);
3918 
3919 	serr = SKB_EXT_ERR(skb);
3920 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3921 
3922 	msg->msg_flags |= MSG_ERRQUEUE;
3923 	err = copied;
3924 
3925 out_free_skb:
3926 	kfree_skb(skb);
3927 out:
3928 	return err;
3929 }
3930 EXPORT_SYMBOL(sock_recv_errqueue);
3931 
3932 /*
3933  *	Get a socket option on an socket.
3934  *
3935  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3936  *	asynchronous errors should be reported by getsockopt. We assume
3937  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3938  */
3939 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3940 			   char __user *optval, int __user *optlen)
3941 {
3942 	struct sock *sk = sock->sk;
3943 
3944 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3945 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3946 }
3947 EXPORT_SYMBOL(sock_common_getsockopt);
3948 
3949 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3950 			int flags)
3951 {
3952 	struct sock *sk = sock->sk;
3953 	int addr_len = 0;
3954 	int err;
3955 
3956 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3957 	if (err >= 0)
3958 		msg->msg_namelen = addr_len;
3959 	return err;
3960 }
3961 EXPORT_SYMBOL(sock_common_recvmsg);
3962 
3963 /*
3964  *	Set socket options on an inet socket.
3965  */
3966 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3967 			   sockptr_t optval, unsigned int optlen)
3968 {
3969 	struct sock *sk = sock->sk;
3970 
3971 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3972 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3973 }
3974 EXPORT_SYMBOL(sock_common_setsockopt);
3975 
3976 void sk_common_release(struct sock *sk)
3977 {
3978 	if (sk->sk_prot->destroy)
3979 		sk->sk_prot->destroy(sk);
3980 
3981 	/*
3982 	 * Observation: when sk_common_release is called, processes have
3983 	 * no access to socket. But net still has.
3984 	 * Step one, detach it from networking:
3985 	 *
3986 	 * A. Remove from hash tables.
3987 	 */
3988 
3989 	sk->sk_prot->unhash(sk);
3990 
3991 	/*
3992 	 * In this point socket cannot receive new packets, but it is possible
3993 	 * that some packets are in flight because some CPU runs receiver and
3994 	 * did hash table lookup before we unhashed socket. They will achieve
3995 	 * receive queue and will be purged by socket destructor.
3996 	 *
3997 	 * Also we still have packets pending on receive queue and probably,
3998 	 * our own packets waiting in device queues. sock_destroy will drain
3999 	 * receive queue, but transmitted packets will delay socket destruction
4000 	 * until the last reference will be released.
4001 	 */
4002 
4003 	sock_orphan(sk);
4004 
4005 	xfrm_sk_free_policy(sk);
4006 
4007 	sock_put(sk);
4008 }
4009 EXPORT_SYMBOL(sk_common_release);
4010 
4011 void sk_get_meminfo(const struct sock *sk, u32 *mem)
4012 {
4013 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
4014 
4015 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
4016 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
4017 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
4018 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
4019 	mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
4020 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
4021 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
4022 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
4023 	mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
4024 }
4025 
4026 #ifdef CONFIG_PROC_FS
4027 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
4028 
4029 int sock_prot_inuse_get(struct net *net, struct proto *prot)
4030 {
4031 	int cpu, idx = prot->inuse_idx;
4032 	int res = 0;
4033 
4034 	for_each_possible_cpu(cpu)
4035 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
4036 
4037 	return res >= 0 ? res : 0;
4038 }
4039 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4040 
4041 int sock_inuse_get(struct net *net)
4042 {
4043 	int cpu, res = 0;
4044 
4045 	for_each_possible_cpu(cpu)
4046 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4047 
4048 	return res;
4049 }
4050 
4051 EXPORT_SYMBOL_GPL(sock_inuse_get);
4052 
4053 static int __net_init sock_inuse_init_net(struct net *net)
4054 {
4055 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4056 	if (net->core.prot_inuse == NULL)
4057 		return -ENOMEM;
4058 	return 0;
4059 }
4060 
4061 static void __net_exit sock_inuse_exit_net(struct net *net)
4062 {
4063 	free_percpu(net->core.prot_inuse);
4064 }
4065 
4066 static struct pernet_operations net_inuse_ops = {
4067 	.init = sock_inuse_init_net,
4068 	.exit = sock_inuse_exit_net,
4069 };
4070 
4071 static __init int net_inuse_init(void)
4072 {
4073 	if (register_pernet_subsys(&net_inuse_ops))
4074 		panic("Cannot initialize net inuse counters");
4075 
4076 	return 0;
4077 }
4078 
4079 core_initcall(net_inuse_init);
4080 
4081 static int assign_proto_idx(struct proto *prot)
4082 {
4083 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4084 
4085 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4086 		pr_err("PROTO_INUSE_NR exhausted\n");
4087 		return -ENOSPC;
4088 	}
4089 
4090 	set_bit(prot->inuse_idx, proto_inuse_idx);
4091 	return 0;
4092 }
4093 
4094 static void release_proto_idx(struct proto *prot)
4095 {
4096 	if (prot->inuse_idx != PROTO_INUSE_NR)
4097 		clear_bit(prot->inuse_idx, proto_inuse_idx);
4098 }
4099 #else
4100 static inline int assign_proto_idx(struct proto *prot)
4101 {
4102 	return 0;
4103 }
4104 
4105 static inline void release_proto_idx(struct proto *prot)
4106 {
4107 }
4108 
4109 #endif
4110 
4111 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4112 {
4113 	if (!twsk_prot)
4114 		return;
4115 	kfree(twsk_prot->twsk_slab_name);
4116 	twsk_prot->twsk_slab_name = NULL;
4117 	kmem_cache_destroy(twsk_prot->twsk_slab);
4118 	twsk_prot->twsk_slab = NULL;
4119 }
4120 
4121 static int tw_prot_init(const struct proto *prot)
4122 {
4123 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4124 
4125 	if (!twsk_prot)
4126 		return 0;
4127 
4128 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4129 					      prot->name);
4130 	if (!twsk_prot->twsk_slab_name)
4131 		return -ENOMEM;
4132 
4133 	twsk_prot->twsk_slab =
4134 		kmem_cache_create(twsk_prot->twsk_slab_name,
4135 				  twsk_prot->twsk_obj_size, 0,
4136 				  SLAB_ACCOUNT | prot->slab_flags,
4137 				  NULL);
4138 	if (!twsk_prot->twsk_slab) {
4139 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4140 			prot->name);
4141 		return -ENOMEM;
4142 	}
4143 
4144 	return 0;
4145 }
4146 
4147 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4148 {
4149 	if (!rsk_prot)
4150 		return;
4151 	kfree(rsk_prot->slab_name);
4152 	rsk_prot->slab_name = NULL;
4153 	kmem_cache_destroy(rsk_prot->slab);
4154 	rsk_prot->slab = NULL;
4155 }
4156 
4157 static int req_prot_init(const struct proto *prot)
4158 {
4159 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4160 
4161 	if (!rsk_prot)
4162 		return 0;
4163 
4164 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4165 					prot->name);
4166 	if (!rsk_prot->slab_name)
4167 		return -ENOMEM;
4168 
4169 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4170 					   rsk_prot->obj_size, 0,
4171 					   SLAB_ACCOUNT | prot->slab_flags,
4172 					   NULL);
4173 
4174 	if (!rsk_prot->slab) {
4175 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4176 			prot->name);
4177 		return -ENOMEM;
4178 	}
4179 	return 0;
4180 }
4181 
4182 int proto_register(struct proto *prot, int alloc_slab)
4183 {
4184 	int ret = -ENOBUFS;
4185 
4186 	if (prot->memory_allocated && !prot->sysctl_mem) {
4187 		pr_err("%s: missing sysctl_mem\n", prot->name);
4188 		return -EINVAL;
4189 	}
4190 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4191 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4192 		return -EINVAL;
4193 	}
4194 	if (alloc_slab) {
4195 		prot->slab = kmem_cache_create_usercopy(prot->name,
4196 					prot->obj_size, 0,
4197 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4198 					prot->slab_flags,
4199 					prot->useroffset, prot->usersize,
4200 					NULL);
4201 
4202 		if (prot->slab == NULL) {
4203 			pr_crit("%s: Can't create sock SLAB cache!\n",
4204 				prot->name);
4205 			goto out;
4206 		}
4207 
4208 		if (req_prot_init(prot))
4209 			goto out_free_request_sock_slab;
4210 
4211 		if (tw_prot_init(prot))
4212 			goto out_free_timewait_sock_slab;
4213 	}
4214 
4215 	mutex_lock(&proto_list_mutex);
4216 	ret = assign_proto_idx(prot);
4217 	if (ret) {
4218 		mutex_unlock(&proto_list_mutex);
4219 		goto out_free_timewait_sock_slab;
4220 	}
4221 	list_add(&prot->node, &proto_list);
4222 	mutex_unlock(&proto_list_mutex);
4223 	return ret;
4224 
4225 out_free_timewait_sock_slab:
4226 	if (alloc_slab)
4227 		tw_prot_cleanup(prot->twsk_prot);
4228 out_free_request_sock_slab:
4229 	if (alloc_slab) {
4230 		req_prot_cleanup(prot->rsk_prot);
4231 
4232 		kmem_cache_destroy(prot->slab);
4233 		prot->slab = NULL;
4234 	}
4235 out:
4236 	return ret;
4237 }
4238 EXPORT_SYMBOL(proto_register);
4239 
4240 void proto_unregister(struct proto *prot)
4241 {
4242 	mutex_lock(&proto_list_mutex);
4243 	release_proto_idx(prot);
4244 	list_del(&prot->node);
4245 	mutex_unlock(&proto_list_mutex);
4246 
4247 	kmem_cache_destroy(prot->slab);
4248 	prot->slab = NULL;
4249 
4250 	req_prot_cleanup(prot->rsk_prot);
4251 	tw_prot_cleanup(prot->twsk_prot);
4252 }
4253 EXPORT_SYMBOL(proto_unregister);
4254 
4255 int sock_load_diag_module(int family, int protocol)
4256 {
4257 	if (!protocol) {
4258 		if (!sock_is_registered(family))
4259 			return -ENOENT;
4260 
4261 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4262 				      NETLINK_SOCK_DIAG, family);
4263 	}
4264 
4265 #ifdef CONFIG_INET
4266 	if (family == AF_INET &&
4267 	    protocol != IPPROTO_RAW &&
4268 	    protocol < MAX_INET_PROTOS &&
4269 	    !rcu_access_pointer(inet_protos[protocol]))
4270 		return -ENOENT;
4271 #endif
4272 
4273 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4274 			      NETLINK_SOCK_DIAG, family, protocol);
4275 }
4276 EXPORT_SYMBOL(sock_load_diag_module);
4277 
4278 #ifdef CONFIG_PROC_FS
4279 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4280 	__acquires(proto_list_mutex)
4281 {
4282 	mutex_lock(&proto_list_mutex);
4283 	return seq_list_start_head(&proto_list, *pos);
4284 }
4285 
4286 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4287 {
4288 	return seq_list_next(v, &proto_list, pos);
4289 }
4290 
4291 static void proto_seq_stop(struct seq_file *seq, void *v)
4292 	__releases(proto_list_mutex)
4293 {
4294 	mutex_unlock(&proto_list_mutex);
4295 }
4296 
4297 static char proto_method_implemented(const void *method)
4298 {
4299 	return method == NULL ? 'n' : 'y';
4300 }
4301 static long sock_prot_memory_allocated(struct proto *proto)
4302 {
4303 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4304 }
4305 
4306 static const char *sock_prot_memory_pressure(struct proto *proto)
4307 {
4308 	return proto->memory_pressure != NULL ?
4309 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4310 }
4311 
4312 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4313 {
4314 
4315 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4316 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4317 		   proto->name,
4318 		   proto->obj_size,
4319 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4320 		   sock_prot_memory_allocated(proto),
4321 		   sock_prot_memory_pressure(proto),
4322 		   proto->max_header,
4323 		   proto->slab == NULL ? "no" : "yes",
4324 		   module_name(proto->owner),
4325 		   proto_method_implemented(proto->close),
4326 		   proto_method_implemented(proto->connect),
4327 		   proto_method_implemented(proto->disconnect),
4328 		   proto_method_implemented(proto->accept),
4329 		   proto_method_implemented(proto->ioctl),
4330 		   proto_method_implemented(proto->init),
4331 		   proto_method_implemented(proto->destroy),
4332 		   proto_method_implemented(proto->shutdown),
4333 		   proto_method_implemented(proto->setsockopt),
4334 		   proto_method_implemented(proto->getsockopt),
4335 		   proto_method_implemented(proto->sendmsg),
4336 		   proto_method_implemented(proto->recvmsg),
4337 		   proto_method_implemented(proto->bind),
4338 		   proto_method_implemented(proto->backlog_rcv),
4339 		   proto_method_implemented(proto->hash),
4340 		   proto_method_implemented(proto->unhash),
4341 		   proto_method_implemented(proto->get_port),
4342 		   proto_method_implemented(proto->enter_memory_pressure));
4343 }
4344 
4345 static int proto_seq_show(struct seq_file *seq, void *v)
4346 {
4347 	if (v == &proto_list)
4348 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4349 			   "protocol",
4350 			   "size",
4351 			   "sockets",
4352 			   "memory",
4353 			   "press",
4354 			   "maxhdr",
4355 			   "slab",
4356 			   "module",
4357 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4358 	else
4359 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4360 	return 0;
4361 }
4362 
4363 static const struct seq_operations proto_seq_ops = {
4364 	.start  = proto_seq_start,
4365 	.next   = proto_seq_next,
4366 	.stop   = proto_seq_stop,
4367 	.show   = proto_seq_show,
4368 };
4369 
4370 static __net_init int proto_init_net(struct net *net)
4371 {
4372 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4373 			sizeof(struct seq_net_private)))
4374 		return -ENOMEM;
4375 
4376 	return 0;
4377 }
4378 
4379 static __net_exit void proto_exit_net(struct net *net)
4380 {
4381 	remove_proc_entry("protocols", net->proc_net);
4382 }
4383 
4384 
4385 static __net_initdata struct pernet_operations proto_net_ops = {
4386 	.init = proto_init_net,
4387 	.exit = proto_exit_net,
4388 };
4389 
4390 static int __init proto_init(void)
4391 {
4392 	return register_pernet_subsys(&proto_net_ops);
4393 }
4394 
4395 subsys_initcall(proto_init);
4396 
4397 #endif /* PROC_FS */
4398 
4399 #ifdef CONFIG_NET_RX_BUSY_POLL
4400 bool sk_busy_loop_end(void *p, unsigned long start_time)
4401 {
4402 	struct sock *sk = p;
4403 
4404 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4405 		return true;
4406 
4407 	if (sk_is_udp(sk) &&
4408 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4409 		return true;
4410 
4411 	return sk_busy_loop_timeout(sk, start_time);
4412 }
4413 EXPORT_SYMBOL(sk_busy_loop_end);
4414 #endif /* CONFIG_NET_RX_BUSY_POLL */
4415 
4416 int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
4417 {
4418 	if (!sk->sk_prot->bind_add)
4419 		return -EOPNOTSUPP;
4420 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4421 }
4422 EXPORT_SYMBOL(sock_bind_add);
4423 
4424 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4425 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4426 		     void __user *arg, void *karg, size_t size)
4427 {
4428 	int ret;
4429 
4430 	if (copy_from_user(karg, arg, size))
4431 		return -EFAULT;
4432 
4433 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4434 	if (ret)
4435 		return ret;
4436 
4437 	if (copy_to_user(arg, karg, size))
4438 		return -EFAULT;
4439 
4440 	return 0;
4441 }
4442 EXPORT_SYMBOL(sock_ioctl_inout);
4443 
4444 /* This is the most common ioctl prep function, where the result (4 bytes) is
4445  * copied back to userspace if the ioctl() returns successfully. No input is
4446  * copied from userspace as input argument.
4447  */
4448 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4449 {
4450 	int ret, karg = 0;
4451 
4452 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4453 	if (ret)
4454 		return ret;
4455 
4456 	return put_user(karg, (int __user *)arg);
4457 }
4458 
4459 /* A wrapper around sock ioctls, which copies the data from userspace
4460  * (depending on the protocol/ioctl), and copies back the result to userspace.
4461  * The main motivation for this function is to pass kernel memory to the
4462  * protocol ioctl callbacks, instead of userspace memory.
4463  */
4464 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4465 {
4466 	int rc = 1;
4467 
4468 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4469 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4470 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4471 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4472 	else if (sk_is_phonet(sk))
4473 		rc = phonet_sk_ioctl(sk, cmd, arg);
4474 
4475 	/* If ioctl was processed, returns its value */
4476 	if (rc <= 0)
4477 		return rc;
4478 
4479 	/* Otherwise call the default handler */
4480 	return sock_ioctl_out(sk, cmd, arg);
4481 }
4482 EXPORT_SYMBOL(sk_ioctl);
4483 
4484 static int __init sock_struct_check(void)
4485 {
4486 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4487 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4488 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4489 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4490 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4491 
4492 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4493 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4494 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4495 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4496 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4497 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4498 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4499 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4500 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4501 
4502 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4503 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4504 #ifdef CONFIG_MEMCG
4505 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4506 #endif
4507 
4508 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4509 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4510 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4511 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4512 
4513 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4514 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4515 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
4516 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4517 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4518 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4519 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4520 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4521 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4522 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4523 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4524 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4525 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4526 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4527 
4528 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
4529 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
4530 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4531 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4532 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4533 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4534 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
4535 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
4536 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4537 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4538 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4539 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4540 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4541 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4542 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
4543 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4544 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4545 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4546 	return 0;
4547 }
4548 
4549 core_initcall(sock_struct_check);
4550