xref: /linux/net/core/sock.c (revision 95f68e06b41b9e88291796efa3969409d13fdd4c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include "dev.h"
152 
153 static DEFINE_MUTEX(proto_list_mutex);
154 static LIST_HEAD(proto_list);
155 
156 static void sock_def_write_space_wfree(struct sock *sk);
157 static void sock_def_write_space(struct sock *sk);
158 
159 /**
160  * sk_ns_capable - General socket capability test
161  * @sk: Socket to use a capability on or through
162  * @user_ns: The user namespace of the capability to use
163  * @cap: The capability to use
164  *
165  * Test to see if the opener of the socket had when the socket was
166  * created and the current process has the capability @cap in the user
167  * namespace @user_ns.
168  */
169 bool sk_ns_capable(const struct sock *sk,
170 		   struct user_namespace *user_ns, int cap)
171 {
172 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173 		ns_capable(user_ns, cap);
174 }
175 EXPORT_SYMBOL(sk_ns_capable);
176 
177 /**
178  * sk_capable - Socket global capability test
179  * @sk: Socket to use a capability on or through
180  * @cap: The global capability to use
181  *
182  * Test to see if the opener of the socket had when the socket was
183  * created and the current process has the capability @cap in all user
184  * namespaces.
185  */
186 bool sk_capable(const struct sock *sk, int cap)
187 {
188 	return sk_ns_capable(sk, &init_user_ns, cap);
189 }
190 EXPORT_SYMBOL(sk_capable);
191 
192 /**
193  * sk_net_capable - Network namespace socket capability test
194  * @sk: Socket to use a capability on or through
195  * @cap: The capability to use
196  *
197  * Test to see if the opener of the socket had when the socket was created
198  * and the current process has the capability @cap over the network namespace
199  * the socket is a member of.
200  */
201 bool sk_net_capable(const struct sock *sk, int cap)
202 {
203 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204 }
205 EXPORT_SYMBOL(sk_net_capable);
206 
207 /*
208  * Each address family might have different locking rules, so we have
209  * one slock key per address family and separate keys for internal and
210  * userspace sockets.
211  */
212 static struct lock_class_key af_family_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_keys[AF_MAX];
214 static struct lock_class_key af_family_slock_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216 
217 /*
218  * Make lock validator output more readable. (we pre-construct these
219  * strings build-time, so that runtime initialization of socket
220  * locks is fast):
221  */
222 
223 #define _sock_locks(x)						  \
224   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
225   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
226   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
227   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
228   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
229   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
230   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
231   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
232   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
233   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
234   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
235   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
236   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
237   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
238   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
239   x "AF_MCTP"  , \
240   x "AF_MAX"
241 
242 static const char *const af_family_key_strings[AF_MAX+1] = {
243 	_sock_locks("sk_lock-")
244 };
245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("slock-")
247 };
248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("clock-")
250 };
251 
252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-sk_lock-")
254 };
255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-slock-")
257 };
258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259 	_sock_locks("k-clock-")
260 };
261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("rlock-")
263 };
264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265 	_sock_locks("wlock-")
266 };
267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268 	_sock_locks("elock-")
269 };
270 
271 /*
272  * sk_callback_lock and sk queues locking rules are per-address-family,
273  * so split the lock classes by using a per-AF key:
274  */
275 static struct lock_class_key af_callback_keys[AF_MAX];
276 static struct lock_class_key af_rlock_keys[AF_MAX];
277 static struct lock_class_key af_wlock_keys[AF_MAX];
278 static struct lock_class_key af_elock_keys[AF_MAX];
279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
280 
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283 EXPORT_SYMBOL(sysctl_wmem_max);
284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285 EXPORT_SYMBOL(sysctl_rmem_max);
286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288 
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_branch_inc(&memalloc_socks_key);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_branch_dec(&memalloc_socks_key);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned int noreclaim_flag;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	noreclaim_flag = memalloc_noreclaim_save();
334 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
335 				 tcp_v6_do_rcv,
336 				 tcp_v4_do_rcv,
337 				 sk, skb);
338 	memalloc_noreclaim_restore(noreclaim_flag);
339 
340 	return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343 
344 void sk_error_report(struct sock *sk)
345 {
346 	sk->sk_error_report(sk);
347 
348 	switch (sk->sk_family) {
349 	case AF_INET:
350 		fallthrough;
351 	case AF_INET6:
352 		trace_inet_sk_error_report(sk);
353 		break;
354 	default:
355 		break;
356 	}
357 }
358 EXPORT_SYMBOL(sk_error_report);
359 
360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
361 {
362 	struct __kernel_sock_timeval tv;
363 
364 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
365 		tv.tv_sec = 0;
366 		tv.tv_usec = 0;
367 	} else {
368 		tv.tv_sec = timeo / HZ;
369 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
370 	}
371 
372 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
373 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
374 		*(struct old_timeval32 *)optval = tv32;
375 		return sizeof(tv32);
376 	}
377 
378 	if (old_timeval) {
379 		struct __kernel_old_timeval old_tv;
380 		old_tv.tv_sec = tv.tv_sec;
381 		old_tv.tv_usec = tv.tv_usec;
382 		*(struct __kernel_old_timeval *)optval = old_tv;
383 		return sizeof(old_tv);
384 	}
385 
386 	*(struct __kernel_sock_timeval *)optval = tv;
387 	return sizeof(tv);
388 }
389 EXPORT_SYMBOL(sock_get_timeout);
390 
391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
392 			   sockptr_t optval, int optlen, bool old_timeval)
393 {
394 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
395 		struct old_timeval32 tv32;
396 
397 		if (optlen < sizeof(tv32))
398 			return -EINVAL;
399 
400 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
401 			return -EFAULT;
402 		tv->tv_sec = tv32.tv_sec;
403 		tv->tv_usec = tv32.tv_usec;
404 	} else if (old_timeval) {
405 		struct __kernel_old_timeval old_tv;
406 
407 		if (optlen < sizeof(old_tv))
408 			return -EINVAL;
409 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
410 			return -EFAULT;
411 		tv->tv_sec = old_tv.tv_sec;
412 		tv->tv_usec = old_tv.tv_usec;
413 	} else {
414 		if (optlen < sizeof(*tv))
415 			return -EINVAL;
416 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
417 			return -EFAULT;
418 	}
419 
420 	return 0;
421 }
422 EXPORT_SYMBOL(sock_copy_user_timeval);
423 
424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
425 			    bool old_timeval)
426 {
427 	struct __kernel_sock_timeval tv;
428 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
429 	long val;
430 
431 	if (err)
432 		return err;
433 
434 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
435 		return -EDOM;
436 
437 	if (tv.tv_sec < 0) {
438 		static int warned __read_mostly;
439 
440 		WRITE_ONCE(*timeo_p, 0);
441 		if (warned < 10 && net_ratelimit()) {
442 			warned++;
443 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 				__func__, current->comm, task_pid_nr(current));
445 		}
446 		return 0;
447 	}
448 	val = MAX_SCHEDULE_TIMEOUT;
449 	if ((tv.tv_sec || tv.tv_usec) &&
450 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
451 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
452 						    USEC_PER_SEC / HZ);
453 	WRITE_ONCE(*timeo_p, val);
454 	return 0;
455 }
456 
457 static bool sk_set_prio_allowed(const struct sock *sk, int val)
458 {
459 	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
460 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
461 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
462 }
463 
464 static bool sock_needs_netstamp(const struct sock *sk)
465 {
466 	switch (sk->sk_family) {
467 	case AF_UNSPEC:
468 	case AF_UNIX:
469 		return false;
470 	default:
471 		return true;
472 	}
473 }
474 
475 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
476 {
477 	if (sk->sk_flags & flags) {
478 		sk->sk_flags &= ~flags;
479 		if (sock_needs_netstamp(sk) &&
480 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
481 			net_disable_timestamp();
482 	}
483 }
484 
485 
486 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
487 {
488 	unsigned long flags;
489 	struct sk_buff_head *list = &sk->sk_receive_queue;
490 
491 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
492 		atomic_inc(&sk->sk_drops);
493 		trace_sock_rcvqueue_full(sk, skb);
494 		return -ENOMEM;
495 	}
496 
497 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
498 		atomic_inc(&sk->sk_drops);
499 		return -ENOBUFS;
500 	}
501 
502 	skb->dev = NULL;
503 	skb_set_owner_r(skb, sk);
504 
505 	/* we escape from rcu protected region, make sure we dont leak
506 	 * a norefcounted dst
507 	 */
508 	skb_dst_force(skb);
509 
510 	spin_lock_irqsave(&list->lock, flags);
511 	sock_skb_set_dropcount(sk, skb);
512 	__skb_queue_tail(list, skb);
513 	spin_unlock_irqrestore(&list->lock, flags);
514 
515 	if (!sock_flag(sk, SOCK_DEAD))
516 		sk->sk_data_ready(sk);
517 	return 0;
518 }
519 EXPORT_SYMBOL(__sock_queue_rcv_skb);
520 
521 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
522 			      enum skb_drop_reason *reason)
523 {
524 	enum skb_drop_reason drop_reason;
525 	int err;
526 
527 	err = sk_filter(sk, skb);
528 	if (err) {
529 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
530 		goto out;
531 	}
532 	err = __sock_queue_rcv_skb(sk, skb);
533 	switch (err) {
534 	case -ENOMEM:
535 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
536 		break;
537 	case -ENOBUFS:
538 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
539 		break;
540 	default:
541 		drop_reason = SKB_NOT_DROPPED_YET;
542 		break;
543 	}
544 out:
545 	if (reason)
546 		*reason = drop_reason;
547 	return err;
548 }
549 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
550 
551 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
552 		     const int nested, unsigned int trim_cap, bool refcounted)
553 {
554 	int rc = NET_RX_SUCCESS;
555 
556 	if (sk_filter_trim_cap(sk, skb, trim_cap))
557 		goto discard_and_relse;
558 
559 	skb->dev = NULL;
560 
561 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
562 		atomic_inc(&sk->sk_drops);
563 		goto discard_and_relse;
564 	}
565 	if (nested)
566 		bh_lock_sock_nested(sk);
567 	else
568 		bh_lock_sock(sk);
569 	if (!sock_owned_by_user(sk)) {
570 		/*
571 		 * trylock + unlock semantics:
572 		 */
573 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
574 
575 		rc = sk_backlog_rcv(sk, skb);
576 
577 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
578 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
579 		bh_unlock_sock(sk);
580 		atomic_inc(&sk->sk_drops);
581 		goto discard_and_relse;
582 	}
583 
584 	bh_unlock_sock(sk);
585 out:
586 	if (refcounted)
587 		sock_put(sk);
588 	return rc;
589 discard_and_relse:
590 	kfree_skb(skb);
591 	goto out;
592 }
593 EXPORT_SYMBOL(__sk_receive_skb);
594 
595 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
596 							  u32));
597 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
598 							   u32));
599 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
600 {
601 	struct dst_entry *dst = __sk_dst_get(sk);
602 
603 	if (dst && dst->obsolete &&
604 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
605 			       dst, cookie) == NULL) {
606 		sk_tx_queue_clear(sk);
607 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
608 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
609 		dst_release(dst);
610 		return NULL;
611 	}
612 
613 	return dst;
614 }
615 EXPORT_SYMBOL(__sk_dst_check);
616 
617 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
618 {
619 	struct dst_entry *dst = sk_dst_get(sk);
620 
621 	if (dst && dst->obsolete &&
622 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
623 			       dst, cookie) == NULL) {
624 		sk_dst_reset(sk);
625 		dst_release(dst);
626 		return NULL;
627 	}
628 
629 	return dst;
630 }
631 EXPORT_SYMBOL(sk_dst_check);
632 
633 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
634 {
635 	int ret = -ENOPROTOOPT;
636 #ifdef CONFIG_NETDEVICES
637 	struct net *net = sock_net(sk);
638 
639 	/* Sorry... */
640 	ret = -EPERM;
641 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
642 		goto out;
643 
644 	ret = -EINVAL;
645 	if (ifindex < 0)
646 		goto out;
647 
648 	/* Paired with all READ_ONCE() done locklessly. */
649 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
650 
651 	if (sk->sk_prot->rehash)
652 		sk->sk_prot->rehash(sk);
653 	sk_dst_reset(sk);
654 
655 	ret = 0;
656 
657 out:
658 #endif
659 
660 	return ret;
661 }
662 
663 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
664 {
665 	int ret;
666 
667 	if (lock_sk)
668 		lock_sock(sk);
669 	ret = sock_bindtoindex_locked(sk, ifindex);
670 	if (lock_sk)
671 		release_sock(sk);
672 
673 	return ret;
674 }
675 EXPORT_SYMBOL(sock_bindtoindex);
676 
677 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
678 {
679 	int ret = -ENOPROTOOPT;
680 #ifdef CONFIG_NETDEVICES
681 	struct net *net = sock_net(sk);
682 	char devname[IFNAMSIZ];
683 	int index;
684 
685 	ret = -EINVAL;
686 	if (optlen < 0)
687 		goto out;
688 
689 	/* Bind this socket to a particular device like "eth0",
690 	 * as specified in the passed interface name. If the
691 	 * name is "" or the option length is zero the socket
692 	 * is not bound.
693 	 */
694 	if (optlen > IFNAMSIZ - 1)
695 		optlen = IFNAMSIZ - 1;
696 	memset(devname, 0, sizeof(devname));
697 
698 	ret = -EFAULT;
699 	if (copy_from_sockptr(devname, optval, optlen))
700 		goto out;
701 
702 	index = 0;
703 	if (devname[0] != '\0') {
704 		struct net_device *dev;
705 
706 		rcu_read_lock();
707 		dev = dev_get_by_name_rcu(net, devname);
708 		if (dev)
709 			index = dev->ifindex;
710 		rcu_read_unlock();
711 		ret = -ENODEV;
712 		if (!dev)
713 			goto out;
714 	}
715 
716 	sockopt_lock_sock(sk);
717 	ret = sock_bindtoindex_locked(sk, index);
718 	sockopt_release_sock(sk);
719 out:
720 #endif
721 
722 	return ret;
723 }
724 
725 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
726 				sockptr_t optlen, int len)
727 {
728 	int ret = -ENOPROTOOPT;
729 #ifdef CONFIG_NETDEVICES
730 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
731 	struct net *net = sock_net(sk);
732 	char devname[IFNAMSIZ];
733 
734 	if (bound_dev_if == 0) {
735 		len = 0;
736 		goto zero;
737 	}
738 
739 	ret = -EINVAL;
740 	if (len < IFNAMSIZ)
741 		goto out;
742 
743 	ret = netdev_get_name(net, devname, bound_dev_if);
744 	if (ret)
745 		goto out;
746 
747 	len = strlen(devname) + 1;
748 
749 	ret = -EFAULT;
750 	if (copy_to_sockptr(optval, devname, len))
751 		goto out;
752 
753 zero:
754 	ret = -EFAULT;
755 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
756 		goto out;
757 
758 	ret = 0;
759 
760 out:
761 #endif
762 
763 	return ret;
764 }
765 
766 bool sk_mc_loop(const struct sock *sk)
767 {
768 	if (dev_recursion_level())
769 		return false;
770 	if (!sk)
771 		return true;
772 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
773 	switch (READ_ONCE(sk->sk_family)) {
774 	case AF_INET:
775 		return inet_test_bit(MC_LOOP, sk);
776 #if IS_ENABLED(CONFIG_IPV6)
777 	case AF_INET6:
778 		return inet6_test_bit(MC6_LOOP, sk);
779 #endif
780 	}
781 	WARN_ON_ONCE(1);
782 	return true;
783 }
784 EXPORT_SYMBOL(sk_mc_loop);
785 
786 void sock_set_reuseaddr(struct sock *sk)
787 {
788 	lock_sock(sk);
789 	sk->sk_reuse = SK_CAN_REUSE;
790 	release_sock(sk);
791 }
792 EXPORT_SYMBOL(sock_set_reuseaddr);
793 
794 void sock_set_reuseport(struct sock *sk)
795 {
796 	lock_sock(sk);
797 	sk->sk_reuseport = true;
798 	release_sock(sk);
799 }
800 EXPORT_SYMBOL(sock_set_reuseport);
801 
802 void sock_no_linger(struct sock *sk)
803 {
804 	lock_sock(sk);
805 	WRITE_ONCE(sk->sk_lingertime, 0);
806 	sock_set_flag(sk, SOCK_LINGER);
807 	release_sock(sk);
808 }
809 EXPORT_SYMBOL(sock_no_linger);
810 
811 void sock_set_priority(struct sock *sk, u32 priority)
812 {
813 	WRITE_ONCE(sk->sk_priority, priority);
814 }
815 EXPORT_SYMBOL(sock_set_priority);
816 
817 void sock_set_sndtimeo(struct sock *sk, s64 secs)
818 {
819 	lock_sock(sk);
820 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
821 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
822 	else
823 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
824 	release_sock(sk);
825 }
826 EXPORT_SYMBOL(sock_set_sndtimeo);
827 
828 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
829 {
830 	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
831 	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
832 	if (val)  {
833 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
834 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
835 	}
836 }
837 
838 void sock_enable_timestamps(struct sock *sk)
839 {
840 	lock_sock(sk);
841 	__sock_set_timestamps(sk, true, false, true);
842 	release_sock(sk);
843 }
844 EXPORT_SYMBOL(sock_enable_timestamps);
845 
846 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
847 {
848 	switch (optname) {
849 	case SO_TIMESTAMP_OLD:
850 		__sock_set_timestamps(sk, valbool, false, false);
851 		break;
852 	case SO_TIMESTAMP_NEW:
853 		__sock_set_timestamps(sk, valbool, true, false);
854 		break;
855 	case SO_TIMESTAMPNS_OLD:
856 		__sock_set_timestamps(sk, valbool, false, true);
857 		break;
858 	case SO_TIMESTAMPNS_NEW:
859 		__sock_set_timestamps(sk, valbool, true, true);
860 		break;
861 	}
862 }
863 
864 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
865 {
866 	struct net *net = sock_net(sk);
867 	struct net_device *dev = NULL;
868 	bool match = false;
869 	int *vclock_index;
870 	int i, num;
871 
872 	if (sk->sk_bound_dev_if)
873 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
874 
875 	if (!dev) {
876 		pr_err("%s: sock not bind to device\n", __func__);
877 		return -EOPNOTSUPP;
878 	}
879 
880 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
881 	dev_put(dev);
882 
883 	for (i = 0; i < num; i++) {
884 		if (*(vclock_index + i) == phc_index) {
885 			match = true;
886 			break;
887 		}
888 	}
889 
890 	if (num > 0)
891 		kfree(vclock_index);
892 
893 	if (!match)
894 		return -EINVAL;
895 
896 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
897 
898 	return 0;
899 }
900 
901 int sock_set_timestamping(struct sock *sk, int optname,
902 			  struct so_timestamping timestamping)
903 {
904 	int val = timestamping.flags;
905 	int ret;
906 
907 	if (val & ~SOF_TIMESTAMPING_MASK)
908 		return -EINVAL;
909 
910 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
911 	    !(val & SOF_TIMESTAMPING_OPT_ID))
912 		return -EINVAL;
913 
914 	if (val & SOF_TIMESTAMPING_OPT_ID &&
915 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
916 		if (sk_is_tcp(sk)) {
917 			if ((1 << sk->sk_state) &
918 			    (TCPF_CLOSE | TCPF_LISTEN))
919 				return -EINVAL;
920 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
922 			else
923 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
924 		} else {
925 			atomic_set(&sk->sk_tskey, 0);
926 		}
927 	}
928 
929 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
930 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
931 		return -EINVAL;
932 
933 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
934 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
935 		if (ret)
936 			return ret;
937 	}
938 
939 	WRITE_ONCE(sk->sk_tsflags, val);
940 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
941 
942 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
943 		sock_enable_timestamp(sk,
944 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
945 	else
946 		sock_disable_timestamp(sk,
947 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
948 	return 0;
949 }
950 
951 void sock_set_keepalive(struct sock *sk)
952 {
953 	lock_sock(sk);
954 	if (sk->sk_prot->keepalive)
955 		sk->sk_prot->keepalive(sk, true);
956 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
957 	release_sock(sk);
958 }
959 EXPORT_SYMBOL(sock_set_keepalive);
960 
961 static void __sock_set_rcvbuf(struct sock *sk, int val)
962 {
963 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
964 	 * as a negative value.
965 	 */
966 	val = min_t(int, val, INT_MAX / 2);
967 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
968 
969 	/* We double it on the way in to account for "struct sk_buff" etc.
970 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
971 	 * will allow that much actual data to be received on that socket.
972 	 *
973 	 * Applications are unaware that "struct sk_buff" and other overheads
974 	 * allocate from the receive buffer during socket buffer allocation.
975 	 *
976 	 * And after considering the possible alternatives, returning the value
977 	 * we actually used in getsockopt is the most desirable behavior.
978 	 */
979 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
980 }
981 
982 void sock_set_rcvbuf(struct sock *sk, int val)
983 {
984 	lock_sock(sk);
985 	__sock_set_rcvbuf(sk, val);
986 	release_sock(sk);
987 }
988 EXPORT_SYMBOL(sock_set_rcvbuf);
989 
990 static void __sock_set_mark(struct sock *sk, u32 val)
991 {
992 	if (val != sk->sk_mark) {
993 		WRITE_ONCE(sk->sk_mark, val);
994 		sk_dst_reset(sk);
995 	}
996 }
997 
998 void sock_set_mark(struct sock *sk, u32 val)
999 {
1000 	lock_sock(sk);
1001 	__sock_set_mark(sk, val);
1002 	release_sock(sk);
1003 }
1004 EXPORT_SYMBOL(sock_set_mark);
1005 
1006 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1007 {
1008 	/* Round down bytes to multiple of pages */
1009 	bytes = round_down(bytes, PAGE_SIZE);
1010 
1011 	WARN_ON(bytes > sk->sk_reserved_mem);
1012 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1013 	sk_mem_reclaim(sk);
1014 }
1015 
1016 static int sock_reserve_memory(struct sock *sk, int bytes)
1017 {
1018 	long allocated;
1019 	bool charged;
1020 	int pages;
1021 
1022 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1023 		return -EOPNOTSUPP;
1024 
1025 	if (!bytes)
1026 		return 0;
1027 
1028 	pages = sk_mem_pages(bytes);
1029 
1030 	/* pre-charge to memcg */
1031 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1032 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1033 	if (!charged)
1034 		return -ENOMEM;
1035 
1036 	/* pre-charge to forward_alloc */
1037 	sk_memory_allocated_add(sk, pages);
1038 	allocated = sk_memory_allocated(sk);
1039 	/* If the system goes into memory pressure with this
1040 	 * precharge, give up and return error.
1041 	 */
1042 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1043 		sk_memory_allocated_sub(sk, pages);
1044 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1045 		return -ENOMEM;
1046 	}
1047 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1048 
1049 	WRITE_ONCE(sk->sk_reserved_mem,
1050 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1051 
1052 	return 0;
1053 }
1054 
1055 #ifdef CONFIG_PAGE_POOL
1056 
1057 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1058  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1059  * allocates to copy these tokens, and to prevent looping over the frags for
1060  * too long.
1061  */
1062 #define MAX_DONTNEED_TOKENS 128
1063 #define MAX_DONTNEED_FRAGS 1024
1064 
1065 static noinline_for_stack int
1066 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1067 {
1068 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1069 	struct dmabuf_token *tokens;
1070 	int ret = 0, num_frags = 0;
1071 	netmem_ref netmems[16];
1072 
1073 	if (!sk_is_tcp(sk))
1074 		return -EBADF;
1075 
1076 	if (optlen % sizeof(*tokens) ||
1077 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1078 		return -EINVAL;
1079 
1080 	num_tokens = optlen / sizeof(*tokens);
1081 	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1082 	if (!tokens)
1083 		return -ENOMEM;
1084 
1085 	if (copy_from_sockptr(tokens, optval, optlen)) {
1086 		kvfree(tokens);
1087 		return -EFAULT;
1088 	}
1089 
1090 	xa_lock_bh(&sk->sk_user_frags);
1091 	for (i = 0; i < num_tokens; i++) {
1092 		for (j = 0; j < tokens[i].token_count; j++) {
1093 			if (++num_frags > MAX_DONTNEED_FRAGS)
1094 				goto frag_limit_reached;
1095 
1096 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1097 				&sk->sk_user_frags, tokens[i].token_start + j);
1098 
1099 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1100 				continue;
1101 
1102 			netmems[netmem_num++] = netmem;
1103 			if (netmem_num == ARRAY_SIZE(netmems)) {
1104 				xa_unlock_bh(&sk->sk_user_frags);
1105 				for (k = 0; k < netmem_num; k++)
1106 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1107 				netmem_num = 0;
1108 				xa_lock_bh(&sk->sk_user_frags);
1109 			}
1110 			ret++;
1111 		}
1112 	}
1113 
1114 frag_limit_reached:
1115 	xa_unlock_bh(&sk->sk_user_frags);
1116 	for (k = 0; k < netmem_num; k++)
1117 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1118 
1119 	kvfree(tokens);
1120 	return ret;
1121 }
1122 #endif
1123 
1124 void sockopt_lock_sock(struct sock *sk)
1125 {
1126 	/* When current->bpf_ctx is set, the setsockopt is called from
1127 	 * a bpf prog.  bpf has ensured the sk lock has been
1128 	 * acquired before calling setsockopt().
1129 	 */
1130 	if (has_current_bpf_ctx())
1131 		return;
1132 
1133 	lock_sock(sk);
1134 }
1135 EXPORT_SYMBOL(sockopt_lock_sock);
1136 
1137 void sockopt_release_sock(struct sock *sk)
1138 {
1139 	if (has_current_bpf_ctx())
1140 		return;
1141 
1142 	release_sock(sk);
1143 }
1144 EXPORT_SYMBOL(sockopt_release_sock);
1145 
1146 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1147 {
1148 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1149 }
1150 EXPORT_SYMBOL(sockopt_ns_capable);
1151 
1152 bool sockopt_capable(int cap)
1153 {
1154 	return has_current_bpf_ctx() || capable(cap);
1155 }
1156 EXPORT_SYMBOL(sockopt_capable);
1157 
1158 static int sockopt_validate_clockid(__kernel_clockid_t value)
1159 {
1160 	switch (value) {
1161 	case CLOCK_REALTIME:
1162 	case CLOCK_MONOTONIC:
1163 	case CLOCK_TAI:
1164 		return 0;
1165 	}
1166 	return -EINVAL;
1167 }
1168 
1169 /*
1170  *	This is meant for all protocols to use and covers goings on
1171  *	at the socket level. Everything here is generic.
1172  */
1173 
1174 int sk_setsockopt(struct sock *sk, int level, int optname,
1175 		  sockptr_t optval, unsigned int optlen)
1176 {
1177 	struct so_timestamping timestamping;
1178 	struct socket *sock = sk->sk_socket;
1179 	struct sock_txtime sk_txtime;
1180 	int val;
1181 	int valbool;
1182 	struct linger ling;
1183 	int ret = 0;
1184 
1185 	/*
1186 	 *	Options without arguments
1187 	 */
1188 
1189 	if (optname == SO_BINDTODEVICE)
1190 		return sock_setbindtodevice(sk, optval, optlen);
1191 
1192 	if (optlen < sizeof(int))
1193 		return -EINVAL;
1194 
1195 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1196 		return -EFAULT;
1197 
1198 	valbool = val ? 1 : 0;
1199 
1200 	/* handle options which do not require locking the socket. */
1201 	switch (optname) {
1202 	case SO_PRIORITY:
1203 		if (sk_set_prio_allowed(sk, val)) {
1204 			sock_set_priority(sk, val);
1205 			return 0;
1206 		}
1207 		return -EPERM;
1208 	case SO_PASSSEC:
1209 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1210 		return 0;
1211 	case SO_PASSCRED:
1212 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1213 		return 0;
1214 	case SO_PASSPIDFD:
1215 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1216 		return 0;
1217 	case SO_TYPE:
1218 	case SO_PROTOCOL:
1219 	case SO_DOMAIN:
1220 	case SO_ERROR:
1221 		return -ENOPROTOOPT;
1222 #ifdef CONFIG_NET_RX_BUSY_POLL
1223 	case SO_BUSY_POLL:
1224 		if (val < 0)
1225 			return -EINVAL;
1226 		WRITE_ONCE(sk->sk_ll_usec, val);
1227 		return 0;
1228 	case SO_PREFER_BUSY_POLL:
1229 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1230 			return -EPERM;
1231 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1232 		return 0;
1233 	case SO_BUSY_POLL_BUDGET:
1234 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1235 		    !sockopt_capable(CAP_NET_ADMIN))
1236 			return -EPERM;
1237 		if (val < 0 || val > U16_MAX)
1238 			return -EINVAL;
1239 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1240 		return 0;
1241 #endif
1242 	case SO_MAX_PACING_RATE:
1243 		{
1244 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1245 		unsigned long pacing_rate;
1246 
1247 		if (sizeof(ulval) != sizeof(val) &&
1248 		    optlen >= sizeof(ulval) &&
1249 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1250 			return -EFAULT;
1251 		}
1252 		if (ulval != ~0UL)
1253 			cmpxchg(&sk->sk_pacing_status,
1254 				SK_PACING_NONE,
1255 				SK_PACING_NEEDED);
1256 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1257 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1258 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1259 		if (ulval < pacing_rate)
1260 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1261 		return 0;
1262 		}
1263 	case SO_TXREHASH:
1264 		if (val < -1 || val > 1)
1265 			return -EINVAL;
1266 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1267 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1268 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1269 		 * and sk_getsockopt().
1270 		 */
1271 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1272 		return 0;
1273 	case SO_PEEK_OFF:
1274 		{
1275 		int (*set_peek_off)(struct sock *sk, int val);
1276 
1277 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1278 		if (set_peek_off)
1279 			ret = set_peek_off(sk, val);
1280 		else
1281 			ret = -EOPNOTSUPP;
1282 		return ret;
1283 		}
1284 #ifdef CONFIG_PAGE_POOL
1285 	case SO_DEVMEM_DONTNEED:
1286 		return sock_devmem_dontneed(sk, optval, optlen);
1287 #endif
1288 	}
1289 
1290 	sockopt_lock_sock(sk);
1291 
1292 	switch (optname) {
1293 	case SO_DEBUG:
1294 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1295 			ret = -EACCES;
1296 		else
1297 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1298 		break;
1299 	case SO_REUSEADDR:
1300 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1301 		break;
1302 	case SO_REUSEPORT:
1303 		sk->sk_reuseport = valbool;
1304 		break;
1305 	case SO_DONTROUTE:
1306 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1307 		sk_dst_reset(sk);
1308 		break;
1309 	case SO_BROADCAST:
1310 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1311 		break;
1312 	case SO_SNDBUF:
1313 		/* Don't error on this BSD doesn't and if you think
1314 		 * about it this is right. Otherwise apps have to
1315 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1316 		 * are treated in BSD as hints
1317 		 */
1318 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1319 set_sndbuf:
1320 		/* Ensure val * 2 fits into an int, to prevent max_t()
1321 		 * from treating it as a negative value.
1322 		 */
1323 		val = min_t(int, val, INT_MAX / 2);
1324 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1325 		WRITE_ONCE(sk->sk_sndbuf,
1326 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1327 		/* Wake up sending tasks if we upped the value. */
1328 		sk->sk_write_space(sk);
1329 		break;
1330 
1331 	case SO_SNDBUFFORCE:
1332 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1333 			ret = -EPERM;
1334 			break;
1335 		}
1336 
1337 		/* No negative values (to prevent underflow, as val will be
1338 		 * multiplied by 2).
1339 		 */
1340 		if (val < 0)
1341 			val = 0;
1342 		goto set_sndbuf;
1343 
1344 	case SO_RCVBUF:
1345 		/* Don't error on this BSD doesn't and if you think
1346 		 * about it this is right. Otherwise apps have to
1347 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1348 		 * are treated in BSD as hints
1349 		 */
1350 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1351 		break;
1352 
1353 	case SO_RCVBUFFORCE:
1354 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1355 			ret = -EPERM;
1356 			break;
1357 		}
1358 
1359 		/* No negative values (to prevent underflow, as val will be
1360 		 * multiplied by 2).
1361 		 */
1362 		__sock_set_rcvbuf(sk, max(val, 0));
1363 		break;
1364 
1365 	case SO_KEEPALIVE:
1366 		if (sk->sk_prot->keepalive)
1367 			sk->sk_prot->keepalive(sk, valbool);
1368 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1369 		break;
1370 
1371 	case SO_OOBINLINE:
1372 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1373 		break;
1374 
1375 	case SO_NO_CHECK:
1376 		sk->sk_no_check_tx = valbool;
1377 		break;
1378 
1379 	case SO_LINGER:
1380 		if (optlen < sizeof(ling)) {
1381 			ret = -EINVAL;	/* 1003.1g */
1382 			break;
1383 		}
1384 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1385 			ret = -EFAULT;
1386 			break;
1387 		}
1388 		if (!ling.l_onoff) {
1389 			sock_reset_flag(sk, SOCK_LINGER);
1390 		} else {
1391 			unsigned long t_sec = ling.l_linger;
1392 
1393 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1394 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1395 			else
1396 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1397 			sock_set_flag(sk, SOCK_LINGER);
1398 		}
1399 		break;
1400 
1401 	case SO_BSDCOMPAT:
1402 		break;
1403 
1404 	case SO_TIMESTAMP_OLD:
1405 	case SO_TIMESTAMP_NEW:
1406 	case SO_TIMESTAMPNS_OLD:
1407 	case SO_TIMESTAMPNS_NEW:
1408 		sock_set_timestamp(sk, optname, valbool);
1409 		break;
1410 
1411 	case SO_TIMESTAMPING_NEW:
1412 	case SO_TIMESTAMPING_OLD:
1413 		if (optlen == sizeof(timestamping)) {
1414 			if (copy_from_sockptr(&timestamping, optval,
1415 					      sizeof(timestamping))) {
1416 				ret = -EFAULT;
1417 				break;
1418 			}
1419 		} else {
1420 			memset(&timestamping, 0, sizeof(timestamping));
1421 			timestamping.flags = val;
1422 		}
1423 		ret = sock_set_timestamping(sk, optname, timestamping);
1424 		break;
1425 
1426 	case SO_RCVLOWAT:
1427 		{
1428 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1429 
1430 		if (val < 0)
1431 			val = INT_MAX;
1432 		if (sock)
1433 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1434 		if (set_rcvlowat)
1435 			ret = set_rcvlowat(sk, val);
1436 		else
1437 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1438 		break;
1439 		}
1440 	case SO_RCVTIMEO_OLD:
1441 	case SO_RCVTIMEO_NEW:
1442 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1443 				       optlen, optname == SO_RCVTIMEO_OLD);
1444 		break;
1445 
1446 	case SO_SNDTIMEO_OLD:
1447 	case SO_SNDTIMEO_NEW:
1448 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1449 				       optlen, optname == SO_SNDTIMEO_OLD);
1450 		break;
1451 
1452 	case SO_ATTACH_FILTER: {
1453 		struct sock_fprog fprog;
1454 
1455 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1456 		if (!ret)
1457 			ret = sk_attach_filter(&fprog, sk);
1458 		break;
1459 	}
1460 	case SO_ATTACH_BPF:
1461 		ret = -EINVAL;
1462 		if (optlen == sizeof(u32)) {
1463 			u32 ufd;
1464 
1465 			ret = -EFAULT;
1466 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1467 				break;
1468 
1469 			ret = sk_attach_bpf(ufd, sk);
1470 		}
1471 		break;
1472 
1473 	case SO_ATTACH_REUSEPORT_CBPF: {
1474 		struct sock_fprog fprog;
1475 
1476 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1477 		if (!ret)
1478 			ret = sk_reuseport_attach_filter(&fprog, sk);
1479 		break;
1480 	}
1481 	case SO_ATTACH_REUSEPORT_EBPF:
1482 		ret = -EINVAL;
1483 		if (optlen == sizeof(u32)) {
1484 			u32 ufd;
1485 
1486 			ret = -EFAULT;
1487 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1488 				break;
1489 
1490 			ret = sk_reuseport_attach_bpf(ufd, sk);
1491 		}
1492 		break;
1493 
1494 	case SO_DETACH_REUSEPORT_BPF:
1495 		ret = reuseport_detach_prog(sk);
1496 		break;
1497 
1498 	case SO_DETACH_FILTER:
1499 		ret = sk_detach_filter(sk);
1500 		break;
1501 
1502 	case SO_LOCK_FILTER:
1503 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1504 			ret = -EPERM;
1505 		else
1506 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1507 		break;
1508 
1509 	case SO_MARK:
1510 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1511 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1512 			ret = -EPERM;
1513 			break;
1514 		}
1515 
1516 		__sock_set_mark(sk, val);
1517 		break;
1518 	case SO_RCVMARK:
1519 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1520 		break;
1521 
1522 	case SO_RCVPRIORITY:
1523 		sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1524 		break;
1525 
1526 	case SO_RXQ_OVFL:
1527 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1528 		break;
1529 
1530 	case SO_WIFI_STATUS:
1531 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1532 		break;
1533 
1534 	case SO_NOFCS:
1535 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1536 		break;
1537 
1538 	case SO_SELECT_ERR_QUEUE:
1539 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1540 		break;
1541 
1542 
1543 	case SO_INCOMING_CPU:
1544 		reuseport_update_incoming_cpu(sk, val);
1545 		break;
1546 
1547 	case SO_CNX_ADVICE:
1548 		if (val == 1)
1549 			dst_negative_advice(sk);
1550 		break;
1551 
1552 	case SO_ZEROCOPY:
1553 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1554 			if (!(sk_is_tcp(sk) ||
1555 			      (sk->sk_type == SOCK_DGRAM &&
1556 			       sk->sk_protocol == IPPROTO_UDP)))
1557 				ret = -EOPNOTSUPP;
1558 		} else if (sk->sk_family != PF_RDS) {
1559 			ret = -EOPNOTSUPP;
1560 		}
1561 		if (!ret) {
1562 			if (val < 0 || val > 1)
1563 				ret = -EINVAL;
1564 			else
1565 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1566 		}
1567 		break;
1568 
1569 	case SO_TXTIME:
1570 		if (optlen != sizeof(struct sock_txtime)) {
1571 			ret = -EINVAL;
1572 			break;
1573 		} else if (copy_from_sockptr(&sk_txtime, optval,
1574 			   sizeof(struct sock_txtime))) {
1575 			ret = -EFAULT;
1576 			break;
1577 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1578 			ret = -EINVAL;
1579 			break;
1580 		}
1581 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1582 		 * scheduler has enough safe guards.
1583 		 */
1584 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1585 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1586 			ret = -EPERM;
1587 			break;
1588 		}
1589 
1590 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1591 		if (ret)
1592 			break;
1593 
1594 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1595 		sk->sk_clockid = sk_txtime.clockid;
1596 		sk->sk_txtime_deadline_mode =
1597 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1598 		sk->sk_txtime_report_errors =
1599 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1600 		break;
1601 
1602 	case SO_BINDTOIFINDEX:
1603 		ret = sock_bindtoindex_locked(sk, val);
1604 		break;
1605 
1606 	case SO_BUF_LOCK:
1607 		if (val & ~SOCK_BUF_LOCK_MASK) {
1608 			ret = -EINVAL;
1609 			break;
1610 		}
1611 		sk->sk_userlocks = val | (sk->sk_userlocks &
1612 					  ~SOCK_BUF_LOCK_MASK);
1613 		break;
1614 
1615 	case SO_RESERVE_MEM:
1616 	{
1617 		int delta;
1618 
1619 		if (val < 0) {
1620 			ret = -EINVAL;
1621 			break;
1622 		}
1623 
1624 		delta = val - sk->sk_reserved_mem;
1625 		if (delta < 0)
1626 			sock_release_reserved_memory(sk, -delta);
1627 		else
1628 			ret = sock_reserve_memory(sk, delta);
1629 		break;
1630 	}
1631 
1632 	default:
1633 		ret = -ENOPROTOOPT;
1634 		break;
1635 	}
1636 	sockopt_release_sock(sk);
1637 	return ret;
1638 }
1639 
1640 int sock_setsockopt(struct socket *sock, int level, int optname,
1641 		    sockptr_t optval, unsigned int optlen)
1642 {
1643 	return sk_setsockopt(sock->sk, level, optname,
1644 			     optval, optlen);
1645 }
1646 EXPORT_SYMBOL(sock_setsockopt);
1647 
1648 static const struct cred *sk_get_peer_cred(struct sock *sk)
1649 {
1650 	const struct cred *cred;
1651 
1652 	spin_lock(&sk->sk_peer_lock);
1653 	cred = get_cred(sk->sk_peer_cred);
1654 	spin_unlock(&sk->sk_peer_lock);
1655 
1656 	return cred;
1657 }
1658 
1659 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1660 			  struct ucred *ucred)
1661 {
1662 	ucred->pid = pid_vnr(pid);
1663 	ucred->uid = ucred->gid = -1;
1664 	if (cred) {
1665 		struct user_namespace *current_ns = current_user_ns();
1666 
1667 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1668 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1669 	}
1670 }
1671 
1672 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1673 {
1674 	struct user_namespace *user_ns = current_user_ns();
1675 	int i;
1676 
1677 	for (i = 0; i < src->ngroups; i++) {
1678 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1679 
1680 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1681 			return -EFAULT;
1682 	}
1683 
1684 	return 0;
1685 }
1686 
1687 int sk_getsockopt(struct sock *sk, int level, int optname,
1688 		  sockptr_t optval, sockptr_t optlen)
1689 {
1690 	struct socket *sock = sk->sk_socket;
1691 
1692 	union {
1693 		int val;
1694 		u64 val64;
1695 		unsigned long ulval;
1696 		struct linger ling;
1697 		struct old_timeval32 tm32;
1698 		struct __kernel_old_timeval tm;
1699 		struct  __kernel_sock_timeval stm;
1700 		struct sock_txtime txtime;
1701 		struct so_timestamping timestamping;
1702 	} v;
1703 
1704 	int lv = sizeof(int);
1705 	int len;
1706 
1707 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1708 		return -EFAULT;
1709 	if (len < 0)
1710 		return -EINVAL;
1711 
1712 	memset(&v, 0, sizeof(v));
1713 
1714 	switch (optname) {
1715 	case SO_DEBUG:
1716 		v.val = sock_flag(sk, SOCK_DBG);
1717 		break;
1718 
1719 	case SO_DONTROUTE:
1720 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1721 		break;
1722 
1723 	case SO_BROADCAST:
1724 		v.val = sock_flag(sk, SOCK_BROADCAST);
1725 		break;
1726 
1727 	case SO_SNDBUF:
1728 		v.val = READ_ONCE(sk->sk_sndbuf);
1729 		break;
1730 
1731 	case SO_RCVBUF:
1732 		v.val = READ_ONCE(sk->sk_rcvbuf);
1733 		break;
1734 
1735 	case SO_REUSEADDR:
1736 		v.val = sk->sk_reuse;
1737 		break;
1738 
1739 	case SO_REUSEPORT:
1740 		v.val = sk->sk_reuseport;
1741 		break;
1742 
1743 	case SO_KEEPALIVE:
1744 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1745 		break;
1746 
1747 	case SO_TYPE:
1748 		v.val = sk->sk_type;
1749 		break;
1750 
1751 	case SO_PROTOCOL:
1752 		v.val = sk->sk_protocol;
1753 		break;
1754 
1755 	case SO_DOMAIN:
1756 		v.val = sk->sk_family;
1757 		break;
1758 
1759 	case SO_ERROR:
1760 		v.val = -sock_error(sk);
1761 		if (v.val == 0)
1762 			v.val = xchg(&sk->sk_err_soft, 0);
1763 		break;
1764 
1765 	case SO_OOBINLINE:
1766 		v.val = sock_flag(sk, SOCK_URGINLINE);
1767 		break;
1768 
1769 	case SO_NO_CHECK:
1770 		v.val = sk->sk_no_check_tx;
1771 		break;
1772 
1773 	case SO_PRIORITY:
1774 		v.val = READ_ONCE(sk->sk_priority);
1775 		break;
1776 
1777 	case SO_LINGER:
1778 		lv		= sizeof(v.ling);
1779 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1780 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1781 		break;
1782 
1783 	case SO_BSDCOMPAT:
1784 		break;
1785 
1786 	case SO_TIMESTAMP_OLD:
1787 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1788 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1789 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1790 		break;
1791 
1792 	case SO_TIMESTAMPNS_OLD:
1793 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1794 		break;
1795 
1796 	case SO_TIMESTAMP_NEW:
1797 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1798 		break;
1799 
1800 	case SO_TIMESTAMPNS_NEW:
1801 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1802 		break;
1803 
1804 	case SO_TIMESTAMPING_OLD:
1805 	case SO_TIMESTAMPING_NEW:
1806 		lv = sizeof(v.timestamping);
1807 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1808 		 * returning the flags when they were set through the same option.
1809 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1810 		 */
1811 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1812 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1813 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1814 		}
1815 		break;
1816 
1817 	case SO_RCVTIMEO_OLD:
1818 	case SO_RCVTIMEO_NEW:
1819 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1820 				      SO_RCVTIMEO_OLD == optname);
1821 		break;
1822 
1823 	case SO_SNDTIMEO_OLD:
1824 	case SO_SNDTIMEO_NEW:
1825 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1826 				      SO_SNDTIMEO_OLD == optname);
1827 		break;
1828 
1829 	case SO_RCVLOWAT:
1830 		v.val = READ_ONCE(sk->sk_rcvlowat);
1831 		break;
1832 
1833 	case SO_SNDLOWAT:
1834 		v.val = 1;
1835 		break;
1836 
1837 	case SO_PASSCRED:
1838 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1839 		break;
1840 
1841 	case SO_PASSPIDFD:
1842 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1843 		break;
1844 
1845 	case SO_PEERCRED:
1846 	{
1847 		struct ucred peercred;
1848 		if (len > sizeof(peercred))
1849 			len = sizeof(peercred);
1850 
1851 		spin_lock(&sk->sk_peer_lock);
1852 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1853 		spin_unlock(&sk->sk_peer_lock);
1854 
1855 		if (copy_to_sockptr(optval, &peercred, len))
1856 			return -EFAULT;
1857 		goto lenout;
1858 	}
1859 
1860 	case SO_PEERPIDFD:
1861 	{
1862 		struct pid *peer_pid;
1863 		struct file *pidfd_file = NULL;
1864 		int pidfd;
1865 
1866 		if (len > sizeof(pidfd))
1867 			len = sizeof(pidfd);
1868 
1869 		spin_lock(&sk->sk_peer_lock);
1870 		peer_pid = get_pid(sk->sk_peer_pid);
1871 		spin_unlock(&sk->sk_peer_lock);
1872 
1873 		if (!peer_pid)
1874 			return -ENODATA;
1875 
1876 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1877 		put_pid(peer_pid);
1878 		if (pidfd < 0)
1879 			return pidfd;
1880 
1881 		if (copy_to_sockptr(optval, &pidfd, len) ||
1882 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1883 			put_unused_fd(pidfd);
1884 			fput(pidfd_file);
1885 
1886 			return -EFAULT;
1887 		}
1888 
1889 		fd_install(pidfd, pidfd_file);
1890 		return 0;
1891 	}
1892 
1893 	case SO_PEERGROUPS:
1894 	{
1895 		const struct cred *cred;
1896 		int ret, n;
1897 
1898 		cred = sk_get_peer_cred(sk);
1899 		if (!cred)
1900 			return -ENODATA;
1901 
1902 		n = cred->group_info->ngroups;
1903 		if (len < n * sizeof(gid_t)) {
1904 			len = n * sizeof(gid_t);
1905 			put_cred(cred);
1906 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1907 		}
1908 		len = n * sizeof(gid_t);
1909 
1910 		ret = groups_to_user(optval, cred->group_info);
1911 		put_cred(cred);
1912 		if (ret)
1913 			return ret;
1914 		goto lenout;
1915 	}
1916 
1917 	case SO_PEERNAME:
1918 	{
1919 		struct sockaddr_storage address;
1920 
1921 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1922 		if (lv < 0)
1923 			return -ENOTCONN;
1924 		if (lv < len)
1925 			return -EINVAL;
1926 		if (copy_to_sockptr(optval, &address, len))
1927 			return -EFAULT;
1928 		goto lenout;
1929 	}
1930 
1931 	/* Dubious BSD thing... Probably nobody even uses it, but
1932 	 * the UNIX standard wants it for whatever reason... -DaveM
1933 	 */
1934 	case SO_ACCEPTCONN:
1935 		v.val = sk->sk_state == TCP_LISTEN;
1936 		break;
1937 
1938 	case SO_PASSSEC:
1939 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1940 		break;
1941 
1942 	case SO_PEERSEC:
1943 		return security_socket_getpeersec_stream(sock,
1944 							 optval, optlen, len);
1945 
1946 	case SO_MARK:
1947 		v.val = READ_ONCE(sk->sk_mark);
1948 		break;
1949 
1950 	case SO_RCVMARK:
1951 		v.val = sock_flag(sk, SOCK_RCVMARK);
1952 		break;
1953 
1954 	case SO_RCVPRIORITY:
1955 		v.val = sock_flag(sk, SOCK_RCVPRIORITY);
1956 		break;
1957 
1958 	case SO_RXQ_OVFL:
1959 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1960 		break;
1961 
1962 	case SO_WIFI_STATUS:
1963 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1964 		break;
1965 
1966 	case SO_PEEK_OFF:
1967 		if (!READ_ONCE(sock->ops)->set_peek_off)
1968 			return -EOPNOTSUPP;
1969 
1970 		v.val = READ_ONCE(sk->sk_peek_off);
1971 		break;
1972 	case SO_NOFCS:
1973 		v.val = sock_flag(sk, SOCK_NOFCS);
1974 		break;
1975 
1976 	case SO_BINDTODEVICE:
1977 		return sock_getbindtodevice(sk, optval, optlen, len);
1978 
1979 	case SO_GET_FILTER:
1980 		len = sk_get_filter(sk, optval, len);
1981 		if (len < 0)
1982 			return len;
1983 
1984 		goto lenout;
1985 
1986 	case SO_LOCK_FILTER:
1987 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1988 		break;
1989 
1990 	case SO_BPF_EXTENSIONS:
1991 		v.val = bpf_tell_extensions();
1992 		break;
1993 
1994 	case SO_SELECT_ERR_QUEUE:
1995 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1996 		break;
1997 
1998 #ifdef CONFIG_NET_RX_BUSY_POLL
1999 	case SO_BUSY_POLL:
2000 		v.val = READ_ONCE(sk->sk_ll_usec);
2001 		break;
2002 	case SO_PREFER_BUSY_POLL:
2003 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2004 		break;
2005 #endif
2006 
2007 	case SO_MAX_PACING_RATE:
2008 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2009 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2010 			lv = sizeof(v.ulval);
2011 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2012 		} else {
2013 			/* 32bit version */
2014 			v.val = min_t(unsigned long, ~0U,
2015 				      READ_ONCE(sk->sk_max_pacing_rate));
2016 		}
2017 		break;
2018 
2019 	case SO_INCOMING_CPU:
2020 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2021 		break;
2022 
2023 	case SO_MEMINFO:
2024 	{
2025 		u32 meminfo[SK_MEMINFO_VARS];
2026 
2027 		sk_get_meminfo(sk, meminfo);
2028 
2029 		len = min_t(unsigned int, len, sizeof(meminfo));
2030 		if (copy_to_sockptr(optval, &meminfo, len))
2031 			return -EFAULT;
2032 
2033 		goto lenout;
2034 	}
2035 
2036 #ifdef CONFIG_NET_RX_BUSY_POLL
2037 	case SO_INCOMING_NAPI_ID:
2038 		v.val = READ_ONCE(sk->sk_napi_id);
2039 
2040 		/* aggregate non-NAPI IDs down to 0 */
2041 		if (v.val < MIN_NAPI_ID)
2042 			v.val = 0;
2043 
2044 		break;
2045 #endif
2046 
2047 	case SO_COOKIE:
2048 		lv = sizeof(u64);
2049 		if (len < lv)
2050 			return -EINVAL;
2051 		v.val64 = sock_gen_cookie(sk);
2052 		break;
2053 
2054 	case SO_ZEROCOPY:
2055 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2056 		break;
2057 
2058 	case SO_TXTIME:
2059 		lv = sizeof(v.txtime);
2060 		v.txtime.clockid = sk->sk_clockid;
2061 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2062 				  SOF_TXTIME_DEADLINE_MODE : 0;
2063 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2064 				  SOF_TXTIME_REPORT_ERRORS : 0;
2065 		break;
2066 
2067 	case SO_BINDTOIFINDEX:
2068 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2069 		break;
2070 
2071 	case SO_NETNS_COOKIE:
2072 		lv = sizeof(u64);
2073 		if (len != lv)
2074 			return -EINVAL;
2075 		v.val64 = sock_net(sk)->net_cookie;
2076 		break;
2077 
2078 	case SO_BUF_LOCK:
2079 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2080 		break;
2081 
2082 	case SO_RESERVE_MEM:
2083 		v.val = READ_ONCE(sk->sk_reserved_mem);
2084 		break;
2085 
2086 	case SO_TXREHASH:
2087 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2088 		v.val = READ_ONCE(sk->sk_txrehash);
2089 		break;
2090 
2091 	default:
2092 		/* We implement the SO_SNDLOWAT etc to not be settable
2093 		 * (1003.1g 7).
2094 		 */
2095 		return -ENOPROTOOPT;
2096 	}
2097 
2098 	if (len > lv)
2099 		len = lv;
2100 	if (copy_to_sockptr(optval, &v, len))
2101 		return -EFAULT;
2102 lenout:
2103 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2104 		return -EFAULT;
2105 	return 0;
2106 }
2107 
2108 /*
2109  * Initialize an sk_lock.
2110  *
2111  * (We also register the sk_lock with the lock validator.)
2112  */
2113 static inline void sock_lock_init(struct sock *sk)
2114 {
2115 	if (sk->sk_kern_sock)
2116 		sock_lock_init_class_and_name(
2117 			sk,
2118 			af_family_kern_slock_key_strings[sk->sk_family],
2119 			af_family_kern_slock_keys + sk->sk_family,
2120 			af_family_kern_key_strings[sk->sk_family],
2121 			af_family_kern_keys + sk->sk_family);
2122 	else
2123 		sock_lock_init_class_and_name(
2124 			sk,
2125 			af_family_slock_key_strings[sk->sk_family],
2126 			af_family_slock_keys + sk->sk_family,
2127 			af_family_key_strings[sk->sk_family],
2128 			af_family_keys + sk->sk_family);
2129 }
2130 
2131 /*
2132  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2133  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2134  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2135  */
2136 static void sock_copy(struct sock *nsk, const struct sock *osk)
2137 {
2138 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2139 #ifdef CONFIG_SECURITY_NETWORK
2140 	void *sptr = nsk->sk_security;
2141 #endif
2142 
2143 	/* If we move sk_tx_queue_mapping out of the private section,
2144 	 * we must check if sk_tx_queue_clear() is called after
2145 	 * sock_copy() in sk_clone_lock().
2146 	 */
2147 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2148 		     offsetof(struct sock, sk_dontcopy_begin) ||
2149 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2150 		     offsetof(struct sock, sk_dontcopy_end));
2151 
2152 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2153 
2154 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2155 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2156 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2157 
2158 #ifdef CONFIG_SECURITY_NETWORK
2159 	nsk->sk_security = sptr;
2160 	security_sk_clone(osk, nsk);
2161 #endif
2162 }
2163 
2164 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2165 		int family)
2166 {
2167 	struct sock *sk;
2168 	struct kmem_cache *slab;
2169 
2170 	slab = prot->slab;
2171 	if (slab != NULL) {
2172 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2173 		if (!sk)
2174 			return sk;
2175 		if (want_init_on_alloc(priority))
2176 			sk_prot_clear_nulls(sk, prot->obj_size);
2177 	} else
2178 		sk = kmalloc(prot->obj_size, priority);
2179 
2180 	if (sk != NULL) {
2181 		if (security_sk_alloc(sk, family, priority))
2182 			goto out_free;
2183 
2184 		if (!try_module_get(prot->owner))
2185 			goto out_free_sec;
2186 	}
2187 
2188 	return sk;
2189 
2190 out_free_sec:
2191 	security_sk_free(sk);
2192 out_free:
2193 	if (slab != NULL)
2194 		kmem_cache_free(slab, sk);
2195 	else
2196 		kfree(sk);
2197 	return NULL;
2198 }
2199 
2200 static void sk_prot_free(struct proto *prot, struct sock *sk)
2201 {
2202 	struct kmem_cache *slab;
2203 	struct module *owner;
2204 
2205 	owner = prot->owner;
2206 	slab = prot->slab;
2207 
2208 	cgroup_sk_free(&sk->sk_cgrp_data);
2209 	mem_cgroup_sk_free(sk);
2210 	security_sk_free(sk);
2211 	if (slab != NULL)
2212 		kmem_cache_free(slab, sk);
2213 	else
2214 		kfree(sk);
2215 	module_put(owner);
2216 }
2217 
2218 /**
2219  *	sk_alloc - All socket objects are allocated here
2220  *	@net: the applicable net namespace
2221  *	@family: protocol family
2222  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2223  *	@prot: struct proto associated with this new sock instance
2224  *	@kern: is this to be a kernel socket?
2225  */
2226 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2227 		      struct proto *prot, int kern)
2228 {
2229 	struct sock *sk;
2230 
2231 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2232 	if (sk) {
2233 		sk->sk_family = family;
2234 		/*
2235 		 * See comment in struct sock definition to understand
2236 		 * why we need sk_prot_creator -acme
2237 		 */
2238 		sk->sk_prot = sk->sk_prot_creator = prot;
2239 		sk->sk_kern_sock = kern;
2240 		sock_lock_init(sk);
2241 		sk->sk_net_refcnt = kern ? 0 : 1;
2242 		if (likely(sk->sk_net_refcnt)) {
2243 			get_net_track(net, &sk->ns_tracker, priority);
2244 			sock_inuse_add(net, 1);
2245 		} else {
2246 			__netns_tracker_alloc(net, &sk->ns_tracker,
2247 					      false, priority);
2248 		}
2249 
2250 		sock_net_set(sk, net);
2251 		refcount_set(&sk->sk_wmem_alloc, 1);
2252 
2253 		mem_cgroup_sk_alloc(sk);
2254 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2255 		sock_update_classid(&sk->sk_cgrp_data);
2256 		sock_update_netprioidx(&sk->sk_cgrp_data);
2257 		sk_tx_queue_clear(sk);
2258 	}
2259 
2260 	return sk;
2261 }
2262 EXPORT_SYMBOL(sk_alloc);
2263 
2264 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2265  * grace period. This is the case for UDP sockets and TCP listeners.
2266  */
2267 static void __sk_destruct(struct rcu_head *head)
2268 {
2269 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2270 	struct sk_filter *filter;
2271 
2272 	if (sk->sk_destruct)
2273 		sk->sk_destruct(sk);
2274 
2275 	filter = rcu_dereference_check(sk->sk_filter,
2276 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2277 	if (filter) {
2278 		sk_filter_uncharge(sk, filter);
2279 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2280 	}
2281 
2282 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2283 
2284 #ifdef CONFIG_BPF_SYSCALL
2285 	bpf_sk_storage_free(sk);
2286 #endif
2287 
2288 	if (atomic_read(&sk->sk_omem_alloc))
2289 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2290 			 __func__, atomic_read(&sk->sk_omem_alloc));
2291 
2292 	if (sk->sk_frag.page) {
2293 		put_page(sk->sk_frag.page);
2294 		sk->sk_frag.page = NULL;
2295 	}
2296 
2297 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2298 	put_cred(sk->sk_peer_cred);
2299 	put_pid(sk->sk_peer_pid);
2300 
2301 	if (likely(sk->sk_net_refcnt))
2302 		put_net_track(sock_net(sk), &sk->ns_tracker);
2303 	else
2304 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2305 
2306 	sk_prot_free(sk->sk_prot_creator, sk);
2307 }
2308 
2309 void sk_destruct(struct sock *sk)
2310 {
2311 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2312 
2313 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2314 		reuseport_detach_sock(sk);
2315 		use_call_rcu = true;
2316 	}
2317 
2318 	if (use_call_rcu)
2319 		call_rcu(&sk->sk_rcu, __sk_destruct);
2320 	else
2321 		__sk_destruct(&sk->sk_rcu);
2322 }
2323 
2324 static void __sk_free(struct sock *sk)
2325 {
2326 	if (likely(sk->sk_net_refcnt))
2327 		sock_inuse_add(sock_net(sk), -1);
2328 
2329 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2330 		sock_diag_broadcast_destroy(sk);
2331 	else
2332 		sk_destruct(sk);
2333 }
2334 
2335 void sk_free(struct sock *sk)
2336 {
2337 	/*
2338 	 * We subtract one from sk_wmem_alloc and can know if
2339 	 * some packets are still in some tx queue.
2340 	 * If not null, sock_wfree() will call __sk_free(sk) later
2341 	 */
2342 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2343 		__sk_free(sk);
2344 }
2345 EXPORT_SYMBOL(sk_free);
2346 
2347 static void sk_init_common(struct sock *sk)
2348 {
2349 	skb_queue_head_init(&sk->sk_receive_queue);
2350 	skb_queue_head_init(&sk->sk_write_queue);
2351 	skb_queue_head_init(&sk->sk_error_queue);
2352 
2353 	rwlock_init(&sk->sk_callback_lock);
2354 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2355 			af_rlock_keys + sk->sk_family,
2356 			af_family_rlock_key_strings[sk->sk_family]);
2357 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2358 			af_wlock_keys + sk->sk_family,
2359 			af_family_wlock_key_strings[sk->sk_family]);
2360 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2361 			af_elock_keys + sk->sk_family,
2362 			af_family_elock_key_strings[sk->sk_family]);
2363 	if (sk->sk_kern_sock)
2364 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2365 			af_kern_callback_keys + sk->sk_family,
2366 			af_family_kern_clock_key_strings[sk->sk_family]);
2367 	else
2368 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2369 			af_callback_keys + sk->sk_family,
2370 			af_family_clock_key_strings[sk->sk_family]);
2371 }
2372 
2373 /**
2374  *	sk_clone_lock - clone a socket, and lock its clone
2375  *	@sk: the socket to clone
2376  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2377  *
2378  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2379  */
2380 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2381 {
2382 	struct proto *prot = READ_ONCE(sk->sk_prot);
2383 	struct sk_filter *filter;
2384 	bool is_charged = true;
2385 	struct sock *newsk;
2386 
2387 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2388 	if (!newsk)
2389 		goto out;
2390 
2391 	sock_copy(newsk, sk);
2392 
2393 	newsk->sk_prot_creator = prot;
2394 
2395 	/* SANITY */
2396 	if (likely(newsk->sk_net_refcnt)) {
2397 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2398 		sock_inuse_add(sock_net(newsk), 1);
2399 	} else {
2400 		/* Kernel sockets are not elevating the struct net refcount.
2401 		 * Instead, use a tracker to more easily detect if a layer
2402 		 * is not properly dismantling its kernel sockets at netns
2403 		 * destroy time.
2404 		 */
2405 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2406 				      false, priority);
2407 	}
2408 	sk_node_init(&newsk->sk_node);
2409 	sock_lock_init(newsk);
2410 	bh_lock_sock(newsk);
2411 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2412 	newsk->sk_backlog.len = 0;
2413 
2414 	atomic_set(&newsk->sk_rmem_alloc, 0);
2415 
2416 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2417 	refcount_set(&newsk->sk_wmem_alloc, 1);
2418 
2419 	atomic_set(&newsk->sk_omem_alloc, 0);
2420 	sk_init_common(newsk);
2421 
2422 	newsk->sk_dst_cache	= NULL;
2423 	newsk->sk_dst_pending_confirm = 0;
2424 	newsk->sk_wmem_queued	= 0;
2425 	newsk->sk_forward_alloc = 0;
2426 	newsk->sk_reserved_mem  = 0;
2427 	atomic_set(&newsk->sk_drops, 0);
2428 	newsk->sk_send_head	= NULL;
2429 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2430 	atomic_set(&newsk->sk_zckey, 0);
2431 
2432 	sock_reset_flag(newsk, SOCK_DONE);
2433 
2434 	/* sk->sk_memcg will be populated at accept() time */
2435 	newsk->sk_memcg = NULL;
2436 
2437 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2438 
2439 	rcu_read_lock();
2440 	filter = rcu_dereference(sk->sk_filter);
2441 	if (filter != NULL)
2442 		/* though it's an empty new sock, the charging may fail
2443 		 * if sysctl_optmem_max was changed between creation of
2444 		 * original socket and cloning
2445 		 */
2446 		is_charged = sk_filter_charge(newsk, filter);
2447 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2448 	rcu_read_unlock();
2449 
2450 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2451 		/* We need to make sure that we don't uncharge the new
2452 		 * socket if we couldn't charge it in the first place
2453 		 * as otherwise we uncharge the parent's filter.
2454 		 */
2455 		if (!is_charged)
2456 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2457 		sk_free_unlock_clone(newsk);
2458 		newsk = NULL;
2459 		goto out;
2460 	}
2461 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2462 
2463 	if (bpf_sk_storage_clone(sk, newsk)) {
2464 		sk_free_unlock_clone(newsk);
2465 		newsk = NULL;
2466 		goto out;
2467 	}
2468 
2469 	/* Clear sk_user_data if parent had the pointer tagged
2470 	 * as not suitable for copying when cloning.
2471 	 */
2472 	if (sk_user_data_is_nocopy(newsk))
2473 		newsk->sk_user_data = NULL;
2474 
2475 	newsk->sk_err	   = 0;
2476 	newsk->sk_err_soft = 0;
2477 	newsk->sk_priority = 0;
2478 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2479 
2480 	/* Before updating sk_refcnt, we must commit prior changes to memory
2481 	 * (Documentation/RCU/rculist_nulls.rst for details)
2482 	 */
2483 	smp_wmb();
2484 	refcount_set(&newsk->sk_refcnt, 2);
2485 
2486 	sk_set_socket(newsk, NULL);
2487 	sk_tx_queue_clear(newsk);
2488 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2489 
2490 	if (newsk->sk_prot->sockets_allocated)
2491 		sk_sockets_allocated_inc(newsk);
2492 
2493 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2494 		net_enable_timestamp();
2495 out:
2496 	return newsk;
2497 }
2498 EXPORT_SYMBOL_GPL(sk_clone_lock);
2499 
2500 void sk_free_unlock_clone(struct sock *sk)
2501 {
2502 	/* It is still raw copy of parent, so invalidate
2503 	 * destructor and make plain sk_free() */
2504 	sk->sk_destruct = NULL;
2505 	bh_unlock_sock(sk);
2506 	sk_free(sk);
2507 }
2508 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2509 
2510 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2511 {
2512 	bool is_ipv6 = false;
2513 	u32 max_size;
2514 
2515 #if IS_ENABLED(CONFIG_IPV6)
2516 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2517 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2518 #endif
2519 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2520 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2521 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2522 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2523 		max_size = GSO_LEGACY_MAX_SIZE;
2524 
2525 	return max_size - (MAX_TCP_HEADER + 1);
2526 }
2527 
2528 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2529 {
2530 	u32 max_segs = 1;
2531 
2532 	sk->sk_route_caps = dst->dev->features;
2533 	if (sk_is_tcp(sk))
2534 		sk->sk_route_caps |= NETIF_F_GSO;
2535 	if (sk->sk_route_caps & NETIF_F_GSO)
2536 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2537 	if (unlikely(sk->sk_gso_disabled))
2538 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2539 	if (sk_can_gso(sk)) {
2540 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2541 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2542 		} else {
2543 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2544 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2545 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2546 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2547 		}
2548 	}
2549 	sk->sk_gso_max_segs = max_segs;
2550 	sk_dst_set(sk, dst);
2551 }
2552 EXPORT_SYMBOL_GPL(sk_setup_caps);
2553 
2554 /*
2555  *	Simple resource managers for sockets.
2556  */
2557 
2558 
2559 /*
2560  * Write buffer destructor automatically called from kfree_skb.
2561  */
2562 void sock_wfree(struct sk_buff *skb)
2563 {
2564 	struct sock *sk = skb->sk;
2565 	unsigned int len = skb->truesize;
2566 	bool free;
2567 
2568 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2569 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2570 		    sk->sk_write_space == sock_def_write_space) {
2571 			rcu_read_lock();
2572 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2573 			sock_def_write_space_wfree(sk);
2574 			rcu_read_unlock();
2575 			if (unlikely(free))
2576 				__sk_free(sk);
2577 			return;
2578 		}
2579 
2580 		/*
2581 		 * Keep a reference on sk_wmem_alloc, this will be released
2582 		 * after sk_write_space() call
2583 		 */
2584 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2585 		sk->sk_write_space(sk);
2586 		len = 1;
2587 	}
2588 	/*
2589 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2590 	 * could not do because of in-flight packets
2591 	 */
2592 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2593 		__sk_free(sk);
2594 }
2595 EXPORT_SYMBOL(sock_wfree);
2596 
2597 /* This variant of sock_wfree() is used by TCP,
2598  * since it sets SOCK_USE_WRITE_QUEUE.
2599  */
2600 void __sock_wfree(struct sk_buff *skb)
2601 {
2602 	struct sock *sk = skb->sk;
2603 
2604 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2605 		__sk_free(sk);
2606 }
2607 
2608 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2609 {
2610 	skb_orphan(skb);
2611 #ifdef CONFIG_INET
2612 	if (unlikely(!sk_fullsock(sk)))
2613 		return skb_set_owner_edemux(skb, sk);
2614 #endif
2615 	skb->sk = sk;
2616 	skb->destructor = sock_wfree;
2617 	skb_set_hash_from_sk(skb, sk);
2618 	/*
2619 	 * We used to take a refcount on sk, but following operation
2620 	 * is enough to guarantee sk_free() won't free this sock until
2621 	 * all in-flight packets are completed
2622 	 */
2623 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2624 }
2625 EXPORT_SYMBOL(skb_set_owner_w);
2626 
2627 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2628 {
2629 	/* Drivers depend on in-order delivery for crypto offload,
2630 	 * partial orphan breaks out-of-order-OK logic.
2631 	 */
2632 	if (skb_is_decrypted(skb))
2633 		return false;
2634 
2635 	return (skb->destructor == sock_wfree ||
2636 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2637 }
2638 
2639 /* This helper is used by netem, as it can hold packets in its
2640  * delay queue. We want to allow the owner socket to send more
2641  * packets, as if they were already TX completed by a typical driver.
2642  * But we also want to keep skb->sk set because some packet schedulers
2643  * rely on it (sch_fq for example).
2644  */
2645 void skb_orphan_partial(struct sk_buff *skb)
2646 {
2647 	if (skb_is_tcp_pure_ack(skb))
2648 		return;
2649 
2650 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2651 		return;
2652 
2653 	skb_orphan(skb);
2654 }
2655 EXPORT_SYMBOL(skb_orphan_partial);
2656 
2657 /*
2658  * Read buffer destructor automatically called from kfree_skb.
2659  */
2660 void sock_rfree(struct sk_buff *skb)
2661 {
2662 	struct sock *sk = skb->sk;
2663 	unsigned int len = skb->truesize;
2664 
2665 	atomic_sub(len, &sk->sk_rmem_alloc);
2666 	sk_mem_uncharge(sk, len);
2667 }
2668 EXPORT_SYMBOL(sock_rfree);
2669 
2670 /*
2671  * Buffer destructor for skbs that are not used directly in read or write
2672  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2673  */
2674 void sock_efree(struct sk_buff *skb)
2675 {
2676 	sock_put(skb->sk);
2677 }
2678 EXPORT_SYMBOL(sock_efree);
2679 
2680 /* Buffer destructor for prefetch/receive path where reference count may
2681  * not be held, e.g. for listen sockets.
2682  */
2683 #ifdef CONFIG_INET
2684 void sock_pfree(struct sk_buff *skb)
2685 {
2686 	struct sock *sk = skb->sk;
2687 
2688 	if (!sk_is_refcounted(sk))
2689 		return;
2690 
2691 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2692 		inet_reqsk(sk)->rsk_listener = NULL;
2693 		reqsk_free(inet_reqsk(sk));
2694 		return;
2695 	}
2696 
2697 	sock_gen_put(sk);
2698 }
2699 EXPORT_SYMBOL(sock_pfree);
2700 #endif /* CONFIG_INET */
2701 
2702 kuid_t sock_i_uid(struct sock *sk)
2703 {
2704 	kuid_t uid;
2705 
2706 	read_lock_bh(&sk->sk_callback_lock);
2707 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2708 	read_unlock_bh(&sk->sk_callback_lock);
2709 	return uid;
2710 }
2711 EXPORT_SYMBOL(sock_i_uid);
2712 
2713 unsigned long __sock_i_ino(struct sock *sk)
2714 {
2715 	unsigned long ino;
2716 
2717 	read_lock(&sk->sk_callback_lock);
2718 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2719 	read_unlock(&sk->sk_callback_lock);
2720 	return ino;
2721 }
2722 EXPORT_SYMBOL(__sock_i_ino);
2723 
2724 unsigned long sock_i_ino(struct sock *sk)
2725 {
2726 	unsigned long ino;
2727 
2728 	local_bh_disable();
2729 	ino = __sock_i_ino(sk);
2730 	local_bh_enable();
2731 	return ino;
2732 }
2733 EXPORT_SYMBOL(sock_i_ino);
2734 
2735 /*
2736  * Allocate a skb from the socket's send buffer.
2737  */
2738 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2739 			     gfp_t priority)
2740 {
2741 	if (force ||
2742 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2743 		struct sk_buff *skb = alloc_skb(size, priority);
2744 
2745 		if (skb) {
2746 			skb_set_owner_w(skb, sk);
2747 			return skb;
2748 		}
2749 	}
2750 	return NULL;
2751 }
2752 EXPORT_SYMBOL(sock_wmalloc);
2753 
2754 static void sock_ofree(struct sk_buff *skb)
2755 {
2756 	struct sock *sk = skb->sk;
2757 
2758 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2759 }
2760 
2761 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2762 			     gfp_t priority)
2763 {
2764 	struct sk_buff *skb;
2765 
2766 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2767 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2768 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2769 		return NULL;
2770 
2771 	skb = alloc_skb(size, priority);
2772 	if (!skb)
2773 		return NULL;
2774 
2775 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2776 	skb->sk = sk;
2777 	skb->destructor = sock_ofree;
2778 	return skb;
2779 }
2780 
2781 /*
2782  * Allocate a memory block from the socket's option memory buffer.
2783  */
2784 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2785 {
2786 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2787 
2788 	if ((unsigned int)size <= optmem_max &&
2789 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2790 		void *mem;
2791 		/* First do the add, to avoid the race if kmalloc
2792 		 * might sleep.
2793 		 */
2794 		atomic_add(size, &sk->sk_omem_alloc);
2795 		mem = kmalloc(size, priority);
2796 		if (mem)
2797 			return mem;
2798 		atomic_sub(size, &sk->sk_omem_alloc);
2799 	}
2800 	return NULL;
2801 }
2802 EXPORT_SYMBOL(sock_kmalloc);
2803 
2804 /* Free an option memory block. Note, we actually want the inline
2805  * here as this allows gcc to detect the nullify and fold away the
2806  * condition entirely.
2807  */
2808 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2809 				  const bool nullify)
2810 {
2811 	if (WARN_ON_ONCE(!mem))
2812 		return;
2813 	if (nullify)
2814 		kfree_sensitive(mem);
2815 	else
2816 		kfree(mem);
2817 	atomic_sub(size, &sk->sk_omem_alloc);
2818 }
2819 
2820 void sock_kfree_s(struct sock *sk, void *mem, int size)
2821 {
2822 	__sock_kfree_s(sk, mem, size, false);
2823 }
2824 EXPORT_SYMBOL(sock_kfree_s);
2825 
2826 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2827 {
2828 	__sock_kfree_s(sk, mem, size, true);
2829 }
2830 EXPORT_SYMBOL(sock_kzfree_s);
2831 
2832 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2833    I think, these locks should be removed for datagram sockets.
2834  */
2835 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2836 {
2837 	DEFINE_WAIT(wait);
2838 
2839 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2840 	for (;;) {
2841 		if (!timeo)
2842 			break;
2843 		if (signal_pending(current))
2844 			break;
2845 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2846 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2847 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2848 			break;
2849 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2850 			break;
2851 		if (READ_ONCE(sk->sk_err))
2852 			break;
2853 		timeo = schedule_timeout(timeo);
2854 	}
2855 	finish_wait(sk_sleep(sk), &wait);
2856 	return timeo;
2857 }
2858 
2859 
2860 /*
2861  *	Generic send/receive buffer handlers
2862  */
2863 
2864 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2865 				     unsigned long data_len, int noblock,
2866 				     int *errcode, int max_page_order)
2867 {
2868 	struct sk_buff *skb;
2869 	long timeo;
2870 	int err;
2871 
2872 	timeo = sock_sndtimeo(sk, noblock);
2873 	for (;;) {
2874 		err = sock_error(sk);
2875 		if (err != 0)
2876 			goto failure;
2877 
2878 		err = -EPIPE;
2879 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2880 			goto failure;
2881 
2882 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2883 			break;
2884 
2885 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2886 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2887 		err = -EAGAIN;
2888 		if (!timeo)
2889 			goto failure;
2890 		if (signal_pending(current))
2891 			goto interrupted;
2892 		timeo = sock_wait_for_wmem(sk, timeo);
2893 	}
2894 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2895 				   errcode, sk->sk_allocation);
2896 	if (skb)
2897 		skb_set_owner_w(skb, sk);
2898 	return skb;
2899 
2900 interrupted:
2901 	err = sock_intr_errno(timeo);
2902 failure:
2903 	*errcode = err;
2904 	return NULL;
2905 }
2906 EXPORT_SYMBOL(sock_alloc_send_pskb);
2907 
2908 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2909 		     struct sockcm_cookie *sockc)
2910 {
2911 	u32 tsflags;
2912 
2913 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
2914 
2915 	switch (cmsg->cmsg_type) {
2916 	case SO_MARK:
2917 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2918 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2919 			return -EPERM;
2920 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2921 			return -EINVAL;
2922 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2923 		break;
2924 	case SO_TIMESTAMPING_OLD:
2925 	case SO_TIMESTAMPING_NEW:
2926 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2927 			return -EINVAL;
2928 
2929 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2930 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2931 			return -EINVAL;
2932 
2933 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2934 		sockc->tsflags |= tsflags;
2935 		break;
2936 	case SCM_TXTIME:
2937 		if (!sock_flag(sk, SOCK_TXTIME))
2938 			return -EINVAL;
2939 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2940 			return -EINVAL;
2941 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2942 		break;
2943 	case SCM_TS_OPT_ID:
2944 		if (sk_is_tcp(sk))
2945 			return -EINVAL;
2946 		tsflags = READ_ONCE(sk->sk_tsflags);
2947 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
2948 			return -EINVAL;
2949 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2950 			return -EINVAL;
2951 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
2952 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
2953 		break;
2954 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2955 	case SCM_RIGHTS:
2956 	case SCM_CREDENTIALS:
2957 		break;
2958 	case SO_PRIORITY:
2959 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2960 			return -EINVAL;
2961 		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
2962 			return -EPERM;
2963 		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
2964 		break;
2965 	default:
2966 		return -EINVAL;
2967 	}
2968 	return 0;
2969 }
2970 EXPORT_SYMBOL(__sock_cmsg_send);
2971 
2972 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2973 		   struct sockcm_cookie *sockc)
2974 {
2975 	struct cmsghdr *cmsg;
2976 	int ret;
2977 
2978 	for_each_cmsghdr(cmsg, msg) {
2979 		if (!CMSG_OK(msg, cmsg))
2980 			return -EINVAL;
2981 		if (cmsg->cmsg_level != SOL_SOCKET)
2982 			continue;
2983 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2984 		if (ret)
2985 			return ret;
2986 	}
2987 	return 0;
2988 }
2989 EXPORT_SYMBOL(sock_cmsg_send);
2990 
2991 static void sk_enter_memory_pressure(struct sock *sk)
2992 {
2993 	if (!sk->sk_prot->enter_memory_pressure)
2994 		return;
2995 
2996 	sk->sk_prot->enter_memory_pressure(sk);
2997 }
2998 
2999 static void sk_leave_memory_pressure(struct sock *sk)
3000 {
3001 	if (sk->sk_prot->leave_memory_pressure) {
3002 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3003 				     tcp_leave_memory_pressure, sk);
3004 	} else {
3005 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3006 
3007 		if (memory_pressure && READ_ONCE(*memory_pressure))
3008 			WRITE_ONCE(*memory_pressure, 0);
3009 	}
3010 }
3011 
3012 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3013 
3014 /**
3015  * skb_page_frag_refill - check that a page_frag contains enough room
3016  * @sz: minimum size of the fragment we want to get
3017  * @pfrag: pointer to page_frag
3018  * @gfp: priority for memory allocation
3019  *
3020  * Note: While this allocator tries to use high order pages, there is
3021  * no guarantee that allocations succeed. Therefore, @sz MUST be
3022  * less or equal than PAGE_SIZE.
3023  */
3024 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3025 {
3026 	if (pfrag->page) {
3027 		if (page_ref_count(pfrag->page) == 1) {
3028 			pfrag->offset = 0;
3029 			return true;
3030 		}
3031 		if (pfrag->offset + sz <= pfrag->size)
3032 			return true;
3033 		put_page(pfrag->page);
3034 	}
3035 
3036 	pfrag->offset = 0;
3037 	if (SKB_FRAG_PAGE_ORDER &&
3038 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3039 		/* Avoid direct reclaim but allow kswapd to wake */
3040 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3041 					  __GFP_COMP | __GFP_NOWARN |
3042 					  __GFP_NORETRY,
3043 					  SKB_FRAG_PAGE_ORDER);
3044 		if (likely(pfrag->page)) {
3045 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3046 			return true;
3047 		}
3048 	}
3049 	pfrag->page = alloc_page(gfp);
3050 	if (likely(pfrag->page)) {
3051 		pfrag->size = PAGE_SIZE;
3052 		return true;
3053 	}
3054 	return false;
3055 }
3056 EXPORT_SYMBOL(skb_page_frag_refill);
3057 
3058 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3059 {
3060 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3061 		return true;
3062 
3063 	sk_enter_memory_pressure(sk);
3064 	sk_stream_moderate_sndbuf(sk);
3065 	return false;
3066 }
3067 EXPORT_SYMBOL(sk_page_frag_refill);
3068 
3069 void __lock_sock(struct sock *sk)
3070 	__releases(&sk->sk_lock.slock)
3071 	__acquires(&sk->sk_lock.slock)
3072 {
3073 	DEFINE_WAIT(wait);
3074 
3075 	for (;;) {
3076 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3077 					TASK_UNINTERRUPTIBLE);
3078 		spin_unlock_bh(&sk->sk_lock.slock);
3079 		schedule();
3080 		spin_lock_bh(&sk->sk_lock.slock);
3081 		if (!sock_owned_by_user(sk))
3082 			break;
3083 	}
3084 	finish_wait(&sk->sk_lock.wq, &wait);
3085 }
3086 
3087 void __release_sock(struct sock *sk)
3088 	__releases(&sk->sk_lock.slock)
3089 	__acquires(&sk->sk_lock.slock)
3090 {
3091 	struct sk_buff *skb, *next;
3092 
3093 	while ((skb = sk->sk_backlog.head) != NULL) {
3094 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3095 
3096 		spin_unlock_bh(&sk->sk_lock.slock);
3097 
3098 		do {
3099 			next = skb->next;
3100 			prefetch(next);
3101 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3102 			skb_mark_not_on_list(skb);
3103 			sk_backlog_rcv(sk, skb);
3104 
3105 			cond_resched();
3106 
3107 			skb = next;
3108 		} while (skb != NULL);
3109 
3110 		spin_lock_bh(&sk->sk_lock.slock);
3111 	}
3112 
3113 	/*
3114 	 * Doing the zeroing here guarantee we can not loop forever
3115 	 * while a wild producer attempts to flood us.
3116 	 */
3117 	sk->sk_backlog.len = 0;
3118 }
3119 
3120 void __sk_flush_backlog(struct sock *sk)
3121 {
3122 	spin_lock_bh(&sk->sk_lock.slock);
3123 	__release_sock(sk);
3124 
3125 	if (sk->sk_prot->release_cb)
3126 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3127 				     tcp_release_cb, sk);
3128 
3129 	spin_unlock_bh(&sk->sk_lock.slock);
3130 }
3131 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3132 
3133 /**
3134  * sk_wait_data - wait for data to arrive at sk_receive_queue
3135  * @sk:    sock to wait on
3136  * @timeo: for how long
3137  * @skb:   last skb seen on sk_receive_queue
3138  *
3139  * Now socket state including sk->sk_err is changed only under lock,
3140  * hence we may omit checks after joining wait queue.
3141  * We check receive queue before schedule() only as optimization;
3142  * it is very likely that release_sock() added new data.
3143  */
3144 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3145 {
3146 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3147 	int rc;
3148 
3149 	add_wait_queue(sk_sleep(sk), &wait);
3150 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3151 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3152 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3153 	remove_wait_queue(sk_sleep(sk), &wait);
3154 	return rc;
3155 }
3156 EXPORT_SYMBOL(sk_wait_data);
3157 
3158 /**
3159  *	__sk_mem_raise_allocated - increase memory_allocated
3160  *	@sk: socket
3161  *	@size: memory size to allocate
3162  *	@amt: pages to allocate
3163  *	@kind: allocation type
3164  *
3165  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3166  *
3167  *	Unlike the globally shared limits among the sockets under same protocol,
3168  *	consuming the budget of a memcg won't have direct effect on other ones.
3169  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3170  *	whether or not to raise allocated through sk_under_memory_pressure() or
3171  *	its variants.
3172  */
3173 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3174 {
3175 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3176 	struct proto *prot = sk->sk_prot;
3177 	bool charged = false;
3178 	long allocated;
3179 
3180 	sk_memory_allocated_add(sk, amt);
3181 	allocated = sk_memory_allocated(sk);
3182 
3183 	if (memcg) {
3184 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3185 			goto suppress_allocation;
3186 		charged = true;
3187 	}
3188 
3189 	/* Under limit. */
3190 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3191 		sk_leave_memory_pressure(sk);
3192 		return 1;
3193 	}
3194 
3195 	/* Under pressure. */
3196 	if (allocated > sk_prot_mem_limits(sk, 1))
3197 		sk_enter_memory_pressure(sk);
3198 
3199 	/* Over hard limit. */
3200 	if (allocated > sk_prot_mem_limits(sk, 2))
3201 		goto suppress_allocation;
3202 
3203 	/* Guarantee minimum buffer size under pressure (either global
3204 	 * or memcg) to make sure features described in RFC 7323 (TCP
3205 	 * Extensions for High Performance) work properly.
3206 	 *
3207 	 * This rule does NOT stand when exceeds global or memcg's hard
3208 	 * limit, or else a DoS attack can be taken place by spawning
3209 	 * lots of sockets whose usage are under minimum buffer size.
3210 	 */
3211 	if (kind == SK_MEM_RECV) {
3212 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3213 			return 1;
3214 
3215 	} else { /* SK_MEM_SEND */
3216 		int wmem0 = sk_get_wmem0(sk, prot);
3217 
3218 		if (sk->sk_type == SOCK_STREAM) {
3219 			if (sk->sk_wmem_queued < wmem0)
3220 				return 1;
3221 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3222 				return 1;
3223 		}
3224 	}
3225 
3226 	if (sk_has_memory_pressure(sk)) {
3227 		u64 alloc;
3228 
3229 		/* The following 'average' heuristic is within the
3230 		 * scope of global accounting, so it only makes
3231 		 * sense for global memory pressure.
3232 		 */
3233 		if (!sk_under_global_memory_pressure(sk))
3234 			return 1;
3235 
3236 		/* Try to be fair among all the sockets under global
3237 		 * pressure by allowing the ones that below average
3238 		 * usage to raise.
3239 		 */
3240 		alloc = sk_sockets_allocated_read_positive(sk);
3241 		if (sk_prot_mem_limits(sk, 2) > alloc *
3242 		    sk_mem_pages(sk->sk_wmem_queued +
3243 				 atomic_read(&sk->sk_rmem_alloc) +
3244 				 sk->sk_forward_alloc))
3245 			return 1;
3246 	}
3247 
3248 suppress_allocation:
3249 
3250 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3251 		sk_stream_moderate_sndbuf(sk);
3252 
3253 		/* Fail only if socket is _under_ its sndbuf.
3254 		 * In this case we cannot block, so that we have to fail.
3255 		 */
3256 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3257 			/* Force charge with __GFP_NOFAIL */
3258 			if (memcg && !charged) {
3259 				mem_cgroup_charge_skmem(memcg, amt,
3260 					gfp_memcg_charge() | __GFP_NOFAIL);
3261 			}
3262 			return 1;
3263 		}
3264 	}
3265 
3266 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3267 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3268 
3269 	sk_memory_allocated_sub(sk, amt);
3270 
3271 	if (charged)
3272 		mem_cgroup_uncharge_skmem(memcg, amt);
3273 
3274 	return 0;
3275 }
3276 
3277 /**
3278  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3279  *	@sk: socket
3280  *	@size: memory size to allocate
3281  *	@kind: allocation type
3282  *
3283  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3284  *	rmem allocation. This function assumes that protocols which have
3285  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3286  */
3287 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3288 {
3289 	int ret, amt = sk_mem_pages(size);
3290 
3291 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3292 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3293 	if (!ret)
3294 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3295 	return ret;
3296 }
3297 EXPORT_SYMBOL(__sk_mem_schedule);
3298 
3299 /**
3300  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3301  *	@sk: socket
3302  *	@amount: number of quanta
3303  *
3304  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3305  */
3306 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3307 {
3308 	sk_memory_allocated_sub(sk, amount);
3309 
3310 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3311 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3312 
3313 	if (sk_under_global_memory_pressure(sk) &&
3314 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3315 		sk_leave_memory_pressure(sk);
3316 }
3317 
3318 /**
3319  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3320  *	@sk: socket
3321  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3322  */
3323 void __sk_mem_reclaim(struct sock *sk, int amount)
3324 {
3325 	amount >>= PAGE_SHIFT;
3326 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3327 	__sk_mem_reduce_allocated(sk, amount);
3328 }
3329 EXPORT_SYMBOL(__sk_mem_reclaim);
3330 
3331 int sk_set_peek_off(struct sock *sk, int val)
3332 {
3333 	WRITE_ONCE(sk->sk_peek_off, val);
3334 	return 0;
3335 }
3336 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3337 
3338 /*
3339  * Set of default routines for initialising struct proto_ops when
3340  * the protocol does not support a particular function. In certain
3341  * cases where it makes no sense for a protocol to have a "do nothing"
3342  * function, some default processing is provided.
3343  */
3344 
3345 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3346 {
3347 	return -EOPNOTSUPP;
3348 }
3349 EXPORT_SYMBOL(sock_no_bind);
3350 
3351 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3352 		    int len, int flags)
3353 {
3354 	return -EOPNOTSUPP;
3355 }
3356 EXPORT_SYMBOL(sock_no_connect);
3357 
3358 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3359 {
3360 	return -EOPNOTSUPP;
3361 }
3362 EXPORT_SYMBOL(sock_no_socketpair);
3363 
3364 int sock_no_accept(struct socket *sock, struct socket *newsock,
3365 		   struct proto_accept_arg *arg)
3366 {
3367 	return -EOPNOTSUPP;
3368 }
3369 EXPORT_SYMBOL(sock_no_accept);
3370 
3371 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3372 		    int peer)
3373 {
3374 	return -EOPNOTSUPP;
3375 }
3376 EXPORT_SYMBOL(sock_no_getname);
3377 
3378 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3379 {
3380 	return -EOPNOTSUPP;
3381 }
3382 EXPORT_SYMBOL(sock_no_ioctl);
3383 
3384 int sock_no_listen(struct socket *sock, int backlog)
3385 {
3386 	return -EOPNOTSUPP;
3387 }
3388 EXPORT_SYMBOL(sock_no_listen);
3389 
3390 int sock_no_shutdown(struct socket *sock, int how)
3391 {
3392 	return -EOPNOTSUPP;
3393 }
3394 EXPORT_SYMBOL(sock_no_shutdown);
3395 
3396 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3397 {
3398 	return -EOPNOTSUPP;
3399 }
3400 EXPORT_SYMBOL(sock_no_sendmsg);
3401 
3402 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3403 {
3404 	return -EOPNOTSUPP;
3405 }
3406 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3407 
3408 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3409 		    int flags)
3410 {
3411 	return -EOPNOTSUPP;
3412 }
3413 EXPORT_SYMBOL(sock_no_recvmsg);
3414 
3415 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3416 {
3417 	/* Mirror missing mmap method error code */
3418 	return -ENODEV;
3419 }
3420 EXPORT_SYMBOL(sock_no_mmap);
3421 
3422 /*
3423  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3424  * various sock-based usage counts.
3425  */
3426 void __receive_sock(struct file *file)
3427 {
3428 	struct socket *sock;
3429 
3430 	sock = sock_from_file(file);
3431 	if (sock) {
3432 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3433 		sock_update_classid(&sock->sk->sk_cgrp_data);
3434 	}
3435 }
3436 
3437 /*
3438  *	Default Socket Callbacks
3439  */
3440 
3441 static void sock_def_wakeup(struct sock *sk)
3442 {
3443 	struct socket_wq *wq;
3444 
3445 	rcu_read_lock();
3446 	wq = rcu_dereference(sk->sk_wq);
3447 	if (skwq_has_sleeper(wq))
3448 		wake_up_interruptible_all(&wq->wait);
3449 	rcu_read_unlock();
3450 }
3451 
3452 static void sock_def_error_report(struct sock *sk)
3453 {
3454 	struct socket_wq *wq;
3455 
3456 	rcu_read_lock();
3457 	wq = rcu_dereference(sk->sk_wq);
3458 	if (skwq_has_sleeper(wq))
3459 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3460 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3461 	rcu_read_unlock();
3462 }
3463 
3464 void sock_def_readable(struct sock *sk)
3465 {
3466 	struct socket_wq *wq;
3467 
3468 	trace_sk_data_ready(sk);
3469 
3470 	rcu_read_lock();
3471 	wq = rcu_dereference(sk->sk_wq);
3472 	if (skwq_has_sleeper(wq))
3473 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3474 						EPOLLRDNORM | EPOLLRDBAND);
3475 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3476 	rcu_read_unlock();
3477 }
3478 
3479 static void sock_def_write_space(struct sock *sk)
3480 {
3481 	struct socket_wq *wq;
3482 
3483 	rcu_read_lock();
3484 
3485 	/* Do not wake up a writer until he can make "significant"
3486 	 * progress.  --DaveM
3487 	 */
3488 	if (sock_writeable(sk)) {
3489 		wq = rcu_dereference(sk->sk_wq);
3490 		if (skwq_has_sleeper(wq))
3491 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3492 						EPOLLWRNORM | EPOLLWRBAND);
3493 
3494 		/* Should agree with poll, otherwise some programs break */
3495 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3496 	}
3497 
3498 	rcu_read_unlock();
3499 }
3500 
3501 /* An optimised version of sock_def_write_space(), should only be called
3502  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3503  * ->sk_wmem_alloc.
3504  */
3505 static void sock_def_write_space_wfree(struct sock *sk)
3506 {
3507 	/* Do not wake up a writer until he can make "significant"
3508 	 * progress.  --DaveM
3509 	 */
3510 	if (sock_writeable(sk)) {
3511 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3512 
3513 		/* rely on refcount_sub from sock_wfree() */
3514 		smp_mb__after_atomic();
3515 		if (wq && waitqueue_active(&wq->wait))
3516 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3517 						EPOLLWRNORM | EPOLLWRBAND);
3518 
3519 		/* Should agree with poll, otherwise some programs break */
3520 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3521 	}
3522 }
3523 
3524 static void sock_def_destruct(struct sock *sk)
3525 {
3526 }
3527 
3528 void sk_send_sigurg(struct sock *sk)
3529 {
3530 	if (sk->sk_socket && sk->sk_socket->file)
3531 		if (send_sigurg(sk->sk_socket->file))
3532 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3533 }
3534 EXPORT_SYMBOL(sk_send_sigurg);
3535 
3536 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3537 		    unsigned long expires)
3538 {
3539 	if (!mod_timer(timer, expires))
3540 		sock_hold(sk);
3541 }
3542 EXPORT_SYMBOL(sk_reset_timer);
3543 
3544 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3545 {
3546 	if (del_timer(timer))
3547 		__sock_put(sk);
3548 }
3549 EXPORT_SYMBOL(sk_stop_timer);
3550 
3551 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3552 {
3553 	if (del_timer_sync(timer))
3554 		__sock_put(sk);
3555 }
3556 EXPORT_SYMBOL(sk_stop_timer_sync);
3557 
3558 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3559 {
3560 	sk_init_common(sk);
3561 	sk->sk_send_head	=	NULL;
3562 
3563 	timer_setup(&sk->sk_timer, NULL, 0);
3564 
3565 	sk->sk_allocation	=	GFP_KERNEL;
3566 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3567 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3568 	sk->sk_state		=	TCP_CLOSE;
3569 	sk->sk_use_task_frag	=	true;
3570 	sk_set_socket(sk, sock);
3571 
3572 	sock_set_flag(sk, SOCK_ZAPPED);
3573 
3574 	if (sock) {
3575 		sk->sk_type	=	sock->type;
3576 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3577 		sock->sk	=	sk;
3578 	} else {
3579 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3580 	}
3581 	sk->sk_uid	=	uid;
3582 
3583 	sk->sk_state_change	=	sock_def_wakeup;
3584 	sk->sk_data_ready	=	sock_def_readable;
3585 	sk->sk_write_space	=	sock_def_write_space;
3586 	sk->sk_error_report	=	sock_def_error_report;
3587 	sk->sk_destruct		=	sock_def_destruct;
3588 
3589 	sk->sk_frag.page	=	NULL;
3590 	sk->sk_frag.offset	=	0;
3591 	sk->sk_peek_off		=	-1;
3592 
3593 	sk->sk_peer_pid 	=	NULL;
3594 	sk->sk_peer_cred	=	NULL;
3595 	spin_lock_init(&sk->sk_peer_lock);
3596 
3597 	sk->sk_write_pending	=	0;
3598 	sk->sk_rcvlowat		=	1;
3599 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3600 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3601 
3602 	sk->sk_stamp = SK_DEFAULT_STAMP;
3603 #if BITS_PER_LONG==32
3604 	seqlock_init(&sk->sk_stamp_seq);
3605 #endif
3606 	atomic_set(&sk->sk_zckey, 0);
3607 
3608 #ifdef CONFIG_NET_RX_BUSY_POLL
3609 	sk->sk_napi_id		=	0;
3610 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3611 #endif
3612 
3613 	sk->sk_max_pacing_rate = ~0UL;
3614 	sk->sk_pacing_rate = ~0UL;
3615 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3616 	sk->sk_incoming_cpu = -1;
3617 
3618 	sk_rx_queue_clear(sk);
3619 	/*
3620 	 * Before updating sk_refcnt, we must commit prior changes to memory
3621 	 * (Documentation/RCU/rculist_nulls.rst for details)
3622 	 */
3623 	smp_wmb();
3624 	refcount_set(&sk->sk_refcnt, 1);
3625 	atomic_set(&sk->sk_drops, 0);
3626 }
3627 EXPORT_SYMBOL(sock_init_data_uid);
3628 
3629 void sock_init_data(struct socket *sock, struct sock *sk)
3630 {
3631 	kuid_t uid = sock ?
3632 		SOCK_INODE(sock)->i_uid :
3633 		make_kuid(sock_net(sk)->user_ns, 0);
3634 
3635 	sock_init_data_uid(sock, sk, uid);
3636 }
3637 EXPORT_SYMBOL(sock_init_data);
3638 
3639 void lock_sock_nested(struct sock *sk, int subclass)
3640 {
3641 	/* The sk_lock has mutex_lock() semantics here. */
3642 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3643 
3644 	might_sleep();
3645 	spin_lock_bh(&sk->sk_lock.slock);
3646 	if (sock_owned_by_user_nocheck(sk))
3647 		__lock_sock(sk);
3648 	sk->sk_lock.owned = 1;
3649 	spin_unlock_bh(&sk->sk_lock.slock);
3650 }
3651 EXPORT_SYMBOL(lock_sock_nested);
3652 
3653 void release_sock(struct sock *sk)
3654 {
3655 	spin_lock_bh(&sk->sk_lock.slock);
3656 	if (sk->sk_backlog.tail)
3657 		__release_sock(sk);
3658 
3659 	if (sk->sk_prot->release_cb)
3660 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3661 				     tcp_release_cb, sk);
3662 
3663 	sock_release_ownership(sk);
3664 	if (waitqueue_active(&sk->sk_lock.wq))
3665 		wake_up(&sk->sk_lock.wq);
3666 	spin_unlock_bh(&sk->sk_lock.slock);
3667 }
3668 EXPORT_SYMBOL(release_sock);
3669 
3670 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3671 {
3672 	might_sleep();
3673 	spin_lock_bh(&sk->sk_lock.slock);
3674 
3675 	if (!sock_owned_by_user_nocheck(sk)) {
3676 		/*
3677 		 * Fast path return with bottom halves disabled and
3678 		 * sock::sk_lock.slock held.
3679 		 *
3680 		 * The 'mutex' is not contended and holding
3681 		 * sock::sk_lock.slock prevents all other lockers to
3682 		 * proceed so the corresponding unlock_sock_fast() can
3683 		 * avoid the slow path of release_sock() completely and
3684 		 * just release slock.
3685 		 *
3686 		 * From a semantical POV this is equivalent to 'acquiring'
3687 		 * the 'mutex', hence the corresponding lockdep
3688 		 * mutex_release() has to happen in the fast path of
3689 		 * unlock_sock_fast().
3690 		 */
3691 		return false;
3692 	}
3693 
3694 	__lock_sock(sk);
3695 	sk->sk_lock.owned = 1;
3696 	__acquire(&sk->sk_lock.slock);
3697 	spin_unlock_bh(&sk->sk_lock.slock);
3698 	return true;
3699 }
3700 EXPORT_SYMBOL(__lock_sock_fast);
3701 
3702 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3703 		   bool timeval, bool time32)
3704 {
3705 	struct sock *sk = sock->sk;
3706 	struct timespec64 ts;
3707 
3708 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3709 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3710 	if (ts.tv_sec == -1)
3711 		return -ENOENT;
3712 	if (ts.tv_sec == 0) {
3713 		ktime_t kt = ktime_get_real();
3714 		sock_write_timestamp(sk, kt);
3715 		ts = ktime_to_timespec64(kt);
3716 	}
3717 
3718 	if (timeval)
3719 		ts.tv_nsec /= 1000;
3720 
3721 #ifdef CONFIG_COMPAT_32BIT_TIME
3722 	if (time32)
3723 		return put_old_timespec32(&ts, userstamp);
3724 #endif
3725 #ifdef CONFIG_SPARC64
3726 	/* beware of padding in sparc64 timeval */
3727 	if (timeval && !in_compat_syscall()) {
3728 		struct __kernel_old_timeval __user tv = {
3729 			.tv_sec = ts.tv_sec,
3730 			.tv_usec = ts.tv_nsec,
3731 		};
3732 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3733 			return -EFAULT;
3734 		return 0;
3735 	}
3736 #endif
3737 	return put_timespec64(&ts, userstamp);
3738 }
3739 EXPORT_SYMBOL(sock_gettstamp);
3740 
3741 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3742 {
3743 	if (!sock_flag(sk, flag)) {
3744 		unsigned long previous_flags = sk->sk_flags;
3745 
3746 		sock_set_flag(sk, flag);
3747 		/*
3748 		 * we just set one of the two flags which require net
3749 		 * time stamping, but time stamping might have been on
3750 		 * already because of the other one
3751 		 */
3752 		if (sock_needs_netstamp(sk) &&
3753 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3754 			net_enable_timestamp();
3755 	}
3756 }
3757 
3758 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3759 		       int level, int type)
3760 {
3761 	struct sock_exterr_skb *serr;
3762 	struct sk_buff *skb;
3763 	int copied, err;
3764 
3765 	err = -EAGAIN;
3766 	skb = sock_dequeue_err_skb(sk);
3767 	if (skb == NULL)
3768 		goto out;
3769 
3770 	copied = skb->len;
3771 	if (copied > len) {
3772 		msg->msg_flags |= MSG_TRUNC;
3773 		copied = len;
3774 	}
3775 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3776 	if (err)
3777 		goto out_free_skb;
3778 
3779 	sock_recv_timestamp(msg, sk, skb);
3780 
3781 	serr = SKB_EXT_ERR(skb);
3782 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3783 
3784 	msg->msg_flags |= MSG_ERRQUEUE;
3785 	err = copied;
3786 
3787 out_free_skb:
3788 	kfree_skb(skb);
3789 out:
3790 	return err;
3791 }
3792 EXPORT_SYMBOL(sock_recv_errqueue);
3793 
3794 /*
3795  *	Get a socket option on an socket.
3796  *
3797  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3798  *	asynchronous errors should be reported by getsockopt. We assume
3799  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3800  */
3801 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3802 			   char __user *optval, int __user *optlen)
3803 {
3804 	struct sock *sk = sock->sk;
3805 
3806 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3807 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3808 }
3809 EXPORT_SYMBOL(sock_common_getsockopt);
3810 
3811 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3812 			int flags)
3813 {
3814 	struct sock *sk = sock->sk;
3815 	int addr_len = 0;
3816 	int err;
3817 
3818 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3819 	if (err >= 0)
3820 		msg->msg_namelen = addr_len;
3821 	return err;
3822 }
3823 EXPORT_SYMBOL(sock_common_recvmsg);
3824 
3825 /*
3826  *	Set socket options on an inet socket.
3827  */
3828 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3829 			   sockptr_t optval, unsigned int optlen)
3830 {
3831 	struct sock *sk = sock->sk;
3832 
3833 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3834 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3835 }
3836 EXPORT_SYMBOL(sock_common_setsockopt);
3837 
3838 void sk_common_release(struct sock *sk)
3839 {
3840 	if (sk->sk_prot->destroy)
3841 		sk->sk_prot->destroy(sk);
3842 
3843 	/*
3844 	 * Observation: when sk_common_release is called, processes have
3845 	 * no access to socket. But net still has.
3846 	 * Step one, detach it from networking:
3847 	 *
3848 	 * A. Remove from hash tables.
3849 	 */
3850 
3851 	sk->sk_prot->unhash(sk);
3852 
3853 	/*
3854 	 * In this point socket cannot receive new packets, but it is possible
3855 	 * that some packets are in flight because some CPU runs receiver and
3856 	 * did hash table lookup before we unhashed socket. They will achieve
3857 	 * receive queue and will be purged by socket destructor.
3858 	 *
3859 	 * Also we still have packets pending on receive queue and probably,
3860 	 * our own packets waiting in device queues. sock_destroy will drain
3861 	 * receive queue, but transmitted packets will delay socket destruction
3862 	 * until the last reference will be released.
3863 	 */
3864 
3865 	sock_orphan(sk);
3866 
3867 	xfrm_sk_free_policy(sk);
3868 
3869 	sock_put(sk);
3870 }
3871 EXPORT_SYMBOL(sk_common_release);
3872 
3873 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3874 {
3875 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3876 
3877 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3878 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3879 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3880 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3881 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3882 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3883 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3884 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3885 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3886 }
3887 
3888 #ifdef CONFIG_PROC_FS
3889 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3890 
3891 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3892 {
3893 	int cpu, idx = prot->inuse_idx;
3894 	int res = 0;
3895 
3896 	for_each_possible_cpu(cpu)
3897 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3898 
3899 	return res >= 0 ? res : 0;
3900 }
3901 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3902 
3903 int sock_inuse_get(struct net *net)
3904 {
3905 	int cpu, res = 0;
3906 
3907 	for_each_possible_cpu(cpu)
3908 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3909 
3910 	return res;
3911 }
3912 
3913 EXPORT_SYMBOL_GPL(sock_inuse_get);
3914 
3915 static int __net_init sock_inuse_init_net(struct net *net)
3916 {
3917 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3918 	if (net->core.prot_inuse == NULL)
3919 		return -ENOMEM;
3920 	return 0;
3921 }
3922 
3923 static void __net_exit sock_inuse_exit_net(struct net *net)
3924 {
3925 	free_percpu(net->core.prot_inuse);
3926 }
3927 
3928 static struct pernet_operations net_inuse_ops = {
3929 	.init = sock_inuse_init_net,
3930 	.exit = sock_inuse_exit_net,
3931 };
3932 
3933 static __init int net_inuse_init(void)
3934 {
3935 	if (register_pernet_subsys(&net_inuse_ops))
3936 		panic("Cannot initialize net inuse counters");
3937 
3938 	return 0;
3939 }
3940 
3941 core_initcall(net_inuse_init);
3942 
3943 static int assign_proto_idx(struct proto *prot)
3944 {
3945 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3946 
3947 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3948 		pr_err("PROTO_INUSE_NR exhausted\n");
3949 		return -ENOSPC;
3950 	}
3951 
3952 	set_bit(prot->inuse_idx, proto_inuse_idx);
3953 	return 0;
3954 }
3955 
3956 static void release_proto_idx(struct proto *prot)
3957 {
3958 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3959 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3960 }
3961 #else
3962 static inline int assign_proto_idx(struct proto *prot)
3963 {
3964 	return 0;
3965 }
3966 
3967 static inline void release_proto_idx(struct proto *prot)
3968 {
3969 }
3970 
3971 #endif
3972 
3973 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3974 {
3975 	if (!twsk_prot)
3976 		return;
3977 	kfree(twsk_prot->twsk_slab_name);
3978 	twsk_prot->twsk_slab_name = NULL;
3979 	kmem_cache_destroy(twsk_prot->twsk_slab);
3980 	twsk_prot->twsk_slab = NULL;
3981 }
3982 
3983 static int tw_prot_init(const struct proto *prot)
3984 {
3985 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3986 
3987 	if (!twsk_prot)
3988 		return 0;
3989 
3990 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3991 					      prot->name);
3992 	if (!twsk_prot->twsk_slab_name)
3993 		return -ENOMEM;
3994 
3995 	twsk_prot->twsk_slab =
3996 		kmem_cache_create(twsk_prot->twsk_slab_name,
3997 				  twsk_prot->twsk_obj_size, 0,
3998 				  SLAB_ACCOUNT | prot->slab_flags,
3999 				  NULL);
4000 	if (!twsk_prot->twsk_slab) {
4001 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4002 			prot->name);
4003 		return -ENOMEM;
4004 	}
4005 
4006 	return 0;
4007 }
4008 
4009 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4010 {
4011 	if (!rsk_prot)
4012 		return;
4013 	kfree(rsk_prot->slab_name);
4014 	rsk_prot->slab_name = NULL;
4015 	kmem_cache_destroy(rsk_prot->slab);
4016 	rsk_prot->slab = NULL;
4017 }
4018 
4019 static int req_prot_init(const struct proto *prot)
4020 {
4021 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4022 
4023 	if (!rsk_prot)
4024 		return 0;
4025 
4026 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4027 					prot->name);
4028 	if (!rsk_prot->slab_name)
4029 		return -ENOMEM;
4030 
4031 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4032 					   rsk_prot->obj_size, 0,
4033 					   SLAB_ACCOUNT | prot->slab_flags,
4034 					   NULL);
4035 
4036 	if (!rsk_prot->slab) {
4037 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4038 			prot->name);
4039 		return -ENOMEM;
4040 	}
4041 	return 0;
4042 }
4043 
4044 int proto_register(struct proto *prot, int alloc_slab)
4045 {
4046 	int ret = -ENOBUFS;
4047 
4048 	if (prot->memory_allocated && !prot->sysctl_mem) {
4049 		pr_err("%s: missing sysctl_mem\n", prot->name);
4050 		return -EINVAL;
4051 	}
4052 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4053 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4054 		return -EINVAL;
4055 	}
4056 	if (alloc_slab) {
4057 		prot->slab = kmem_cache_create_usercopy(prot->name,
4058 					prot->obj_size, 0,
4059 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4060 					prot->slab_flags,
4061 					prot->useroffset, prot->usersize,
4062 					NULL);
4063 
4064 		if (prot->slab == NULL) {
4065 			pr_crit("%s: Can't create sock SLAB cache!\n",
4066 				prot->name);
4067 			goto out;
4068 		}
4069 
4070 		if (req_prot_init(prot))
4071 			goto out_free_request_sock_slab;
4072 
4073 		if (tw_prot_init(prot))
4074 			goto out_free_timewait_sock_slab;
4075 	}
4076 
4077 	mutex_lock(&proto_list_mutex);
4078 	ret = assign_proto_idx(prot);
4079 	if (ret) {
4080 		mutex_unlock(&proto_list_mutex);
4081 		goto out_free_timewait_sock_slab;
4082 	}
4083 	list_add(&prot->node, &proto_list);
4084 	mutex_unlock(&proto_list_mutex);
4085 	return ret;
4086 
4087 out_free_timewait_sock_slab:
4088 	if (alloc_slab)
4089 		tw_prot_cleanup(prot->twsk_prot);
4090 out_free_request_sock_slab:
4091 	if (alloc_slab) {
4092 		req_prot_cleanup(prot->rsk_prot);
4093 
4094 		kmem_cache_destroy(prot->slab);
4095 		prot->slab = NULL;
4096 	}
4097 out:
4098 	return ret;
4099 }
4100 EXPORT_SYMBOL(proto_register);
4101 
4102 void proto_unregister(struct proto *prot)
4103 {
4104 	mutex_lock(&proto_list_mutex);
4105 	release_proto_idx(prot);
4106 	list_del(&prot->node);
4107 	mutex_unlock(&proto_list_mutex);
4108 
4109 	kmem_cache_destroy(prot->slab);
4110 	prot->slab = NULL;
4111 
4112 	req_prot_cleanup(prot->rsk_prot);
4113 	tw_prot_cleanup(prot->twsk_prot);
4114 }
4115 EXPORT_SYMBOL(proto_unregister);
4116 
4117 int sock_load_diag_module(int family, int protocol)
4118 {
4119 	if (!protocol) {
4120 		if (!sock_is_registered(family))
4121 			return -ENOENT;
4122 
4123 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4124 				      NETLINK_SOCK_DIAG, family);
4125 	}
4126 
4127 #ifdef CONFIG_INET
4128 	if (family == AF_INET &&
4129 	    protocol != IPPROTO_RAW &&
4130 	    protocol < MAX_INET_PROTOS &&
4131 	    !rcu_access_pointer(inet_protos[protocol]))
4132 		return -ENOENT;
4133 #endif
4134 
4135 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4136 			      NETLINK_SOCK_DIAG, family, protocol);
4137 }
4138 EXPORT_SYMBOL(sock_load_diag_module);
4139 
4140 #ifdef CONFIG_PROC_FS
4141 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4142 	__acquires(proto_list_mutex)
4143 {
4144 	mutex_lock(&proto_list_mutex);
4145 	return seq_list_start_head(&proto_list, *pos);
4146 }
4147 
4148 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4149 {
4150 	return seq_list_next(v, &proto_list, pos);
4151 }
4152 
4153 static void proto_seq_stop(struct seq_file *seq, void *v)
4154 	__releases(proto_list_mutex)
4155 {
4156 	mutex_unlock(&proto_list_mutex);
4157 }
4158 
4159 static char proto_method_implemented(const void *method)
4160 {
4161 	return method == NULL ? 'n' : 'y';
4162 }
4163 static long sock_prot_memory_allocated(struct proto *proto)
4164 {
4165 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4166 }
4167 
4168 static const char *sock_prot_memory_pressure(struct proto *proto)
4169 {
4170 	return proto->memory_pressure != NULL ?
4171 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4172 }
4173 
4174 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4175 {
4176 
4177 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4178 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4179 		   proto->name,
4180 		   proto->obj_size,
4181 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4182 		   sock_prot_memory_allocated(proto),
4183 		   sock_prot_memory_pressure(proto),
4184 		   proto->max_header,
4185 		   proto->slab == NULL ? "no" : "yes",
4186 		   module_name(proto->owner),
4187 		   proto_method_implemented(proto->close),
4188 		   proto_method_implemented(proto->connect),
4189 		   proto_method_implemented(proto->disconnect),
4190 		   proto_method_implemented(proto->accept),
4191 		   proto_method_implemented(proto->ioctl),
4192 		   proto_method_implemented(proto->init),
4193 		   proto_method_implemented(proto->destroy),
4194 		   proto_method_implemented(proto->shutdown),
4195 		   proto_method_implemented(proto->setsockopt),
4196 		   proto_method_implemented(proto->getsockopt),
4197 		   proto_method_implemented(proto->sendmsg),
4198 		   proto_method_implemented(proto->recvmsg),
4199 		   proto_method_implemented(proto->bind),
4200 		   proto_method_implemented(proto->backlog_rcv),
4201 		   proto_method_implemented(proto->hash),
4202 		   proto_method_implemented(proto->unhash),
4203 		   proto_method_implemented(proto->get_port),
4204 		   proto_method_implemented(proto->enter_memory_pressure));
4205 }
4206 
4207 static int proto_seq_show(struct seq_file *seq, void *v)
4208 {
4209 	if (v == &proto_list)
4210 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4211 			   "protocol",
4212 			   "size",
4213 			   "sockets",
4214 			   "memory",
4215 			   "press",
4216 			   "maxhdr",
4217 			   "slab",
4218 			   "module",
4219 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4220 	else
4221 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4222 	return 0;
4223 }
4224 
4225 static const struct seq_operations proto_seq_ops = {
4226 	.start  = proto_seq_start,
4227 	.next   = proto_seq_next,
4228 	.stop   = proto_seq_stop,
4229 	.show   = proto_seq_show,
4230 };
4231 
4232 static __net_init int proto_init_net(struct net *net)
4233 {
4234 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4235 			sizeof(struct seq_net_private)))
4236 		return -ENOMEM;
4237 
4238 	return 0;
4239 }
4240 
4241 static __net_exit void proto_exit_net(struct net *net)
4242 {
4243 	remove_proc_entry("protocols", net->proc_net);
4244 }
4245 
4246 
4247 static __net_initdata struct pernet_operations proto_net_ops = {
4248 	.init = proto_init_net,
4249 	.exit = proto_exit_net,
4250 };
4251 
4252 static int __init proto_init(void)
4253 {
4254 	return register_pernet_subsys(&proto_net_ops);
4255 }
4256 
4257 subsys_initcall(proto_init);
4258 
4259 #endif /* PROC_FS */
4260 
4261 #ifdef CONFIG_NET_RX_BUSY_POLL
4262 bool sk_busy_loop_end(void *p, unsigned long start_time)
4263 {
4264 	struct sock *sk = p;
4265 
4266 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4267 		return true;
4268 
4269 	if (sk_is_udp(sk) &&
4270 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4271 		return true;
4272 
4273 	return sk_busy_loop_timeout(sk, start_time);
4274 }
4275 EXPORT_SYMBOL(sk_busy_loop_end);
4276 #endif /* CONFIG_NET_RX_BUSY_POLL */
4277 
4278 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4279 {
4280 	if (!sk->sk_prot->bind_add)
4281 		return -EOPNOTSUPP;
4282 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4283 }
4284 EXPORT_SYMBOL(sock_bind_add);
4285 
4286 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4287 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4288 		     void __user *arg, void *karg, size_t size)
4289 {
4290 	int ret;
4291 
4292 	if (copy_from_user(karg, arg, size))
4293 		return -EFAULT;
4294 
4295 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4296 	if (ret)
4297 		return ret;
4298 
4299 	if (copy_to_user(arg, karg, size))
4300 		return -EFAULT;
4301 
4302 	return 0;
4303 }
4304 EXPORT_SYMBOL(sock_ioctl_inout);
4305 
4306 /* This is the most common ioctl prep function, where the result (4 bytes) is
4307  * copied back to userspace if the ioctl() returns successfully. No input is
4308  * copied from userspace as input argument.
4309  */
4310 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4311 {
4312 	int ret, karg = 0;
4313 
4314 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4315 	if (ret)
4316 		return ret;
4317 
4318 	return put_user(karg, (int __user *)arg);
4319 }
4320 
4321 /* A wrapper around sock ioctls, which copies the data from userspace
4322  * (depending on the protocol/ioctl), and copies back the result to userspace.
4323  * The main motivation for this function is to pass kernel memory to the
4324  * protocol ioctl callbacks, instead of userspace memory.
4325  */
4326 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4327 {
4328 	int rc = 1;
4329 
4330 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4331 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4332 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4333 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4334 	else if (sk_is_phonet(sk))
4335 		rc = phonet_sk_ioctl(sk, cmd, arg);
4336 
4337 	/* If ioctl was processed, returns its value */
4338 	if (rc <= 0)
4339 		return rc;
4340 
4341 	/* Otherwise call the default handler */
4342 	return sock_ioctl_out(sk, cmd, arg);
4343 }
4344 EXPORT_SYMBOL(sk_ioctl);
4345 
4346 static int __init sock_struct_check(void)
4347 {
4348 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4349 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4350 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4351 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4352 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4353 
4354 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4355 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4356 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4357 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4358 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4359 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4360 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4361 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4362 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4363 
4364 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4365 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4366 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4367 
4368 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4369 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4370 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4371 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4372 
4373 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4374 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4375 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4376 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4377 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4378 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4379 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4380 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4381 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4382 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4383 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4384 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4385 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4386 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4387 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4388 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4389 
4390 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4391 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4392 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4393 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4394 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4395 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4396 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4397 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4398 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4399 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4400 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4401 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4402 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4403 	return 0;
4404 }
4405 
4406 core_initcall(sock_struct_check);
4407