xref: /linux/net/core/sock.c (revision 2f435137a0484f11b47554281091ef4908f8cb31)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include "dev.h"
152 
153 static DEFINE_MUTEX(proto_list_mutex);
154 static LIST_HEAD(proto_list);
155 
156 static void sock_def_write_space_wfree(struct sock *sk);
157 static void sock_def_write_space(struct sock *sk);
158 
159 /**
160  * sk_ns_capable - General socket capability test
161  * @sk: Socket to use a capability on or through
162  * @user_ns: The user namespace of the capability to use
163  * @cap: The capability to use
164  *
165  * Test to see if the opener of the socket had when the socket was
166  * created and the current process has the capability @cap in the user
167  * namespace @user_ns.
168  */
169 bool sk_ns_capable(const struct sock *sk,
170 		   struct user_namespace *user_ns, int cap)
171 {
172 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173 		ns_capable(user_ns, cap);
174 }
175 EXPORT_SYMBOL(sk_ns_capable);
176 
177 /**
178  * sk_capable - Socket global capability test
179  * @sk: Socket to use a capability on or through
180  * @cap: The global capability to use
181  *
182  * Test to see if the opener of the socket had when the socket was
183  * created and the current process has the capability @cap in all user
184  * namespaces.
185  */
186 bool sk_capable(const struct sock *sk, int cap)
187 {
188 	return sk_ns_capable(sk, &init_user_ns, cap);
189 }
190 EXPORT_SYMBOL(sk_capable);
191 
192 /**
193  * sk_net_capable - Network namespace socket capability test
194  * @sk: Socket to use a capability on or through
195  * @cap: The capability to use
196  *
197  * Test to see if the opener of the socket had when the socket was created
198  * and the current process has the capability @cap over the network namespace
199  * the socket is a member of.
200  */
201 bool sk_net_capable(const struct sock *sk, int cap)
202 {
203 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204 }
205 EXPORT_SYMBOL(sk_net_capable);
206 
207 /*
208  * Each address family might have different locking rules, so we have
209  * one slock key per address family and separate keys for internal and
210  * userspace sockets.
211  */
212 static struct lock_class_key af_family_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_keys[AF_MAX];
214 static struct lock_class_key af_family_slock_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216 
217 /*
218  * Make lock validator output more readable. (we pre-construct these
219  * strings build-time, so that runtime initialization of socket
220  * locks is fast):
221  */
222 
223 #define _sock_locks(x)						  \
224   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
225   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
226   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
227   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
228   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
229   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
230   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
231   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
232   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
233   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
234   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
235   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
236   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
237   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
238   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
239   x "AF_MCTP"  , \
240   x "AF_MAX"
241 
242 static const char *const af_family_key_strings[AF_MAX+1] = {
243 	_sock_locks("sk_lock-")
244 };
245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("slock-")
247 };
248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("clock-")
250 };
251 
252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-sk_lock-")
254 };
255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-slock-")
257 };
258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259 	_sock_locks("k-clock-")
260 };
261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("rlock-")
263 };
264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265 	_sock_locks("wlock-")
266 };
267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268 	_sock_locks("elock-")
269 };
270 
271 /*
272  * sk_callback_lock and sk queues locking rules are per-address-family,
273  * so split the lock classes by using a per-AF key:
274  */
275 static struct lock_class_key af_callback_keys[AF_MAX];
276 static struct lock_class_key af_rlock_keys[AF_MAX];
277 static struct lock_class_key af_wlock_keys[AF_MAX];
278 static struct lock_class_key af_elock_keys[AF_MAX];
279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
280 
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283 EXPORT_SYMBOL(sysctl_wmem_max);
284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285 EXPORT_SYMBOL(sysctl_rmem_max);
286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288 
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_branch_inc(&memalloc_socks_key);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_branch_dec(&memalloc_socks_key);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned int noreclaim_flag;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	noreclaim_flag = memalloc_noreclaim_save();
334 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
335 				 tcp_v6_do_rcv,
336 				 tcp_v4_do_rcv,
337 				 sk, skb);
338 	memalloc_noreclaim_restore(noreclaim_flag);
339 
340 	return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343 
344 void sk_error_report(struct sock *sk)
345 {
346 	sk->sk_error_report(sk);
347 
348 	switch (sk->sk_family) {
349 	case AF_INET:
350 		fallthrough;
351 	case AF_INET6:
352 		trace_inet_sk_error_report(sk);
353 		break;
354 	default:
355 		break;
356 	}
357 }
358 EXPORT_SYMBOL(sk_error_report);
359 
360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
361 {
362 	struct __kernel_sock_timeval tv;
363 
364 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
365 		tv.tv_sec = 0;
366 		tv.tv_usec = 0;
367 	} else {
368 		tv.tv_sec = timeo / HZ;
369 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
370 	}
371 
372 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
373 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
374 		*(struct old_timeval32 *)optval = tv32;
375 		return sizeof(tv32);
376 	}
377 
378 	if (old_timeval) {
379 		struct __kernel_old_timeval old_tv;
380 		old_tv.tv_sec = tv.tv_sec;
381 		old_tv.tv_usec = tv.tv_usec;
382 		*(struct __kernel_old_timeval *)optval = old_tv;
383 		return sizeof(old_tv);
384 	}
385 
386 	*(struct __kernel_sock_timeval *)optval = tv;
387 	return sizeof(tv);
388 }
389 EXPORT_SYMBOL(sock_get_timeout);
390 
391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
392 			   sockptr_t optval, int optlen, bool old_timeval)
393 {
394 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
395 		struct old_timeval32 tv32;
396 
397 		if (optlen < sizeof(tv32))
398 			return -EINVAL;
399 
400 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
401 			return -EFAULT;
402 		tv->tv_sec = tv32.tv_sec;
403 		tv->tv_usec = tv32.tv_usec;
404 	} else if (old_timeval) {
405 		struct __kernel_old_timeval old_tv;
406 
407 		if (optlen < sizeof(old_tv))
408 			return -EINVAL;
409 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
410 			return -EFAULT;
411 		tv->tv_sec = old_tv.tv_sec;
412 		tv->tv_usec = old_tv.tv_usec;
413 	} else {
414 		if (optlen < sizeof(*tv))
415 			return -EINVAL;
416 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
417 			return -EFAULT;
418 	}
419 
420 	return 0;
421 }
422 EXPORT_SYMBOL(sock_copy_user_timeval);
423 
424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
425 			    bool old_timeval)
426 {
427 	struct __kernel_sock_timeval tv;
428 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
429 	long val;
430 
431 	if (err)
432 		return err;
433 
434 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
435 		return -EDOM;
436 
437 	if (tv.tv_sec < 0) {
438 		static int warned __read_mostly;
439 
440 		WRITE_ONCE(*timeo_p, 0);
441 		if (warned < 10 && net_ratelimit()) {
442 			warned++;
443 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 				__func__, current->comm, task_pid_nr(current));
445 		}
446 		return 0;
447 	}
448 	val = MAX_SCHEDULE_TIMEOUT;
449 	if ((tv.tv_sec || tv.tv_usec) &&
450 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
451 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
452 						    USEC_PER_SEC / HZ);
453 	WRITE_ONCE(*timeo_p, val);
454 	return 0;
455 }
456 
457 static bool sk_set_prio_allowed(const struct sock *sk, int val)
458 {
459 	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
460 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
461 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
462 }
463 
464 static bool sock_needs_netstamp(const struct sock *sk)
465 {
466 	switch (sk->sk_family) {
467 	case AF_UNSPEC:
468 	case AF_UNIX:
469 		return false;
470 	default:
471 		return true;
472 	}
473 }
474 
475 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
476 {
477 	if (sk->sk_flags & flags) {
478 		sk->sk_flags &= ~flags;
479 		if (sock_needs_netstamp(sk) &&
480 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
481 			net_disable_timestamp();
482 	}
483 }
484 
485 
486 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
487 {
488 	unsigned long flags;
489 	struct sk_buff_head *list = &sk->sk_receive_queue;
490 
491 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
492 		atomic_inc(&sk->sk_drops);
493 		trace_sock_rcvqueue_full(sk, skb);
494 		return -ENOMEM;
495 	}
496 
497 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
498 		atomic_inc(&sk->sk_drops);
499 		return -ENOBUFS;
500 	}
501 
502 	skb->dev = NULL;
503 	skb_set_owner_r(skb, sk);
504 
505 	/* we escape from rcu protected region, make sure we dont leak
506 	 * a norefcounted dst
507 	 */
508 	skb_dst_force(skb);
509 
510 	spin_lock_irqsave(&list->lock, flags);
511 	sock_skb_set_dropcount(sk, skb);
512 	__skb_queue_tail(list, skb);
513 	spin_unlock_irqrestore(&list->lock, flags);
514 
515 	if (!sock_flag(sk, SOCK_DEAD))
516 		sk->sk_data_ready(sk);
517 	return 0;
518 }
519 EXPORT_SYMBOL(__sock_queue_rcv_skb);
520 
521 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
522 			      enum skb_drop_reason *reason)
523 {
524 	enum skb_drop_reason drop_reason;
525 	int err;
526 
527 	err = sk_filter(sk, skb);
528 	if (err) {
529 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
530 		goto out;
531 	}
532 	err = __sock_queue_rcv_skb(sk, skb);
533 	switch (err) {
534 	case -ENOMEM:
535 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
536 		break;
537 	case -ENOBUFS:
538 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
539 		break;
540 	default:
541 		drop_reason = SKB_NOT_DROPPED_YET;
542 		break;
543 	}
544 out:
545 	if (reason)
546 		*reason = drop_reason;
547 	return err;
548 }
549 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
550 
551 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
552 		     const int nested, unsigned int trim_cap, bool refcounted)
553 {
554 	int rc = NET_RX_SUCCESS;
555 
556 	if (sk_filter_trim_cap(sk, skb, trim_cap))
557 		goto discard_and_relse;
558 
559 	skb->dev = NULL;
560 
561 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
562 		atomic_inc(&sk->sk_drops);
563 		goto discard_and_relse;
564 	}
565 	if (nested)
566 		bh_lock_sock_nested(sk);
567 	else
568 		bh_lock_sock(sk);
569 	if (!sock_owned_by_user(sk)) {
570 		/*
571 		 * trylock + unlock semantics:
572 		 */
573 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
574 
575 		rc = sk_backlog_rcv(sk, skb);
576 
577 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
578 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
579 		bh_unlock_sock(sk);
580 		atomic_inc(&sk->sk_drops);
581 		goto discard_and_relse;
582 	}
583 
584 	bh_unlock_sock(sk);
585 out:
586 	if (refcounted)
587 		sock_put(sk);
588 	return rc;
589 discard_and_relse:
590 	kfree_skb(skb);
591 	goto out;
592 }
593 EXPORT_SYMBOL(__sk_receive_skb);
594 
595 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
596 							  u32));
597 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
598 							   u32));
599 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
600 {
601 	struct dst_entry *dst = __sk_dst_get(sk);
602 
603 	if (dst && dst->obsolete &&
604 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
605 			       dst, cookie) == NULL) {
606 		sk_tx_queue_clear(sk);
607 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
608 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
609 		dst_release(dst);
610 		return NULL;
611 	}
612 
613 	return dst;
614 }
615 EXPORT_SYMBOL(__sk_dst_check);
616 
617 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
618 {
619 	struct dst_entry *dst = sk_dst_get(sk);
620 
621 	if (dst && dst->obsolete &&
622 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
623 			       dst, cookie) == NULL) {
624 		sk_dst_reset(sk);
625 		dst_release(dst);
626 		return NULL;
627 	}
628 
629 	return dst;
630 }
631 EXPORT_SYMBOL(sk_dst_check);
632 
633 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
634 {
635 	int ret = -ENOPROTOOPT;
636 #ifdef CONFIG_NETDEVICES
637 	struct net *net = sock_net(sk);
638 
639 	/* Sorry... */
640 	ret = -EPERM;
641 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
642 		goto out;
643 
644 	ret = -EINVAL;
645 	if (ifindex < 0)
646 		goto out;
647 
648 	/* Paired with all READ_ONCE() done locklessly. */
649 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
650 
651 	if (sk->sk_prot->rehash)
652 		sk->sk_prot->rehash(sk);
653 	sk_dst_reset(sk);
654 
655 	ret = 0;
656 
657 out:
658 #endif
659 
660 	return ret;
661 }
662 
663 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
664 {
665 	int ret;
666 
667 	if (lock_sk)
668 		lock_sock(sk);
669 	ret = sock_bindtoindex_locked(sk, ifindex);
670 	if (lock_sk)
671 		release_sock(sk);
672 
673 	return ret;
674 }
675 EXPORT_SYMBOL(sock_bindtoindex);
676 
677 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
678 {
679 	int ret = -ENOPROTOOPT;
680 #ifdef CONFIG_NETDEVICES
681 	struct net *net = sock_net(sk);
682 	char devname[IFNAMSIZ];
683 	int index;
684 
685 	ret = -EINVAL;
686 	if (optlen < 0)
687 		goto out;
688 
689 	/* Bind this socket to a particular device like "eth0",
690 	 * as specified in the passed interface name. If the
691 	 * name is "" or the option length is zero the socket
692 	 * is not bound.
693 	 */
694 	if (optlen > IFNAMSIZ - 1)
695 		optlen = IFNAMSIZ - 1;
696 	memset(devname, 0, sizeof(devname));
697 
698 	ret = -EFAULT;
699 	if (copy_from_sockptr(devname, optval, optlen))
700 		goto out;
701 
702 	index = 0;
703 	if (devname[0] != '\0') {
704 		struct net_device *dev;
705 
706 		rcu_read_lock();
707 		dev = dev_get_by_name_rcu(net, devname);
708 		if (dev)
709 			index = dev->ifindex;
710 		rcu_read_unlock();
711 		ret = -ENODEV;
712 		if (!dev)
713 			goto out;
714 	}
715 
716 	sockopt_lock_sock(sk);
717 	ret = sock_bindtoindex_locked(sk, index);
718 	sockopt_release_sock(sk);
719 out:
720 #endif
721 
722 	return ret;
723 }
724 
725 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
726 				sockptr_t optlen, int len)
727 {
728 	int ret = -ENOPROTOOPT;
729 #ifdef CONFIG_NETDEVICES
730 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
731 	struct net *net = sock_net(sk);
732 	char devname[IFNAMSIZ];
733 
734 	if (bound_dev_if == 0) {
735 		len = 0;
736 		goto zero;
737 	}
738 
739 	ret = -EINVAL;
740 	if (len < IFNAMSIZ)
741 		goto out;
742 
743 	ret = netdev_get_name(net, devname, bound_dev_if);
744 	if (ret)
745 		goto out;
746 
747 	len = strlen(devname) + 1;
748 
749 	ret = -EFAULT;
750 	if (copy_to_sockptr(optval, devname, len))
751 		goto out;
752 
753 zero:
754 	ret = -EFAULT;
755 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
756 		goto out;
757 
758 	ret = 0;
759 
760 out:
761 #endif
762 
763 	return ret;
764 }
765 
766 bool sk_mc_loop(const struct sock *sk)
767 {
768 	if (dev_recursion_level())
769 		return false;
770 	if (!sk)
771 		return true;
772 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
773 	switch (READ_ONCE(sk->sk_family)) {
774 	case AF_INET:
775 		return inet_test_bit(MC_LOOP, sk);
776 #if IS_ENABLED(CONFIG_IPV6)
777 	case AF_INET6:
778 		return inet6_test_bit(MC6_LOOP, sk);
779 #endif
780 	}
781 	WARN_ON_ONCE(1);
782 	return true;
783 }
784 EXPORT_SYMBOL(sk_mc_loop);
785 
786 void sock_set_reuseaddr(struct sock *sk)
787 {
788 	lock_sock(sk);
789 	sk->sk_reuse = SK_CAN_REUSE;
790 	release_sock(sk);
791 }
792 EXPORT_SYMBOL(sock_set_reuseaddr);
793 
794 void sock_set_reuseport(struct sock *sk)
795 {
796 	lock_sock(sk);
797 	sk->sk_reuseport = true;
798 	release_sock(sk);
799 }
800 EXPORT_SYMBOL(sock_set_reuseport);
801 
802 void sock_no_linger(struct sock *sk)
803 {
804 	lock_sock(sk);
805 	WRITE_ONCE(sk->sk_lingertime, 0);
806 	sock_set_flag(sk, SOCK_LINGER);
807 	release_sock(sk);
808 }
809 EXPORT_SYMBOL(sock_no_linger);
810 
811 void sock_set_priority(struct sock *sk, u32 priority)
812 {
813 	WRITE_ONCE(sk->sk_priority, priority);
814 }
815 EXPORT_SYMBOL(sock_set_priority);
816 
817 void sock_set_sndtimeo(struct sock *sk, s64 secs)
818 {
819 	lock_sock(sk);
820 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
821 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
822 	else
823 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
824 	release_sock(sk);
825 }
826 EXPORT_SYMBOL(sock_set_sndtimeo);
827 
828 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
829 {
830 	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
831 	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
832 	if (val)  {
833 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
834 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
835 	}
836 }
837 
838 void sock_enable_timestamps(struct sock *sk)
839 {
840 	lock_sock(sk);
841 	__sock_set_timestamps(sk, true, false, true);
842 	release_sock(sk);
843 }
844 EXPORT_SYMBOL(sock_enable_timestamps);
845 
846 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
847 {
848 	switch (optname) {
849 	case SO_TIMESTAMP_OLD:
850 		__sock_set_timestamps(sk, valbool, false, false);
851 		break;
852 	case SO_TIMESTAMP_NEW:
853 		__sock_set_timestamps(sk, valbool, true, false);
854 		break;
855 	case SO_TIMESTAMPNS_OLD:
856 		__sock_set_timestamps(sk, valbool, false, true);
857 		break;
858 	case SO_TIMESTAMPNS_NEW:
859 		__sock_set_timestamps(sk, valbool, true, true);
860 		break;
861 	}
862 }
863 
864 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
865 {
866 	struct net *net = sock_net(sk);
867 	struct net_device *dev = NULL;
868 	bool match = false;
869 	int *vclock_index;
870 	int i, num;
871 
872 	if (sk->sk_bound_dev_if)
873 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
874 
875 	if (!dev) {
876 		pr_err("%s: sock not bind to device\n", __func__);
877 		return -EOPNOTSUPP;
878 	}
879 
880 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
881 	dev_put(dev);
882 
883 	for (i = 0; i < num; i++) {
884 		if (*(vclock_index + i) == phc_index) {
885 			match = true;
886 			break;
887 		}
888 	}
889 
890 	if (num > 0)
891 		kfree(vclock_index);
892 
893 	if (!match)
894 		return -EINVAL;
895 
896 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
897 
898 	return 0;
899 }
900 
901 int sock_set_timestamping(struct sock *sk, int optname,
902 			  struct so_timestamping timestamping)
903 {
904 	int val = timestamping.flags;
905 	int ret;
906 
907 	if (val & ~SOF_TIMESTAMPING_MASK)
908 		return -EINVAL;
909 
910 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
911 	    !(val & SOF_TIMESTAMPING_OPT_ID))
912 		return -EINVAL;
913 
914 	if (val & SOF_TIMESTAMPING_OPT_ID &&
915 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
916 		if (sk_is_tcp(sk)) {
917 			if ((1 << sk->sk_state) &
918 			    (TCPF_CLOSE | TCPF_LISTEN))
919 				return -EINVAL;
920 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
922 			else
923 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
924 		} else {
925 			atomic_set(&sk->sk_tskey, 0);
926 		}
927 	}
928 
929 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
930 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
931 		return -EINVAL;
932 
933 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
934 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
935 		if (ret)
936 			return ret;
937 	}
938 
939 	WRITE_ONCE(sk->sk_tsflags, val);
940 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
941 	sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
942 
943 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
944 		sock_enable_timestamp(sk,
945 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
946 	else
947 		sock_disable_timestamp(sk,
948 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
949 	return 0;
950 }
951 
952 void sock_set_keepalive(struct sock *sk)
953 {
954 	lock_sock(sk);
955 	if (sk->sk_prot->keepalive)
956 		sk->sk_prot->keepalive(sk, true);
957 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
958 	release_sock(sk);
959 }
960 EXPORT_SYMBOL(sock_set_keepalive);
961 
962 static void __sock_set_rcvbuf(struct sock *sk, int val)
963 {
964 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
965 	 * as a negative value.
966 	 */
967 	val = min_t(int, val, INT_MAX / 2);
968 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
969 
970 	/* We double it on the way in to account for "struct sk_buff" etc.
971 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
972 	 * will allow that much actual data to be received on that socket.
973 	 *
974 	 * Applications are unaware that "struct sk_buff" and other overheads
975 	 * allocate from the receive buffer during socket buffer allocation.
976 	 *
977 	 * And after considering the possible alternatives, returning the value
978 	 * we actually used in getsockopt is the most desirable behavior.
979 	 */
980 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
981 }
982 
983 void sock_set_rcvbuf(struct sock *sk, int val)
984 {
985 	lock_sock(sk);
986 	__sock_set_rcvbuf(sk, val);
987 	release_sock(sk);
988 }
989 EXPORT_SYMBOL(sock_set_rcvbuf);
990 
991 static void __sock_set_mark(struct sock *sk, u32 val)
992 {
993 	if (val != sk->sk_mark) {
994 		WRITE_ONCE(sk->sk_mark, val);
995 		sk_dst_reset(sk);
996 	}
997 }
998 
999 void sock_set_mark(struct sock *sk, u32 val)
1000 {
1001 	lock_sock(sk);
1002 	__sock_set_mark(sk, val);
1003 	release_sock(sk);
1004 }
1005 EXPORT_SYMBOL(sock_set_mark);
1006 
1007 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1008 {
1009 	/* Round down bytes to multiple of pages */
1010 	bytes = round_down(bytes, PAGE_SIZE);
1011 
1012 	WARN_ON(bytes > sk->sk_reserved_mem);
1013 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1014 	sk_mem_reclaim(sk);
1015 }
1016 
1017 static int sock_reserve_memory(struct sock *sk, int bytes)
1018 {
1019 	long allocated;
1020 	bool charged;
1021 	int pages;
1022 
1023 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1024 		return -EOPNOTSUPP;
1025 
1026 	if (!bytes)
1027 		return 0;
1028 
1029 	pages = sk_mem_pages(bytes);
1030 
1031 	/* pre-charge to memcg */
1032 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1033 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1034 	if (!charged)
1035 		return -ENOMEM;
1036 
1037 	/* pre-charge to forward_alloc */
1038 	sk_memory_allocated_add(sk, pages);
1039 	allocated = sk_memory_allocated(sk);
1040 	/* If the system goes into memory pressure with this
1041 	 * precharge, give up and return error.
1042 	 */
1043 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1044 		sk_memory_allocated_sub(sk, pages);
1045 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1046 		return -ENOMEM;
1047 	}
1048 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1049 
1050 	WRITE_ONCE(sk->sk_reserved_mem,
1051 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1052 
1053 	return 0;
1054 }
1055 
1056 #ifdef CONFIG_PAGE_POOL
1057 
1058 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1059  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1060  * allocates to copy these tokens, and to prevent looping over the frags for
1061  * too long.
1062  */
1063 #define MAX_DONTNEED_TOKENS 128
1064 #define MAX_DONTNEED_FRAGS 1024
1065 
1066 static noinline_for_stack int
1067 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1068 {
1069 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1070 	struct dmabuf_token *tokens;
1071 	int ret = 0, num_frags = 0;
1072 	netmem_ref netmems[16];
1073 
1074 	if (!sk_is_tcp(sk))
1075 		return -EBADF;
1076 
1077 	if (optlen % sizeof(*tokens) ||
1078 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1079 		return -EINVAL;
1080 
1081 	num_tokens = optlen / sizeof(*tokens);
1082 	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1083 	if (!tokens)
1084 		return -ENOMEM;
1085 
1086 	if (copy_from_sockptr(tokens, optval, optlen)) {
1087 		kvfree(tokens);
1088 		return -EFAULT;
1089 	}
1090 
1091 	xa_lock_bh(&sk->sk_user_frags);
1092 	for (i = 0; i < num_tokens; i++) {
1093 		for (j = 0; j < tokens[i].token_count; j++) {
1094 			if (++num_frags > MAX_DONTNEED_FRAGS)
1095 				goto frag_limit_reached;
1096 
1097 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1098 				&sk->sk_user_frags, tokens[i].token_start + j);
1099 
1100 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1101 				continue;
1102 
1103 			netmems[netmem_num++] = netmem;
1104 			if (netmem_num == ARRAY_SIZE(netmems)) {
1105 				xa_unlock_bh(&sk->sk_user_frags);
1106 				for (k = 0; k < netmem_num; k++)
1107 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1108 				netmem_num = 0;
1109 				xa_lock_bh(&sk->sk_user_frags);
1110 			}
1111 			ret++;
1112 		}
1113 	}
1114 
1115 frag_limit_reached:
1116 	xa_unlock_bh(&sk->sk_user_frags);
1117 	for (k = 0; k < netmem_num; k++)
1118 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1119 
1120 	kvfree(tokens);
1121 	return ret;
1122 }
1123 #endif
1124 
1125 void sockopt_lock_sock(struct sock *sk)
1126 {
1127 	/* When current->bpf_ctx is set, the setsockopt is called from
1128 	 * a bpf prog.  bpf has ensured the sk lock has been
1129 	 * acquired before calling setsockopt().
1130 	 */
1131 	if (has_current_bpf_ctx())
1132 		return;
1133 
1134 	lock_sock(sk);
1135 }
1136 EXPORT_SYMBOL(sockopt_lock_sock);
1137 
1138 void sockopt_release_sock(struct sock *sk)
1139 {
1140 	if (has_current_bpf_ctx())
1141 		return;
1142 
1143 	release_sock(sk);
1144 }
1145 EXPORT_SYMBOL(sockopt_release_sock);
1146 
1147 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1148 {
1149 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1150 }
1151 EXPORT_SYMBOL(sockopt_ns_capable);
1152 
1153 bool sockopt_capable(int cap)
1154 {
1155 	return has_current_bpf_ctx() || capable(cap);
1156 }
1157 EXPORT_SYMBOL(sockopt_capable);
1158 
1159 static int sockopt_validate_clockid(__kernel_clockid_t value)
1160 {
1161 	switch (value) {
1162 	case CLOCK_REALTIME:
1163 	case CLOCK_MONOTONIC:
1164 	case CLOCK_TAI:
1165 		return 0;
1166 	}
1167 	return -EINVAL;
1168 }
1169 
1170 /*
1171  *	This is meant for all protocols to use and covers goings on
1172  *	at the socket level. Everything here is generic.
1173  */
1174 
1175 int sk_setsockopt(struct sock *sk, int level, int optname,
1176 		  sockptr_t optval, unsigned int optlen)
1177 {
1178 	struct so_timestamping timestamping;
1179 	struct socket *sock = sk->sk_socket;
1180 	struct sock_txtime sk_txtime;
1181 	int val;
1182 	int valbool;
1183 	struct linger ling;
1184 	int ret = 0;
1185 
1186 	/*
1187 	 *	Options without arguments
1188 	 */
1189 
1190 	if (optname == SO_BINDTODEVICE)
1191 		return sock_setbindtodevice(sk, optval, optlen);
1192 
1193 	if (optlen < sizeof(int))
1194 		return -EINVAL;
1195 
1196 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1197 		return -EFAULT;
1198 
1199 	valbool = val ? 1 : 0;
1200 
1201 	/* handle options which do not require locking the socket. */
1202 	switch (optname) {
1203 	case SO_PRIORITY:
1204 		if (sk_set_prio_allowed(sk, val)) {
1205 			sock_set_priority(sk, val);
1206 			return 0;
1207 		}
1208 		return -EPERM;
1209 	case SO_PASSSEC:
1210 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1211 		return 0;
1212 	case SO_PASSCRED:
1213 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1214 		return 0;
1215 	case SO_PASSPIDFD:
1216 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1217 		return 0;
1218 	case SO_TYPE:
1219 	case SO_PROTOCOL:
1220 	case SO_DOMAIN:
1221 	case SO_ERROR:
1222 		return -ENOPROTOOPT;
1223 #ifdef CONFIG_NET_RX_BUSY_POLL
1224 	case SO_BUSY_POLL:
1225 		if (val < 0)
1226 			return -EINVAL;
1227 		WRITE_ONCE(sk->sk_ll_usec, val);
1228 		return 0;
1229 	case SO_PREFER_BUSY_POLL:
1230 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1231 			return -EPERM;
1232 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1233 		return 0;
1234 	case SO_BUSY_POLL_BUDGET:
1235 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1236 		    !sockopt_capable(CAP_NET_ADMIN))
1237 			return -EPERM;
1238 		if (val < 0 || val > U16_MAX)
1239 			return -EINVAL;
1240 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1241 		return 0;
1242 #endif
1243 	case SO_MAX_PACING_RATE:
1244 		{
1245 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1246 		unsigned long pacing_rate;
1247 
1248 		if (sizeof(ulval) != sizeof(val) &&
1249 		    optlen >= sizeof(ulval) &&
1250 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1251 			return -EFAULT;
1252 		}
1253 		if (ulval != ~0UL)
1254 			cmpxchg(&sk->sk_pacing_status,
1255 				SK_PACING_NONE,
1256 				SK_PACING_NEEDED);
1257 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1258 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1259 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1260 		if (ulval < pacing_rate)
1261 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1262 		return 0;
1263 		}
1264 	case SO_TXREHASH:
1265 		if (val < -1 || val > 1)
1266 			return -EINVAL;
1267 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1268 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1269 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1270 		 * and sk_getsockopt().
1271 		 */
1272 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1273 		return 0;
1274 	case SO_PEEK_OFF:
1275 		{
1276 		int (*set_peek_off)(struct sock *sk, int val);
1277 
1278 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1279 		if (set_peek_off)
1280 			ret = set_peek_off(sk, val);
1281 		else
1282 			ret = -EOPNOTSUPP;
1283 		return ret;
1284 		}
1285 #ifdef CONFIG_PAGE_POOL
1286 	case SO_DEVMEM_DONTNEED:
1287 		return sock_devmem_dontneed(sk, optval, optlen);
1288 #endif
1289 	}
1290 
1291 	sockopt_lock_sock(sk);
1292 
1293 	switch (optname) {
1294 	case SO_DEBUG:
1295 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1296 			ret = -EACCES;
1297 		else
1298 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1299 		break;
1300 	case SO_REUSEADDR:
1301 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1302 		break;
1303 	case SO_REUSEPORT:
1304 		if (valbool && !sk_is_inet(sk))
1305 			ret = -EOPNOTSUPP;
1306 		else
1307 			sk->sk_reuseport = valbool;
1308 		break;
1309 	case SO_DONTROUTE:
1310 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1311 		sk_dst_reset(sk);
1312 		break;
1313 	case SO_BROADCAST:
1314 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1315 		break;
1316 	case SO_SNDBUF:
1317 		/* Don't error on this BSD doesn't and if you think
1318 		 * about it this is right. Otherwise apps have to
1319 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1320 		 * are treated in BSD as hints
1321 		 */
1322 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1323 set_sndbuf:
1324 		/* Ensure val * 2 fits into an int, to prevent max_t()
1325 		 * from treating it as a negative value.
1326 		 */
1327 		val = min_t(int, val, INT_MAX / 2);
1328 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1329 		WRITE_ONCE(sk->sk_sndbuf,
1330 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1331 		/* Wake up sending tasks if we upped the value. */
1332 		sk->sk_write_space(sk);
1333 		break;
1334 
1335 	case SO_SNDBUFFORCE:
1336 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1337 			ret = -EPERM;
1338 			break;
1339 		}
1340 
1341 		/* No negative values (to prevent underflow, as val will be
1342 		 * multiplied by 2).
1343 		 */
1344 		if (val < 0)
1345 			val = 0;
1346 		goto set_sndbuf;
1347 
1348 	case SO_RCVBUF:
1349 		/* Don't error on this BSD doesn't and if you think
1350 		 * about it this is right. Otherwise apps have to
1351 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1352 		 * are treated in BSD as hints
1353 		 */
1354 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1355 		break;
1356 
1357 	case SO_RCVBUFFORCE:
1358 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1359 			ret = -EPERM;
1360 			break;
1361 		}
1362 
1363 		/* No negative values (to prevent underflow, as val will be
1364 		 * multiplied by 2).
1365 		 */
1366 		__sock_set_rcvbuf(sk, max(val, 0));
1367 		break;
1368 
1369 	case SO_KEEPALIVE:
1370 		if (sk->sk_prot->keepalive)
1371 			sk->sk_prot->keepalive(sk, valbool);
1372 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1373 		break;
1374 
1375 	case SO_OOBINLINE:
1376 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1377 		break;
1378 
1379 	case SO_NO_CHECK:
1380 		sk->sk_no_check_tx = valbool;
1381 		break;
1382 
1383 	case SO_LINGER:
1384 		if (optlen < sizeof(ling)) {
1385 			ret = -EINVAL;	/* 1003.1g */
1386 			break;
1387 		}
1388 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1389 			ret = -EFAULT;
1390 			break;
1391 		}
1392 		if (!ling.l_onoff) {
1393 			sock_reset_flag(sk, SOCK_LINGER);
1394 		} else {
1395 			unsigned long t_sec = ling.l_linger;
1396 
1397 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1398 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1399 			else
1400 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1401 			sock_set_flag(sk, SOCK_LINGER);
1402 		}
1403 		break;
1404 
1405 	case SO_BSDCOMPAT:
1406 		break;
1407 
1408 	case SO_TIMESTAMP_OLD:
1409 	case SO_TIMESTAMP_NEW:
1410 	case SO_TIMESTAMPNS_OLD:
1411 	case SO_TIMESTAMPNS_NEW:
1412 		sock_set_timestamp(sk, optname, valbool);
1413 		break;
1414 
1415 	case SO_TIMESTAMPING_NEW:
1416 	case SO_TIMESTAMPING_OLD:
1417 		if (optlen == sizeof(timestamping)) {
1418 			if (copy_from_sockptr(&timestamping, optval,
1419 					      sizeof(timestamping))) {
1420 				ret = -EFAULT;
1421 				break;
1422 			}
1423 		} else {
1424 			memset(&timestamping, 0, sizeof(timestamping));
1425 			timestamping.flags = val;
1426 		}
1427 		ret = sock_set_timestamping(sk, optname, timestamping);
1428 		break;
1429 
1430 	case SO_RCVLOWAT:
1431 		{
1432 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1433 
1434 		if (val < 0)
1435 			val = INT_MAX;
1436 		if (sock)
1437 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1438 		if (set_rcvlowat)
1439 			ret = set_rcvlowat(sk, val);
1440 		else
1441 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1442 		break;
1443 		}
1444 	case SO_RCVTIMEO_OLD:
1445 	case SO_RCVTIMEO_NEW:
1446 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1447 				       optlen, optname == SO_RCVTIMEO_OLD);
1448 		break;
1449 
1450 	case SO_SNDTIMEO_OLD:
1451 	case SO_SNDTIMEO_NEW:
1452 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1453 				       optlen, optname == SO_SNDTIMEO_OLD);
1454 		break;
1455 
1456 	case SO_ATTACH_FILTER: {
1457 		struct sock_fprog fprog;
1458 
1459 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1460 		if (!ret)
1461 			ret = sk_attach_filter(&fprog, sk);
1462 		break;
1463 	}
1464 	case SO_ATTACH_BPF:
1465 		ret = -EINVAL;
1466 		if (optlen == sizeof(u32)) {
1467 			u32 ufd;
1468 
1469 			ret = -EFAULT;
1470 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1471 				break;
1472 
1473 			ret = sk_attach_bpf(ufd, sk);
1474 		}
1475 		break;
1476 
1477 	case SO_ATTACH_REUSEPORT_CBPF: {
1478 		struct sock_fprog fprog;
1479 
1480 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1481 		if (!ret)
1482 			ret = sk_reuseport_attach_filter(&fprog, sk);
1483 		break;
1484 	}
1485 	case SO_ATTACH_REUSEPORT_EBPF:
1486 		ret = -EINVAL;
1487 		if (optlen == sizeof(u32)) {
1488 			u32 ufd;
1489 
1490 			ret = -EFAULT;
1491 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1492 				break;
1493 
1494 			ret = sk_reuseport_attach_bpf(ufd, sk);
1495 		}
1496 		break;
1497 
1498 	case SO_DETACH_REUSEPORT_BPF:
1499 		ret = reuseport_detach_prog(sk);
1500 		break;
1501 
1502 	case SO_DETACH_FILTER:
1503 		ret = sk_detach_filter(sk);
1504 		break;
1505 
1506 	case SO_LOCK_FILTER:
1507 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1508 			ret = -EPERM;
1509 		else
1510 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1511 		break;
1512 
1513 	case SO_MARK:
1514 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1515 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1516 			ret = -EPERM;
1517 			break;
1518 		}
1519 
1520 		__sock_set_mark(sk, val);
1521 		break;
1522 	case SO_RCVMARK:
1523 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1524 		break;
1525 
1526 	case SO_RCVPRIORITY:
1527 		sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1528 		break;
1529 
1530 	case SO_RXQ_OVFL:
1531 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1532 		break;
1533 
1534 	case SO_WIFI_STATUS:
1535 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1536 		break;
1537 
1538 	case SO_NOFCS:
1539 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1540 		break;
1541 
1542 	case SO_SELECT_ERR_QUEUE:
1543 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1544 		break;
1545 
1546 
1547 	case SO_INCOMING_CPU:
1548 		reuseport_update_incoming_cpu(sk, val);
1549 		break;
1550 
1551 	case SO_CNX_ADVICE:
1552 		if (val == 1)
1553 			dst_negative_advice(sk);
1554 		break;
1555 
1556 	case SO_ZEROCOPY:
1557 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1558 			if (!(sk_is_tcp(sk) ||
1559 			      (sk->sk_type == SOCK_DGRAM &&
1560 			       sk->sk_protocol == IPPROTO_UDP)))
1561 				ret = -EOPNOTSUPP;
1562 		} else if (sk->sk_family != PF_RDS) {
1563 			ret = -EOPNOTSUPP;
1564 		}
1565 		if (!ret) {
1566 			if (val < 0 || val > 1)
1567 				ret = -EINVAL;
1568 			else
1569 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1570 		}
1571 		break;
1572 
1573 	case SO_TXTIME:
1574 		if (optlen != sizeof(struct sock_txtime)) {
1575 			ret = -EINVAL;
1576 			break;
1577 		} else if (copy_from_sockptr(&sk_txtime, optval,
1578 			   sizeof(struct sock_txtime))) {
1579 			ret = -EFAULT;
1580 			break;
1581 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1582 			ret = -EINVAL;
1583 			break;
1584 		}
1585 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1586 		 * scheduler has enough safe guards.
1587 		 */
1588 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1589 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1590 			ret = -EPERM;
1591 			break;
1592 		}
1593 
1594 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1595 		if (ret)
1596 			break;
1597 
1598 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1599 		sk->sk_clockid = sk_txtime.clockid;
1600 		sk->sk_txtime_deadline_mode =
1601 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1602 		sk->sk_txtime_report_errors =
1603 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1604 		break;
1605 
1606 	case SO_BINDTOIFINDEX:
1607 		ret = sock_bindtoindex_locked(sk, val);
1608 		break;
1609 
1610 	case SO_BUF_LOCK:
1611 		if (val & ~SOCK_BUF_LOCK_MASK) {
1612 			ret = -EINVAL;
1613 			break;
1614 		}
1615 		sk->sk_userlocks = val | (sk->sk_userlocks &
1616 					  ~SOCK_BUF_LOCK_MASK);
1617 		break;
1618 
1619 	case SO_RESERVE_MEM:
1620 	{
1621 		int delta;
1622 
1623 		if (val < 0) {
1624 			ret = -EINVAL;
1625 			break;
1626 		}
1627 
1628 		delta = val - sk->sk_reserved_mem;
1629 		if (delta < 0)
1630 			sock_release_reserved_memory(sk, -delta);
1631 		else
1632 			ret = sock_reserve_memory(sk, delta);
1633 		break;
1634 	}
1635 
1636 	default:
1637 		ret = -ENOPROTOOPT;
1638 		break;
1639 	}
1640 	sockopt_release_sock(sk);
1641 	return ret;
1642 }
1643 
1644 int sock_setsockopt(struct socket *sock, int level, int optname,
1645 		    sockptr_t optval, unsigned int optlen)
1646 {
1647 	return sk_setsockopt(sock->sk, level, optname,
1648 			     optval, optlen);
1649 }
1650 EXPORT_SYMBOL(sock_setsockopt);
1651 
1652 static const struct cred *sk_get_peer_cred(struct sock *sk)
1653 {
1654 	const struct cred *cred;
1655 
1656 	spin_lock(&sk->sk_peer_lock);
1657 	cred = get_cred(sk->sk_peer_cred);
1658 	spin_unlock(&sk->sk_peer_lock);
1659 
1660 	return cred;
1661 }
1662 
1663 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1664 			  struct ucred *ucred)
1665 {
1666 	ucred->pid = pid_vnr(pid);
1667 	ucred->uid = ucred->gid = -1;
1668 	if (cred) {
1669 		struct user_namespace *current_ns = current_user_ns();
1670 
1671 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1672 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1673 	}
1674 }
1675 
1676 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1677 {
1678 	struct user_namespace *user_ns = current_user_ns();
1679 	int i;
1680 
1681 	for (i = 0; i < src->ngroups; i++) {
1682 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1683 
1684 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1685 			return -EFAULT;
1686 	}
1687 
1688 	return 0;
1689 }
1690 
1691 int sk_getsockopt(struct sock *sk, int level, int optname,
1692 		  sockptr_t optval, sockptr_t optlen)
1693 {
1694 	struct socket *sock = sk->sk_socket;
1695 
1696 	union {
1697 		int val;
1698 		u64 val64;
1699 		unsigned long ulval;
1700 		struct linger ling;
1701 		struct old_timeval32 tm32;
1702 		struct __kernel_old_timeval tm;
1703 		struct  __kernel_sock_timeval stm;
1704 		struct sock_txtime txtime;
1705 		struct so_timestamping timestamping;
1706 	} v;
1707 
1708 	int lv = sizeof(int);
1709 	int len;
1710 
1711 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1712 		return -EFAULT;
1713 	if (len < 0)
1714 		return -EINVAL;
1715 
1716 	memset(&v, 0, sizeof(v));
1717 
1718 	switch (optname) {
1719 	case SO_DEBUG:
1720 		v.val = sock_flag(sk, SOCK_DBG);
1721 		break;
1722 
1723 	case SO_DONTROUTE:
1724 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1725 		break;
1726 
1727 	case SO_BROADCAST:
1728 		v.val = sock_flag(sk, SOCK_BROADCAST);
1729 		break;
1730 
1731 	case SO_SNDBUF:
1732 		v.val = READ_ONCE(sk->sk_sndbuf);
1733 		break;
1734 
1735 	case SO_RCVBUF:
1736 		v.val = READ_ONCE(sk->sk_rcvbuf);
1737 		break;
1738 
1739 	case SO_REUSEADDR:
1740 		v.val = sk->sk_reuse;
1741 		break;
1742 
1743 	case SO_REUSEPORT:
1744 		v.val = sk->sk_reuseport;
1745 		break;
1746 
1747 	case SO_KEEPALIVE:
1748 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1749 		break;
1750 
1751 	case SO_TYPE:
1752 		v.val = sk->sk_type;
1753 		break;
1754 
1755 	case SO_PROTOCOL:
1756 		v.val = sk->sk_protocol;
1757 		break;
1758 
1759 	case SO_DOMAIN:
1760 		v.val = sk->sk_family;
1761 		break;
1762 
1763 	case SO_ERROR:
1764 		v.val = -sock_error(sk);
1765 		if (v.val == 0)
1766 			v.val = xchg(&sk->sk_err_soft, 0);
1767 		break;
1768 
1769 	case SO_OOBINLINE:
1770 		v.val = sock_flag(sk, SOCK_URGINLINE);
1771 		break;
1772 
1773 	case SO_NO_CHECK:
1774 		v.val = sk->sk_no_check_tx;
1775 		break;
1776 
1777 	case SO_PRIORITY:
1778 		v.val = READ_ONCE(sk->sk_priority);
1779 		break;
1780 
1781 	case SO_LINGER:
1782 		lv		= sizeof(v.ling);
1783 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1784 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1785 		break;
1786 
1787 	case SO_BSDCOMPAT:
1788 		break;
1789 
1790 	case SO_TIMESTAMP_OLD:
1791 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1792 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1793 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1794 		break;
1795 
1796 	case SO_TIMESTAMPNS_OLD:
1797 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1798 		break;
1799 
1800 	case SO_TIMESTAMP_NEW:
1801 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1802 		break;
1803 
1804 	case SO_TIMESTAMPNS_NEW:
1805 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1806 		break;
1807 
1808 	case SO_TIMESTAMPING_OLD:
1809 	case SO_TIMESTAMPING_NEW:
1810 		lv = sizeof(v.timestamping);
1811 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1812 		 * returning the flags when they were set through the same option.
1813 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1814 		 */
1815 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1816 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1817 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1818 		}
1819 		break;
1820 
1821 	case SO_RCVTIMEO_OLD:
1822 	case SO_RCVTIMEO_NEW:
1823 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1824 				      SO_RCVTIMEO_OLD == optname);
1825 		break;
1826 
1827 	case SO_SNDTIMEO_OLD:
1828 	case SO_SNDTIMEO_NEW:
1829 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1830 				      SO_SNDTIMEO_OLD == optname);
1831 		break;
1832 
1833 	case SO_RCVLOWAT:
1834 		v.val = READ_ONCE(sk->sk_rcvlowat);
1835 		break;
1836 
1837 	case SO_SNDLOWAT:
1838 		v.val = 1;
1839 		break;
1840 
1841 	case SO_PASSCRED:
1842 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1843 		break;
1844 
1845 	case SO_PASSPIDFD:
1846 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1847 		break;
1848 
1849 	case SO_PEERCRED:
1850 	{
1851 		struct ucred peercred;
1852 		if (len > sizeof(peercred))
1853 			len = sizeof(peercred);
1854 
1855 		spin_lock(&sk->sk_peer_lock);
1856 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1857 		spin_unlock(&sk->sk_peer_lock);
1858 
1859 		if (copy_to_sockptr(optval, &peercred, len))
1860 			return -EFAULT;
1861 		goto lenout;
1862 	}
1863 
1864 	case SO_PEERPIDFD:
1865 	{
1866 		struct pid *peer_pid;
1867 		struct file *pidfd_file = NULL;
1868 		int pidfd;
1869 
1870 		if (len > sizeof(pidfd))
1871 			len = sizeof(pidfd);
1872 
1873 		spin_lock(&sk->sk_peer_lock);
1874 		peer_pid = get_pid(sk->sk_peer_pid);
1875 		spin_unlock(&sk->sk_peer_lock);
1876 
1877 		if (!peer_pid)
1878 			return -ENODATA;
1879 
1880 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1881 		put_pid(peer_pid);
1882 		if (pidfd < 0)
1883 			return pidfd;
1884 
1885 		if (copy_to_sockptr(optval, &pidfd, len) ||
1886 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1887 			put_unused_fd(pidfd);
1888 			fput(pidfd_file);
1889 
1890 			return -EFAULT;
1891 		}
1892 
1893 		fd_install(pidfd, pidfd_file);
1894 		return 0;
1895 	}
1896 
1897 	case SO_PEERGROUPS:
1898 	{
1899 		const struct cred *cred;
1900 		int ret, n;
1901 
1902 		cred = sk_get_peer_cred(sk);
1903 		if (!cred)
1904 			return -ENODATA;
1905 
1906 		n = cred->group_info->ngroups;
1907 		if (len < n * sizeof(gid_t)) {
1908 			len = n * sizeof(gid_t);
1909 			put_cred(cred);
1910 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1911 		}
1912 		len = n * sizeof(gid_t);
1913 
1914 		ret = groups_to_user(optval, cred->group_info);
1915 		put_cred(cred);
1916 		if (ret)
1917 			return ret;
1918 		goto lenout;
1919 	}
1920 
1921 	case SO_PEERNAME:
1922 	{
1923 		struct sockaddr_storage address;
1924 
1925 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1926 		if (lv < 0)
1927 			return -ENOTCONN;
1928 		if (lv < len)
1929 			return -EINVAL;
1930 		if (copy_to_sockptr(optval, &address, len))
1931 			return -EFAULT;
1932 		goto lenout;
1933 	}
1934 
1935 	/* Dubious BSD thing... Probably nobody even uses it, but
1936 	 * the UNIX standard wants it for whatever reason... -DaveM
1937 	 */
1938 	case SO_ACCEPTCONN:
1939 		v.val = sk->sk_state == TCP_LISTEN;
1940 		break;
1941 
1942 	case SO_PASSSEC:
1943 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1944 		break;
1945 
1946 	case SO_PEERSEC:
1947 		return security_socket_getpeersec_stream(sock,
1948 							 optval, optlen, len);
1949 
1950 	case SO_MARK:
1951 		v.val = READ_ONCE(sk->sk_mark);
1952 		break;
1953 
1954 	case SO_RCVMARK:
1955 		v.val = sock_flag(sk, SOCK_RCVMARK);
1956 		break;
1957 
1958 	case SO_RCVPRIORITY:
1959 		v.val = sock_flag(sk, SOCK_RCVPRIORITY);
1960 		break;
1961 
1962 	case SO_RXQ_OVFL:
1963 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1964 		break;
1965 
1966 	case SO_WIFI_STATUS:
1967 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1968 		break;
1969 
1970 	case SO_PEEK_OFF:
1971 		if (!READ_ONCE(sock->ops)->set_peek_off)
1972 			return -EOPNOTSUPP;
1973 
1974 		v.val = READ_ONCE(sk->sk_peek_off);
1975 		break;
1976 	case SO_NOFCS:
1977 		v.val = sock_flag(sk, SOCK_NOFCS);
1978 		break;
1979 
1980 	case SO_BINDTODEVICE:
1981 		return sock_getbindtodevice(sk, optval, optlen, len);
1982 
1983 	case SO_GET_FILTER:
1984 		len = sk_get_filter(sk, optval, len);
1985 		if (len < 0)
1986 			return len;
1987 
1988 		goto lenout;
1989 
1990 	case SO_LOCK_FILTER:
1991 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1992 		break;
1993 
1994 	case SO_BPF_EXTENSIONS:
1995 		v.val = bpf_tell_extensions();
1996 		break;
1997 
1998 	case SO_SELECT_ERR_QUEUE:
1999 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2000 		break;
2001 
2002 #ifdef CONFIG_NET_RX_BUSY_POLL
2003 	case SO_BUSY_POLL:
2004 		v.val = READ_ONCE(sk->sk_ll_usec);
2005 		break;
2006 	case SO_PREFER_BUSY_POLL:
2007 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2008 		break;
2009 #endif
2010 
2011 	case SO_MAX_PACING_RATE:
2012 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2013 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2014 			lv = sizeof(v.ulval);
2015 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2016 		} else {
2017 			/* 32bit version */
2018 			v.val = min_t(unsigned long, ~0U,
2019 				      READ_ONCE(sk->sk_max_pacing_rate));
2020 		}
2021 		break;
2022 
2023 	case SO_INCOMING_CPU:
2024 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2025 		break;
2026 
2027 	case SO_MEMINFO:
2028 	{
2029 		u32 meminfo[SK_MEMINFO_VARS];
2030 
2031 		sk_get_meminfo(sk, meminfo);
2032 
2033 		len = min_t(unsigned int, len, sizeof(meminfo));
2034 		if (copy_to_sockptr(optval, &meminfo, len))
2035 			return -EFAULT;
2036 
2037 		goto lenout;
2038 	}
2039 
2040 #ifdef CONFIG_NET_RX_BUSY_POLL
2041 	case SO_INCOMING_NAPI_ID:
2042 		v.val = READ_ONCE(sk->sk_napi_id);
2043 
2044 		/* aggregate non-NAPI IDs down to 0 */
2045 		if (v.val < MIN_NAPI_ID)
2046 			v.val = 0;
2047 
2048 		break;
2049 #endif
2050 
2051 	case SO_COOKIE:
2052 		lv = sizeof(u64);
2053 		if (len < lv)
2054 			return -EINVAL;
2055 		v.val64 = sock_gen_cookie(sk);
2056 		break;
2057 
2058 	case SO_ZEROCOPY:
2059 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2060 		break;
2061 
2062 	case SO_TXTIME:
2063 		lv = sizeof(v.txtime);
2064 		v.txtime.clockid = sk->sk_clockid;
2065 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2066 				  SOF_TXTIME_DEADLINE_MODE : 0;
2067 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2068 				  SOF_TXTIME_REPORT_ERRORS : 0;
2069 		break;
2070 
2071 	case SO_BINDTOIFINDEX:
2072 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2073 		break;
2074 
2075 	case SO_NETNS_COOKIE:
2076 		lv = sizeof(u64);
2077 		if (len != lv)
2078 			return -EINVAL;
2079 		v.val64 = sock_net(sk)->net_cookie;
2080 		break;
2081 
2082 	case SO_BUF_LOCK:
2083 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2084 		break;
2085 
2086 	case SO_RESERVE_MEM:
2087 		v.val = READ_ONCE(sk->sk_reserved_mem);
2088 		break;
2089 
2090 	case SO_TXREHASH:
2091 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2092 		v.val = READ_ONCE(sk->sk_txrehash);
2093 		break;
2094 
2095 	default:
2096 		/* We implement the SO_SNDLOWAT etc to not be settable
2097 		 * (1003.1g 7).
2098 		 */
2099 		return -ENOPROTOOPT;
2100 	}
2101 
2102 	if (len > lv)
2103 		len = lv;
2104 	if (copy_to_sockptr(optval, &v, len))
2105 		return -EFAULT;
2106 lenout:
2107 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2108 		return -EFAULT;
2109 	return 0;
2110 }
2111 
2112 /*
2113  * Initialize an sk_lock.
2114  *
2115  * (We also register the sk_lock with the lock validator.)
2116  */
2117 static inline void sock_lock_init(struct sock *sk)
2118 {
2119 	if (sk->sk_kern_sock)
2120 		sock_lock_init_class_and_name(
2121 			sk,
2122 			af_family_kern_slock_key_strings[sk->sk_family],
2123 			af_family_kern_slock_keys + sk->sk_family,
2124 			af_family_kern_key_strings[sk->sk_family],
2125 			af_family_kern_keys + sk->sk_family);
2126 	else
2127 		sock_lock_init_class_and_name(
2128 			sk,
2129 			af_family_slock_key_strings[sk->sk_family],
2130 			af_family_slock_keys + sk->sk_family,
2131 			af_family_key_strings[sk->sk_family],
2132 			af_family_keys + sk->sk_family);
2133 }
2134 
2135 /*
2136  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2137  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2138  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2139  */
2140 static void sock_copy(struct sock *nsk, const struct sock *osk)
2141 {
2142 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2143 #ifdef CONFIG_SECURITY_NETWORK
2144 	void *sptr = nsk->sk_security;
2145 #endif
2146 
2147 	/* If we move sk_tx_queue_mapping out of the private section,
2148 	 * we must check if sk_tx_queue_clear() is called after
2149 	 * sock_copy() in sk_clone_lock().
2150 	 */
2151 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2152 		     offsetof(struct sock, sk_dontcopy_begin) ||
2153 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2154 		     offsetof(struct sock, sk_dontcopy_end));
2155 
2156 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2157 
2158 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2159 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2160 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2161 
2162 #ifdef CONFIG_SECURITY_NETWORK
2163 	nsk->sk_security = sptr;
2164 	security_sk_clone(osk, nsk);
2165 #endif
2166 }
2167 
2168 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2169 		int family)
2170 {
2171 	struct sock *sk;
2172 	struct kmem_cache *slab;
2173 
2174 	slab = prot->slab;
2175 	if (slab != NULL) {
2176 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2177 		if (!sk)
2178 			return sk;
2179 		if (want_init_on_alloc(priority))
2180 			sk_prot_clear_nulls(sk, prot->obj_size);
2181 	} else
2182 		sk = kmalloc(prot->obj_size, priority);
2183 
2184 	if (sk != NULL) {
2185 		if (security_sk_alloc(sk, family, priority))
2186 			goto out_free;
2187 
2188 		if (!try_module_get(prot->owner))
2189 			goto out_free_sec;
2190 	}
2191 
2192 	return sk;
2193 
2194 out_free_sec:
2195 	security_sk_free(sk);
2196 out_free:
2197 	if (slab != NULL)
2198 		kmem_cache_free(slab, sk);
2199 	else
2200 		kfree(sk);
2201 	return NULL;
2202 }
2203 
2204 static void sk_prot_free(struct proto *prot, struct sock *sk)
2205 {
2206 	struct kmem_cache *slab;
2207 	struct module *owner;
2208 
2209 	owner = prot->owner;
2210 	slab = prot->slab;
2211 
2212 	cgroup_sk_free(&sk->sk_cgrp_data);
2213 	mem_cgroup_sk_free(sk);
2214 	security_sk_free(sk);
2215 	if (slab != NULL)
2216 		kmem_cache_free(slab, sk);
2217 	else
2218 		kfree(sk);
2219 	module_put(owner);
2220 }
2221 
2222 /**
2223  *	sk_alloc - All socket objects are allocated here
2224  *	@net: the applicable net namespace
2225  *	@family: protocol family
2226  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2227  *	@prot: struct proto associated with this new sock instance
2228  *	@kern: is this to be a kernel socket?
2229  */
2230 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2231 		      struct proto *prot, int kern)
2232 {
2233 	struct sock *sk;
2234 
2235 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2236 	if (sk) {
2237 		sk->sk_family = family;
2238 		/*
2239 		 * See comment in struct sock definition to understand
2240 		 * why we need sk_prot_creator -acme
2241 		 */
2242 		sk->sk_prot = sk->sk_prot_creator = prot;
2243 		sk->sk_kern_sock = kern;
2244 		sock_lock_init(sk);
2245 		sk->sk_net_refcnt = kern ? 0 : 1;
2246 		if (likely(sk->sk_net_refcnt)) {
2247 			get_net_track(net, &sk->ns_tracker, priority);
2248 			sock_inuse_add(net, 1);
2249 		} else {
2250 			__netns_tracker_alloc(net, &sk->ns_tracker,
2251 					      false, priority);
2252 		}
2253 
2254 		sock_net_set(sk, net);
2255 		refcount_set(&sk->sk_wmem_alloc, 1);
2256 
2257 		mem_cgroup_sk_alloc(sk);
2258 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2259 		sock_update_classid(&sk->sk_cgrp_data);
2260 		sock_update_netprioidx(&sk->sk_cgrp_data);
2261 		sk_tx_queue_clear(sk);
2262 	}
2263 
2264 	return sk;
2265 }
2266 EXPORT_SYMBOL(sk_alloc);
2267 
2268 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2269  * grace period. This is the case for UDP sockets and TCP listeners.
2270  */
2271 static void __sk_destruct(struct rcu_head *head)
2272 {
2273 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2274 	struct sk_filter *filter;
2275 
2276 	if (sk->sk_destruct)
2277 		sk->sk_destruct(sk);
2278 
2279 	filter = rcu_dereference_check(sk->sk_filter,
2280 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2281 	if (filter) {
2282 		sk_filter_uncharge(sk, filter);
2283 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2284 	}
2285 
2286 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2287 
2288 #ifdef CONFIG_BPF_SYSCALL
2289 	bpf_sk_storage_free(sk);
2290 #endif
2291 
2292 	if (atomic_read(&sk->sk_omem_alloc))
2293 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2294 			 __func__, atomic_read(&sk->sk_omem_alloc));
2295 
2296 	if (sk->sk_frag.page) {
2297 		put_page(sk->sk_frag.page);
2298 		sk->sk_frag.page = NULL;
2299 	}
2300 
2301 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2302 	put_cred(sk->sk_peer_cred);
2303 	put_pid(sk->sk_peer_pid);
2304 
2305 	if (likely(sk->sk_net_refcnt))
2306 		put_net_track(sock_net(sk), &sk->ns_tracker);
2307 	else
2308 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2309 
2310 	sk_prot_free(sk->sk_prot_creator, sk);
2311 }
2312 
2313 void sk_destruct(struct sock *sk)
2314 {
2315 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2316 
2317 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2318 		reuseport_detach_sock(sk);
2319 		use_call_rcu = true;
2320 	}
2321 
2322 	if (use_call_rcu)
2323 		call_rcu(&sk->sk_rcu, __sk_destruct);
2324 	else
2325 		__sk_destruct(&sk->sk_rcu);
2326 }
2327 
2328 static void __sk_free(struct sock *sk)
2329 {
2330 	if (likely(sk->sk_net_refcnt))
2331 		sock_inuse_add(sock_net(sk), -1);
2332 
2333 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2334 		sock_diag_broadcast_destroy(sk);
2335 	else
2336 		sk_destruct(sk);
2337 }
2338 
2339 void sk_free(struct sock *sk)
2340 {
2341 	/*
2342 	 * We subtract one from sk_wmem_alloc and can know if
2343 	 * some packets are still in some tx queue.
2344 	 * If not null, sock_wfree() will call __sk_free(sk) later
2345 	 */
2346 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2347 		__sk_free(sk);
2348 }
2349 EXPORT_SYMBOL(sk_free);
2350 
2351 static void sk_init_common(struct sock *sk)
2352 {
2353 	skb_queue_head_init(&sk->sk_receive_queue);
2354 	skb_queue_head_init(&sk->sk_write_queue);
2355 	skb_queue_head_init(&sk->sk_error_queue);
2356 
2357 	rwlock_init(&sk->sk_callback_lock);
2358 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2359 			af_rlock_keys + sk->sk_family,
2360 			af_family_rlock_key_strings[sk->sk_family]);
2361 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2362 			af_wlock_keys + sk->sk_family,
2363 			af_family_wlock_key_strings[sk->sk_family]);
2364 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2365 			af_elock_keys + sk->sk_family,
2366 			af_family_elock_key_strings[sk->sk_family]);
2367 	if (sk->sk_kern_sock)
2368 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2369 			af_kern_callback_keys + sk->sk_family,
2370 			af_family_kern_clock_key_strings[sk->sk_family]);
2371 	else
2372 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2373 			af_callback_keys + sk->sk_family,
2374 			af_family_clock_key_strings[sk->sk_family]);
2375 }
2376 
2377 /**
2378  *	sk_clone_lock - clone a socket, and lock its clone
2379  *	@sk: the socket to clone
2380  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2381  *
2382  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2383  */
2384 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2385 {
2386 	struct proto *prot = READ_ONCE(sk->sk_prot);
2387 	struct sk_filter *filter;
2388 	bool is_charged = true;
2389 	struct sock *newsk;
2390 
2391 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2392 	if (!newsk)
2393 		goto out;
2394 
2395 	sock_copy(newsk, sk);
2396 
2397 	newsk->sk_prot_creator = prot;
2398 
2399 	/* SANITY */
2400 	if (likely(newsk->sk_net_refcnt)) {
2401 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2402 		sock_inuse_add(sock_net(newsk), 1);
2403 	} else {
2404 		/* Kernel sockets are not elevating the struct net refcount.
2405 		 * Instead, use a tracker to more easily detect if a layer
2406 		 * is not properly dismantling its kernel sockets at netns
2407 		 * destroy time.
2408 		 */
2409 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2410 				      false, priority);
2411 	}
2412 	sk_node_init(&newsk->sk_node);
2413 	sock_lock_init(newsk);
2414 	bh_lock_sock(newsk);
2415 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2416 	newsk->sk_backlog.len = 0;
2417 
2418 	atomic_set(&newsk->sk_rmem_alloc, 0);
2419 
2420 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2421 	refcount_set(&newsk->sk_wmem_alloc, 1);
2422 
2423 	atomic_set(&newsk->sk_omem_alloc, 0);
2424 	sk_init_common(newsk);
2425 
2426 	newsk->sk_dst_cache	= NULL;
2427 	newsk->sk_dst_pending_confirm = 0;
2428 	newsk->sk_wmem_queued	= 0;
2429 	newsk->sk_forward_alloc = 0;
2430 	newsk->sk_reserved_mem  = 0;
2431 	atomic_set(&newsk->sk_drops, 0);
2432 	newsk->sk_send_head	= NULL;
2433 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2434 	atomic_set(&newsk->sk_zckey, 0);
2435 
2436 	sock_reset_flag(newsk, SOCK_DONE);
2437 
2438 	/* sk->sk_memcg will be populated at accept() time */
2439 	newsk->sk_memcg = NULL;
2440 
2441 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2442 
2443 	rcu_read_lock();
2444 	filter = rcu_dereference(sk->sk_filter);
2445 	if (filter != NULL)
2446 		/* though it's an empty new sock, the charging may fail
2447 		 * if sysctl_optmem_max was changed between creation of
2448 		 * original socket and cloning
2449 		 */
2450 		is_charged = sk_filter_charge(newsk, filter);
2451 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2452 	rcu_read_unlock();
2453 
2454 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2455 		/* We need to make sure that we don't uncharge the new
2456 		 * socket if we couldn't charge it in the first place
2457 		 * as otherwise we uncharge the parent's filter.
2458 		 */
2459 		if (!is_charged)
2460 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2461 		sk_free_unlock_clone(newsk);
2462 		newsk = NULL;
2463 		goto out;
2464 	}
2465 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2466 
2467 	if (bpf_sk_storage_clone(sk, newsk)) {
2468 		sk_free_unlock_clone(newsk);
2469 		newsk = NULL;
2470 		goto out;
2471 	}
2472 
2473 	/* Clear sk_user_data if parent had the pointer tagged
2474 	 * as not suitable for copying when cloning.
2475 	 */
2476 	if (sk_user_data_is_nocopy(newsk))
2477 		newsk->sk_user_data = NULL;
2478 
2479 	newsk->sk_err	   = 0;
2480 	newsk->sk_err_soft = 0;
2481 	newsk->sk_priority = 0;
2482 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2483 
2484 	/* Before updating sk_refcnt, we must commit prior changes to memory
2485 	 * (Documentation/RCU/rculist_nulls.rst for details)
2486 	 */
2487 	smp_wmb();
2488 	refcount_set(&newsk->sk_refcnt, 2);
2489 
2490 	sk_set_socket(newsk, NULL);
2491 	sk_tx_queue_clear(newsk);
2492 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2493 
2494 	if (newsk->sk_prot->sockets_allocated)
2495 		sk_sockets_allocated_inc(newsk);
2496 
2497 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2498 		net_enable_timestamp();
2499 out:
2500 	return newsk;
2501 }
2502 EXPORT_SYMBOL_GPL(sk_clone_lock);
2503 
2504 void sk_free_unlock_clone(struct sock *sk)
2505 {
2506 	/* It is still raw copy of parent, so invalidate
2507 	 * destructor and make plain sk_free() */
2508 	sk->sk_destruct = NULL;
2509 	bh_unlock_sock(sk);
2510 	sk_free(sk);
2511 }
2512 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2513 
2514 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2515 {
2516 	bool is_ipv6 = false;
2517 	u32 max_size;
2518 
2519 #if IS_ENABLED(CONFIG_IPV6)
2520 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2521 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2522 #endif
2523 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2524 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2525 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2526 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2527 		max_size = GSO_LEGACY_MAX_SIZE;
2528 
2529 	return max_size - (MAX_TCP_HEADER + 1);
2530 }
2531 
2532 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2533 {
2534 	u32 max_segs = 1;
2535 
2536 	sk->sk_route_caps = dst->dev->features;
2537 	if (sk_is_tcp(sk))
2538 		sk->sk_route_caps |= NETIF_F_GSO;
2539 	if (sk->sk_route_caps & NETIF_F_GSO)
2540 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2541 	if (unlikely(sk->sk_gso_disabled))
2542 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2543 	if (sk_can_gso(sk)) {
2544 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2545 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2546 		} else {
2547 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2548 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2549 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2550 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2551 		}
2552 	}
2553 	sk->sk_gso_max_segs = max_segs;
2554 	sk_dst_set(sk, dst);
2555 }
2556 EXPORT_SYMBOL_GPL(sk_setup_caps);
2557 
2558 /*
2559  *	Simple resource managers for sockets.
2560  */
2561 
2562 
2563 /*
2564  * Write buffer destructor automatically called from kfree_skb.
2565  */
2566 void sock_wfree(struct sk_buff *skb)
2567 {
2568 	struct sock *sk = skb->sk;
2569 	unsigned int len = skb->truesize;
2570 	bool free;
2571 
2572 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2573 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2574 		    sk->sk_write_space == sock_def_write_space) {
2575 			rcu_read_lock();
2576 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2577 			sock_def_write_space_wfree(sk);
2578 			rcu_read_unlock();
2579 			if (unlikely(free))
2580 				__sk_free(sk);
2581 			return;
2582 		}
2583 
2584 		/*
2585 		 * Keep a reference on sk_wmem_alloc, this will be released
2586 		 * after sk_write_space() call
2587 		 */
2588 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2589 		sk->sk_write_space(sk);
2590 		len = 1;
2591 	}
2592 	/*
2593 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2594 	 * could not do because of in-flight packets
2595 	 */
2596 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2597 		__sk_free(sk);
2598 }
2599 EXPORT_SYMBOL(sock_wfree);
2600 
2601 /* This variant of sock_wfree() is used by TCP,
2602  * since it sets SOCK_USE_WRITE_QUEUE.
2603  */
2604 void __sock_wfree(struct sk_buff *skb)
2605 {
2606 	struct sock *sk = skb->sk;
2607 
2608 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2609 		__sk_free(sk);
2610 }
2611 
2612 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2613 {
2614 	skb_orphan(skb);
2615 #ifdef CONFIG_INET
2616 	if (unlikely(!sk_fullsock(sk)))
2617 		return skb_set_owner_edemux(skb, sk);
2618 #endif
2619 	skb->sk = sk;
2620 	skb->destructor = sock_wfree;
2621 	skb_set_hash_from_sk(skb, sk);
2622 	/*
2623 	 * We used to take a refcount on sk, but following operation
2624 	 * is enough to guarantee sk_free() won't free this sock until
2625 	 * all in-flight packets are completed
2626 	 */
2627 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2628 }
2629 EXPORT_SYMBOL(skb_set_owner_w);
2630 
2631 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2632 {
2633 	/* Drivers depend on in-order delivery for crypto offload,
2634 	 * partial orphan breaks out-of-order-OK logic.
2635 	 */
2636 	if (skb_is_decrypted(skb))
2637 		return false;
2638 
2639 	return (skb->destructor == sock_wfree ||
2640 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2641 }
2642 
2643 /* This helper is used by netem, as it can hold packets in its
2644  * delay queue. We want to allow the owner socket to send more
2645  * packets, as if they were already TX completed by a typical driver.
2646  * But we also want to keep skb->sk set because some packet schedulers
2647  * rely on it (sch_fq for example).
2648  */
2649 void skb_orphan_partial(struct sk_buff *skb)
2650 {
2651 	if (skb_is_tcp_pure_ack(skb))
2652 		return;
2653 
2654 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2655 		return;
2656 
2657 	skb_orphan(skb);
2658 }
2659 EXPORT_SYMBOL(skb_orphan_partial);
2660 
2661 /*
2662  * Read buffer destructor automatically called from kfree_skb.
2663  */
2664 void sock_rfree(struct sk_buff *skb)
2665 {
2666 	struct sock *sk = skb->sk;
2667 	unsigned int len = skb->truesize;
2668 
2669 	atomic_sub(len, &sk->sk_rmem_alloc);
2670 	sk_mem_uncharge(sk, len);
2671 }
2672 EXPORT_SYMBOL(sock_rfree);
2673 
2674 /*
2675  * Buffer destructor for skbs that are not used directly in read or write
2676  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2677  */
2678 void sock_efree(struct sk_buff *skb)
2679 {
2680 	sock_put(skb->sk);
2681 }
2682 EXPORT_SYMBOL(sock_efree);
2683 
2684 /* Buffer destructor for prefetch/receive path where reference count may
2685  * not be held, e.g. for listen sockets.
2686  */
2687 #ifdef CONFIG_INET
2688 void sock_pfree(struct sk_buff *skb)
2689 {
2690 	struct sock *sk = skb->sk;
2691 
2692 	if (!sk_is_refcounted(sk))
2693 		return;
2694 
2695 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2696 		inet_reqsk(sk)->rsk_listener = NULL;
2697 		reqsk_free(inet_reqsk(sk));
2698 		return;
2699 	}
2700 
2701 	sock_gen_put(sk);
2702 }
2703 EXPORT_SYMBOL(sock_pfree);
2704 #endif /* CONFIG_INET */
2705 
2706 kuid_t sock_i_uid(struct sock *sk)
2707 {
2708 	kuid_t uid;
2709 
2710 	read_lock_bh(&sk->sk_callback_lock);
2711 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2712 	read_unlock_bh(&sk->sk_callback_lock);
2713 	return uid;
2714 }
2715 EXPORT_SYMBOL(sock_i_uid);
2716 
2717 unsigned long __sock_i_ino(struct sock *sk)
2718 {
2719 	unsigned long ino;
2720 
2721 	read_lock(&sk->sk_callback_lock);
2722 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2723 	read_unlock(&sk->sk_callback_lock);
2724 	return ino;
2725 }
2726 EXPORT_SYMBOL(__sock_i_ino);
2727 
2728 unsigned long sock_i_ino(struct sock *sk)
2729 {
2730 	unsigned long ino;
2731 
2732 	local_bh_disable();
2733 	ino = __sock_i_ino(sk);
2734 	local_bh_enable();
2735 	return ino;
2736 }
2737 EXPORT_SYMBOL(sock_i_ino);
2738 
2739 /*
2740  * Allocate a skb from the socket's send buffer.
2741  */
2742 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2743 			     gfp_t priority)
2744 {
2745 	if (force ||
2746 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2747 		struct sk_buff *skb = alloc_skb(size, priority);
2748 
2749 		if (skb) {
2750 			skb_set_owner_w(skb, sk);
2751 			return skb;
2752 		}
2753 	}
2754 	return NULL;
2755 }
2756 EXPORT_SYMBOL(sock_wmalloc);
2757 
2758 static void sock_ofree(struct sk_buff *skb)
2759 {
2760 	struct sock *sk = skb->sk;
2761 
2762 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2763 }
2764 
2765 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2766 			     gfp_t priority)
2767 {
2768 	struct sk_buff *skb;
2769 
2770 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2771 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2772 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2773 		return NULL;
2774 
2775 	skb = alloc_skb(size, priority);
2776 	if (!skb)
2777 		return NULL;
2778 
2779 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2780 	skb->sk = sk;
2781 	skb->destructor = sock_ofree;
2782 	return skb;
2783 }
2784 
2785 /*
2786  * Allocate a memory block from the socket's option memory buffer.
2787  */
2788 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2789 {
2790 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2791 
2792 	if ((unsigned int)size <= optmem_max &&
2793 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2794 		void *mem;
2795 		/* First do the add, to avoid the race if kmalloc
2796 		 * might sleep.
2797 		 */
2798 		atomic_add(size, &sk->sk_omem_alloc);
2799 		mem = kmalloc(size, priority);
2800 		if (mem)
2801 			return mem;
2802 		atomic_sub(size, &sk->sk_omem_alloc);
2803 	}
2804 	return NULL;
2805 }
2806 EXPORT_SYMBOL(sock_kmalloc);
2807 
2808 /* Free an option memory block. Note, we actually want the inline
2809  * here as this allows gcc to detect the nullify and fold away the
2810  * condition entirely.
2811  */
2812 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2813 				  const bool nullify)
2814 {
2815 	if (WARN_ON_ONCE(!mem))
2816 		return;
2817 	if (nullify)
2818 		kfree_sensitive(mem);
2819 	else
2820 		kfree(mem);
2821 	atomic_sub(size, &sk->sk_omem_alloc);
2822 }
2823 
2824 void sock_kfree_s(struct sock *sk, void *mem, int size)
2825 {
2826 	__sock_kfree_s(sk, mem, size, false);
2827 }
2828 EXPORT_SYMBOL(sock_kfree_s);
2829 
2830 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2831 {
2832 	__sock_kfree_s(sk, mem, size, true);
2833 }
2834 EXPORT_SYMBOL(sock_kzfree_s);
2835 
2836 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2837    I think, these locks should be removed for datagram sockets.
2838  */
2839 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2840 {
2841 	DEFINE_WAIT(wait);
2842 
2843 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2844 	for (;;) {
2845 		if (!timeo)
2846 			break;
2847 		if (signal_pending(current))
2848 			break;
2849 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2850 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2851 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2852 			break;
2853 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2854 			break;
2855 		if (READ_ONCE(sk->sk_err))
2856 			break;
2857 		timeo = schedule_timeout(timeo);
2858 	}
2859 	finish_wait(sk_sleep(sk), &wait);
2860 	return timeo;
2861 }
2862 
2863 
2864 /*
2865  *	Generic send/receive buffer handlers
2866  */
2867 
2868 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2869 				     unsigned long data_len, int noblock,
2870 				     int *errcode, int max_page_order)
2871 {
2872 	struct sk_buff *skb;
2873 	long timeo;
2874 	int err;
2875 
2876 	timeo = sock_sndtimeo(sk, noblock);
2877 	for (;;) {
2878 		err = sock_error(sk);
2879 		if (err != 0)
2880 			goto failure;
2881 
2882 		err = -EPIPE;
2883 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2884 			goto failure;
2885 
2886 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2887 			break;
2888 
2889 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2890 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2891 		err = -EAGAIN;
2892 		if (!timeo)
2893 			goto failure;
2894 		if (signal_pending(current))
2895 			goto interrupted;
2896 		timeo = sock_wait_for_wmem(sk, timeo);
2897 	}
2898 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2899 				   errcode, sk->sk_allocation);
2900 	if (skb)
2901 		skb_set_owner_w(skb, sk);
2902 	return skb;
2903 
2904 interrupted:
2905 	err = sock_intr_errno(timeo);
2906 failure:
2907 	*errcode = err;
2908 	return NULL;
2909 }
2910 EXPORT_SYMBOL(sock_alloc_send_pskb);
2911 
2912 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2913 		     struct sockcm_cookie *sockc)
2914 {
2915 	u32 tsflags;
2916 
2917 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
2918 
2919 	switch (cmsg->cmsg_type) {
2920 	case SO_MARK:
2921 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2922 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2923 			return -EPERM;
2924 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2925 			return -EINVAL;
2926 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2927 		break;
2928 	case SO_TIMESTAMPING_OLD:
2929 	case SO_TIMESTAMPING_NEW:
2930 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2931 			return -EINVAL;
2932 
2933 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2934 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2935 			return -EINVAL;
2936 
2937 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2938 		sockc->tsflags |= tsflags;
2939 		break;
2940 	case SCM_TXTIME:
2941 		if (!sock_flag(sk, SOCK_TXTIME))
2942 			return -EINVAL;
2943 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2944 			return -EINVAL;
2945 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2946 		break;
2947 	case SCM_TS_OPT_ID:
2948 		if (sk_is_tcp(sk))
2949 			return -EINVAL;
2950 		tsflags = READ_ONCE(sk->sk_tsflags);
2951 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
2952 			return -EINVAL;
2953 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2954 			return -EINVAL;
2955 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
2956 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
2957 		break;
2958 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2959 	case SCM_RIGHTS:
2960 	case SCM_CREDENTIALS:
2961 		break;
2962 	case SO_PRIORITY:
2963 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2964 			return -EINVAL;
2965 		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
2966 			return -EPERM;
2967 		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
2968 		break;
2969 	default:
2970 		return -EINVAL;
2971 	}
2972 	return 0;
2973 }
2974 EXPORT_SYMBOL(__sock_cmsg_send);
2975 
2976 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2977 		   struct sockcm_cookie *sockc)
2978 {
2979 	struct cmsghdr *cmsg;
2980 	int ret;
2981 
2982 	for_each_cmsghdr(cmsg, msg) {
2983 		if (!CMSG_OK(msg, cmsg))
2984 			return -EINVAL;
2985 		if (cmsg->cmsg_level != SOL_SOCKET)
2986 			continue;
2987 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2988 		if (ret)
2989 			return ret;
2990 	}
2991 	return 0;
2992 }
2993 EXPORT_SYMBOL(sock_cmsg_send);
2994 
2995 static void sk_enter_memory_pressure(struct sock *sk)
2996 {
2997 	if (!sk->sk_prot->enter_memory_pressure)
2998 		return;
2999 
3000 	sk->sk_prot->enter_memory_pressure(sk);
3001 }
3002 
3003 static void sk_leave_memory_pressure(struct sock *sk)
3004 {
3005 	if (sk->sk_prot->leave_memory_pressure) {
3006 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3007 				     tcp_leave_memory_pressure, sk);
3008 	} else {
3009 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3010 
3011 		if (memory_pressure && READ_ONCE(*memory_pressure))
3012 			WRITE_ONCE(*memory_pressure, 0);
3013 	}
3014 }
3015 
3016 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3017 
3018 /**
3019  * skb_page_frag_refill - check that a page_frag contains enough room
3020  * @sz: minimum size of the fragment we want to get
3021  * @pfrag: pointer to page_frag
3022  * @gfp: priority for memory allocation
3023  *
3024  * Note: While this allocator tries to use high order pages, there is
3025  * no guarantee that allocations succeed. Therefore, @sz MUST be
3026  * less or equal than PAGE_SIZE.
3027  */
3028 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3029 {
3030 	if (pfrag->page) {
3031 		if (page_ref_count(pfrag->page) == 1) {
3032 			pfrag->offset = 0;
3033 			return true;
3034 		}
3035 		if (pfrag->offset + sz <= pfrag->size)
3036 			return true;
3037 		put_page(pfrag->page);
3038 	}
3039 
3040 	pfrag->offset = 0;
3041 	if (SKB_FRAG_PAGE_ORDER &&
3042 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3043 		/* Avoid direct reclaim but allow kswapd to wake */
3044 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3045 					  __GFP_COMP | __GFP_NOWARN |
3046 					  __GFP_NORETRY,
3047 					  SKB_FRAG_PAGE_ORDER);
3048 		if (likely(pfrag->page)) {
3049 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3050 			return true;
3051 		}
3052 	}
3053 	pfrag->page = alloc_page(gfp);
3054 	if (likely(pfrag->page)) {
3055 		pfrag->size = PAGE_SIZE;
3056 		return true;
3057 	}
3058 	return false;
3059 }
3060 EXPORT_SYMBOL(skb_page_frag_refill);
3061 
3062 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3063 {
3064 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3065 		return true;
3066 
3067 	sk_enter_memory_pressure(sk);
3068 	sk_stream_moderate_sndbuf(sk);
3069 	return false;
3070 }
3071 EXPORT_SYMBOL(sk_page_frag_refill);
3072 
3073 void __lock_sock(struct sock *sk)
3074 	__releases(&sk->sk_lock.slock)
3075 	__acquires(&sk->sk_lock.slock)
3076 {
3077 	DEFINE_WAIT(wait);
3078 
3079 	for (;;) {
3080 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3081 					TASK_UNINTERRUPTIBLE);
3082 		spin_unlock_bh(&sk->sk_lock.slock);
3083 		schedule();
3084 		spin_lock_bh(&sk->sk_lock.slock);
3085 		if (!sock_owned_by_user(sk))
3086 			break;
3087 	}
3088 	finish_wait(&sk->sk_lock.wq, &wait);
3089 }
3090 
3091 void __release_sock(struct sock *sk)
3092 	__releases(&sk->sk_lock.slock)
3093 	__acquires(&sk->sk_lock.slock)
3094 {
3095 	struct sk_buff *skb, *next;
3096 
3097 	while ((skb = sk->sk_backlog.head) != NULL) {
3098 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3099 
3100 		spin_unlock_bh(&sk->sk_lock.slock);
3101 
3102 		do {
3103 			next = skb->next;
3104 			prefetch(next);
3105 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3106 			skb_mark_not_on_list(skb);
3107 			sk_backlog_rcv(sk, skb);
3108 
3109 			cond_resched();
3110 
3111 			skb = next;
3112 		} while (skb != NULL);
3113 
3114 		spin_lock_bh(&sk->sk_lock.slock);
3115 	}
3116 
3117 	/*
3118 	 * Doing the zeroing here guarantee we can not loop forever
3119 	 * while a wild producer attempts to flood us.
3120 	 */
3121 	sk->sk_backlog.len = 0;
3122 }
3123 
3124 void __sk_flush_backlog(struct sock *sk)
3125 {
3126 	spin_lock_bh(&sk->sk_lock.slock);
3127 	__release_sock(sk);
3128 
3129 	if (sk->sk_prot->release_cb)
3130 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3131 				     tcp_release_cb, sk);
3132 
3133 	spin_unlock_bh(&sk->sk_lock.slock);
3134 }
3135 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3136 
3137 /**
3138  * sk_wait_data - wait for data to arrive at sk_receive_queue
3139  * @sk:    sock to wait on
3140  * @timeo: for how long
3141  * @skb:   last skb seen on sk_receive_queue
3142  *
3143  * Now socket state including sk->sk_err is changed only under lock,
3144  * hence we may omit checks after joining wait queue.
3145  * We check receive queue before schedule() only as optimization;
3146  * it is very likely that release_sock() added new data.
3147  */
3148 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3149 {
3150 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3151 	int rc;
3152 
3153 	add_wait_queue(sk_sleep(sk), &wait);
3154 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3155 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3156 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3157 	remove_wait_queue(sk_sleep(sk), &wait);
3158 	return rc;
3159 }
3160 EXPORT_SYMBOL(sk_wait_data);
3161 
3162 /**
3163  *	__sk_mem_raise_allocated - increase memory_allocated
3164  *	@sk: socket
3165  *	@size: memory size to allocate
3166  *	@amt: pages to allocate
3167  *	@kind: allocation type
3168  *
3169  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3170  *
3171  *	Unlike the globally shared limits among the sockets under same protocol,
3172  *	consuming the budget of a memcg won't have direct effect on other ones.
3173  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3174  *	whether or not to raise allocated through sk_under_memory_pressure() or
3175  *	its variants.
3176  */
3177 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3178 {
3179 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3180 	struct proto *prot = sk->sk_prot;
3181 	bool charged = false;
3182 	long allocated;
3183 
3184 	sk_memory_allocated_add(sk, amt);
3185 	allocated = sk_memory_allocated(sk);
3186 
3187 	if (memcg) {
3188 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3189 			goto suppress_allocation;
3190 		charged = true;
3191 	}
3192 
3193 	/* Under limit. */
3194 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3195 		sk_leave_memory_pressure(sk);
3196 		return 1;
3197 	}
3198 
3199 	/* Under pressure. */
3200 	if (allocated > sk_prot_mem_limits(sk, 1))
3201 		sk_enter_memory_pressure(sk);
3202 
3203 	/* Over hard limit. */
3204 	if (allocated > sk_prot_mem_limits(sk, 2))
3205 		goto suppress_allocation;
3206 
3207 	/* Guarantee minimum buffer size under pressure (either global
3208 	 * or memcg) to make sure features described in RFC 7323 (TCP
3209 	 * Extensions for High Performance) work properly.
3210 	 *
3211 	 * This rule does NOT stand when exceeds global or memcg's hard
3212 	 * limit, or else a DoS attack can be taken place by spawning
3213 	 * lots of sockets whose usage are under minimum buffer size.
3214 	 */
3215 	if (kind == SK_MEM_RECV) {
3216 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3217 			return 1;
3218 
3219 	} else { /* SK_MEM_SEND */
3220 		int wmem0 = sk_get_wmem0(sk, prot);
3221 
3222 		if (sk->sk_type == SOCK_STREAM) {
3223 			if (sk->sk_wmem_queued < wmem0)
3224 				return 1;
3225 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3226 				return 1;
3227 		}
3228 	}
3229 
3230 	if (sk_has_memory_pressure(sk)) {
3231 		u64 alloc;
3232 
3233 		/* The following 'average' heuristic is within the
3234 		 * scope of global accounting, so it only makes
3235 		 * sense for global memory pressure.
3236 		 */
3237 		if (!sk_under_global_memory_pressure(sk))
3238 			return 1;
3239 
3240 		/* Try to be fair among all the sockets under global
3241 		 * pressure by allowing the ones that below average
3242 		 * usage to raise.
3243 		 */
3244 		alloc = sk_sockets_allocated_read_positive(sk);
3245 		if (sk_prot_mem_limits(sk, 2) > alloc *
3246 		    sk_mem_pages(sk->sk_wmem_queued +
3247 				 atomic_read(&sk->sk_rmem_alloc) +
3248 				 sk->sk_forward_alloc))
3249 			return 1;
3250 	}
3251 
3252 suppress_allocation:
3253 
3254 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3255 		sk_stream_moderate_sndbuf(sk);
3256 
3257 		/* Fail only if socket is _under_ its sndbuf.
3258 		 * In this case we cannot block, so that we have to fail.
3259 		 */
3260 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3261 			/* Force charge with __GFP_NOFAIL */
3262 			if (memcg && !charged) {
3263 				mem_cgroup_charge_skmem(memcg, amt,
3264 					gfp_memcg_charge() | __GFP_NOFAIL);
3265 			}
3266 			return 1;
3267 		}
3268 	}
3269 
3270 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3271 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3272 
3273 	sk_memory_allocated_sub(sk, amt);
3274 
3275 	if (charged)
3276 		mem_cgroup_uncharge_skmem(memcg, amt);
3277 
3278 	return 0;
3279 }
3280 
3281 /**
3282  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3283  *	@sk: socket
3284  *	@size: memory size to allocate
3285  *	@kind: allocation type
3286  *
3287  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3288  *	rmem allocation. This function assumes that protocols which have
3289  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3290  */
3291 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3292 {
3293 	int ret, amt = sk_mem_pages(size);
3294 
3295 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3296 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3297 	if (!ret)
3298 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3299 	return ret;
3300 }
3301 EXPORT_SYMBOL(__sk_mem_schedule);
3302 
3303 /**
3304  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3305  *	@sk: socket
3306  *	@amount: number of quanta
3307  *
3308  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3309  */
3310 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3311 {
3312 	sk_memory_allocated_sub(sk, amount);
3313 
3314 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3315 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3316 
3317 	if (sk_under_global_memory_pressure(sk) &&
3318 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3319 		sk_leave_memory_pressure(sk);
3320 }
3321 
3322 /**
3323  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3324  *	@sk: socket
3325  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3326  */
3327 void __sk_mem_reclaim(struct sock *sk, int amount)
3328 {
3329 	amount >>= PAGE_SHIFT;
3330 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3331 	__sk_mem_reduce_allocated(sk, amount);
3332 }
3333 EXPORT_SYMBOL(__sk_mem_reclaim);
3334 
3335 int sk_set_peek_off(struct sock *sk, int val)
3336 {
3337 	WRITE_ONCE(sk->sk_peek_off, val);
3338 	return 0;
3339 }
3340 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3341 
3342 /*
3343  * Set of default routines for initialising struct proto_ops when
3344  * the protocol does not support a particular function. In certain
3345  * cases where it makes no sense for a protocol to have a "do nothing"
3346  * function, some default processing is provided.
3347  */
3348 
3349 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3350 {
3351 	return -EOPNOTSUPP;
3352 }
3353 EXPORT_SYMBOL(sock_no_bind);
3354 
3355 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3356 		    int len, int flags)
3357 {
3358 	return -EOPNOTSUPP;
3359 }
3360 EXPORT_SYMBOL(sock_no_connect);
3361 
3362 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3363 {
3364 	return -EOPNOTSUPP;
3365 }
3366 EXPORT_SYMBOL(sock_no_socketpair);
3367 
3368 int sock_no_accept(struct socket *sock, struct socket *newsock,
3369 		   struct proto_accept_arg *arg)
3370 {
3371 	return -EOPNOTSUPP;
3372 }
3373 EXPORT_SYMBOL(sock_no_accept);
3374 
3375 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3376 		    int peer)
3377 {
3378 	return -EOPNOTSUPP;
3379 }
3380 EXPORT_SYMBOL(sock_no_getname);
3381 
3382 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3383 {
3384 	return -EOPNOTSUPP;
3385 }
3386 EXPORT_SYMBOL(sock_no_ioctl);
3387 
3388 int sock_no_listen(struct socket *sock, int backlog)
3389 {
3390 	return -EOPNOTSUPP;
3391 }
3392 EXPORT_SYMBOL(sock_no_listen);
3393 
3394 int sock_no_shutdown(struct socket *sock, int how)
3395 {
3396 	return -EOPNOTSUPP;
3397 }
3398 EXPORT_SYMBOL(sock_no_shutdown);
3399 
3400 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3401 {
3402 	return -EOPNOTSUPP;
3403 }
3404 EXPORT_SYMBOL(sock_no_sendmsg);
3405 
3406 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3407 {
3408 	return -EOPNOTSUPP;
3409 }
3410 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3411 
3412 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3413 		    int flags)
3414 {
3415 	return -EOPNOTSUPP;
3416 }
3417 EXPORT_SYMBOL(sock_no_recvmsg);
3418 
3419 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3420 {
3421 	/* Mirror missing mmap method error code */
3422 	return -ENODEV;
3423 }
3424 EXPORT_SYMBOL(sock_no_mmap);
3425 
3426 /*
3427  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3428  * various sock-based usage counts.
3429  */
3430 void __receive_sock(struct file *file)
3431 {
3432 	struct socket *sock;
3433 
3434 	sock = sock_from_file(file);
3435 	if (sock) {
3436 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3437 		sock_update_classid(&sock->sk->sk_cgrp_data);
3438 	}
3439 }
3440 
3441 /*
3442  *	Default Socket Callbacks
3443  */
3444 
3445 static void sock_def_wakeup(struct sock *sk)
3446 {
3447 	struct socket_wq *wq;
3448 
3449 	rcu_read_lock();
3450 	wq = rcu_dereference(sk->sk_wq);
3451 	if (skwq_has_sleeper(wq))
3452 		wake_up_interruptible_all(&wq->wait);
3453 	rcu_read_unlock();
3454 }
3455 
3456 static void sock_def_error_report(struct sock *sk)
3457 {
3458 	struct socket_wq *wq;
3459 
3460 	rcu_read_lock();
3461 	wq = rcu_dereference(sk->sk_wq);
3462 	if (skwq_has_sleeper(wq))
3463 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3464 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3465 	rcu_read_unlock();
3466 }
3467 
3468 void sock_def_readable(struct sock *sk)
3469 {
3470 	struct socket_wq *wq;
3471 
3472 	trace_sk_data_ready(sk);
3473 
3474 	rcu_read_lock();
3475 	wq = rcu_dereference(sk->sk_wq);
3476 	if (skwq_has_sleeper(wq))
3477 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3478 						EPOLLRDNORM | EPOLLRDBAND);
3479 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3480 	rcu_read_unlock();
3481 }
3482 
3483 static void sock_def_write_space(struct sock *sk)
3484 {
3485 	struct socket_wq *wq;
3486 
3487 	rcu_read_lock();
3488 
3489 	/* Do not wake up a writer until he can make "significant"
3490 	 * progress.  --DaveM
3491 	 */
3492 	if (sock_writeable(sk)) {
3493 		wq = rcu_dereference(sk->sk_wq);
3494 		if (skwq_has_sleeper(wq))
3495 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3496 						EPOLLWRNORM | EPOLLWRBAND);
3497 
3498 		/* Should agree with poll, otherwise some programs break */
3499 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3500 	}
3501 
3502 	rcu_read_unlock();
3503 }
3504 
3505 /* An optimised version of sock_def_write_space(), should only be called
3506  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3507  * ->sk_wmem_alloc.
3508  */
3509 static void sock_def_write_space_wfree(struct sock *sk)
3510 {
3511 	/* Do not wake up a writer until he can make "significant"
3512 	 * progress.  --DaveM
3513 	 */
3514 	if (sock_writeable(sk)) {
3515 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3516 
3517 		/* rely on refcount_sub from sock_wfree() */
3518 		smp_mb__after_atomic();
3519 		if (wq && waitqueue_active(&wq->wait))
3520 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3521 						EPOLLWRNORM | EPOLLWRBAND);
3522 
3523 		/* Should agree with poll, otherwise some programs break */
3524 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3525 	}
3526 }
3527 
3528 static void sock_def_destruct(struct sock *sk)
3529 {
3530 }
3531 
3532 void sk_send_sigurg(struct sock *sk)
3533 {
3534 	if (sk->sk_socket && sk->sk_socket->file)
3535 		if (send_sigurg(sk->sk_socket->file))
3536 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3537 }
3538 EXPORT_SYMBOL(sk_send_sigurg);
3539 
3540 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3541 		    unsigned long expires)
3542 {
3543 	if (!mod_timer(timer, expires))
3544 		sock_hold(sk);
3545 }
3546 EXPORT_SYMBOL(sk_reset_timer);
3547 
3548 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3549 {
3550 	if (del_timer(timer))
3551 		__sock_put(sk);
3552 }
3553 EXPORT_SYMBOL(sk_stop_timer);
3554 
3555 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3556 {
3557 	if (del_timer_sync(timer))
3558 		__sock_put(sk);
3559 }
3560 EXPORT_SYMBOL(sk_stop_timer_sync);
3561 
3562 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3563 {
3564 	sk_init_common(sk);
3565 	sk->sk_send_head	=	NULL;
3566 
3567 	timer_setup(&sk->sk_timer, NULL, 0);
3568 
3569 	sk->sk_allocation	=	GFP_KERNEL;
3570 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3571 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3572 	sk->sk_state		=	TCP_CLOSE;
3573 	sk->sk_use_task_frag	=	true;
3574 	sk_set_socket(sk, sock);
3575 
3576 	sock_set_flag(sk, SOCK_ZAPPED);
3577 
3578 	if (sock) {
3579 		sk->sk_type	=	sock->type;
3580 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3581 		sock->sk	=	sk;
3582 	} else {
3583 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3584 	}
3585 	sk->sk_uid	=	uid;
3586 
3587 	sk->sk_state_change	=	sock_def_wakeup;
3588 	sk->sk_data_ready	=	sock_def_readable;
3589 	sk->sk_write_space	=	sock_def_write_space;
3590 	sk->sk_error_report	=	sock_def_error_report;
3591 	sk->sk_destruct		=	sock_def_destruct;
3592 
3593 	sk->sk_frag.page	=	NULL;
3594 	sk->sk_frag.offset	=	0;
3595 	sk->sk_peek_off		=	-1;
3596 
3597 	sk->sk_peer_pid 	=	NULL;
3598 	sk->sk_peer_cred	=	NULL;
3599 	spin_lock_init(&sk->sk_peer_lock);
3600 
3601 	sk->sk_write_pending	=	0;
3602 	sk->sk_rcvlowat		=	1;
3603 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3604 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3605 
3606 	sk->sk_stamp = SK_DEFAULT_STAMP;
3607 #if BITS_PER_LONG==32
3608 	seqlock_init(&sk->sk_stamp_seq);
3609 #endif
3610 	atomic_set(&sk->sk_zckey, 0);
3611 
3612 #ifdef CONFIG_NET_RX_BUSY_POLL
3613 	sk->sk_napi_id		=	0;
3614 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3615 #endif
3616 
3617 	sk->sk_max_pacing_rate = ~0UL;
3618 	sk->sk_pacing_rate = ~0UL;
3619 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3620 	sk->sk_incoming_cpu = -1;
3621 
3622 	sk_rx_queue_clear(sk);
3623 	/*
3624 	 * Before updating sk_refcnt, we must commit prior changes to memory
3625 	 * (Documentation/RCU/rculist_nulls.rst for details)
3626 	 */
3627 	smp_wmb();
3628 	refcount_set(&sk->sk_refcnt, 1);
3629 	atomic_set(&sk->sk_drops, 0);
3630 }
3631 EXPORT_SYMBOL(sock_init_data_uid);
3632 
3633 void sock_init_data(struct socket *sock, struct sock *sk)
3634 {
3635 	kuid_t uid = sock ?
3636 		SOCK_INODE(sock)->i_uid :
3637 		make_kuid(sock_net(sk)->user_ns, 0);
3638 
3639 	sock_init_data_uid(sock, sk, uid);
3640 }
3641 EXPORT_SYMBOL(sock_init_data);
3642 
3643 void lock_sock_nested(struct sock *sk, int subclass)
3644 {
3645 	/* The sk_lock has mutex_lock() semantics here. */
3646 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3647 
3648 	might_sleep();
3649 	spin_lock_bh(&sk->sk_lock.slock);
3650 	if (sock_owned_by_user_nocheck(sk))
3651 		__lock_sock(sk);
3652 	sk->sk_lock.owned = 1;
3653 	spin_unlock_bh(&sk->sk_lock.slock);
3654 }
3655 EXPORT_SYMBOL(lock_sock_nested);
3656 
3657 void release_sock(struct sock *sk)
3658 {
3659 	spin_lock_bh(&sk->sk_lock.slock);
3660 	if (sk->sk_backlog.tail)
3661 		__release_sock(sk);
3662 
3663 	if (sk->sk_prot->release_cb)
3664 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3665 				     tcp_release_cb, sk);
3666 
3667 	sock_release_ownership(sk);
3668 	if (waitqueue_active(&sk->sk_lock.wq))
3669 		wake_up(&sk->sk_lock.wq);
3670 	spin_unlock_bh(&sk->sk_lock.slock);
3671 }
3672 EXPORT_SYMBOL(release_sock);
3673 
3674 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3675 {
3676 	might_sleep();
3677 	spin_lock_bh(&sk->sk_lock.slock);
3678 
3679 	if (!sock_owned_by_user_nocheck(sk)) {
3680 		/*
3681 		 * Fast path return with bottom halves disabled and
3682 		 * sock::sk_lock.slock held.
3683 		 *
3684 		 * The 'mutex' is not contended and holding
3685 		 * sock::sk_lock.slock prevents all other lockers to
3686 		 * proceed so the corresponding unlock_sock_fast() can
3687 		 * avoid the slow path of release_sock() completely and
3688 		 * just release slock.
3689 		 *
3690 		 * From a semantical POV this is equivalent to 'acquiring'
3691 		 * the 'mutex', hence the corresponding lockdep
3692 		 * mutex_release() has to happen in the fast path of
3693 		 * unlock_sock_fast().
3694 		 */
3695 		return false;
3696 	}
3697 
3698 	__lock_sock(sk);
3699 	sk->sk_lock.owned = 1;
3700 	__acquire(&sk->sk_lock.slock);
3701 	spin_unlock_bh(&sk->sk_lock.slock);
3702 	return true;
3703 }
3704 EXPORT_SYMBOL(__lock_sock_fast);
3705 
3706 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3707 		   bool timeval, bool time32)
3708 {
3709 	struct sock *sk = sock->sk;
3710 	struct timespec64 ts;
3711 
3712 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3713 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3714 	if (ts.tv_sec == -1)
3715 		return -ENOENT;
3716 	if (ts.tv_sec == 0) {
3717 		ktime_t kt = ktime_get_real();
3718 		sock_write_timestamp(sk, kt);
3719 		ts = ktime_to_timespec64(kt);
3720 	}
3721 
3722 	if (timeval)
3723 		ts.tv_nsec /= 1000;
3724 
3725 #ifdef CONFIG_COMPAT_32BIT_TIME
3726 	if (time32)
3727 		return put_old_timespec32(&ts, userstamp);
3728 #endif
3729 #ifdef CONFIG_SPARC64
3730 	/* beware of padding in sparc64 timeval */
3731 	if (timeval && !in_compat_syscall()) {
3732 		struct __kernel_old_timeval __user tv = {
3733 			.tv_sec = ts.tv_sec,
3734 			.tv_usec = ts.tv_nsec,
3735 		};
3736 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3737 			return -EFAULT;
3738 		return 0;
3739 	}
3740 #endif
3741 	return put_timespec64(&ts, userstamp);
3742 }
3743 EXPORT_SYMBOL(sock_gettstamp);
3744 
3745 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3746 {
3747 	if (!sock_flag(sk, flag)) {
3748 		unsigned long previous_flags = sk->sk_flags;
3749 
3750 		sock_set_flag(sk, flag);
3751 		/*
3752 		 * we just set one of the two flags which require net
3753 		 * time stamping, but time stamping might have been on
3754 		 * already because of the other one
3755 		 */
3756 		if (sock_needs_netstamp(sk) &&
3757 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3758 			net_enable_timestamp();
3759 	}
3760 }
3761 
3762 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3763 		       int level, int type)
3764 {
3765 	struct sock_exterr_skb *serr;
3766 	struct sk_buff *skb;
3767 	int copied, err;
3768 
3769 	err = -EAGAIN;
3770 	skb = sock_dequeue_err_skb(sk);
3771 	if (skb == NULL)
3772 		goto out;
3773 
3774 	copied = skb->len;
3775 	if (copied > len) {
3776 		msg->msg_flags |= MSG_TRUNC;
3777 		copied = len;
3778 	}
3779 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3780 	if (err)
3781 		goto out_free_skb;
3782 
3783 	sock_recv_timestamp(msg, sk, skb);
3784 
3785 	serr = SKB_EXT_ERR(skb);
3786 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3787 
3788 	msg->msg_flags |= MSG_ERRQUEUE;
3789 	err = copied;
3790 
3791 out_free_skb:
3792 	kfree_skb(skb);
3793 out:
3794 	return err;
3795 }
3796 EXPORT_SYMBOL(sock_recv_errqueue);
3797 
3798 /*
3799  *	Get a socket option on an socket.
3800  *
3801  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3802  *	asynchronous errors should be reported by getsockopt. We assume
3803  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3804  */
3805 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3806 			   char __user *optval, int __user *optlen)
3807 {
3808 	struct sock *sk = sock->sk;
3809 
3810 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3811 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3812 }
3813 EXPORT_SYMBOL(sock_common_getsockopt);
3814 
3815 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3816 			int flags)
3817 {
3818 	struct sock *sk = sock->sk;
3819 	int addr_len = 0;
3820 	int err;
3821 
3822 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3823 	if (err >= 0)
3824 		msg->msg_namelen = addr_len;
3825 	return err;
3826 }
3827 EXPORT_SYMBOL(sock_common_recvmsg);
3828 
3829 /*
3830  *	Set socket options on an inet socket.
3831  */
3832 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3833 			   sockptr_t optval, unsigned int optlen)
3834 {
3835 	struct sock *sk = sock->sk;
3836 
3837 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3838 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3839 }
3840 EXPORT_SYMBOL(sock_common_setsockopt);
3841 
3842 void sk_common_release(struct sock *sk)
3843 {
3844 	if (sk->sk_prot->destroy)
3845 		sk->sk_prot->destroy(sk);
3846 
3847 	/*
3848 	 * Observation: when sk_common_release is called, processes have
3849 	 * no access to socket. But net still has.
3850 	 * Step one, detach it from networking:
3851 	 *
3852 	 * A. Remove from hash tables.
3853 	 */
3854 
3855 	sk->sk_prot->unhash(sk);
3856 
3857 	/*
3858 	 * In this point socket cannot receive new packets, but it is possible
3859 	 * that some packets are in flight because some CPU runs receiver and
3860 	 * did hash table lookup before we unhashed socket. They will achieve
3861 	 * receive queue and will be purged by socket destructor.
3862 	 *
3863 	 * Also we still have packets pending on receive queue and probably,
3864 	 * our own packets waiting in device queues. sock_destroy will drain
3865 	 * receive queue, but transmitted packets will delay socket destruction
3866 	 * until the last reference will be released.
3867 	 */
3868 
3869 	sock_orphan(sk);
3870 
3871 	xfrm_sk_free_policy(sk);
3872 
3873 	sock_put(sk);
3874 }
3875 EXPORT_SYMBOL(sk_common_release);
3876 
3877 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3878 {
3879 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3880 
3881 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3882 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3883 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3884 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3885 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3886 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3887 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3888 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3889 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3890 }
3891 
3892 #ifdef CONFIG_PROC_FS
3893 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3894 
3895 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3896 {
3897 	int cpu, idx = prot->inuse_idx;
3898 	int res = 0;
3899 
3900 	for_each_possible_cpu(cpu)
3901 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3902 
3903 	return res >= 0 ? res : 0;
3904 }
3905 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3906 
3907 int sock_inuse_get(struct net *net)
3908 {
3909 	int cpu, res = 0;
3910 
3911 	for_each_possible_cpu(cpu)
3912 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3913 
3914 	return res;
3915 }
3916 
3917 EXPORT_SYMBOL_GPL(sock_inuse_get);
3918 
3919 static int __net_init sock_inuse_init_net(struct net *net)
3920 {
3921 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3922 	if (net->core.prot_inuse == NULL)
3923 		return -ENOMEM;
3924 	return 0;
3925 }
3926 
3927 static void __net_exit sock_inuse_exit_net(struct net *net)
3928 {
3929 	free_percpu(net->core.prot_inuse);
3930 }
3931 
3932 static struct pernet_operations net_inuse_ops = {
3933 	.init = sock_inuse_init_net,
3934 	.exit = sock_inuse_exit_net,
3935 };
3936 
3937 static __init int net_inuse_init(void)
3938 {
3939 	if (register_pernet_subsys(&net_inuse_ops))
3940 		panic("Cannot initialize net inuse counters");
3941 
3942 	return 0;
3943 }
3944 
3945 core_initcall(net_inuse_init);
3946 
3947 static int assign_proto_idx(struct proto *prot)
3948 {
3949 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3950 
3951 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3952 		pr_err("PROTO_INUSE_NR exhausted\n");
3953 		return -ENOSPC;
3954 	}
3955 
3956 	set_bit(prot->inuse_idx, proto_inuse_idx);
3957 	return 0;
3958 }
3959 
3960 static void release_proto_idx(struct proto *prot)
3961 {
3962 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3963 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3964 }
3965 #else
3966 static inline int assign_proto_idx(struct proto *prot)
3967 {
3968 	return 0;
3969 }
3970 
3971 static inline void release_proto_idx(struct proto *prot)
3972 {
3973 }
3974 
3975 #endif
3976 
3977 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3978 {
3979 	if (!twsk_prot)
3980 		return;
3981 	kfree(twsk_prot->twsk_slab_name);
3982 	twsk_prot->twsk_slab_name = NULL;
3983 	kmem_cache_destroy(twsk_prot->twsk_slab);
3984 	twsk_prot->twsk_slab = NULL;
3985 }
3986 
3987 static int tw_prot_init(const struct proto *prot)
3988 {
3989 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3990 
3991 	if (!twsk_prot)
3992 		return 0;
3993 
3994 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3995 					      prot->name);
3996 	if (!twsk_prot->twsk_slab_name)
3997 		return -ENOMEM;
3998 
3999 	twsk_prot->twsk_slab =
4000 		kmem_cache_create(twsk_prot->twsk_slab_name,
4001 				  twsk_prot->twsk_obj_size, 0,
4002 				  SLAB_ACCOUNT | prot->slab_flags,
4003 				  NULL);
4004 	if (!twsk_prot->twsk_slab) {
4005 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4006 			prot->name);
4007 		return -ENOMEM;
4008 	}
4009 
4010 	return 0;
4011 }
4012 
4013 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4014 {
4015 	if (!rsk_prot)
4016 		return;
4017 	kfree(rsk_prot->slab_name);
4018 	rsk_prot->slab_name = NULL;
4019 	kmem_cache_destroy(rsk_prot->slab);
4020 	rsk_prot->slab = NULL;
4021 }
4022 
4023 static int req_prot_init(const struct proto *prot)
4024 {
4025 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4026 
4027 	if (!rsk_prot)
4028 		return 0;
4029 
4030 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4031 					prot->name);
4032 	if (!rsk_prot->slab_name)
4033 		return -ENOMEM;
4034 
4035 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4036 					   rsk_prot->obj_size, 0,
4037 					   SLAB_ACCOUNT | prot->slab_flags,
4038 					   NULL);
4039 
4040 	if (!rsk_prot->slab) {
4041 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4042 			prot->name);
4043 		return -ENOMEM;
4044 	}
4045 	return 0;
4046 }
4047 
4048 int proto_register(struct proto *prot, int alloc_slab)
4049 {
4050 	int ret = -ENOBUFS;
4051 
4052 	if (prot->memory_allocated && !prot->sysctl_mem) {
4053 		pr_err("%s: missing sysctl_mem\n", prot->name);
4054 		return -EINVAL;
4055 	}
4056 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4057 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4058 		return -EINVAL;
4059 	}
4060 	if (alloc_slab) {
4061 		prot->slab = kmem_cache_create_usercopy(prot->name,
4062 					prot->obj_size, 0,
4063 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4064 					prot->slab_flags,
4065 					prot->useroffset, prot->usersize,
4066 					NULL);
4067 
4068 		if (prot->slab == NULL) {
4069 			pr_crit("%s: Can't create sock SLAB cache!\n",
4070 				prot->name);
4071 			goto out;
4072 		}
4073 
4074 		if (req_prot_init(prot))
4075 			goto out_free_request_sock_slab;
4076 
4077 		if (tw_prot_init(prot))
4078 			goto out_free_timewait_sock_slab;
4079 	}
4080 
4081 	mutex_lock(&proto_list_mutex);
4082 	ret = assign_proto_idx(prot);
4083 	if (ret) {
4084 		mutex_unlock(&proto_list_mutex);
4085 		goto out_free_timewait_sock_slab;
4086 	}
4087 	list_add(&prot->node, &proto_list);
4088 	mutex_unlock(&proto_list_mutex);
4089 	return ret;
4090 
4091 out_free_timewait_sock_slab:
4092 	if (alloc_slab)
4093 		tw_prot_cleanup(prot->twsk_prot);
4094 out_free_request_sock_slab:
4095 	if (alloc_slab) {
4096 		req_prot_cleanup(prot->rsk_prot);
4097 
4098 		kmem_cache_destroy(prot->slab);
4099 		prot->slab = NULL;
4100 	}
4101 out:
4102 	return ret;
4103 }
4104 EXPORT_SYMBOL(proto_register);
4105 
4106 void proto_unregister(struct proto *prot)
4107 {
4108 	mutex_lock(&proto_list_mutex);
4109 	release_proto_idx(prot);
4110 	list_del(&prot->node);
4111 	mutex_unlock(&proto_list_mutex);
4112 
4113 	kmem_cache_destroy(prot->slab);
4114 	prot->slab = NULL;
4115 
4116 	req_prot_cleanup(prot->rsk_prot);
4117 	tw_prot_cleanup(prot->twsk_prot);
4118 }
4119 EXPORT_SYMBOL(proto_unregister);
4120 
4121 int sock_load_diag_module(int family, int protocol)
4122 {
4123 	if (!protocol) {
4124 		if (!sock_is_registered(family))
4125 			return -ENOENT;
4126 
4127 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4128 				      NETLINK_SOCK_DIAG, family);
4129 	}
4130 
4131 #ifdef CONFIG_INET
4132 	if (family == AF_INET &&
4133 	    protocol != IPPROTO_RAW &&
4134 	    protocol < MAX_INET_PROTOS &&
4135 	    !rcu_access_pointer(inet_protos[protocol]))
4136 		return -ENOENT;
4137 #endif
4138 
4139 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4140 			      NETLINK_SOCK_DIAG, family, protocol);
4141 }
4142 EXPORT_SYMBOL(sock_load_diag_module);
4143 
4144 #ifdef CONFIG_PROC_FS
4145 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4146 	__acquires(proto_list_mutex)
4147 {
4148 	mutex_lock(&proto_list_mutex);
4149 	return seq_list_start_head(&proto_list, *pos);
4150 }
4151 
4152 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4153 {
4154 	return seq_list_next(v, &proto_list, pos);
4155 }
4156 
4157 static void proto_seq_stop(struct seq_file *seq, void *v)
4158 	__releases(proto_list_mutex)
4159 {
4160 	mutex_unlock(&proto_list_mutex);
4161 }
4162 
4163 static char proto_method_implemented(const void *method)
4164 {
4165 	return method == NULL ? 'n' : 'y';
4166 }
4167 static long sock_prot_memory_allocated(struct proto *proto)
4168 {
4169 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4170 }
4171 
4172 static const char *sock_prot_memory_pressure(struct proto *proto)
4173 {
4174 	return proto->memory_pressure != NULL ?
4175 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4176 }
4177 
4178 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4179 {
4180 
4181 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4182 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4183 		   proto->name,
4184 		   proto->obj_size,
4185 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4186 		   sock_prot_memory_allocated(proto),
4187 		   sock_prot_memory_pressure(proto),
4188 		   proto->max_header,
4189 		   proto->slab == NULL ? "no" : "yes",
4190 		   module_name(proto->owner),
4191 		   proto_method_implemented(proto->close),
4192 		   proto_method_implemented(proto->connect),
4193 		   proto_method_implemented(proto->disconnect),
4194 		   proto_method_implemented(proto->accept),
4195 		   proto_method_implemented(proto->ioctl),
4196 		   proto_method_implemented(proto->init),
4197 		   proto_method_implemented(proto->destroy),
4198 		   proto_method_implemented(proto->shutdown),
4199 		   proto_method_implemented(proto->setsockopt),
4200 		   proto_method_implemented(proto->getsockopt),
4201 		   proto_method_implemented(proto->sendmsg),
4202 		   proto_method_implemented(proto->recvmsg),
4203 		   proto_method_implemented(proto->bind),
4204 		   proto_method_implemented(proto->backlog_rcv),
4205 		   proto_method_implemented(proto->hash),
4206 		   proto_method_implemented(proto->unhash),
4207 		   proto_method_implemented(proto->get_port),
4208 		   proto_method_implemented(proto->enter_memory_pressure));
4209 }
4210 
4211 static int proto_seq_show(struct seq_file *seq, void *v)
4212 {
4213 	if (v == &proto_list)
4214 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4215 			   "protocol",
4216 			   "size",
4217 			   "sockets",
4218 			   "memory",
4219 			   "press",
4220 			   "maxhdr",
4221 			   "slab",
4222 			   "module",
4223 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4224 	else
4225 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4226 	return 0;
4227 }
4228 
4229 static const struct seq_operations proto_seq_ops = {
4230 	.start  = proto_seq_start,
4231 	.next   = proto_seq_next,
4232 	.stop   = proto_seq_stop,
4233 	.show   = proto_seq_show,
4234 };
4235 
4236 static __net_init int proto_init_net(struct net *net)
4237 {
4238 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4239 			sizeof(struct seq_net_private)))
4240 		return -ENOMEM;
4241 
4242 	return 0;
4243 }
4244 
4245 static __net_exit void proto_exit_net(struct net *net)
4246 {
4247 	remove_proc_entry("protocols", net->proc_net);
4248 }
4249 
4250 
4251 static __net_initdata struct pernet_operations proto_net_ops = {
4252 	.init = proto_init_net,
4253 	.exit = proto_exit_net,
4254 };
4255 
4256 static int __init proto_init(void)
4257 {
4258 	return register_pernet_subsys(&proto_net_ops);
4259 }
4260 
4261 subsys_initcall(proto_init);
4262 
4263 #endif /* PROC_FS */
4264 
4265 #ifdef CONFIG_NET_RX_BUSY_POLL
4266 bool sk_busy_loop_end(void *p, unsigned long start_time)
4267 {
4268 	struct sock *sk = p;
4269 
4270 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4271 		return true;
4272 
4273 	if (sk_is_udp(sk) &&
4274 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4275 		return true;
4276 
4277 	return sk_busy_loop_timeout(sk, start_time);
4278 }
4279 EXPORT_SYMBOL(sk_busy_loop_end);
4280 #endif /* CONFIG_NET_RX_BUSY_POLL */
4281 
4282 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4283 {
4284 	if (!sk->sk_prot->bind_add)
4285 		return -EOPNOTSUPP;
4286 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4287 }
4288 EXPORT_SYMBOL(sock_bind_add);
4289 
4290 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4291 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4292 		     void __user *arg, void *karg, size_t size)
4293 {
4294 	int ret;
4295 
4296 	if (copy_from_user(karg, arg, size))
4297 		return -EFAULT;
4298 
4299 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4300 	if (ret)
4301 		return ret;
4302 
4303 	if (copy_to_user(arg, karg, size))
4304 		return -EFAULT;
4305 
4306 	return 0;
4307 }
4308 EXPORT_SYMBOL(sock_ioctl_inout);
4309 
4310 /* This is the most common ioctl prep function, where the result (4 bytes) is
4311  * copied back to userspace if the ioctl() returns successfully. No input is
4312  * copied from userspace as input argument.
4313  */
4314 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4315 {
4316 	int ret, karg = 0;
4317 
4318 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4319 	if (ret)
4320 		return ret;
4321 
4322 	return put_user(karg, (int __user *)arg);
4323 }
4324 
4325 /* A wrapper around sock ioctls, which copies the data from userspace
4326  * (depending on the protocol/ioctl), and copies back the result to userspace.
4327  * The main motivation for this function is to pass kernel memory to the
4328  * protocol ioctl callbacks, instead of userspace memory.
4329  */
4330 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4331 {
4332 	int rc = 1;
4333 
4334 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4335 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4336 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4337 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4338 	else if (sk_is_phonet(sk))
4339 		rc = phonet_sk_ioctl(sk, cmd, arg);
4340 
4341 	/* If ioctl was processed, returns its value */
4342 	if (rc <= 0)
4343 		return rc;
4344 
4345 	/* Otherwise call the default handler */
4346 	return sock_ioctl_out(sk, cmd, arg);
4347 }
4348 EXPORT_SYMBOL(sk_ioctl);
4349 
4350 static int __init sock_struct_check(void)
4351 {
4352 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4353 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4354 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4355 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4356 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4357 
4358 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4359 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4360 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4361 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4362 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4363 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4364 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4365 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4366 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4367 
4368 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4369 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4370 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4371 
4372 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4373 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4374 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4375 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4376 
4377 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4378 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4379 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4380 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4381 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4382 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4383 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4384 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4385 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4386 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4387 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4388 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4389 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4390 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4391 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4392 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4393 
4394 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4395 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4396 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4397 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4398 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4399 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4400 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4401 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4402 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4403 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4404 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4405 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4406 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4407 	return 0;
4408 }
4409 
4410 core_initcall(sock_struct_check);
4411