xref: /linux/net/core/sock.c (revision add452d09a38c7a7c44aea55c1015392cebf9fa7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include "dev.h"
152 
153 static DEFINE_MUTEX(proto_list_mutex);
154 static LIST_HEAD(proto_list);
155 
156 static void sock_def_write_space_wfree(struct sock *sk);
157 static void sock_def_write_space(struct sock *sk);
158 
159 /**
160  * sk_ns_capable - General socket capability test
161  * @sk: Socket to use a capability on or through
162  * @user_ns: The user namespace of the capability to use
163  * @cap: The capability to use
164  *
165  * Test to see if the opener of the socket had when the socket was
166  * created and the current process has the capability @cap in the user
167  * namespace @user_ns.
168  */
169 bool sk_ns_capable(const struct sock *sk,
170 		   struct user_namespace *user_ns, int cap)
171 {
172 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173 		ns_capable(user_ns, cap);
174 }
175 EXPORT_SYMBOL(sk_ns_capable);
176 
177 /**
178  * sk_capable - Socket global capability test
179  * @sk: Socket to use a capability on or through
180  * @cap: The global capability to use
181  *
182  * Test to see if the opener of the socket had when the socket was
183  * created and the current process has the capability @cap in all user
184  * namespaces.
185  */
186 bool sk_capable(const struct sock *sk, int cap)
187 {
188 	return sk_ns_capable(sk, &init_user_ns, cap);
189 }
190 EXPORT_SYMBOL(sk_capable);
191 
192 /**
193  * sk_net_capable - Network namespace socket capability test
194  * @sk: Socket to use a capability on or through
195  * @cap: The capability to use
196  *
197  * Test to see if the opener of the socket had when the socket was created
198  * and the current process has the capability @cap over the network namespace
199  * the socket is a member of.
200  */
201 bool sk_net_capable(const struct sock *sk, int cap)
202 {
203 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204 }
205 EXPORT_SYMBOL(sk_net_capable);
206 
207 /*
208  * Each address family might have different locking rules, so we have
209  * one slock key per address family and separate keys for internal and
210  * userspace sockets.
211  */
212 static struct lock_class_key af_family_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_keys[AF_MAX];
214 static struct lock_class_key af_family_slock_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216 
217 /*
218  * Make lock validator output more readable. (we pre-construct these
219  * strings build-time, so that runtime initialization of socket
220  * locks is fast):
221  */
222 
223 #define _sock_locks(x)						  \
224   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
225   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
226   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
227   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
228   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
229   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
230   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
231   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
232   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
233   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
234   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
235   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
236   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
237   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
238   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
239   x "AF_MCTP"  , \
240   x "AF_MAX"
241 
242 static const char *const af_family_key_strings[AF_MAX+1] = {
243 	_sock_locks("sk_lock-")
244 };
245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("slock-")
247 };
248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("clock-")
250 };
251 
252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-sk_lock-")
254 };
255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-slock-")
257 };
258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259 	_sock_locks("k-clock-")
260 };
261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("rlock-")
263 };
264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265 	_sock_locks("wlock-")
266 };
267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268 	_sock_locks("elock-")
269 };
270 
271 /*
272  * sk_callback_lock and sk queues locking rules are per-address-family,
273  * so split the lock classes by using a per-AF key:
274  */
275 static struct lock_class_key af_callback_keys[AF_MAX];
276 static struct lock_class_key af_rlock_keys[AF_MAX];
277 static struct lock_class_key af_wlock_keys[AF_MAX];
278 static struct lock_class_key af_elock_keys[AF_MAX];
279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
280 
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283 EXPORT_SYMBOL(sysctl_wmem_max);
284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285 EXPORT_SYMBOL(sysctl_rmem_max);
286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288 
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_branch_inc(&memalloc_socks_key);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_branch_dec(&memalloc_socks_key);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned int noreclaim_flag;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	noreclaim_flag = memalloc_noreclaim_save();
334 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
335 				 tcp_v6_do_rcv,
336 				 tcp_v4_do_rcv,
337 				 sk, skb);
338 	memalloc_noreclaim_restore(noreclaim_flag);
339 
340 	return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343 
344 void sk_error_report(struct sock *sk)
345 {
346 	sk->sk_error_report(sk);
347 
348 	switch (sk->sk_family) {
349 	case AF_INET:
350 		fallthrough;
351 	case AF_INET6:
352 		trace_inet_sk_error_report(sk);
353 		break;
354 	default:
355 		break;
356 	}
357 }
358 EXPORT_SYMBOL(sk_error_report);
359 
360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
361 {
362 	struct __kernel_sock_timeval tv;
363 
364 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
365 		tv.tv_sec = 0;
366 		tv.tv_usec = 0;
367 	} else {
368 		tv.tv_sec = timeo / HZ;
369 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
370 	}
371 
372 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
373 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
374 		*(struct old_timeval32 *)optval = tv32;
375 		return sizeof(tv32);
376 	}
377 
378 	if (old_timeval) {
379 		struct __kernel_old_timeval old_tv;
380 		old_tv.tv_sec = tv.tv_sec;
381 		old_tv.tv_usec = tv.tv_usec;
382 		*(struct __kernel_old_timeval *)optval = old_tv;
383 		return sizeof(old_tv);
384 	}
385 
386 	*(struct __kernel_sock_timeval *)optval = tv;
387 	return sizeof(tv);
388 }
389 EXPORT_SYMBOL(sock_get_timeout);
390 
391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
392 			   sockptr_t optval, int optlen, bool old_timeval)
393 {
394 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
395 		struct old_timeval32 tv32;
396 
397 		if (optlen < sizeof(tv32))
398 			return -EINVAL;
399 
400 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
401 			return -EFAULT;
402 		tv->tv_sec = tv32.tv_sec;
403 		tv->tv_usec = tv32.tv_usec;
404 	} else if (old_timeval) {
405 		struct __kernel_old_timeval old_tv;
406 
407 		if (optlen < sizeof(old_tv))
408 			return -EINVAL;
409 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
410 			return -EFAULT;
411 		tv->tv_sec = old_tv.tv_sec;
412 		tv->tv_usec = old_tv.tv_usec;
413 	} else {
414 		if (optlen < sizeof(*tv))
415 			return -EINVAL;
416 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
417 			return -EFAULT;
418 	}
419 
420 	return 0;
421 }
422 EXPORT_SYMBOL(sock_copy_user_timeval);
423 
424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
425 			    bool old_timeval)
426 {
427 	struct __kernel_sock_timeval tv;
428 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
429 	long val;
430 
431 	if (err)
432 		return err;
433 
434 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
435 		return -EDOM;
436 
437 	if (tv.tv_sec < 0) {
438 		static int warned __read_mostly;
439 
440 		WRITE_ONCE(*timeo_p, 0);
441 		if (warned < 10 && net_ratelimit()) {
442 			warned++;
443 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 				__func__, current->comm, task_pid_nr(current));
445 		}
446 		return 0;
447 	}
448 	val = MAX_SCHEDULE_TIMEOUT;
449 	if ((tv.tv_sec || tv.tv_usec) &&
450 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
451 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
452 						    USEC_PER_SEC / HZ);
453 	WRITE_ONCE(*timeo_p, val);
454 	return 0;
455 }
456 
457 static bool sock_needs_netstamp(const struct sock *sk)
458 {
459 	switch (sk->sk_family) {
460 	case AF_UNSPEC:
461 	case AF_UNIX:
462 		return false;
463 	default:
464 		return true;
465 	}
466 }
467 
468 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
469 {
470 	if (sk->sk_flags & flags) {
471 		sk->sk_flags &= ~flags;
472 		if (sock_needs_netstamp(sk) &&
473 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
474 			net_disable_timestamp();
475 	}
476 }
477 
478 
479 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
480 {
481 	unsigned long flags;
482 	struct sk_buff_head *list = &sk->sk_receive_queue;
483 
484 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
485 		atomic_inc(&sk->sk_drops);
486 		trace_sock_rcvqueue_full(sk, skb);
487 		return -ENOMEM;
488 	}
489 
490 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
491 		atomic_inc(&sk->sk_drops);
492 		return -ENOBUFS;
493 	}
494 
495 	skb->dev = NULL;
496 	skb_set_owner_r(skb, sk);
497 
498 	/* we escape from rcu protected region, make sure we dont leak
499 	 * a norefcounted dst
500 	 */
501 	skb_dst_force(skb);
502 
503 	spin_lock_irqsave(&list->lock, flags);
504 	sock_skb_set_dropcount(sk, skb);
505 	__skb_queue_tail(list, skb);
506 	spin_unlock_irqrestore(&list->lock, flags);
507 
508 	if (!sock_flag(sk, SOCK_DEAD))
509 		sk->sk_data_ready(sk);
510 	return 0;
511 }
512 EXPORT_SYMBOL(__sock_queue_rcv_skb);
513 
514 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
515 			      enum skb_drop_reason *reason)
516 {
517 	enum skb_drop_reason drop_reason;
518 	int err;
519 
520 	err = sk_filter(sk, skb);
521 	if (err) {
522 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
523 		goto out;
524 	}
525 	err = __sock_queue_rcv_skb(sk, skb);
526 	switch (err) {
527 	case -ENOMEM:
528 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
529 		break;
530 	case -ENOBUFS:
531 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
532 		break;
533 	default:
534 		drop_reason = SKB_NOT_DROPPED_YET;
535 		break;
536 	}
537 out:
538 	if (reason)
539 		*reason = drop_reason;
540 	return err;
541 }
542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
543 
544 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
545 		     const int nested, unsigned int trim_cap, bool refcounted)
546 {
547 	int rc = NET_RX_SUCCESS;
548 
549 	if (sk_filter_trim_cap(sk, skb, trim_cap))
550 		goto discard_and_relse;
551 
552 	skb->dev = NULL;
553 
554 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
555 		atomic_inc(&sk->sk_drops);
556 		goto discard_and_relse;
557 	}
558 	if (nested)
559 		bh_lock_sock_nested(sk);
560 	else
561 		bh_lock_sock(sk);
562 	if (!sock_owned_by_user(sk)) {
563 		/*
564 		 * trylock + unlock semantics:
565 		 */
566 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
567 
568 		rc = sk_backlog_rcv(sk, skb);
569 
570 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
571 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
572 		bh_unlock_sock(sk);
573 		atomic_inc(&sk->sk_drops);
574 		goto discard_and_relse;
575 	}
576 
577 	bh_unlock_sock(sk);
578 out:
579 	if (refcounted)
580 		sock_put(sk);
581 	return rc;
582 discard_and_relse:
583 	kfree_skb(skb);
584 	goto out;
585 }
586 EXPORT_SYMBOL(__sk_receive_skb);
587 
588 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
589 							  u32));
590 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
591 							   u32));
592 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
593 {
594 	struct dst_entry *dst = __sk_dst_get(sk);
595 
596 	if (dst && dst->obsolete &&
597 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
598 			       dst, cookie) == NULL) {
599 		sk_tx_queue_clear(sk);
600 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
601 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
602 		dst_release(dst);
603 		return NULL;
604 	}
605 
606 	return dst;
607 }
608 EXPORT_SYMBOL(__sk_dst_check);
609 
610 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
611 {
612 	struct dst_entry *dst = sk_dst_get(sk);
613 
614 	if (dst && dst->obsolete &&
615 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
616 			       dst, cookie) == NULL) {
617 		sk_dst_reset(sk);
618 		dst_release(dst);
619 		return NULL;
620 	}
621 
622 	return dst;
623 }
624 EXPORT_SYMBOL(sk_dst_check);
625 
626 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
627 {
628 	int ret = -ENOPROTOOPT;
629 #ifdef CONFIG_NETDEVICES
630 	struct net *net = sock_net(sk);
631 
632 	/* Sorry... */
633 	ret = -EPERM;
634 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
635 		goto out;
636 
637 	ret = -EINVAL;
638 	if (ifindex < 0)
639 		goto out;
640 
641 	/* Paired with all READ_ONCE() done locklessly. */
642 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
643 
644 	if (sk->sk_prot->rehash)
645 		sk->sk_prot->rehash(sk);
646 	sk_dst_reset(sk);
647 
648 	ret = 0;
649 
650 out:
651 #endif
652 
653 	return ret;
654 }
655 
656 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
657 {
658 	int ret;
659 
660 	if (lock_sk)
661 		lock_sock(sk);
662 	ret = sock_bindtoindex_locked(sk, ifindex);
663 	if (lock_sk)
664 		release_sock(sk);
665 
666 	return ret;
667 }
668 EXPORT_SYMBOL(sock_bindtoindex);
669 
670 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
671 {
672 	int ret = -ENOPROTOOPT;
673 #ifdef CONFIG_NETDEVICES
674 	struct net *net = sock_net(sk);
675 	char devname[IFNAMSIZ];
676 	int index;
677 
678 	ret = -EINVAL;
679 	if (optlen < 0)
680 		goto out;
681 
682 	/* Bind this socket to a particular device like "eth0",
683 	 * as specified in the passed interface name. If the
684 	 * name is "" or the option length is zero the socket
685 	 * is not bound.
686 	 */
687 	if (optlen > IFNAMSIZ - 1)
688 		optlen = IFNAMSIZ - 1;
689 	memset(devname, 0, sizeof(devname));
690 
691 	ret = -EFAULT;
692 	if (copy_from_sockptr(devname, optval, optlen))
693 		goto out;
694 
695 	index = 0;
696 	if (devname[0] != '\0') {
697 		struct net_device *dev;
698 
699 		rcu_read_lock();
700 		dev = dev_get_by_name_rcu(net, devname);
701 		if (dev)
702 			index = dev->ifindex;
703 		rcu_read_unlock();
704 		ret = -ENODEV;
705 		if (!dev)
706 			goto out;
707 	}
708 
709 	sockopt_lock_sock(sk);
710 	ret = sock_bindtoindex_locked(sk, index);
711 	sockopt_release_sock(sk);
712 out:
713 #endif
714 
715 	return ret;
716 }
717 
718 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
719 				sockptr_t optlen, int len)
720 {
721 	int ret = -ENOPROTOOPT;
722 #ifdef CONFIG_NETDEVICES
723 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
724 	struct net *net = sock_net(sk);
725 	char devname[IFNAMSIZ];
726 
727 	if (bound_dev_if == 0) {
728 		len = 0;
729 		goto zero;
730 	}
731 
732 	ret = -EINVAL;
733 	if (len < IFNAMSIZ)
734 		goto out;
735 
736 	ret = netdev_get_name(net, devname, bound_dev_if);
737 	if (ret)
738 		goto out;
739 
740 	len = strlen(devname) + 1;
741 
742 	ret = -EFAULT;
743 	if (copy_to_sockptr(optval, devname, len))
744 		goto out;
745 
746 zero:
747 	ret = -EFAULT;
748 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
749 		goto out;
750 
751 	ret = 0;
752 
753 out:
754 #endif
755 
756 	return ret;
757 }
758 
759 bool sk_mc_loop(const struct sock *sk)
760 {
761 	if (dev_recursion_level())
762 		return false;
763 	if (!sk)
764 		return true;
765 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
766 	switch (READ_ONCE(sk->sk_family)) {
767 	case AF_INET:
768 		return inet_test_bit(MC_LOOP, sk);
769 #if IS_ENABLED(CONFIG_IPV6)
770 	case AF_INET6:
771 		return inet6_test_bit(MC6_LOOP, sk);
772 #endif
773 	}
774 	WARN_ON_ONCE(1);
775 	return true;
776 }
777 EXPORT_SYMBOL(sk_mc_loop);
778 
779 void sock_set_reuseaddr(struct sock *sk)
780 {
781 	lock_sock(sk);
782 	sk->sk_reuse = SK_CAN_REUSE;
783 	release_sock(sk);
784 }
785 EXPORT_SYMBOL(sock_set_reuseaddr);
786 
787 void sock_set_reuseport(struct sock *sk)
788 {
789 	lock_sock(sk);
790 	sk->sk_reuseport = true;
791 	release_sock(sk);
792 }
793 EXPORT_SYMBOL(sock_set_reuseport);
794 
795 void sock_no_linger(struct sock *sk)
796 {
797 	lock_sock(sk);
798 	WRITE_ONCE(sk->sk_lingertime, 0);
799 	sock_set_flag(sk, SOCK_LINGER);
800 	release_sock(sk);
801 }
802 EXPORT_SYMBOL(sock_no_linger);
803 
804 void sock_set_priority(struct sock *sk, u32 priority)
805 {
806 	WRITE_ONCE(sk->sk_priority, priority);
807 }
808 EXPORT_SYMBOL(sock_set_priority);
809 
810 void sock_set_sndtimeo(struct sock *sk, s64 secs)
811 {
812 	lock_sock(sk);
813 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
814 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
815 	else
816 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
817 	release_sock(sk);
818 }
819 EXPORT_SYMBOL(sock_set_sndtimeo);
820 
821 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
822 {
823 	if (val)  {
824 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
825 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
826 		sock_set_flag(sk, SOCK_RCVTSTAMP);
827 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
828 	} else {
829 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
830 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
831 	}
832 }
833 
834 void sock_enable_timestamps(struct sock *sk)
835 {
836 	lock_sock(sk);
837 	__sock_set_timestamps(sk, true, false, true);
838 	release_sock(sk);
839 }
840 EXPORT_SYMBOL(sock_enable_timestamps);
841 
842 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
843 {
844 	switch (optname) {
845 	case SO_TIMESTAMP_OLD:
846 		__sock_set_timestamps(sk, valbool, false, false);
847 		break;
848 	case SO_TIMESTAMP_NEW:
849 		__sock_set_timestamps(sk, valbool, true, false);
850 		break;
851 	case SO_TIMESTAMPNS_OLD:
852 		__sock_set_timestamps(sk, valbool, false, true);
853 		break;
854 	case SO_TIMESTAMPNS_NEW:
855 		__sock_set_timestamps(sk, valbool, true, true);
856 		break;
857 	}
858 }
859 
860 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
861 {
862 	struct net *net = sock_net(sk);
863 	struct net_device *dev = NULL;
864 	bool match = false;
865 	int *vclock_index;
866 	int i, num;
867 
868 	if (sk->sk_bound_dev_if)
869 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
870 
871 	if (!dev) {
872 		pr_err("%s: sock not bind to device\n", __func__);
873 		return -EOPNOTSUPP;
874 	}
875 
876 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
877 	dev_put(dev);
878 
879 	for (i = 0; i < num; i++) {
880 		if (*(vclock_index + i) == phc_index) {
881 			match = true;
882 			break;
883 		}
884 	}
885 
886 	if (num > 0)
887 		kfree(vclock_index);
888 
889 	if (!match)
890 		return -EINVAL;
891 
892 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
893 
894 	return 0;
895 }
896 
897 int sock_set_timestamping(struct sock *sk, int optname,
898 			  struct so_timestamping timestamping)
899 {
900 	int val = timestamping.flags;
901 	int ret;
902 
903 	if (val & ~SOF_TIMESTAMPING_MASK)
904 		return -EINVAL;
905 
906 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
907 	    !(val & SOF_TIMESTAMPING_OPT_ID))
908 		return -EINVAL;
909 
910 	if (val & SOF_TIMESTAMPING_OPT_ID &&
911 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
912 		if (sk_is_tcp(sk)) {
913 			if ((1 << sk->sk_state) &
914 			    (TCPF_CLOSE | TCPF_LISTEN))
915 				return -EINVAL;
916 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
917 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
918 			else
919 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
920 		} else {
921 			atomic_set(&sk->sk_tskey, 0);
922 		}
923 	}
924 
925 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
926 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
927 		return -EINVAL;
928 
929 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
930 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
931 		if (ret)
932 			return ret;
933 	}
934 
935 	WRITE_ONCE(sk->sk_tsflags, val);
936 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
937 
938 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
939 		sock_enable_timestamp(sk,
940 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
941 	else
942 		sock_disable_timestamp(sk,
943 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
944 	return 0;
945 }
946 
947 void sock_set_keepalive(struct sock *sk)
948 {
949 	lock_sock(sk);
950 	if (sk->sk_prot->keepalive)
951 		sk->sk_prot->keepalive(sk, true);
952 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
953 	release_sock(sk);
954 }
955 EXPORT_SYMBOL(sock_set_keepalive);
956 
957 static void __sock_set_rcvbuf(struct sock *sk, int val)
958 {
959 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
960 	 * as a negative value.
961 	 */
962 	val = min_t(int, val, INT_MAX / 2);
963 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
964 
965 	/* We double it on the way in to account for "struct sk_buff" etc.
966 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
967 	 * will allow that much actual data to be received on that socket.
968 	 *
969 	 * Applications are unaware that "struct sk_buff" and other overheads
970 	 * allocate from the receive buffer during socket buffer allocation.
971 	 *
972 	 * And after considering the possible alternatives, returning the value
973 	 * we actually used in getsockopt is the most desirable behavior.
974 	 */
975 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
976 }
977 
978 void sock_set_rcvbuf(struct sock *sk, int val)
979 {
980 	lock_sock(sk);
981 	__sock_set_rcvbuf(sk, val);
982 	release_sock(sk);
983 }
984 EXPORT_SYMBOL(sock_set_rcvbuf);
985 
986 static void __sock_set_mark(struct sock *sk, u32 val)
987 {
988 	if (val != sk->sk_mark) {
989 		WRITE_ONCE(sk->sk_mark, val);
990 		sk_dst_reset(sk);
991 	}
992 }
993 
994 void sock_set_mark(struct sock *sk, u32 val)
995 {
996 	lock_sock(sk);
997 	__sock_set_mark(sk, val);
998 	release_sock(sk);
999 }
1000 EXPORT_SYMBOL(sock_set_mark);
1001 
1002 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1003 {
1004 	/* Round down bytes to multiple of pages */
1005 	bytes = round_down(bytes, PAGE_SIZE);
1006 
1007 	WARN_ON(bytes > sk->sk_reserved_mem);
1008 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1009 	sk_mem_reclaim(sk);
1010 }
1011 
1012 static int sock_reserve_memory(struct sock *sk, int bytes)
1013 {
1014 	long allocated;
1015 	bool charged;
1016 	int pages;
1017 
1018 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1019 		return -EOPNOTSUPP;
1020 
1021 	if (!bytes)
1022 		return 0;
1023 
1024 	pages = sk_mem_pages(bytes);
1025 
1026 	/* pre-charge to memcg */
1027 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1028 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1029 	if (!charged)
1030 		return -ENOMEM;
1031 
1032 	/* pre-charge to forward_alloc */
1033 	sk_memory_allocated_add(sk, pages);
1034 	allocated = sk_memory_allocated(sk);
1035 	/* If the system goes into memory pressure with this
1036 	 * precharge, give up and return error.
1037 	 */
1038 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1039 		sk_memory_allocated_sub(sk, pages);
1040 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1041 		return -ENOMEM;
1042 	}
1043 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1044 
1045 	WRITE_ONCE(sk->sk_reserved_mem,
1046 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1047 
1048 	return 0;
1049 }
1050 
1051 #ifdef CONFIG_PAGE_POOL
1052 
1053 /* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in
1054  * 1 syscall. The limit exists to limit the amount of memory the kernel
1055  * allocates to copy these tokens.
1056  */
1057 #define MAX_DONTNEED_TOKENS 128
1058 
1059 static noinline_for_stack int
1060 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1061 {
1062 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1063 	struct dmabuf_token *tokens;
1064 	netmem_ref netmems[16];
1065 	int ret = 0;
1066 
1067 	if (!sk_is_tcp(sk))
1068 		return -EBADF;
1069 
1070 	if (optlen % sizeof(struct dmabuf_token) ||
1071 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1072 		return -EINVAL;
1073 
1074 	tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
1075 	if (!tokens)
1076 		return -ENOMEM;
1077 
1078 	num_tokens = optlen / sizeof(struct dmabuf_token);
1079 	if (copy_from_sockptr(tokens, optval, optlen)) {
1080 		kvfree(tokens);
1081 		return -EFAULT;
1082 	}
1083 
1084 	xa_lock_bh(&sk->sk_user_frags);
1085 	for (i = 0; i < num_tokens; i++) {
1086 		for (j = 0; j < tokens[i].token_count; j++) {
1087 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1088 				&sk->sk_user_frags, tokens[i].token_start + j);
1089 
1090 			if (netmem &&
1091 			    !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) {
1092 				netmems[netmem_num++] = netmem;
1093 				if (netmem_num == ARRAY_SIZE(netmems)) {
1094 					xa_unlock_bh(&sk->sk_user_frags);
1095 					for (k = 0; k < netmem_num; k++)
1096 						WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1097 					netmem_num = 0;
1098 					xa_lock_bh(&sk->sk_user_frags);
1099 				}
1100 				ret++;
1101 			}
1102 		}
1103 	}
1104 
1105 	xa_unlock_bh(&sk->sk_user_frags);
1106 	for (k = 0; k < netmem_num; k++)
1107 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1108 
1109 	kvfree(tokens);
1110 	return ret;
1111 }
1112 #endif
1113 
1114 void sockopt_lock_sock(struct sock *sk)
1115 {
1116 	/* When current->bpf_ctx is set, the setsockopt is called from
1117 	 * a bpf prog.  bpf has ensured the sk lock has been
1118 	 * acquired before calling setsockopt().
1119 	 */
1120 	if (has_current_bpf_ctx())
1121 		return;
1122 
1123 	lock_sock(sk);
1124 }
1125 EXPORT_SYMBOL(sockopt_lock_sock);
1126 
1127 void sockopt_release_sock(struct sock *sk)
1128 {
1129 	if (has_current_bpf_ctx())
1130 		return;
1131 
1132 	release_sock(sk);
1133 }
1134 EXPORT_SYMBOL(sockopt_release_sock);
1135 
1136 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1137 {
1138 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1139 }
1140 EXPORT_SYMBOL(sockopt_ns_capable);
1141 
1142 bool sockopt_capable(int cap)
1143 {
1144 	return has_current_bpf_ctx() || capable(cap);
1145 }
1146 EXPORT_SYMBOL(sockopt_capable);
1147 
1148 static int sockopt_validate_clockid(__kernel_clockid_t value)
1149 {
1150 	switch (value) {
1151 	case CLOCK_REALTIME:
1152 	case CLOCK_MONOTONIC:
1153 	case CLOCK_TAI:
1154 		return 0;
1155 	}
1156 	return -EINVAL;
1157 }
1158 
1159 /*
1160  *	This is meant for all protocols to use and covers goings on
1161  *	at the socket level. Everything here is generic.
1162  */
1163 
1164 int sk_setsockopt(struct sock *sk, int level, int optname,
1165 		  sockptr_t optval, unsigned int optlen)
1166 {
1167 	struct so_timestamping timestamping;
1168 	struct socket *sock = sk->sk_socket;
1169 	struct sock_txtime sk_txtime;
1170 	int val;
1171 	int valbool;
1172 	struct linger ling;
1173 	int ret = 0;
1174 
1175 	/*
1176 	 *	Options without arguments
1177 	 */
1178 
1179 	if (optname == SO_BINDTODEVICE)
1180 		return sock_setbindtodevice(sk, optval, optlen);
1181 
1182 	if (optlen < sizeof(int))
1183 		return -EINVAL;
1184 
1185 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1186 		return -EFAULT;
1187 
1188 	valbool = val ? 1 : 0;
1189 
1190 	/* handle options which do not require locking the socket. */
1191 	switch (optname) {
1192 	case SO_PRIORITY:
1193 		if ((val >= 0 && val <= 6) ||
1194 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1195 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1196 			sock_set_priority(sk, val);
1197 			return 0;
1198 		}
1199 		return -EPERM;
1200 	case SO_PASSSEC:
1201 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1202 		return 0;
1203 	case SO_PASSCRED:
1204 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1205 		return 0;
1206 	case SO_PASSPIDFD:
1207 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1208 		return 0;
1209 	case SO_TYPE:
1210 	case SO_PROTOCOL:
1211 	case SO_DOMAIN:
1212 	case SO_ERROR:
1213 		return -ENOPROTOOPT;
1214 #ifdef CONFIG_NET_RX_BUSY_POLL
1215 	case SO_BUSY_POLL:
1216 		if (val < 0)
1217 			return -EINVAL;
1218 		WRITE_ONCE(sk->sk_ll_usec, val);
1219 		return 0;
1220 	case SO_PREFER_BUSY_POLL:
1221 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1222 			return -EPERM;
1223 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1224 		return 0;
1225 	case SO_BUSY_POLL_BUDGET:
1226 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1227 		    !sockopt_capable(CAP_NET_ADMIN))
1228 			return -EPERM;
1229 		if (val < 0 || val > U16_MAX)
1230 			return -EINVAL;
1231 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1232 		return 0;
1233 #endif
1234 	case SO_MAX_PACING_RATE:
1235 		{
1236 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1237 		unsigned long pacing_rate;
1238 
1239 		if (sizeof(ulval) != sizeof(val) &&
1240 		    optlen >= sizeof(ulval) &&
1241 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1242 			return -EFAULT;
1243 		}
1244 		if (ulval != ~0UL)
1245 			cmpxchg(&sk->sk_pacing_status,
1246 				SK_PACING_NONE,
1247 				SK_PACING_NEEDED);
1248 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1249 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1250 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1251 		if (ulval < pacing_rate)
1252 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1253 		return 0;
1254 		}
1255 	case SO_TXREHASH:
1256 		if (val < -1 || val > 1)
1257 			return -EINVAL;
1258 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1259 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1260 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1261 		 * and sk_getsockopt().
1262 		 */
1263 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1264 		return 0;
1265 	case SO_PEEK_OFF:
1266 		{
1267 		int (*set_peek_off)(struct sock *sk, int val);
1268 
1269 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1270 		if (set_peek_off)
1271 			ret = set_peek_off(sk, val);
1272 		else
1273 			ret = -EOPNOTSUPP;
1274 		return ret;
1275 		}
1276 #ifdef CONFIG_PAGE_POOL
1277 	case SO_DEVMEM_DONTNEED:
1278 		return sock_devmem_dontneed(sk, optval, optlen);
1279 #endif
1280 	}
1281 
1282 	sockopt_lock_sock(sk);
1283 
1284 	switch (optname) {
1285 	case SO_DEBUG:
1286 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1287 			ret = -EACCES;
1288 		else
1289 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1290 		break;
1291 	case SO_REUSEADDR:
1292 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1293 		break;
1294 	case SO_REUSEPORT:
1295 		sk->sk_reuseport = valbool;
1296 		break;
1297 	case SO_DONTROUTE:
1298 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1299 		sk_dst_reset(sk);
1300 		break;
1301 	case SO_BROADCAST:
1302 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1303 		break;
1304 	case SO_SNDBUF:
1305 		/* Don't error on this BSD doesn't and if you think
1306 		 * about it this is right. Otherwise apps have to
1307 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1308 		 * are treated in BSD as hints
1309 		 */
1310 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1311 set_sndbuf:
1312 		/* Ensure val * 2 fits into an int, to prevent max_t()
1313 		 * from treating it as a negative value.
1314 		 */
1315 		val = min_t(int, val, INT_MAX / 2);
1316 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1317 		WRITE_ONCE(sk->sk_sndbuf,
1318 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1319 		/* Wake up sending tasks if we upped the value. */
1320 		sk->sk_write_space(sk);
1321 		break;
1322 
1323 	case SO_SNDBUFFORCE:
1324 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1325 			ret = -EPERM;
1326 			break;
1327 		}
1328 
1329 		/* No negative values (to prevent underflow, as val will be
1330 		 * multiplied by 2).
1331 		 */
1332 		if (val < 0)
1333 			val = 0;
1334 		goto set_sndbuf;
1335 
1336 	case SO_RCVBUF:
1337 		/* Don't error on this BSD doesn't and if you think
1338 		 * about it this is right. Otherwise apps have to
1339 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1340 		 * are treated in BSD as hints
1341 		 */
1342 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1343 		break;
1344 
1345 	case SO_RCVBUFFORCE:
1346 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1347 			ret = -EPERM;
1348 			break;
1349 		}
1350 
1351 		/* No negative values (to prevent underflow, as val will be
1352 		 * multiplied by 2).
1353 		 */
1354 		__sock_set_rcvbuf(sk, max(val, 0));
1355 		break;
1356 
1357 	case SO_KEEPALIVE:
1358 		if (sk->sk_prot->keepalive)
1359 			sk->sk_prot->keepalive(sk, valbool);
1360 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1361 		break;
1362 
1363 	case SO_OOBINLINE:
1364 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1365 		break;
1366 
1367 	case SO_NO_CHECK:
1368 		sk->sk_no_check_tx = valbool;
1369 		break;
1370 
1371 	case SO_LINGER:
1372 		if (optlen < sizeof(ling)) {
1373 			ret = -EINVAL;	/* 1003.1g */
1374 			break;
1375 		}
1376 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1377 			ret = -EFAULT;
1378 			break;
1379 		}
1380 		if (!ling.l_onoff) {
1381 			sock_reset_flag(sk, SOCK_LINGER);
1382 		} else {
1383 			unsigned long t_sec = ling.l_linger;
1384 
1385 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1386 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1387 			else
1388 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1389 			sock_set_flag(sk, SOCK_LINGER);
1390 		}
1391 		break;
1392 
1393 	case SO_BSDCOMPAT:
1394 		break;
1395 
1396 	case SO_TIMESTAMP_OLD:
1397 	case SO_TIMESTAMP_NEW:
1398 	case SO_TIMESTAMPNS_OLD:
1399 	case SO_TIMESTAMPNS_NEW:
1400 		sock_set_timestamp(sk, optname, valbool);
1401 		break;
1402 
1403 	case SO_TIMESTAMPING_NEW:
1404 	case SO_TIMESTAMPING_OLD:
1405 		if (optlen == sizeof(timestamping)) {
1406 			if (copy_from_sockptr(&timestamping, optval,
1407 					      sizeof(timestamping))) {
1408 				ret = -EFAULT;
1409 				break;
1410 			}
1411 		} else {
1412 			memset(&timestamping, 0, sizeof(timestamping));
1413 			timestamping.flags = val;
1414 		}
1415 		ret = sock_set_timestamping(sk, optname, timestamping);
1416 		break;
1417 
1418 	case SO_RCVLOWAT:
1419 		{
1420 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1421 
1422 		if (val < 0)
1423 			val = INT_MAX;
1424 		if (sock)
1425 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1426 		if (set_rcvlowat)
1427 			ret = set_rcvlowat(sk, val);
1428 		else
1429 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1430 		break;
1431 		}
1432 	case SO_RCVTIMEO_OLD:
1433 	case SO_RCVTIMEO_NEW:
1434 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1435 				       optlen, optname == SO_RCVTIMEO_OLD);
1436 		break;
1437 
1438 	case SO_SNDTIMEO_OLD:
1439 	case SO_SNDTIMEO_NEW:
1440 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1441 				       optlen, optname == SO_SNDTIMEO_OLD);
1442 		break;
1443 
1444 	case SO_ATTACH_FILTER: {
1445 		struct sock_fprog fprog;
1446 
1447 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1448 		if (!ret)
1449 			ret = sk_attach_filter(&fprog, sk);
1450 		break;
1451 	}
1452 	case SO_ATTACH_BPF:
1453 		ret = -EINVAL;
1454 		if (optlen == sizeof(u32)) {
1455 			u32 ufd;
1456 
1457 			ret = -EFAULT;
1458 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1459 				break;
1460 
1461 			ret = sk_attach_bpf(ufd, sk);
1462 		}
1463 		break;
1464 
1465 	case SO_ATTACH_REUSEPORT_CBPF: {
1466 		struct sock_fprog fprog;
1467 
1468 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1469 		if (!ret)
1470 			ret = sk_reuseport_attach_filter(&fprog, sk);
1471 		break;
1472 	}
1473 	case SO_ATTACH_REUSEPORT_EBPF:
1474 		ret = -EINVAL;
1475 		if (optlen == sizeof(u32)) {
1476 			u32 ufd;
1477 
1478 			ret = -EFAULT;
1479 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1480 				break;
1481 
1482 			ret = sk_reuseport_attach_bpf(ufd, sk);
1483 		}
1484 		break;
1485 
1486 	case SO_DETACH_REUSEPORT_BPF:
1487 		ret = reuseport_detach_prog(sk);
1488 		break;
1489 
1490 	case SO_DETACH_FILTER:
1491 		ret = sk_detach_filter(sk);
1492 		break;
1493 
1494 	case SO_LOCK_FILTER:
1495 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1496 			ret = -EPERM;
1497 		else
1498 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1499 		break;
1500 
1501 	case SO_MARK:
1502 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1503 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1504 			ret = -EPERM;
1505 			break;
1506 		}
1507 
1508 		__sock_set_mark(sk, val);
1509 		break;
1510 	case SO_RCVMARK:
1511 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1512 		break;
1513 
1514 	case SO_RXQ_OVFL:
1515 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1516 		break;
1517 
1518 	case SO_WIFI_STATUS:
1519 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1520 		break;
1521 
1522 	case SO_NOFCS:
1523 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1524 		break;
1525 
1526 	case SO_SELECT_ERR_QUEUE:
1527 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1528 		break;
1529 
1530 
1531 	case SO_INCOMING_CPU:
1532 		reuseport_update_incoming_cpu(sk, val);
1533 		break;
1534 
1535 	case SO_CNX_ADVICE:
1536 		if (val == 1)
1537 			dst_negative_advice(sk);
1538 		break;
1539 
1540 	case SO_ZEROCOPY:
1541 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1542 			if (!(sk_is_tcp(sk) ||
1543 			      (sk->sk_type == SOCK_DGRAM &&
1544 			       sk->sk_protocol == IPPROTO_UDP)))
1545 				ret = -EOPNOTSUPP;
1546 		} else if (sk->sk_family != PF_RDS) {
1547 			ret = -EOPNOTSUPP;
1548 		}
1549 		if (!ret) {
1550 			if (val < 0 || val > 1)
1551 				ret = -EINVAL;
1552 			else
1553 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1554 		}
1555 		break;
1556 
1557 	case SO_TXTIME:
1558 		if (optlen != sizeof(struct sock_txtime)) {
1559 			ret = -EINVAL;
1560 			break;
1561 		} else if (copy_from_sockptr(&sk_txtime, optval,
1562 			   sizeof(struct sock_txtime))) {
1563 			ret = -EFAULT;
1564 			break;
1565 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1566 			ret = -EINVAL;
1567 			break;
1568 		}
1569 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1570 		 * scheduler has enough safe guards.
1571 		 */
1572 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1573 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1574 			ret = -EPERM;
1575 			break;
1576 		}
1577 
1578 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1579 		if (ret)
1580 			break;
1581 
1582 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1583 		sk->sk_clockid = sk_txtime.clockid;
1584 		sk->sk_txtime_deadline_mode =
1585 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1586 		sk->sk_txtime_report_errors =
1587 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1588 		break;
1589 
1590 	case SO_BINDTOIFINDEX:
1591 		ret = sock_bindtoindex_locked(sk, val);
1592 		break;
1593 
1594 	case SO_BUF_LOCK:
1595 		if (val & ~SOCK_BUF_LOCK_MASK) {
1596 			ret = -EINVAL;
1597 			break;
1598 		}
1599 		sk->sk_userlocks = val | (sk->sk_userlocks &
1600 					  ~SOCK_BUF_LOCK_MASK);
1601 		break;
1602 
1603 	case SO_RESERVE_MEM:
1604 	{
1605 		int delta;
1606 
1607 		if (val < 0) {
1608 			ret = -EINVAL;
1609 			break;
1610 		}
1611 
1612 		delta = val - sk->sk_reserved_mem;
1613 		if (delta < 0)
1614 			sock_release_reserved_memory(sk, -delta);
1615 		else
1616 			ret = sock_reserve_memory(sk, delta);
1617 		break;
1618 	}
1619 
1620 	default:
1621 		ret = -ENOPROTOOPT;
1622 		break;
1623 	}
1624 	sockopt_release_sock(sk);
1625 	return ret;
1626 }
1627 
1628 int sock_setsockopt(struct socket *sock, int level, int optname,
1629 		    sockptr_t optval, unsigned int optlen)
1630 {
1631 	return sk_setsockopt(sock->sk, level, optname,
1632 			     optval, optlen);
1633 }
1634 EXPORT_SYMBOL(sock_setsockopt);
1635 
1636 static const struct cred *sk_get_peer_cred(struct sock *sk)
1637 {
1638 	const struct cred *cred;
1639 
1640 	spin_lock(&sk->sk_peer_lock);
1641 	cred = get_cred(sk->sk_peer_cred);
1642 	spin_unlock(&sk->sk_peer_lock);
1643 
1644 	return cred;
1645 }
1646 
1647 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1648 			  struct ucred *ucred)
1649 {
1650 	ucred->pid = pid_vnr(pid);
1651 	ucred->uid = ucred->gid = -1;
1652 	if (cred) {
1653 		struct user_namespace *current_ns = current_user_ns();
1654 
1655 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1656 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1657 	}
1658 }
1659 
1660 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1661 {
1662 	struct user_namespace *user_ns = current_user_ns();
1663 	int i;
1664 
1665 	for (i = 0; i < src->ngroups; i++) {
1666 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1667 
1668 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1669 			return -EFAULT;
1670 	}
1671 
1672 	return 0;
1673 }
1674 
1675 int sk_getsockopt(struct sock *sk, int level, int optname,
1676 		  sockptr_t optval, sockptr_t optlen)
1677 {
1678 	struct socket *sock = sk->sk_socket;
1679 
1680 	union {
1681 		int val;
1682 		u64 val64;
1683 		unsigned long ulval;
1684 		struct linger ling;
1685 		struct old_timeval32 tm32;
1686 		struct __kernel_old_timeval tm;
1687 		struct  __kernel_sock_timeval stm;
1688 		struct sock_txtime txtime;
1689 		struct so_timestamping timestamping;
1690 	} v;
1691 
1692 	int lv = sizeof(int);
1693 	int len;
1694 
1695 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1696 		return -EFAULT;
1697 	if (len < 0)
1698 		return -EINVAL;
1699 
1700 	memset(&v, 0, sizeof(v));
1701 
1702 	switch (optname) {
1703 	case SO_DEBUG:
1704 		v.val = sock_flag(sk, SOCK_DBG);
1705 		break;
1706 
1707 	case SO_DONTROUTE:
1708 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1709 		break;
1710 
1711 	case SO_BROADCAST:
1712 		v.val = sock_flag(sk, SOCK_BROADCAST);
1713 		break;
1714 
1715 	case SO_SNDBUF:
1716 		v.val = READ_ONCE(sk->sk_sndbuf);
1717 		break;
1718 
1719 	case SO_RCVBUF:
1720 		v.val = READ_ONCE(sk->sk_rcvbuf);
1721 		break;
1722 
1723 	case SO_REUSEADDR:
1724 		v.val = sk->sk_reuse;
1725 		break;
1726 
1727 	case SO_REUSEPORT:
1728 		v.val = sk->sk_reuseport;
1729 		break;
1730 
1731 	case SO_KEEPALIVE:
1732 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1733 		break;
1734 
1735 	case SO_TYPE:
1736 		v.val = sk->sk_type;
1737 		break;
1738 
1739 	case SO_PROTOCOL:
1740 		v.val = sk->sk_protocol;
1741 		break;
1742 
1743 	case SO_DOMAIN:
1744 		v.val = sk->sk_family;
1745 		break;
1746 
1747 	case SO_ERROR:
1748 		v.val = -sock_error(sk);
1749 		if (v.val == 0)
1750 			v.val = xchg(&sk->sk_err_soft, 0);
1751 		break;
1752 
1753 	case SO_OOBINLINE:
1754 		v.val = sock_flag(sk, SOCK_URGINLINE);
1755 		break;
1756 
1757 	case SO_NO_CHECK:
1758 		v.val = sk->sk_no_check_tx;
1759 		break;
1760 
1761 	case SO_PRIORITY:
1762 		v.val = READ_ONCE(sk->sk_priority);
1763 		break;
1764 
1765 	case SO_LINGER:
1766 		lv		= sizeof(v.ling);
1767 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1768 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1769 		break;
1770 
1771 	case SO_BSDCOMPAT:
1772 		break;
1773 
1774 	case SO_TIMESTAMP_OLD:
1775 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1776 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1777 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1778 		break;
1779 
1780 	case SO_TIMESTAMPNS_OLD:
1781 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1782 		break;
1783 
1784 	case SO_TIMESTAMP_NEW:
1785 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1786 		break;
1787 
1788 	case SO_TIMESTAMPNS_NEW:
1789 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1790 		break;
1791 
1792 	case SO_TIMESTAMPING_OLD:
1793 	case SO_TIMESTAMPING_NEW:
1794 		lv = sizeof(v.timestamping);
1795 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1796 		 * returning the flags when they were set through the same option.
1797 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1798 		 */
1799 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1800 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1801 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1802 		}
1803 		break;
1804 
1805 	case SO_RCVTIMEO_OLD:
1806 	case SO_RCVTIMEO_NEW:
1807 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1808 				      SO_RCVTIMEO_OLD == optname);
1809 		break;
1810 
1811 	case SO_SNDTIMEO_OLD:
1812 	case SO_SNDTIMEO_NEW:
1813 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1814 				      SO_SNDTIMEO_OLD == optname);
1815 		break;
1816 
1817 	case SO_RCVLOWAT:
1818 		v.val = READ_ONCE(sk->sk_rcvlowat);
1819 		break;
1820 
1821 	case SO_SNDLOWAT:
1822 		v.val = 1;
1823 		break;
1824 
1825 	case SO_PASSCRED:
1826 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1827 		break;
1828 
1829 	case SO_PASSPIDFD:
1830 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1831 		break;
1832 
1833 	case SO_PEERCRED:
1834 	{
1835 		struct ucred peercred;
1836 		if (len > sizeof(peercred))
1837 			len = sizeof(peercred);
1838 
1839 		spin_lock(&sk->sk_peer_lock);
1840 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1841 		spin_unlock(&sk->sk_peer_lock);
1842 
1843 		if (copy_to_sockptr(optval, &peercred, len))
1844 			return -EFAULT;
1845 		goto lenout;
1846 	}
1847 
1848 	case SO_PEERPIDFD:
1849 	{
1850 		struct pid *peer_pid;
1851 		struct file *pidfd_file = NULL;
1852 		int pidfd;
1853 
1854 		if (len > sizeof(pidfd))
1855 			len = sizeof(pidfd);
1856 
1857 		spin_lock(&sk->sk_peer_lock);
1858 		peer_pid = get_pid(sk->sk_peer_pid);
1859 		spin_unlock(&sk->sk_peer_lock);
1860 
1861 		if (!peer_pid)
1862 			return -ENODATA;
1863 
1864 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1865 		put_pid(peer_pid);
1866 		if (pidfd < 0)
1867 			return pidfd;
1868 
1869 		if (copy_to_sockptr(optval, &pidfd, len) ||
1870 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1871 			put_unused_fd(pidfd);
1872 			fput(pidfd_file);
1873 
1874 			return -EFAULT;
1875 		}
1876 
1877 		fd_install(pidfd, pidfd_file);
1878 		return 0;
1879 	}
1880 
1881 	case SO_PEERGROUPS:
1882 	{
1883 		const struct cred *cred;
1884 		int ret, n;
1885 
1886 		cred = sk_get_peer_cred(sk);
1887 		if (!cred)
1888 			return -ENODATA;
1889 
1890 		n = cred->group_info->ngroups;
1891 		if (len < n * sizeof(gid_t)) {
1892 			len = n * sizeof(gid_t);
1893 			put_cred(cred);
1894 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1895 		}
1896 		len = n * sizeof(gid_t);
1897 
1898 		ret = groups_to_user(optval, cred->group_info);
1899 		put_cred(cred);
1900 		if (ret)
1901 			return ret;
1902 		goto lenout;
1903 	}
1904 
1905 	case SO_PEERNAME:
1906 	{
1907 		struct sockaddr_storage address;
1908 
1909 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1910 		if (lv < 0)
1911 			return -ENOTCONN;
1912 		if (lv < len)
1913 			return -EINVAL;
1914 		if (copy_to_sockptr(optval, &address, len))
1915 			return -EFAULT;
1916 		goto lenout;
1917 	}
1918 
1919 	/* Dubious BSD thing... Probably nobody even uses it, but
1920 	 * the UNIX standard wants it for whatever reason... -DaveM
1921 	 */
1922 	case SO_ACCEPTCONN:
1923 		v.val = sk->sk_state == TCP_LISTEN;
1924 		break;
1925 
1926 	case SO_PASSSEC:
1927 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1928 		break;
1929 
1930 	case SO_PEERSEC:
1931 		return security_socket_getpeersec_stream(sock,
1932 							 optval, optlen, len);
1933 
1934 	case SO_MARK:
1935 		v.val = READ_ONCE(sk->sk_mark);
1936 		break;
1937 
1938 	case SO_RCVMARK:
1939 		v.val = sock_flag(sk, SOCK_RCVMARK);
1940 		break;
1941 
1942 	case SO_RXQ_OVFL:
1943 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1944 		break;
1945 
1946 	case SO_WIFI_STATUS:
1947 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1948 		break;
1949 
1950 	case SO_PEEK_OFF:
1951 		if (!READ_ONCE(sock->ops)->set_peek_off)
1952 			return -EOPNOTSUPP;
1953 
1954 		v.val = READ_ONCE(sk->sk_peek_off);
1955 		break;
1956 	case SO_NOFCS:
1957 		v.val = sock_flag(sk, SOCK_NOFCS);
1958 		break;
1959 
1960 	case SO_BINDTODEVICE:
1961 		return sock_getbindtodevice(sk, optval, optlen, len);
1962 
1963 	case SO_GET_FILTER:
1964 		len = sk_get_filter(sk, optval, len);
1965 		if (len < 0)
1966 			return len;
1967 
1968 		goto lenout;
1969 
1970 	case SO_LOCK_FILTER:
1971 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1972 		break;
1973 
1974 	case SO_BPF_EXTENSIONS:
1975 		v.val = bpf_tell_extensions();
1976 		break;
1977 
1978 	case SO_SELECT_ERR_QUEUE:
1979 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1980 		break;
1981 
1982 #ifdef CONFIG_NET_RX_BUSY_POLL
1983 	case SO_BUSY_POLL:
1984 		v.val = READ_ONCE(sk->sk_ll_usec);
1985 		break;
1986 	case SO_PREFER_BUSY_POLL:
1987 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1988 		break;
1989 #endif
1990 
1991 	case SO_MAX_PACING_RATE:
1992 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1993 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1994 			lv = sizeof(v.ulval);
1995 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1996 		} else {
1997 			/* 32bit version */
1998 			v.val = min_t(unsigned long, ~0U,
1999 				      READ_ONCE(sk->sk_max_pacing_rate));
2000 		}
2001 		break;
2002 
2003 	case SO_INCOMING_CPU:
2004 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2005 		break;
2006 
2007 	case SO_MEMINFO:
2008 	{
2009 		u32 meminfo[SK_MEMINFO_VARS];
2010 
2011 		sk_get_meminfo(sk, meminfo);
2012 
2013 		len = min_t(unsigned int, len, sizeof(meminfo));
2014 		if (copy_to_sockptr(optval, &meminfo, len))
2015 			return -EFAULT;
2016 
2017 		goto lenout;
2018 	}
2019 
2020 #ifdef CONFIG_NET_RX_BUSY_POLL
2021 	case SO_INCOMING_NAPI_ID:
2022 		v.val = READ_ONCE(sk->sk_napi_id);
2023 
2024 		/* aggregate non-NAPI IDs down to 0 */
2025 		if (v.val < MIN_NAPI_ID)
2026 			v.val = 0;
2027 
2028 		break;
2029 #endif
2030 
2031 	case SO_COOKIE:
2032 		lv = sizeof(u64);
2033 		if (len < lv)
2034 			return -EINVAL;
2035 		v.val64 = sock_gen_cookie(sk);
2036 		break;
2037 
2038 	case SO_ZEROCOPY:
2039 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2040 		break;
2041 
2042 	case SO_TXTIME:
2043 		lv = sizeof(v.txtime);
2044 		v.txtime.clockid = sk->sk_clockid;
2045 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2046 				  SOF_TXTIME_DEADLINE_MODE : 0;
2047 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2048 				  SOF_TXTIME_REPORT_ERRORS : 0;
2049 		break;
2050 
2051 	case SO_BINDTOIFINDEX:
2052 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2053 		break;
2054 
2055 	case SO_NETNS_COOKIE:
2056 		lv = sizeof(u64);
2057 		if (len != lv)
2058 			return -EINVAL;
2059 		v.val64 = sock_net(sk)->net_cookie;
2060 		break;
2061 
2062 	case SO_BUF_LOCK:
2063 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2064 		break;
2065 
2066 	case SO_RESERVE_MEM:
2067 		v.val = READ_ONCE(sk->sk_reserved_mem);
2068 		break;
2069 
2070 	case SO_TXREHASH:
2071 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2072 		v.val = READ_ONCE(sk->sk_txrehash);
2073 		break;
2074 
2075 	default:
2076 		/* We implement the SO_SNDLOWAT etc to not be settable
2077 		 * (1003.1g 7).
2078 		 */
2079 		return -ENOPROTOOPT;
2080 	}
2081 
2082 	if (len > lv)
2083 		len = lv;
2084 	if (copy_to_sockptr(optval, &v, len))
2085 		return -EFAULT;
2086 lenout:
2087 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2088 		return -EFAULT;
2089 	return 0;
2090 }
2091 
2092 /*
2093  * Initialize an sk_lock.
2094  *
2095  * (We also register the sk_lock with the lock validator.)
2096  */
2097 static inline void sock_lock_init(struct sock *sk)
2098 {
2099 	if (sk->sk_kern_sock)
2100 		sock_lock_init_class_and_name(
2101 			sk,
2102 			af_family_kern_slock_key_strings[sk->sk_family],
2103 			af_family_kern_slock_keys + sk->sk_family,
2104 			af_family_kern_key_strings[sk->sk_family],
2105 			af_family_kern_keys + sk->sk_family);
2106 	else
2107 		sock_lock_init_class_and_name(
2108 			sk,
2109 			af_family_slock_key_strings[sk->sk_family],
2110 			af_family_slock_keys + sk->sk_family,
2111 			af_family_key_strings[sk->sk_family],
2112 			af_family_keys + sk->sk_family);
2113 }
2114 
2115 /*
2116  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2117  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2118  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2119  */
2120 static void sock_copy(struct sock *nsk, const struct sock *osk)
2121 {
2122 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2123 #ifdef CONFIG_SECURITY_NETWORK
2124 	void *sptr = nsk->sk_security;
2125 #endif
2126 
2127 	/* If we move sk_tx_queue_mapping out of the private section,
2128 	 * we must check if sk_tx_queue_clear() is called after
2129 	 * sock_copy() in sk_clone_lock().
2130 	 */
2131 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2132 		     offsetof(struct sock, sk_dontcopy_begin) ||
2133 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2134 		     offsetof(struct sock, sk_dontcopy_end));
2135 
2136 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2137 
2138 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2139 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2140 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2141 
2142 #ifdef CONFIG_SECURITY_NETWORK
2143 	nsk->sk_security = sptr;
2144 	security_sk_clone(osk, nsk);
2145 #endif
2146 }
2147 
2148 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2149 		int family)
2150 {
2151 	struct sock *sk;
2152 	struct kmem_cache *slab;
2153 
2154 	slab = prot->slab;
2155 	if (slab != NULL) {
2156 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2157 		if (!sk)
2158 			return sk;
2159 		if (want_init_on_alloc(priority))
2160 			sk_prot_clear_nulls(sk, prot->obj_size);
2161 	} else
2162 		sk = kmalloc(prot->obj_size, priority);
2163 
2164 	if (sk != NULL) {
2165 		if (security_sk_alloc(sk, family, priority))
2166 			goto out_free;
2167 
2168 		if (!try_module_get(prot->owner))
2169 			goto out_free_sec;
2170 	}
2171 
2172 	return sk;
2173 
2174 out_free_sec:
2175 	security_sk_free(sk);
2176 out_free:
2177 	if (slab != NULL)
2178 		kmem_cache_free(slab, sk);
2179 	else
2180 		kfree(sk);
2181 	return NULL;
2182 }
2183 
2184 static void sk_prot_free(struct proto *prot, struct sock *sk)
2185 {
2186 	struct kmem_cache *slab;
2187 	struct module *owner;
2188 
2189 	owner = prot->owner;
2190 	slab = prot->slab;
2191 
2192 	cgroup_sk_free(&sk->sk_cgrp_data);
2193 	mem_cgroup_sk_free(sk);
2194 	security_sk_free(sk);
2195 	if (slab != NULL)
2196 		kmem_cache_free(slab, sk);
2197 	else
2198 		kfree(sk);
2199 	module_put(owner);
2200 }
2201 
2202 /**
2203  *	sk_alloc - All socket objects are allocated here
2204  *	@net: the applicable net namespace
2205  *	@family: protocol family
2206  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2207  *	@prot: struct proto associated with this new sock instance
2208  *	@kern: is this to be a kernel socket?
2209  */
2210 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2211 		      struct proto *prot, int kern)
2212 {
2213 	struct sock *sk;
2214 
2215 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2216 	if (sk) {
2217 		sk->sk_family = family;
2218 		/*
2219 		 * See comment in struct sock definition to understand
2220 		 * why we need sk_prot_creator -acme
2221 		 */
2222 		sk->sk_prot = sk->sk_prot_creator = prot;
2223 		sk->sk_kern_sock = kern;
2224 		sock_lock_init(sk);
2225 		sk->sk_net_refcnt = kern ? 0 : 1;
2226 		if (likely(sk->sk_net_refcnt)) {
2227 			get_net_track(net, &sk->ns_tracker, priority);
2228 			sock_inuse_add(net, 1);
2229 		} else {
2230 			__netns_tracker_alloc(net, &sk->ns_tracker,
2231 					      false, priority);
2232 		}
2233 
2234 		sock_net_set(sk, net);
2235 		refcount_set(&sk->sk_wmem_alloc, 1);
2236 
2237 		mem_cgroup_sk_alloc(sk);
2238 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2239 		sock_update_classid(&sk->sk_cgrp_data);
2240 		sock_update_netprioidx(&sk->sk_cgrp_data);
2241 		sk_tx_queue_clear(sk);
2242 	}
2243 
2244 	return sk;
2245 }
2246 EXPORT_SYMBOL(sk_alloc);
2247 
2248 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2249  * grace period. This is the case for UDP sockets and TCP listeners.
2250  */
2251 static void __sk_destruct(struct rcu_head *head)
2252 {
2253 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2254 	struct sk_filter *filter;
2255 
2256 	if (sk->sk_destruct)
2257 		sk->sk_destruct(sk);
2258 
2259 	filter = rcu_dereference_check(sk->sk_filter,
2260 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2261 	if (filter) {
2262 		sk_filter_uncharge(sk, filter);
2263 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2264 	}
2265 
2266 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2267 
2268 #ifdef CONFIG_BPF_SYSCALL
2269 	bpf_sk_storage_free(sk);
2270 #endif
2271 
2272 	if (atomic_read(&sk->sk_omem_alloc))
2273 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2274 			 __func__, atomic_read(&sk->sk_omem_alloc));
2275 
2276 	if (sk->sk_frag.page) {
2277 		put_page(sk->sk_frag.page);
2278 		sk->sk_frag.page = NULL;
2279 	}
2280 
2281 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2282 	put_cred(sk->sk_peer_cred);
2283 	put_pid(sk->sk_peer_pid);
2284 
2285 	if (likely(sk->sk_net_refcnt))
2286 		put_net_track(sock_net(sk), &sk->ns_tracker);
2287 	else
2288 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2289 
2290 	sk_prot_free(sk->sk_prot_creator, sk);
2291 }
2292 
2293 void sk_destruct(struct sock *sk)
2294 {
2295 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2296 
2297 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2298 		reuseport_detach_sock(sk);
2299 		use_call_rcu = true;
2300 	}
2301 
2302 	if (use_call_rcu)
2303 		call_rcu(&sk->sk_rcu, __sk_destruct);
2304 	else
2305 		__sk_destruct(&sk->sk_rcu);
2306 }
2307 
2308 static void __sk_free(struct sock *sk)
2309 {
2310 	if (likely(sk->sk_net_refcnt))
2311 		sock_inuse_add(sock_net(sk), -1);
2312 
2313 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2314 		sock_diag_broadcast_destroy(sk);
2315 	else
2316 		sk_destruct(sk);
2317 }
2318 
2319 void sk_free(struct sock *sk)
2320 {
2321 	/*
2322 	 * We subtract one from sk_wmem_alloc and can know if
2323 	 * some packets are still in some tx queue.
2324 	 * If not null, sock_wfree() will call __sk_free(sk) later
2325 	 */
2326 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2327 		__sk_free(sk);
2328 }
2329 EXPORT_SYMBOL(sk_free);
2330 
2331 static void sk_init_common(struct sock *sk)
2332 {
2333 	skb_queue_head_init(&sk->sk_receive_queue);
2334 	skb_queue_head_init(&sk->sk_write_queue);
2335 	skb_queue_head_init(&sk->sk_error_queue);
2336 
2337 	rwlock_init(&sk->sk_callback_lock);
2338 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2339 			af_rlock_keys + sk->sk_family,
2340 			af_family_rlock_key_strings[sk->sk_family]);
2341 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2342 			af_wlock_keys + sk->sk_family,
2343 			af_family_wlock_key_strings[sk->sk_family]);
2344 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2345 			af_elock_keys + sk->sk_family,
2346 			af_family_elock_key_strings[sk->sk_family]);
2347 	if (sk->sk_kern_sock)
2348 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2349 			af_kern_callback_keys + sk->sk_family,
2350 			af_family_kern_clock_key_strings[sk->sk_family]);
2351 	else
2352 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2353 			af_callback_keys + sk->sk_family,
2354 			af_family_clock_key_strings[sk->sk_family]);
2355 }
2356 
2357 /**
2358  *	sk_clone_lock - clone a socket, and lock its clone
2359  *	@sk: the socket to clone
2360  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2361  *
2362  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2363  */
2364 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2365 {
2366 	struct proto *prot = READ_ONCE(sk->sk_prot);
2367 	struct sk_filter *filter;
2368 	bool is_charged = true;
2369 	struct sock *newsk;
2370 
2371 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2372 	if (!newsk)
2373 		goto out;
2374 
2375 	sock_copy(newsk, sk);
2376 
2377 	newsk->sk_prot_creator = prot;
2378 
2379 	/* SANITY */
2380 	if (likely(newsk->sk_net_refcnt)) {
2381 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2382 		sock_inuse_add(sock_net(newsk), 1);
2383 	} else {
2384 		/* Kernel sockets are not elevating the struct net refcount.
2385 		 * Instead, use a tracker to more easily detect if a layer
2386 		 * is not properly dismantling its kernel sockets at netns
2387 		 * destroy time.
2388 		 */
2389 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2390 				      false, priority);
2391 	}
2392 	sk_node_init(&newsk->sk_node);
2393 	sock_lock_init(newsk);
2394 	bh_lock_sock(newsk);
2395 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2396 	newsk->sk_backlog.len = 0;
2397 
2398 	atomic_set(&newsk->sk_rmem_alloc, 0);
2399 
2400 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2401 	refcount_set(&newsk->sk_wmem_alloc, 1);
2402 
2403 	atomic_set(&newsk->sk_omem_alloc, 0);
2404 	sk_init_common(newsk);
2405 
2406 	newsk->sk_dst_cache	= NULL;
2407 	newsk->sk_dst_pending_confirm = 0;
2408 	newsk->sk_wmem_queued	= 0;
2409 	newsk->sk_forward_alloc = 0;
2410 	newsk->sk_reserved_mem  = 0;
2411 	atomic_set(&newsk->sk_drops, 0);
2412 	newsk->sk_send_head	= NULL;
2413 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2414 	atomic_set(&newsk->sk_zckey, 0);
2415 
2416 	sock_reset_flag(newsk, SOCK_DONE);
2417 
2418 	/* sk->sk_memcg will be populated at accept() time */
2419 	newsk->sk_memcg = NULL;
2420 
2421 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2422 
2423 	rcu_read_lock();
2424 	filter = rcu_dereference(sk->sk_filter);
2425 	if (filter != NULL)
2426 		/* though it's an empty new sock, the charging may fail
2427 		 * if sysctl_optmem_max was changed between creation of
2428 		 * original socket and cloning
2429 		 */
2430 		is_charged = sk_filter_charge(newsk, filter);
2431 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2432 	rcu_read_unlock();
2433 
2434 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2435 		/* We need to make sure that we don't uncharge the new
2436 		 * socket if we couldn't charge it in the first place
2437 		 * as otherwise we uncharge the parent's filter.
2438 		 */
2439 		if (!is_charged)
2440 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2441 		sk_free_unlock_clone(newsk);
2442 		newsk = NULL;
2443 		goto out;
2444 	}
2445 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2446 
2447 	if (bpf_sk_storage_clone(sk, newsk)) {
2448 		sk_free_unlock_clone(newsk);
2449 		newsk = NULL;
2450 		goto out;
2451 	}
2452 
2453 	/* Clear sk_user_data if parent had the pointer tagged
2454 	 * as not suitable for copying when cloning.
2455 	 */
2456 	if (sk_user_data_is_nocopy(newsk))
2457 		newsk->sk_user_data = NULL;
2458 
2459 	newsk->sk_err	   = 0;
2460 	newsk->sk_err_soft = 0;
2461 	newsk->sk_priority = 0;
2462 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2463 
2464 	/* Before updating sk_refcnt, we must commit prior changes to memory
2465 	 * (Documentation/RCU/rculist_nulls.rst for details)
2466 	 */
2467 	smp_wmb();
2468 	refcount_set(&newsk->sk_refcnt, 2);
2469 
2470 	sk_set_socket(newsk, NULL);
2471 	sk_tx_queue_clear(newsk);
2472 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2473 
2474 	if (newsk->sk_prot->sockets_allocated)
2475 		sk_sockets_allocated_inc(newsk);
2476 
2477 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2478 		net_enable_timestamp();
2479 out:
2480 	return newsk;
2481 }
2482 EXPORT_SYMBOL_GPL(sk_clone_lock);
2483 
2484 void sk_free_unlock_clone(struct sock *sk)
2485 {
2486 	/* It is still raw copy of parent, so invalidate
2487 	 * destructor and make plain sk_free() */
2488 	sk->sk_destruct = NULL;
2489 	bh_unlock_sock(sk);
2490 	sk_free(sk);
2491 }
2492 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2493 
2494 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2495 {
2496 	bool is_ipv6 = false;
2497 	u32 max_size;
2498 
2499 #if IS_ENABLED(CONFIG_IPV6)
2500 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2501 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2502 #endif
2503 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2504 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2505 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2506 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2507 		max_size = GSO_LEGACY_MAX_SIZE;
2508 
2509 	return max_size - (MAX_TCP_HEADER + 1);
2510 }
2511 
2512 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2513 {
2514 	u32 max_segs = 1;
2515 
2516 	sk->sk_route_caps = dst->dev->features;
2517 	if (sk_is_tcp(sk))
2518 		sk->sk_route_caps |= NETIF_F_GSO;
2519 	if (sk->sk_route_caps & NETIF_F_GSO)
2520 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2521 	if (unlikely(sk->sk_gso_disabled))
2522 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2523 	if (sk_can_gso(sk)) {
2524 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2525 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2526 		} else {
2527 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2528 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2529 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2530 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2531 		}
2532 	}
2533 	sk->sk_gso_max_segs = max_segs;
2534 	sk_dst_set(sk, dst);
2535 }
2536 EXPORT_SYMBOL_GPL(sk_setup_caps);
2537 
2538 /*
2539  *	Simple resource managers for sockets.
2540  */
2541 
2542 
2543 /*
2544  * Write buffer destructor automatically called from kfree_skb.
2545  */
2546 void sock_wfree(struct sk_buff *skb)
2547 {
2548 	struct sock *sk = skb->sk;
2549 	unsigned int len = skb->truesize;
2550 	bool free;
2551 
2552 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2553 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2554 		    sk->sk_write_space == sock_def_write_space) {
2555 			rcu_read_lock();
2556 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2557 			sock_def_write_space_wfree(sk);
2558 			rcu_read_unlock();
2559 			if (unlikely(free))
2560 				__sk_free(sk);
2561 			return;
2562 		}
2563 
2564 		/*
2565 		 * Keep a reference on sk_wmem_alloc, this will be released
2566 		 * after sk_write_space() call
2567 		 */
2568 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2569 		sk->sk_write_space(sk);
2570 		len = 1;
2571 	}
2572 	/*
2573 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2574 	 * could not do because of in-flight packets
2575 	 */
2576 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2577 		__sk_free(sk);
2578 }
2579 EXPORT_SYMBOL(sock_wfree);
2580 
2581 /* This variant of sock_wfree() is used by TCP,
2582  * since it sets SOCK_USE_WRITE_QUEUE.
2583  */
2584 void __sock_wfree(struct sk_buff *skb)
2585 {
2586 	struct sock *sk = skb->sk;
2587 
2588 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2589 		__sk_free(sk);
2590 }
2591 
2592 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2593 {
2594 	skb_orphan(skb);
2595 	skb->sk = sk;
2596 #ifdef CONFIG_INET
2597 	if (unlikely(!sk_fullsock(sk))) {
2598 		skb->destructor = sock_edemux;
2599 		sock_hold(sk);
2600 		return;
2601 	}
2602 #endif
2603 	skb->destructor = sock_wfree;
2604 	skb_set_hash_from_sk(skb, sk);
2605 	/*
2606 	 * We used to take a refcount on sk, but following operation
2607 	 * is enough to guarantee sk_free() won't free this sock until
2608 	 * all in-flight packets are completed
2609 	 */
2610 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2611 }
2612 EXPORT_SYMBOL(skb_set_owner_w);
2613 
2614 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2615 {
2616 	/* Drivers depend on in-order delivery for crypto offload,
2617 	 * partial orphan breaks out-of-order-OK logic.
2618 	 */
2619 	if (skb_is_decrypted(skb))
2620 		return false;
2621 
2622 	return (skb->destructor == sock_wfree ||
2623 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2624 }
2625 
2626 /* This helper is used by netem, as it can hold packets in its
2627  * delay queue. We want to allow the owner socket to send more
2628  * packets, as if they were already TX completed by a typical driver.
2629  * But we also want to keep skb->sk set because some packet schedulers
2630  * rely on it (sch_fq for example).
2631  */
2632 void skb_orphan_partial(struct sk_buff *skb)
2633 {
2634 	if (skb_is_tcp_pure_ack(skb))
2635 		return;
2636 
2637 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2638 		return;
2639 
2640 	skb_orphan(skb);
2641 }
2642 EXPORT_SYMBOL(skb_orphan_partial);
2643 
2644 /*
2645  * Read buffer destructor automatically called from kfree_skb.
2646  */
2647 void sock_rfree(struct sk_buff *skb)
2648 {
2649 	struct sock *sk = skb->sk;
2650 	unsigned int len = skb->truesize;
2651 
2652 	atomic_sub(len, &sk->sk_rmem_alloc);
2653 	sk_mem_uncharge(sk, len);
2654 }
2655 EXPORT_SYMBOL(sock_rfree);
2656 
2657 /*
2658  * Buffer destructor for skbs that are not used directly in read or write
2659  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2660  */
2661 void sock_efree(struct sk_buff *skb)
2662 {
2663 	sock_put(skb->sk);
2664 }
2665 EXPORT_SYMBOL(sock_efree);
2666 
2667 /* Buffer destructor for prefetch/receive path where reference count may
2668  * not be held, e.g. for listen sockets.
2669  */
2670 #ifdef CONFIG_INET
2671 void sock_pfree(struct sk_buff *skb)
2672 {
2673 	struct sock *sk = skb->sk;
2674 
2675 	if (!sk_is_refcounted(sk))
2676 		return;
2677 
2678 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2679 		inet_reqsk(sk)->rsk_listener = NULL;
2680 		reqsk_free(inet_reqsk(sk));
2681 		return;
2682 	}
2683 
2684 	sock_gen_put(sk);
2685 }
2686 EXPORT_SYMBOL(sock_pfree);
2687 #endif /* CONFIG_INET */
2688 
2689 kuid_t sock_i_uid(struct sock *sk)
2690 {
2691 	kuid_t uid;
2692 
2693 	read_lock_bh(&sk->sk_callback_lock);
2694 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2695 	read_unlock_bh(&sk->sk_callback_lock);
2696 	return uid;
2697 }
2698 EXPORT_SYMBOL(sock_i_uid);
2699 
2700 unsigned long __sock_i_ino(struct sock *sk)
2701 {
2702 	unsigned long ino;
2703 
2704 	read_lock(&sk->sk_callback_lock);
2705 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2706 	read_unlock(&sk->sk_callback_lock);
2707 	return ino;
2708 }
2709 EXPORT_SYMBOL(__sock_i_ino);
2710 
2711 unsigned long sock_i_ino(struct sock *sk)
2712 {
2713 	unsigned long ino;
2714 
2715 	local_bh_disable();
2716 	ino = __sock_i_ino(sk);
2717 	local_bh_enable();
2718 	return ino;
2719 }
2720 EXPORT_SYMBOL(sock_i_ino);
2721 
2722 /*
2723  * Allocate a skb from the socket's send buffer.
2724  */
2725 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2726 			     gfp_t priority)
2727 {
2728 	if (force ||
2729 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2730 		struct sk_buff *skb = alloc_skb(size, priority);
2731 
2732 		if (skb) {
2733 			skb_set_owner_w(skb, sk);
2734 			return skb;
2735 		}
2736 	}
2737 	return NULL;
2738 }
2739 EXPORT_SYMBOL(sock_wmalloc);
2740 
2741 static void sock_ofree(struct sk_buff *skb)
2742 {
2743 	struct sock *sk = skb->sk;
2744 
2745 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2746 }
2747 
2748 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2749 			     gfp_t priority)
2750 {
2751 	struct sk_buff *skb;
2752 
2753 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2754 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2755 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2756 		return NULL;
2757 
2758 	skb = alloc_skb(size, priority);
2759 	if (!skb)
2760 		return NULL;
2761 
2762 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2763 	skb->sk = sk;
2764 	skb->destructor = sock_ofree;
2765 	return skb;
2766 }
2767 
2768 /*
2769  * Allocate a memory block from the socket's option memory buffer.
2770  */
2771 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2772 {
2773 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2774 
2775 	if ((unsigned int)size <= optmem_max &&
2776 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2777 		void *mem;
2778 		/* First do the add, to avoid the race if kmalloc
2779 		 * might sleep.
2780 		 */
2781 		atomic_add(size, &sk->sk_omem_alloc);
2782 		mem = kmalloc(size, priority);
2783 		if (mem)
2784 			return mem;
2785 		atomic_sub(size, &sk->sk_omem_alloc);
2786 	}
2787 	return NULL;
2788 }
2789 EXPORT_SYMBOL(sock_kmalloc);
2790 
2791 /* Free an option memory block. Note, we actually want the inline
2792  * here as this allows gcc to detect the nullify and fold away the
2793  * condition entirely.
2794  */
2795 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2796 				  const bool nullify)
2797 {
2798 	if (WARN_ON_ONCE(!mem))
2799 		return;
2800 	if (nullify)
2801 		kfree_sensitive(mem);
2802 	else
2803 		kfree(mem);
2804 	atomic_sub(size, &sk->sk_omem_alloc);
2805 }
2806 
2807 void sock_kfree_s(struct sock *sk, void *mem, int size)
2808 {
2809 	__sock_kfree_s(sk, mem, size, false);
2810 }
2811 EXPORT_SYMBOL(sock_kfree_s);
2812 
2813 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2814 {
2815 	__sock_kfree_s(sk, mem, size, true);
2816 }
2817 EXPORT_SYMBOL(sock_kzfree_s);
2818 
2819 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2820    I think, these locks should be removed for datagram sockets.
2821  */
2822 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2823 {
2824 	DEFINE_WAIT(wait);
2825 
2826 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2827 	for (;;) {
2828 		if (!timeo)
2829 			break;
2830 		if (signal_pending(current))
2831 			break;
2832 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2833 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2834 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2835 			break;
2836 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2837 			break;
2838 		if (READ_ONCE(sk->sk_err))
2839 			break;
2840 		timeo = schedule_timeout(timeo);
2841 	}
2842 	finish_wait(sk_sleep(sk), &wait);
2843 	return timeo;
2844 }
2845 
2846 
2847 /*
2848  *	Generic send/receive buffer handlers
2849  */
2850 
2851 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2852 				     unsigned long data_len, int noblock,
2853 				     int *errcode, int max_page_order)
2854 {
2855 	struct sk_buff *skb;
2856 	long timeo;
2857 	int err;
2858 
2859 	timeo = sock_sndtimeo(sk, noblock);
2860 	for (;;) {
2861 		err = sock_error(sk);
2862 		if (err != 0)
2863 			goto failure;
2864 
2865 		err = -EPIPE;
2866 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2867 			goto failure;
2868 
2869 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2870 			break;
2871 
2872 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2873 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2874 		err = -EAGAIN;
2875 		if (!timeo)
2876 			goto failure;
2877 		if (signal_pending(current))
2878 			goto interrupted;
2879 		timeo = sock_wait_for_wmem(sk, timeo);
2880 	}
2881 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2882 				   errcode, sk->sk_allocation);
2883 	if (skb)
2884 		skb_set_owner_w(skb, sk);
2885 	return skb;
2886 
2887 interrupted:
2888 	err = sock_intr_errno(timeo);
2889 failure:
2890 	*errcode = err;
2891 	return NULL;
2892 }
2893 EXPORT_SYMBOL(sock_alloc_send_pskb);
2894 
2895 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2896 		     struct sockcm_cookie *sockc)
2897 {
2898 	u32 tsflags;
2899 
2900 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
2901 
2902 	switch (cmsg->cmsg_type) {
2903 	case SO_MARK:
2904 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2905 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2906 			return -EPERM;
2907 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2908 			return -EINVAL;
2909 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2910 		break;
2911 	case SO_TIMESTAMPING_OLD:
2912 	case SO_TIMESTAMPING_NEW:
2913 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2914 			return -EINVAL;
2915 
2916 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2917 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2918 			return -EINVAL;
2919 
2920 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2921 		sockc->tsflags |= tsflags;
2922 		break;
2923 	case SCM_TXTIME:
2924 		if (!sock_flag(sk, SOCK_TXTIME))
2925 			return -EINVAL;
2926 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2927 			return -EINVAL;
2928 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2929 		break;
2930 	case SCM_TS_OPT_ID:
2931 		if (sk_is_tcp(sk))
2932 			return -EINVAL;
2933 		tsflags = READ_ONCE(sk->sk_tsflags);
2934 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
2935 			return -EINVAL;
2936 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2937 			return -EINVAL;
2938 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
2939 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
2940 		break;
2941 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2942 	case SCM_RIGHTS:
2943 	case SCM_CREDENTIALS:
2944 		break;
2945 	default:
2946 		return -EINVAL;
2947 	}
2948 	return 0;
2949 }
2950 EXPORT_SYMBOL(__sock_cmsg_send);
2951 
2952 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2953 		   struct sockcm_cookie *sockc)
2954 {
2955 	struct cmsghdr *cmsg;
2956 	int ret;
2957 
2958 	for_each_cmsghdr(cmsg, msg) {
2959 		if (!CMSG_OK(msg, cmsg))
2960 			return -EINVAL;
2961 		if (cmsg->cmsg_level != SOL_SOCKET)
2962 			continue;
2963 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2964 		if (ret)
2965 			return ret;
2966 	}
2967 	return 0;
2968 }
2969 EXPORT_SYMBOL(sock_cmsg_send);
2970 
2971 static void sk_enter_memory_pressure(struct sock *sk)
2972 {
2973 	if (!sk->sk_prot->enter_memory_pressure)
2974 		return;
2975 
2976 	sk->sk_prot->enter_memory_pressure(sk);
2977 }
2978 
2979 static void sk_leave_memory_pressure(struct sock *sk)
2980 {
2981 	if (sk->sk_prot->leave_memory_pressure) {
2982 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2983 				     tcp_leave_memory_pressure, sk);
2984 	} else {
2985 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2986 
2987 		if (memory_pressure && READ_ONCE(*memory_pressure))
2988 			WRITE_ONCE(*memory_pressure, 0);
2989 	}
2990 }
2991 
2992 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2993 
2994 /**
2995  * skb_page_frag_refill - check that a page_frag contains enough room
2996  * @sz: minimum size of the fragment we want to get
2997  * @pfrag: pointer to page_frag
2998  * @gfp: priority for memory allocation
2999  *
3000  * Note: While this allocator tries to use high order pages, there is
3001  * no guarantee that allocations succeed. Therefore, @sz MUST be
3002  * less or equal than PAGE_SIZE.
3003  */
3004 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3005 {
3006 	if (pfrag->page) {
3007 		if (page_ref_count(pfrag->page) == 1) {
3008 			pfrag->offset = 0;
3009 			return true;
3010 		}
3011 		if (pfrag->offset + sz <= pfrag->size)
3012 			return true;
3013 		put_page(pfrag->page);
3014 	}
3015 
3016 	pfrag->offset = 0;
3017 	if (SKB_FRAG_PAGE_ORDER &&
3018 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3019 		/* Avoid direct reclaim but allow kswapd to wake */
3020 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3021 					  __GFP_COMP | __GFP_NOWARN |
3022 					  __GFP_NORETRY,
3023 					  SKB_FRAG_PAGE_ORDER);
3024 		if (likely(pfrag->page)) {
3025 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3026 			return true;
3027 		}
3028 	}
3029 	pfrag->page = alloc_page(gfp);
3030 	if (likely(pfrag->page)) {
3031 		pfrag->size = PAGE_SIZE;
3032 		return true;
3033 	}
3034 	return false;
3035 }
3036 EXPORT_SYMBOL(skb_page_frag_refill);
3037 
3038 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3039 {
3040 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3041 		return true;
3042 
3043 	sk_enter_memory_pressure(sk);
3044 	sk_stream_moderate_sndbuf(sk);
3045 	return false;
3046 }
3047 EXPORT_SYMBOL(sk_page_frag_refill);
3048 
3049 void __lock_sock(struct sock *sk)
3050 	__releases(&sk->sk_lock.slock)
3051 	__acquires(&sk->sk_lock.slock)
3052 {
3053 	DEFINE_WAIT(wait);
3054 
3055 	for (;;) {
3056 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3057 					TASK_UNINTERRUPTIBLE);
3058 		spin_unlock_bh(&sk->sk_lock.slock);
3059 		schedule();
3060 		spin_lock_bh(&sk->sk_lock.slock);
3061 		if (!sock_owned_by_user(sk))
3062 			break;
3063 	}
3064 	finish_wait(&sk->sk_lock.wq, &wait);
3065 }
3066 
3067 void __release_sock(struct sock *sk)
3068 	__releases(&sk->sk_lock.slock)
3069 	__acquires(&sk->sk_lock.slock)
3070 {
3071 	struct sk_buff *skb, *next;
3072 
3073 	while ((skb = sk->sk_backlog.head) != NULL) {
3074 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3075 
3076 		spin_unlock_bh(&sk->sk_lock.slock);
3077 
3078 		do {
3079 			next = skb->next;
3080 			prefetch(next);
3081 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3082 			skb_mark_not_on_list(skb);
3083 			sk_backlog_rcv(sk, skb);
3084 
3085 			cond_resched();
3086 
3087 			skb = next;
3088 		} while (skb != NULL);
3089 
3090 		spin_lock_bh(&sk->sk_lock.slock);
3091 	}
3092 
3093 	/*
3094 	 * Doing the zeroing here guarantee we can not loop forever
3095 	 * while a wild producer attempts to flood us.
3096 	 */
3097 	sk->sk_backlog.len = 0;
3098 }
3099 
3100 void __sk_flush_backlog(struct sock *sk)
3101 {
3102 	spin_lock_bh(&sk->sk_lock.slock);
3103 	__release_sock(sk);
3104 
3105 	if (sk->sk_prot->release_cb)
3106 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3107 				     tcp_release_cb, sk);
3108 
3109 	spin_unlock_bh(&sk->sk_lock.slock);
3110 }
3111 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3112 
3113 /**
3114  * sk_wait_data - wait for data to arrive at sk_receive_queue
3115  * @sk:    sock to wait on
3116  * @timeo: for how long
3117  * @skb:   last skb seen on sk_receive_queue
3118  *
3119  * Now socket state including sk->sk_err is changed only under lock,
3120  * hence we may omit checks after joining wait queue.
3121  * We check receive queue before schedule() only as optimization;
3122  * it is very likely that release_sock() added new data.
3123  */
3124 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3125 {
3126 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3127 	int rc;
3128 
3129 	add_wait_queue(sk_sleep(sk), &wait);
3130 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3131 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3132 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3133 	remove_wait_queue(sk_sleep(sk), &wait);
3134 	return rc;
3135 }
3136 EXPORT_SYMBOL(sk_wait_data);
3137 
3138 /**
3139  *	__sk_mem_raise_allocated - increase memory_allocated
3140  *	@sk: socket
3141  *	@size: memory size to allocate
3142  *	@amt: pages to allocate
3143  *	@kind: allocation type
3144  *
3145  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3146  *
3147  *	Unlike the globally shared limits among the sockets under same protocol,
3148  *	consuming the budget of a memcg won't have direct effect on other ones.
3149  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3150  *	whether or not to raise allocated through sk_under_memory_pressure() or
3151  *	its variants.
3152  */
3153 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3154 {
3155 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3156 	struct proto *prot = sk->sk_prot;
3157 	bool charged = false;
3158 	long allocated;
3159 
3160 	sk_memory_allocated_add(sk, amt);
3161 	allocated = sk_memory_allocated(sk);
3162 
3163 	if (memcg) {
3164 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3165 			goto suppress_allocation;
3166 		charged = true;
3167 	}
3168 
3169 	/* Under limit. */
3170 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3171 		sk_leave_memory_pressure(sk);
3172 		return 1;
3173 	}
3174 
3175 	/* Under pressure. */
3176 	if (allocated > sk_prot_mem_limits(sk, 1))
3177 		sk_enter_memory_pressure(sk);
3178 
3179 	/* Over hard limit. */
3180 	if (allocated > sk_prot_mem_limits(sk, 2))
3181 		goto suppress_allocation;
3182 
3183 	/* Guarantee minimum buffer size under pressure (either global
3184 	 * or memcg) to make sure features described in RFC 7323 (TCP
3185 	 * Extensions for High Performance) work properly.
3186 	 *
3187 	 * This rule does NOT stand when exceeds global or memcg's hard
3188 	 * limit, or else a DoS attack can be taken place by spawning
3189 	 * lots of sockets whose usage are under minimum buffer size.
3190 	 */
3191 	if (kind == SK_MEM_RECV) {
3192 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3193 			return 1;
3194 
3195 	} else { /* SK_MEM_SEND */
3196 		int wmem0 = sk_get_wmem0(sk, prot);
3197 
3198 		if (sk->sk_type == SOCK_STREAM) {
3199 			if (sk->sk_wmem_queued < wmem0)
3200 				return 1;
3201 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3202 				return 1;
3203 		}
3204 	}
3205 
3206 	if (sk_has_memory_pressure(sk)) {
3207 		u64 alloc;
3208 
3209 		/* The following 'average' heuristic is within the
3210 		 * scope of global accounting, so it only makes
3211 		 * sense for global memory pressure.
3212 		 */
3213 		if (!sk_under_global_memory_pressure(sk))
3214 			return 1;
3215 
3216 		/* Try to be fair among all the sockets under global
3217 		 * pressure by allowing the ones that below average
3218 		 * usage to raise.
3219 		 */
3220 		alloc = sk_sockets_allocated_read_positive(sk);
3221 		if (sk_prot_mem_limits(sk, 2) > alloc *
3222 		    sk_mem_pages(sk->sk_wmem_queued +
3223 				 atomic_read(&sk->sk_rmem_alloc) +
3224 				 sk->sk_forward_alloc))
3225 			return 1;
3226 	}
3227 
3228 suppress_allocation:
3229 
3230 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3231 		sk_stream_moderate_sndbuf(sk);
3232 
3233 		/* Fail only if socket is _under_ its sndbuf.
3234 		 * In this case we cannot block, so that we have to fail.
3235 		 */
3236 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3237 			/* Force charge with __GFP_NOFAIL */
3238 			if (memcg && !charged) {
3239 				mem_cgroup_charge_skmem(memcg, amt,
3240 					gfp_memcg_charge() | __GFP_NOFAIL);
3241 			}
3242 			return 1;
3243 		}
3244 	}
3245 
3246 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3247 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3248 
3249 	sk_memory_allocated_sub(sk, amt);
3250 
3251 	if (charged)
3252 		mem_cgroup_uncharge_skmem(memcg, amt);
3253 
3254 	return 0;
3255 }
3256 
3257 /**
3258  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3259  *	@sk: socket
3260  *	@size: memory size to allocate
3261  *	@kind: allocation type
3262  *
3263  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3264  *	rmem allocation. This function assumes that protocols which have
3265  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3266  */
3267 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3268 {
3269 	int ret, amt = sk_mem_pages(size);
3270 
3271 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3272 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3273 	if (!ret)
3274 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3275 	return ret;
3276 }
3277 EXPORT_SYMBOL(__sk_mem_schedule);
3278 
3279 /**
3280  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3281  *	@sk: socket
3282  *	@amount: number of quanta
3283  *
3284  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3285  */
3286 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3287 {
3288 	sk_memory_allocated_sub(sk, amount);
3289 
3290 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3291 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3292 
3293 	if (sk_under_global_memory_pressure(sk) &&
3294 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3295 		sk_leave_memory_pressure(sk);
3296 }
3297 
3298 /**
3299  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3300  *	@sk: socket
3301  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3302  */
3303 void __sk_mem_reclaim(struct sock *sk, int amount)
3304 {
3305 	amount >>= PAGE_SHIFT;
3306 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3307 	__sk_mem_reduce_allocated(sk, amount);
3308 }
3309 EXPORT_SYMBOL(__sk_mem_reclaim);
3310 
3311 int sk_set_peek_off(struct sock *sk, int val)
3312 {
3313 	WRITE_ONCE(sk->sk_peek_off, val);
3314 	return 0;
3315 }
3316 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3317 
3318 /*
3319  * Set of default routines for initialising struct proto_ops when
3320  * the protocol does not support a particular function. In certain
3321  * cases where it makes no sense for a protocol to have a "do nothing"
3322  * function, some default processing is provided.
3323  */
3324 
3325 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3326 {
3327 	return -EOPNOTSUPP;
3328 }
3329 EXPORT_SYMBOL(sock_no_bind);
3330 
3331 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3332 		    int len, int flags)
3333 {
3334 	return -EOPNOTSUPP;
3335 }
3336 EXPORT_SYMBOL(sock_no_connect);
3337 
3338 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3339 {
3340 	return -EOPNOTSUPP;
3341 }
3342 EXPORT_SYMBOL(sock_no_socketpair);
3343 
3344 int sock_no_accept(struct socket *sock, struct socket *newsock,
3345 		   struct proto_accept_arg *arg)
3346 {
3347 	return -EOPNOTSUPP;
3348 }
3349 EXPORT_SYMBOL(sock_no_accept);
3350 
3351 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3352 		    int peer)
3353 {
3354 	return -EOPNOTSUPP;
3355 }
3356 EXPORT_SYMBOL(sock_no_getname);
3357 
3358 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3359 {
3360 	return -EOPNOTSUPP;
3361 }
3362 EXPORT_SYMBOL(sock_no_ioctl);
3363 
3364 int sock_no_listen(struct socket *sock, int backlog)
3365 {
3366 	return -EOPNOTSUPP;
3367 }
3368 EXPORT_SYMBOL(sock_no_listen);
3369 
3370 int sock_no_shutdown(struct socket *sock, int how)
3371 {
3372 	return -EOPNOTSUPP;
3373 }
3374 EXPORT_SYMBOL(sock_no_shutdown);
3375 
3376 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3377 {
3378 	return -EOPNOTSUPP;
3379 }
3380 EXPORT_SYMBOL(sock_no_sendmsg);
3381 
3382 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3383 {
3384 	return -EOPNOTSUPP;
3385 }
3386 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3387 
3388 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3389 		    int flags)
3390 {
3391 	return -EOPNOTSUPP;
3392 }
3393 EXPORT_SYMBOL(sock_no_recvmsg);
3394 
3395 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3396 {
3397 	/* Mirror missing mmap method error code */
3398 	return -ENODEV;
3399 }
3400 EXPORT_SYMBOL(sock_no_mmap);
3401 
3402 /*
3403  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3404  * various sock-based usage counts.
3405  */
3406 void __receive_sock(struct file *file)
3407 {
3408 	struct socket *sock;
3409 
3410 	sock = sock_from_file(file);
3411 	if (sock) {
3412 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3413 		sock_update_classid(&sock->sk->sk_cgrp_data);
3414 	}
3415 }
3416 
3417 /*
3418  *	Default Socket Callbacks
3419  */
3420 
3421 static void sock_def_wakeup(struct sock *sk)
3422 {
3423 	struct socket_wq *wq;
3424 
3425 	rcu_read_lock();
3426 	wq = rcu_dereference(sk->sk_wq);
3427 	if (skwq_has_sleeper(wq))
3428 		wake_up_interruptible_all(&wq->wait);
3429 	rcu_read_unlock();
3430 }
3431 
3432 static void sock_def_error_report(struct sock *sk)
3433 {
3434 	struct socket_wq *wq;
3435 
3436 	rcu_read_lock();
3437 	wq = rcu_dereference(sk->sk_wq);
3438 	if (skwq_has_sleeper(wq))
3439 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3440 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3441 	rcu_read_unlock();
3442 }
3443 
3444 void sock_def_readable(struct sock *sk)
3445 {
3446 	struct socket_wq *wq;
3447 
3448 	trace_sk_data_ready(sk);
3449 
3450 	rcu_read_lock();
3451 	wq = rcu_dereference(sk->sk_wq);
3452 	if (skwq_has_sleeper(wq))
3453 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3454 						EPOLLRDNORM | EPOLLRDBAND);
3455 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3456 	rcu_read_unlock();
3457 }
3458 
3459 static void sock_def_write_space(struct sock *sk)
3460 {
3461 	struct socket_wq *wq;
3462 
3463 	rcu_read_lock();
3464 
3465 	/* Do not wake up a writer until he can make "significant"
3466 	 * progress.  --DaveM
3467 	 */
3468 	if (sock_writeable(sk)) {
3469 		wq = rcu_dereference(sk->sk_wq);
3470 		if (skwq_has_sleeper(wq))
3471 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3472 						EPOLLWRNORM | EPOLLWRBAND);
3473 
3474 		/* Should agree with poll, otherwise some programs break */
3475 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3476 	}
3477 
3478 	rcu_read_unlock();
3479 }
3480 
3481 /* An optimised version of sock_def_write_space(), should only be called
3482  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3483  * ->sk_wmem_alloc.
3484  */
3485 static void sock_def_write_space_wfree(struct sock *sk)
3486 {
3487 	/* Do not wake up a writer until he can make "significant"
3488 	 * progress.  --DaveM
3489 	 */
3490 	if (sock_writeable(sk)) {
3491 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3492 
3493 		/* rely on refcount_sub from sock_wfree() */
3494 		smp_mb__after_atomic();
3495 		if (wq && waitqueue_active(&wq->wait))
3496 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3497 						EPOLLWRNORM | EPOLLWRBAND);
3498 
3499 		/* Should agree with poll, otherwise some programs break */
3500 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3501 	}
3502 }
3503 
3504 static void sock_def_destruct(struct sock *sk)
3505 {
3506 }
3507 
3508 void sk_send_sigurg(struct sock *sk)
3509 {
3510 	if (sk->sk_socket && sk->sk_socket->file)
3511 		if (send_sigurg(sk->sk_socket->file))
3512 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3513 }
3514 EXPORT_SYMBOL(sk_send_sigurg);
3515 
3516 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3517 		    unsigned long expires)
3518 {
3519 	if (!mod_timer(timer, expires))
3520 		sock_hold(sk);
3521 }
3522 EXPORT_SYMBOL(sk_reset_timer);
3523 
3524 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3525 {
3526 	if (del_timer(timer))
3527 		__sock_put(sk);
3528 }
3529 EXPORT_SYMBOL(sk_stop_timer);
3530 
3531 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3532 {
3533 	if (del_timer_sync(timer))
3534 		__sock_put(sk);
3535 }
3536 EXPORT_SYMBOL(sk_stop_timer_sync);
3537 
3538 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3539 {
3540 	sk_init_common(sk);
3541 	sk->sk_send_head	=	NULL;
3542 
3543 	timer_setup(&sk->sk_timer, NULL, 0);
3544 
3545 	sk->sk_allocation	=	GFP_KERNEL;
3546 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3547 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3548 	sk->sk_state		=	TCP_CLOSE;
3549 	sk->sk_use_task_frag	=	true;
3550 	sk_set_socket(sk, sock);
3551 
3552 	sock_set_flag(sk, SOCK_ZAPPED);
3553 
3554 	if (sock) {
3555 		sk->sk_type	=	sock->type;
3556 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3557 		sock->sk	=	sk;
3558 	} else {
3559 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3560 	}
3561 	sk->sk_uid	=	uid;
3562 
3563 	sk->sk_state_change	=	sock_def_wakeup;
3564 	sk->sk_data_ready	=	sock_def_readable;
3565 	sk->sk_write_space	=	sock_def_write_space;
3566 	sk->sk_error_report	=	sock_def_error_report;
3567 	sk->sk_destruct		=	sock_def_destruct;
3568 
3569 	sk->sk_frag.page	=	NULL;
3570 	sk->sk_frag.offset	=	0;
3571 	sk->sk_peek_off		=	-1;
3572 
3573 	sk->sk_peer_pid 	=	NULL;
3574 	sk->sk_peer_cred	=	NULL;
3575 	spin_lock_init(&sk->sk_peer_lock);
3576 
3577 	sk->sk_write_pending	=	0;
3578 	sk->sk_rcvlowat		=	1;
3579 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3580 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3581 
3582 	sk->sk_stamp = SK_DEFAULT_STAMP;
3583 #if BITS_PER_LONG==32
3584 	seqlock_init(&sk->sk_stamp_seq);
3585 #endif
3586 	atomic_set(&sk->sk_zckey, 0);
3587 
3588 #ifdef CONFIG_NET_RX_BUSY_POLL
3589 	sk->sk_napi_id		=	0;
3590 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3591 #endif
3592 
3593 	sk->sk_max_pacing_rate = ~0UL;
3594 	sk->sk_pacing_rate = ~0UL;
3595 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3596 	sk->sk_incoming_cpu = -1;
3597 
3598 	sk_rx_queue_clear(sk);
3599 	/*
3600 	 * Before updating sk_refcnt, we must commit prior changes to memory
3601 	 * (Documentation/RCU/rculist_nulls.rst for details)
3602 	 */
3603 	smp_wmb();
3604 	refcount_set(&sk->sk_refcnt, 1);
3605 	atomic_set(&sk->sk_drops, 0);
3606 }
3607 EXPORT_SYMBOL(sock_init_data_uid);
3608 
3609 void sock_init_data(struct socket *sock, struct sock *sk)
3610 {
3611 	kuid_t uid = sock ?
3612 		SOCK_INODE(sock)->i_uid :
3613 		make_kuid(sock_net(sk)->user_ns, 0);
3614 
3615 	sock_init_data_uid(sock, sk, uid);
3616 }
3617 EXPORT_SYMBOL(sock_init_data);
3618 
3619 void lock_sock_nested(struct sock *sk, int subclass)
3620 {
3621 	/* The sk_lock has mutex_lock() semantics here. */
3622 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3623 
3624 	might_sleep();
3625 	spin_lock_bh(&sk->sk_lock.slock);
3626 	if (sock_owned_by_user_nocheck(sk))
3627 		__lock_sock(sk);
3628 	sk->sk_lock.owned = 1;
3629 	spin_unlock_bh(&sk->sk_lock.slock);
3630 }
3631 EXPORT_SYMBOL(lock_sock_nested);
3632 
3633 void release_sock(struct sock *sk)
3634 {
3635 	spin_lock_bh(&sk->sk_lock.slock);
3636 	if (sk->sk_backlog.tail)
3637 		__release_sock(sk);
3638 
3639 	if (sk->sk_prot->release_cb)
3640 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3641 				     tcp_release_cb, sk);
3642 
3643 	sock_release_ownership(sk);
3644 	if (waitqueue_active(&sk->sk_lock.wq))
3645 		wake_up(&sk->sk_lock.wq);
3646 	spin_unlock_bh(&sk->sk_lock.slock);
3647 }
3648 EXPORT_SYMBOL(release_sock);
3649 
3650 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3651 {
3652 	might_sleep();
3653 	spin_lock_bh(&sk->sk_lock.slock);
3654 
3655 	if (!sock_owned_by_user_nocheck(sk)) {
3656 		/*
3657 		 * Fast path return with bottom halves disabled and
3658 		 * sock::sk_lock.slock held.
3659 		 *
3660 		 * The 'mutex' is not contended and holding
3661 		 * sock::sk_lock.slock prevents all other lockers to
3662 		 * proceed so the corresponding unlock_sock_fast() can
3663 		 * avoid the slow path of release_sock() completely and
3664 		 * just release slock.
3665 		 *
3666 		 * From a semantical POV this is equivalent to 'acquiring'
3667 		 * the 'mutex', hence the corresponding lockdep
3668 		 * mutex_release() has to happen in the fast path of
3669 		 * unlock_sock_fast().
3670 		 */
3671 		return false;
3672 	}
3673 
3674 	__lock_sock(sk);
3675 	sk->sk_lock.owned = 1;
3676 	__acquire(&sk->sk_lock.slock);
3677 	spin_unlock_bh(&sk->sk_lock.slock);
3678 	return true;
3679 }
3680 EXPORT_SYMBOL(__lock_sock_fast);
3681 
3682 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3683 		   bool timeval, bool time32)
3684 {
3685 	struct sock *sk = sock->sk;
3686 	struct timespec64 ts;
3687 
3688 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3689 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3690 	if (ts.tv_sec == -1)
3691 		return -ENOENT;
3692 	if (ts.tv_sec == 0) {
3693 		ktime_t kt = ktime_get_real();
3694 		sock_write_timestamp(sk, kt);
3695 		ts = ktime_to_timespec64(kt);
3696 	}
3697 
3698 	if (timeval)
3699 		ts.tv_nsec /= 1000;
3700 
3701 #ifdef CONFIG_COMPAT_32BIT_TIME
3702 	if (time32)
3703 		return put_old_timespec32(&ts, userstamp);
3704 #endif
3705 #ifdef CONFIG_SPARC64
3706 	/* beware of padding in sparc64 timeval */
3707 	if (timeval && !in_compat_syscall()) {
3708 		struct __kernel_old_timeval __user tv = {
3709 			.tv_sec = ts.tv_sec,
3710 			.tv_usec = ts.tv_nsec,
3711 		};
3712 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3713 			return -EFAULT;
3714 		return 0;
3715 	}
3716 #endif
3717 	return put_timespec64(&ts, userstamp);
3718 }
3719 EXPORT_SYMBOL(sock_gettstamp);
3720 
3721 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3722 {
3723 	if (!sock_flag(sk, flag)) {
3724 		unsigned long previous_flags = sk->sk_flags;
3725 
3726 		sock_set_flag(sk, flag);
3727 		/*
3728 		 * we just set one of the two flags which require net
3729 		 * time stamping, but time stamping might have been on
3730 		 * already because of the other one
3731 		 */
3732 		if (sock_needs_netstamp(sk) &&
3733 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3734 			net_enable_timestamp();
3735 	}
3736 }
3737 
3738 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3739 		       int level, int type)
3740 {
3741 	struct sock_exterr_skb *serr;
3742 	struct sk_buff *skb;
3743 	int copied, err;
3744 
3745 	err = -EAGAIN;
3746 	skb = sock_dequeue_err_skb(sk);
3747 	if (skb == NULL)
3748 		goto out;
3749 
3750 	copied = skb->len;
3751 	if (copied > len) {
3752 		msg->msg_flags |= MSG_TRUNC;
3753 		copied = len;
3754 	}
3755 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3756 	if (err)
3757 		goto out_free_skb;
3758 
3759 	sock_recv_timestamp(msg, sk, skb);
3760 
3761 	serr = SKB_EXT_ERR(skb);
3762 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3763 
3764 	msg->msg_flags |= MSG_ERRQUEUE;
3765 	err = copied;
3766 
3767 out_free_skb:
3768 	kfree_skb(skb);
3769 out:
3770 	return err;
3771 }
3772 EXPORT_SYMBOL(sock_recv_errqueue);
3773 
3774 /*
3775  *	Get a socket option on an socket.
3776  *
3777  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3778  *	asynchronous errors should be reported by getsockopt. We assume
3779  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3780  */
3781 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3782 			   char __user *optval, int __user *optlen)
3783 {
3784 	struct sock *sk = sock->sk;
3785 
3786 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3787 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3788 }
3789 EXPORT_SYMBOL(sock_common_getsockopt);
3790 
3791 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3792 			int flags)
3793 {
3794 	struct sock *sk = sock->sk;
3795 	int addr_len = 0;
3796 	int err;
3797 
3798 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3799 	if (err >= 0)
3800 		msg->msg_namelen = addr_len;
3801 	return err;
3802 }
3803 EXPORT_SYMBOL(sock_common_recvmsg);
3804 
3805 /*
3806  *	Set socket options on an inet socket.
3807  */
3808 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3809 			   sockptr_t optval, unsigned int optlen)
3810 {
3811 	struct sock *sk = sock->sk;
3812 
3813 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3814 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3815 }
3816 EXPORT_SYMBOL(sock_common_setsockopt);
3817 
3818 void sk_common_release(struct sock *sk)
3819 {
3820 	if (sk->sk_prot->destroy)
3821 		sk->sk_prot->destroy(sk);
3822 
3823 	/*
3824 	 * Observation: when sk_common_release is called, processes have
3825 	 * no access to socket. But net still has.
3826 	 * Step one, detach it from networking:
3827 	 *
3828 	 * A. Remove from hash tables.
3829 	 */
3830 
3831 	sk->sk_prot->unhash(sk);
3832 
3833 	if (sk->sk_socket)
3834 		sk->sk_socket->sk = NULL;
3835 
3836 	/*
3837 	 * In this point socket cannot receive new packets, but it is possible
3838 	 * that some packets are in flight because some CPU runs receiver and
3839 	 * did hash table lookup before we unhashed socket. They will achieve
3840 	 * receive queue and will be purged by socket destructor.
3841 	 *
3842 	 * Also we still have packets pending on receive queue and probably,
3843 	 * our own packets waiting in device queues. sock_destroy will drain
3844 	 * receive queue, but transmitted packets will delay socket destruction
3845 	 * until the last reference will be released.
3846 	 */
3847 
3848 	sock_orphan(sk);
3849 
3850 	xfrm_sk_free_policy(sk);
3851 
3852 	sock_put(sk);
3853 }
3854 EXPORT_SYMBOL(sk_common_release);
3855 
3856 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3857 {
3858 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3859 
3860 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3861 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3862 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3863 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3864 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3865 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3866 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3867 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3868 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3869 }
3870 
3871 #ifdef CONFIG_PROC_FS
3872 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3873 
3874 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3875 {
3876 	int cpu, idx = prot->inuse_idx;
3877 	int res = 0;
3878 
3879 	for_each_possible_cpu(cpu)
3880 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3881 
3882 	return res >= 0 ? res : 0;
3883 }
3884 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3885 
3886 int sock_inuse_get(struct net *net)
3887 {
3888 	int cpu, res = 0;
3889 
3890 	for_each_possible_cpu(cpu)
3891 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3892 
3893 	return res;
3894 }
3895 
3896 EXPORT_SYMBOL_GPL(sock_inuse_get);
3897 
3898 static int __net_init sock_inuse_init_net(struct net *net)
3899 {
3900 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3901 	if (net->core.prot_inuse == NULL)
3902 		return -ENOMEM;
3903 	return 0;
3904 }
3905 
3906 static void __net_exit sock_inuse_exit_net(struct net *net)
3907 {
3908 	free_percpu(net->core.prot_inuse);
3909 }
3910 
3911 static struct pernet_operations net_inuse_ops = {
3912 	.init = sock_inuse_init_net,
3913 	.exit = sock_inuse_exit_net,
3914 };
3915 
3916 static __init int net_inuse_init(void)
3917 {
3918 	if (register_pernet_subsys(&net_inuse_ops))
3919 		panic("Cannot initialize net inuse counters");
3920 
3921 	return 0;
3922 }
3923 
3924 core_initcall(net_inuse_init);
3925 
3926 static int assign_proto_idx(struct proto *prot)
3927 {
3928 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3929 
3930 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3931 		pr_err("PROTO_INUSE_NR exhausted\n");
3932 		return -ENOSPC;
3933 	}
3934 
3935 	set_bit(prot->inuse_idx, proto_inuse_idx);
3936 	return 0;
3937 }
3938 
3939 static void release_proto_idx(struct proto *prot)
3940 {
3941 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3942 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3943 }
3944 #else
3945 static inline int assign_proto_idx(struct proto *prot)
3946 {
3947 	return 0;
3948 }
3949 
3950 static inline void release_proto_idx(struct proto *prot)
3951 {
3952 }
3953 
3954 #endif
3955 
3956 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3957 {
3958 	if (!twsk_prot)
3959 		return;
3960 	kfree(twsk_prot->twsk_slab_name);
3961 	twsk_prot->twsk_slab_name = NULL;
3962 	kmem_cache_destroy(twsk_prot->twsk_slab);
3963 	twsk_prot->twsk_slab = NULL;
3964 }
3965 
3966 static int tw_prot_init(const struct proto *prot)
3967 {
3968 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3969 
3970 	if (!twsk_prot)
3971 		return 0;
3972 
3973 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3974 					      prot->name);
3975 	if (!twsk_prot->twsk_slab_name)
3976 		return -ENOMEM;
3977 
3978 	twsk_prot->twsk_slab =
3979 		kmem_cache_create(twsk_prot->twsk_slab_name,
3980 				  twsk_prot->twsk_obj_size, 0,
3981 				  SLAB_ACCOUNT | prot->slab_flags,
3982 				  NULL);
3983 	if (!twsk_prot->twsk_slab) {
3984 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3985 			prot->name);
3986 		return -ENOMEM;
3987 	}
3988 
3989 	return 0;
3990 }
3991 
3992 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3993 {
3994 	if (!rsk_prot)
3995 		return;
3996 	kfree(rsk_prot->slab_name);
3997 	rsk_prot->slab_name = NULL;
3998 	kmem_cache_destroy(rsk_prot->slab);
3999 	rsk_prot->slab = NULL;
4000 }
4001 
4002 static int req_prot_init(const struct proto *prot)
4003 {
4004 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4005 
4006 	if (!rsk_prot)
4007 		return 0;
4008 
4009 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4010 					prot->name);
4011 	if (!rsk_prot->slab_name)
4012 		return -ENOMEM;
4013 
4014 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4015 					   rsk_prot->obj_size, 0,
4016 					   SLAB_ACCOUNT | prot->slab_flags,
4017 					   NULL);
4018 
4019 	if (!rsk_prot->slab) {
4020 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4021 			prot->name);
4022 		return -ENOMEM;
4023 	}
4024 	return 0;
4025 }
4026 
4027 int proto_register(struct proto *prot, int alloc_slab)
4028 {
4029 	int ret = -ENOBUFS;
4030 
4031 	if (prot->memory_allocated && !prot->sysctl_mem) {
4032 		pr_err("%s: missing sysctl_mem\n", prot->name);
4033 		return -EINVAL;
4034 	}
4035 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4036 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4037 		return -EINVAL;
4038 	}
4039 	if (alloc_slab) {
4040 		prot->slab = kmem_cache_create_usercopy(prot->name,
4041 					prot->obj_size, 0,
4042 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4043 					prot->slab_flags,
4044 					prot->useroffset, prot->usersize,
4045 					NULL);
4046 
4047 		if (prot->slab == NULL) {
4048 			pr_crit("%s: Can't create sock SLAB cache!\n",
4049 				prot->name);
4050 			goto out;
4051 		}
4052 
4053 		if (req_prot_init(prot))
4054 			goto out_free_request_sock_slab;
4055 
4056 		if (tw_prot_init(prot))
4057 			goto out_free_timewait_sock_slab;
4058 	}
4059 
4060 	mutex_lock(&proto_list_mutex);
4061 	ret = assign_proto_idx(prot);
4062 	if (ret) {
4063 		mutex_unlock(&proto_list_mutex);
4064 		goto out_free_timewait_sock_slab;
4065 	}
4066 	list_add(&prot->node, &proto_list);
4067 	mutex_unlock(&proto_list_mutex);
4068 	return ret;
4069 
4070 out_free_timewait_sock_slab:
4071 	if (alloc_slab)
4072 		tw_prot_cleanup(prot->twsk_prot);
4073 out_free_request_sock_slab:
4074 	if (alloc_slab) {
4075 		req_prot_cleanup(prot->rsk_prot);
4076 
4077 		kmem_cache_destroy(prot->slab);
4078 		prot->slab = NULL;
4079 	}
4080 out:
4081 	return ret;
4082 }
4083 EXPORT_SYMBOL(proto_register);
4084 
4085 void proto_unregister(struct proto *prot)
4086 {
4087 	mutex_lock(&proto_list_mutex);
4088 	release_proto_idx(prot);
4089 	list_del(&prot->node);
4090 	mutex_unlock(&proto_list_mutex);
4091 
4092 	kmem_cache_destroy(prot->slab);
4093 	prot->slab = NULL;
4094 
4095 	req_prot_cleanup(prot->rsk_prot);
4096 	tw_prot_cleanup(prot->twsk_prot);
4097 }
4098 EXPORT_SYMBOL(proto_unregister);
4099 
4100 int sock_load_diag_module(int family, int protocol)
4101 {
4102 	if (!protocol) {
4103 		if (!sock_is_registered(family))
4104 			return -ENOENT;
4105 
4106 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4107 				      NETLINK_SOCK_DIAG, family);
4108 	}
4109 
4110 #ifdef CONFIG_INET
4111 	if (family == AF_INET &&
4112 	    protocol != IPPROTO_RAW &&
4113 	    protocol < MAX_INET_PROTOS &&
4114 	    !rcu_access_pointer(inet_protos[protocol]))
4115 		return -ENOENT;
4116 #endif
4117 
4118 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4119 			      NETLINK_SOCK_DIAG, family, protocol);
4120 }
4121 EXPORT_SYMBOL(sock_load_diag_module);
4122 
4123 #ifdef CONFIG_PROC_FS
4124 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4125 	__acquires(proto_list_mutex)
4126 {
4127 	mutex_lock(&proto_list_mutex);
4128 	return seq_list_start_head(&proto_list, *pos);
4129 }
4130 
4131 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4132 {
4133 	return seq_list_next(v, &proto_list, pos);
4134 }
4135 
4136 static void proto_seq_stop(struct seq_file *seq, void *v)
4137 	__releases(proto_list_mutex)
4138 {
4139 	mutex_unlock(&proto_list_mutex);
4140 }
4141 
4142 static char proto_method_implemented(const void *method)
4143 {
4144 	return method == NULL ? 'n' : 'y';
4145 }
4146 static long sock_prot_memory_allocated(struct proto *proto)
4147 {
4148 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4149 }
4150 
4151 static const char *sock_prot_memory_pressure(struct proto *proto)
4152 {
4153 	return proto->memory_pressure != NULL ?
4154 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4155 }
4156 
4157 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4158 {
4159 
4160 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4161 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4162 		   proto->name,
4163 		   proto->obj_size,
4164 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4165 		   sock_prot_memory_allocated(proto),
4166 		   sock_prot_memory_pressure(proto),
4167 		   proto->max_header,
4168 		   proto->slab == NULL ? "no" : "yes",
4169 		   module_name(proto->owner),
4170 		   proto_method_implemented(proto->close),
4171 		   proto_method_implemented(proto->connect),
4172 		   proto_method_implemented(proto->disconnect),
4173 		   proto_method_implemented(proto->accept),
4174 		   proto_method_implemented(proto->ioctl),
4175 		   proto_method_implemented(proto->init),
4176 		   proto_method_implemented(proto->destroy),
4177 		   proto_method_implemented(proto->shutdown),
4178 		   proto_method_implemented(proto->setsockopt),
4179 		   proto_method_implemented(proto->getsockopt),
4180 		   proto_method_implemented(proto->sendmsg),
4181 		   proto_method_implemented(proto->recvmsg),
4182 		   proto_method_implemented(proto->bind),
4183 		   proto_method_implemented(proto->backlog_rcv),
4184 		   proto_method_implemented(proto->hash),
4185 		   proto_method_implemented(proto->unhash),
4186 		   proto_method_implemented(proto->get_port),
4187 		   proto_method_implemented(proto->enter_memory_pressure));
4188 }
4189 
4190 static int proto_seq_show(struct seq_file *seq, void *v)
4191 {
4192 	if (v == &proto_list)
4193 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4194 			   "protocol",
4195 			   "size",
4196 			   "sockets",
4197 			   "memory",
4198 			   "press",
4199 			   "maxhdr",
4200 			   "slab",
4201 			   "module",
4202 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4203 	else
4204 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4205 	return 0;
4206 }
4207 
4208 static const struct seq_operations proto_seq_ops = {
4209 	.start  = proto_seq_start,
4210 	.next   = proto_seq_next,
4211 	.stop   = proto_seq_stop,
4212 	.show   = proto_seq_show,
4213 };
4214 
4215 static __net_init int proto_init_net(struct net *net)
4216 {
4217 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4218 			sizeof(struct seq_net_private)))
4219 		return -ENOMEM;
4220 
4221 	return 0;
4222 }
4223 
4224 static __net_exit void proto_exit_net(struct net *net)
4225 {
4226 	remove_proc_entry("protocols", net->proc_net);
4227 }
4228 
4229 
4230 static __net_initdata struct pernet_operations proto_net_ops = {
4231 	.init = proto_init_net,
4232 	.exit = proto_exit_net,
4233 };
4234 
4235 static int __init proto_init(void)
4236 {
4237 	return register_pernet_subsys(&proto_net_ops);
4238 }
4239 
4240 subsys_initcall(proto_init);
4241 
4242 #endif /* PROC_FS */
4243 
4244 #ifdef CONFIG_NET_RX_BUSY_POLL
4245 bool sk_busy_loop_end(void *p, unsigned long start_time)
4246 {
4247 	struct sock *sk = p;
4248 
4249 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4250 		return true;
4251 
4252 	if (sk_is_udp(sk) &&
4253 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4254 		return true;
4255 
4256 	return sk_busy_loop_timeout(sk, start_time);
4257 }
4258 EXPORT_SYMBOL(sk_busy_loop_end);
4259 #endif /* CONFIG_NET_RX_BUSY_POLL */
4260 
4261 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4262 {
4263 	if (!sk->sk_prot->bind_add)
4264 		return -EOPNOTSUPP;
4265 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4266 }
4267 EXPORT_SYMBOL(sock_bind_add);
4268 
4269 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4270 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4271 		     void __user *arg, void *karg, size_t size)
4272 {
4273 	int ret;
4274 
4275 	if (copy_from_user(karg, arg, size))
4276 		return -EFAULT;
4277 
4278 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4279 	if (ret)
4280 		return ret;
4281 
4282 	if (copy_to_user(arg, karg, size))
4283 		return -EFAULT;
4284 
4285 	return 0;
4286 }
4287 EXPORT_SYMBOL(sock_ioctl_inout);
4288 
4289 /* This is the most common ioctl prep function, where the result (4 bytes) is
4290  * copied back to userspace if the ioctl() returns successfully. No input is
4291  * copied from userspace as input argument.
4292  */
4293 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4294 {
4295 	int ret, karg = 0;
4296 
4297 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4298 	if (ret)
4299 		return ret;
4300 
4301 	return put_user(karg, (int __user *)arg);
4302 }
4303 
4304 /* A wrapper around sock ioctls, which copies the data from userspace
4305  * (depending on the protocol/ioctl), and copies back the result to userspace.
4306  * The main motivation for this function is to pass kernel memory to the
4307  * protocol ioctl callbacks, instead of userspace memory.
4308  */
4309 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4310 {
4311 	int rc = 1;
4312 
4313 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4314 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4315 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4316 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4317 	else if (sk_is_phonet(sk))
4318 		rc = phonet_sk_ioctl(sk, cmd, arg);
4319 
4320 	/* If ioctl was processed, returns its value */
4321 	if (rc <= 0)
4322 		return rc;
4323 
4324 	/* Otherwise call the default handler */
4325 	return sock_ioctl_out(sk, cmd, arg);
4326 }
4327 EXPORT_SYMBOL(sk_ioctl);
4328 
4329 static int __init sock_struct_check(void)
4330 {
4331 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4332 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4333 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4334 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4335 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4336 
4337 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4338 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4339 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4340 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4341 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4342 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4343 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4344 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4345 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4346 
4347 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4348 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4349 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4350 
4351 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4352 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4353 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4354 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4355 
4356 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4357 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4358 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4359 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4360 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4361 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4362 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4363 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4364 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4365 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4366 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4367 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4368 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4369 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4370 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4371 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4372 
4373 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4374 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4375 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4376 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4377 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4378 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4379 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4380 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4381 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4382 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4383 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4384 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4385 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4386 	return 0;
4387 }
4388 
4389 core_initcall(sock_struct_check);
4390