xref: /linux/net/core/sock.c (revision 1d2709d6d3902786bfc3e9ede627e7364633cff7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include "dev.h"
152 
153 static DEFINE_MUTEX(proto_list_mutex);
154 static LIST_HEAD(proto_list);
155 
156 static void sock_def_write_space_wfree(struct sock *sk);
157 static void sock_def_write_space(struct sock *sk);
158 
159 /**
160  * sk_ns_capable - General socket capability test
161  * @sk: Socket to use a capability on or through
162  * @user_ns: The user namespace of the capability to use
163  * @cap: The capability to use
164  *
165  * Test to see if the opener of the socket had when the socket was
166  * created and the current process has the capability @cap in the user
167  * namespace @user_ns.
168  */
169 bool sk_ns_capable(const struct sock *sk,
170 		   struct user_namespace *user_ns, int cap)
171 {
172 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173 		ns_capable(user_ns, cap);
174 }
175 EXPORT_SYMBOL(sk_ns_capable);
176 
177 /**
178  * sk_capable - Socket global capability test
179  * @sk: Socket to use a capability on or through
180  * @cap: The global capability to use
181  *
182  * Test to see if the opener of the socket had when the socket was
183  * created and the current process has the capability @cap in all user
184  * namespaces.
185  */
186 bool sk_capable(const struct sock *sk, int cap)
187 {
188 	return sk_ns_capable(sk, &init_user_ns, cap);
189 }
190 EXPORT_SYMBOL(sk_capable);
191 
192 /**
193  * sk_net_capable - Network namespace socket capability test
194  * @sk: Socket to use a capability on or through
195  * @cap: The capability to use
196  *
197  * Test to see if the opener of the socket had when the socket was created
198  * and the current process has the capability @cap over the network namespace
199  * the socket is a member of.
200  */
201 bool sk_net_capable(const struct sock *sk, int cap)
202 {
203 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204 }
205 EXPORT_SYMBOL(sk_net_capable);
206 
207 /*
208  * Each address family might have different locking rules, so we have
209  * one slock key per address family and separate keys for internal and
210  * userspace sockets.
211  */
212 static struct lock_class_key af_family_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_keys[AF_MAX];
214 static struct lock_class_key af_family_slock_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216 
217 /*
218  * Make lock validator output more readable. (we pre-construct these
219  * strings build-time, so that runtime initialization of socket
220  * locks is fast):
221  */
222 
223 #define _sock_locks(x)						  \
224   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
225   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
226   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
227   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
228   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
229   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
230   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
231   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
232   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
233   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
234   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
235   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
236   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
237   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
238   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
239   x "AF_MCTP"  , \
240   x "AF_MAX"
241 
242 static const char *const af_family_key_strings[AF_MAX+1] = {
243 	_sock_locks("sk_lock-")
244 };
245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("slock-")
247 };
248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("clock-")
250 };
251 
252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-sk_lock-")
254 };
255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-slock-")
257 };
258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259 	_sock_locks("k-clock-")
260 };
261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("rlock-")
263 };
264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265 	_sock_locks("wlock-")
266 };
267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268 	_sock_locks("elock-")
269 };
270 
271 /*
272  * sk_callback_lock and sk queues locking rules are per-address-family,
273  * so split the lock classes by using a per-AF key:
274  */
275 static struct lock_class_key af_callback_keys[AF_MAX];
276 static struct lock_class_key af_rlock_keys[AF_MAX];
277 static struct lock_class_key af_wlock_keys[AF_MAX];
278 static struct lock_class_key af_elock_keys[AF_MAX];
279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
280 
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283 EXPORT_SYMBOL(sysctl_wmem_max);
284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285 EXPORT_SYMBOL(sysctl_rmem_max);
286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288 
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_branch_inc(&memalloc_socks_key);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_branch_dec(&memalloc_socks_key);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned int noreclaim_flag;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	noreclaim_flag = memalloc_noreclaim_save();
334 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
335 				 tcp_v6_do_rcv,
336 				 tcp_v4_do_rcv,
337 				 sk, skb);
338 	memalloc_noreclaim_restore(noreclaim_flag);
339 
340 	return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343 
344 void sk_error_report(struct sock *sk)
345 {
346 	sk->sk_error_report(sk);
347 
348 	switch (sk->sk_family) {
349 	case AF_INET:
350 		fallthrough;
351 	case AF_INET6:
352 		trace_inet_sk_error_report(sk);
353 		break;
354 	default:
355 		break;
356 	}
357 }
358 EXPORT_SYMBOL(sk_error_report);
359 
360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
361 {
362 	struct __kernel_sock_timeval tv;
363 
364 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
365 		tv.tv_sec = 0;
366 		tv.tv_usec = 0;
367 	} else {
368 		tv.tv_sec = timeo / HZ;
369 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
370 	}
371 
372 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
373 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
374 		*(struct old_timeval32 *)optval = tv32;
375 		return sizeof(tv32);
376 	}
377 
378 	if (old_timeval) {
379 		struct __kernel_old_timeval old_tv;
380 		old_tv.tv_sec = tv.tv_sec;
381 		old_tv.tv_usec = tv.tv_usec;
382 		*(struct __kernel_old_timeval *)optval = old_tv;
383 		return sizeof(old_tv);
384 	}
385 
386 	*(struct __kernel_sock_timeval *)optval = tv;
387 	return sizeof(tv);
388 }
389 EXPORT_SYMBOL(sock_get_timeout);
390 
391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
392 			   sockptr_t optval, int optlen, bool old_timeval)
393 {
394 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
395 		struct old_timeval32 tv32;
396 
397 		if (optlen < sizeof(tv32))
398 			return -EINVAL;
399 
400 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
401 			return -EFAULT;
402 		tv->tv_sec = tv32.tv_sec;
403 		tv->tv_usec = tv32.tv_usec;
404 	} else if (old_timeval) {
405 		struct __kernel_old_timeval old_tv;
406 
407 		if (optlen < sizeof(old_tv))
408 			return -EINVAL;
409 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
410 			return -EFAULT;
411 		tv->tv_sec = old_tv.tv_sec;
412 		tv->tv_usec = old_tv.tv_usec;
413 	} else {
414 		if (optlen < sizeof(*tv))
415 			return -EINVAL;
416 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
417 			return -EFAULT;
418 	}
419 
420 	return 0;
421 }
422 EXPORT_SYMBOL(sock_copy_user_timeval);
423 
424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
425 			    bool old_timeval)
426 {
427 	struct __kernel_sock_timeval tv;
428 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
429 	long val;
430 
431 	if (err)
432 		return err;
433 
434 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
435 		return -EDOM;
436 
437 	if (tv.tv_sec < 0) {
438 		static int warned __read_mostly;
439 
440 		WRITE_ONCE(*timeo_p, 0);
441 		if (warned < 10 && net_ratelimit()) {
442 			warned++;
443 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 				__func__, current->comm, task_pid_nr(current));
445 		}
446 		return 0;
447 	}
448 	val = MAX_SCHEDULE_TIMEOUT;
449 	if ((tv.tv_sec || tv.tv_usec) &&
450 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
451 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
452 						    USEC_PER_SEC / HZ);
453 	WRITE_ONCE(*timeo_p, val);
454 	return 0;
455 }
456 
457 static bool sock_needs_netstamp(const struct sock *sk)
458 {
459 	switch (sk->sk_family) {
460 	case AF_UNSPEC:
461 	case AF_UNIX:
462 		return false;
463 	default:
464 		return true;
465 	}
466 }
467 
468 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
469 {
470 	if (sk->sk_flags & flags) {
471 		sk->sk_flags &= ~flags;
472 		if (sock_needs_netstamp(sk) &&
473 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
474 			net_disable_timestamp();
475 	}
476 }
477 
478 
479 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
480 {
481 	unsigned long flags;
482 	struct sk_buff_head *list = &sk->sk_receive_queue;
483 
484 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
485 		atomic_inc(&sk->sk_drops);
486 		trace_sock_rcvqueue_full(sk, skb);
487 		return -ENOMEM;
488 	}
489 
490 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
491 		atomic_inc(&sk->sk_drops);
492 		return -ENOBUFS;
493 	}
494 
495 	skb->dev = NULL;
496 	skb_set_owner_r(skb, sk);
497 
498 	/* we escape from rcu protected region, make sure we dont leak
499 	 * a norefcounted dst
500 	 */
501 	skb_dst_force(skb);
502 
503 	spin_lock_irqsave(&list->lock, flags);
504 	sock_skb_set_dropcount(sk, skb);
505 	__skb_queue_tail(list, skb);
506 	spin_unlock_irqrestore(&list->lock, flags);
507 
508 	if (!sock_flag(sk, SOCK_DEAD))
509 		sk->sk_data_ready(sk);
510 	return 0;
511 }
512 EXPORT_SYMBOL(__sock_queue_rcv_skb);
513 
514 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
515 			      enum skb_drop_reason *reason)
516 {
517 	enum skb_drop_reason drop_reason;
518 	int err;
519 
520 	err = sk_filter(sk, skb);
521 	if (err) {
522 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
523 		goto out;
524 	}
525 	err = __sock_queue_rcv_skb(sk, skb);
526 	switch (err) {
527 	case -ENOMEM:
528 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
529 		break;
530 	case -ENOBUFS:
531 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
532 		break;
533 	default:
534 		drop_reason = SKB_NOT_DROPPED_YET;
535 		break;
536 	}
537 out:
538 	if (reason)
539 		*reason = drop_reason;
540 	return err;
541 }
542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
543 
544 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
545 		     const int nested, unsigned int trim_cap, bool refcounted)
546 {
547 	int rc = NET_RX_SUCCESS;
548 
549 	if (sk_filter_trim_cap(sk, skb, trim_cap))
550 		goto discard_and_relse;
551 
552 	skb->dev = NULL;
553 
554 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
555 		atomic_inc(&sk->sk_drops);
556 		goto discard_and_relse;
557 	}
558 	if (nested)
559 		bh_lock_sock_nested(sk);
560 	else
561 		bh_lock_sock(sk);
562 	if (!sock_owned_by_user(sk)) {
563 		/*
564 		 * trylock + unlock semantics:
565 		 */
566 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
567 
568 		rc = sk_backlog_rcv(sk, skb);
569 
570 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
571 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
572 		bh_unlock_sock(sk);
573 		atomic_inc(&sk->sk_drops);
574 		goto discard_and_relse;
575 	}
576 
577 	bh_unlock_sock(sk);
578 out:
579 	if (refcounted)
580 		sock_put(sk);
581 	return rc;
582 discard_and_relse:
583 	kfree_skb(skb);
584 	goto out;
585 }
586 EXPORT_SYMBOL(__sk_receive_skb);
587 
588 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
589 							  u32));
590 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
591 							   u32));
592 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
593 {
594 	struct dst_entry *dst = __sk_dst_get(sk);
595 
596 	if (dst && dst->obsolete &&
597 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
598 			       dst, cookie) == NULL) {
599 		sk_tx_queue_clear(sk);
600 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
601 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
602 		dst_release(dst);
603 		return NULL;
604 	}
605 
606 	return dst;
607 }
608 EXPORT_SYMBOL(__sk_dst_check);
609 
610 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
611 {
612 	struct dst_entry *dst = sk_dst_get(sk);
613 
614 	if (dst && dst->obsolete &&
615 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
616 			       dst, cookie) == NULL) {
617 		sk_dst_reset(sk);
618 		dst_release(dst);
619 		return NULL;
620 	}
621 
622 	return dst;
623 }
624 EXPORT_SYMBOL(sk_dst_check);
625 
626 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
627 {
628 	int ret = -ENOPROTOOPT;
629 #ifdef CONFIG_NETDEVICES
630 	struct net *net = sock_net(sk);
631 
632 	/* Sorry... */
633 	ret = -EPERM;
634 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
635 		goto out;
636 
637 	ret = -EINVAL;
638 	if (ifindex < 0)
639 		goto out;
640 
641 	/* Paired with all READ_ONCE() done locklessly. */
642 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
643 
644 	if (sk->sk_prot->rehash)
645 		sk->sk_prot->rehash(sk);
646 	sk_dst_reset(sk);
647 
648 	ret = 0;
649 
650 out:
651 #endif
652 
653 	return ret;
654 }
655 
656 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
657 {
658 	int ret;
659 
660 	if (lock_sk)
661 		lock_sock(sk);
662 	ret = sock_bindtoindex_locked(sk, ifindex);
663 	if (lock_sk)
664 		release_sock(sk);
665 
666 	return ret;
667 }
668 EXPORT_SYMBOL(sock_bindtoindex);
669 
670 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
671 {
672 	int ret = -ENOPROTOOPT;
673 #ifdef CONFIG_NETDEVICES
674 	struct net *net = sock_net(sk);
675 	char devname[IFNAMSIZ];
676 	int index;
677 
678 	ret = -EINVAL;
679 	if (optlen < 0)
680 		goto out;
681 
682 	/* Bind this socket to a particular device like "eth0",
683 	 * as specified in the passed interface name. If the
684 	 * name is "" or the option length is zero the socket
685 	 * is not bound.
686 	 */
687 	if (optlen > IFNAMSIZ - 1)
688 		optlen = IFNAMSIZ - 1;
689 	memset(devname, 0, sizeof(devname));
690 
691 	ret = -EFAULT;
692 	if (copy_from_sockptr(devname, optval, optlen))
693 		goto out;
694 
695 	index = 0;
696 	if (devname[0] != '\0') {
697 		struct net_device *dev;
698 
699 		rcu_read_lock();
700 		dev = dev_get_by_name_rcu(net, devname);
701 		if (dev)
702 			index = dev->ifindex;
703 		rcu_read_unlock();
704 		ret = -ENODEV;
705 		if (!dev)
706 			goto out;
707 	}
708 
709 	sockopt_lock_sock(sk);
710 	ret = sock_bindtoindex_locked(sk, index);
711 	sockopt_release_sock(sk);
712 out:
713 #endif
714 
715 	return ret;
716 }
717 
718 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
719 				sockptr_t optlen, int len)
720 {
721 	int ret = -ENOPROTOOPT;
722 #ifdef CONFIG_NETDEVICES
723 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
724 	struct net *net = sock_net(sk);
725 	char devname[IFNAMSIZ];
726 
727 	if (bound_dev_if == 0) {
728 		len = 0;
729 		goto zero;
730 	}
731 
732 	ret = -EINVAL;
733 	if (len < IFNAMSIZ)
734 		goto out;
735 
736 	ret = netdev_get_name(net, devname, bound_dev_if);
737 	if (ret)
738 		goto out;
739 
740 	len = strlen(devname) + 1;
741 
742 	ret = -EFAULT;
743 	if (copy_to_sockptr(optval, devname, len))
744 		goto out;
745 
746 zero:
747 	ret = -EFAULT;
748 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
749 		goto out;
750 
751 	ret = 0;
752 
753 out:
754 #endif
755 
756 	return ret;
757 }
758 
759 bool sk_mc_loop(const struct sock *sk)
760 {
761 	if (dev_recursion_level())
762 		return false;
763 	if (!sk)
764 		return true;
765 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
766 	switch (READ_ONCE(sk->sk_family)) {
767 	case AF_INET:
768 		return inet_test_bit(MC_LOOP, sk);
769 #if IS_ENABLED(CONFIG_IPV6)
770 	case AF_INET6:
771 		return inet6_test_bit(MC6_LOOP, sk);
772 #endif
773 	}
774 	WARN_ON_ONCE(1);
775 	return true;
776 }
777 EXPORT_SYMBOL(sk_mc_loop);
778 
779 void sock_set_reuseaddr(struct sock *sk)
780 {
781 	lock_sock(sk);
782 	sk->sk_reuse = SK_CAN_REUSE;
783 	release_sock(sk);
784 }
785 EXPORT_SYMBOL(sock_set_reuseaddr);
786 
787 void sock_set_reuseport(struct sock *sk)
788 {
789 	lock_sock(sk);
790 	sk->sk_reuseport = true;
791 	release_sock(sk);
792 }
793 EXPORT_SYMBOL(sock_set_reuseport);
794 
795 void sock_no_linger(struct sock *sk)
796 {
797 	lock_sock(sk);
798 	WRITE_ONCE(sk->sk_lingertime, 0);
799 	sock_set_flag(sk, SOCK_LINGER);
800 	release_sock(sk);
801 }
802 EXPORT_SYMBOL(sock_no_linger);
803 
804 void sock_set_priority(struct sock *sk, u32 priority)
805 {
806 	WRITE_ONCE(sk->sk_priority, priority);
807 }
808 EXPORT_SYMBOL(sock_set_priority);
809 
810 void sock_set_sndtimeo(struct sock *sk, s64 secs)
811 {
812 	lock_sock(sk);
813 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
814 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
815 	else
816 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
817 	release_sock(sk);
818 }
819 EXPORT_SYMBOL(sock_set_sndtimeo);
820 
821 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
822 {
823 	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
824 	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
825 	if (val)  {
826 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
827 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
828 	}
829 }
830 
831 void sock_enable_timestamps(struct sock *sk)
832 {
833 	lock_sock(sk);
834 	__sock_set_timestamps(sk, true, false, true);
835 	release_sock(sk);
836 }
837 EXPORT_SYMBOL(sock_enable_timestamps);
838 
839 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
840 {
841 	switch (optname) {
842 	case SO_TIMESTAMP_OLD:
843 		__sock_set_timestamps(sk, valbool, false, false);
844 		break;
845 	case SO_TIMESTAMP_NEW:
846 		__sock_set_timestamps(sk, valbool, true, false);
847 		break;
848 	case SO_TIMESTAMPNS_OLD:
849 		__sock_set_timestamps(sk, valbool, false, true);
850 		break;
851 	case SO_TIMESTAMPNS_NEW:
852 		__sock_set_timestamps(sk, valbool, true, true);
853 		break;
854 	}
855 }
856 
857 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
858 {
859 	struct net *net = sock_net(sk);
860 	struct net_device *dev = NULL;
861 	bool match = false;
862 	int *vclock_index;
863 	int i, num;
864 
865 	if (sk->sk_bound_dev_if)
866 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
867 
868 	if (!dev) {
869 		pr_err("%s: sock not bind to device\n", __func__);
870 		return -EOPNOTSUPP;
871 	}
872 
873 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
874 	dev_put(dev);
875 
876 	for (i = 0; i < num; i++) {
877 		if (*(vclock_index + i) == phc_index) {
878 			match = true;
879 			break;
880 		}
881 	}
882 
883 	if (num > 0)
884 		kfree(vclock_index);
885 
886 	if (!match)
887 		return -EINVAL;
888 
889 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
890 
891 	return 0;
892 }
893 
894 int sock_set_timestamping(struct sock *sk, int optname,
895 			  struct so_timestamping timestamping)
896 {
897 	int val = timestamping.flags;
898 	int ret;
899 
900 	if (val & ~SOF_TIMESTAMPING_MASK)
901 		return -EINVAL;
902 
903 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
904 	    !(val & SOF_TIMESTAMPING_OPT_ID))
905 		return -EINVAL;
906 
907 	if (val & SOF_TIMESTAMPING_OPT_ID &&
908 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
909 		if (sk_is_tcp(sk)) {
910 			if ((1 << sk->sk_state) &
911 			    (TCPF_CLOSE | TCPF_LISTEN))
912 				return -EINVAL;
913 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
914 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
915 			else
916 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
917 		} else {
918 			atomic_set(&sk->sk_tskey, 0);
919 		}
920 	}
921 
922 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
923 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
924 		return -EINVAL;
925 
926 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
927 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
928 		if (ret)
929 			return ret;
930 	}
931 
932 	WRITE_ONCE(sk->sk_tsflags, val);
933 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
934 
935 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
936 		sock_enable_timestamp(sk,
937 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
938 	else
939 		sock_disable_timestamp(sk,
940 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
941 	return 0;
942 }
943 
944 void sock_set_keepalive(struct sock *sk)
945 {
946 	lock_sock(sk);
947 	if (sk->sk_prot->keepalive)
948 		sk->sk_prot->keepalive(sk, true);
949 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
950 	release_sock(sk);
951 }
952 EXPORT_SYMBOL(sock_set_keepalive);
953 
954 static void __sock_set_rcvbuf(struct sock *sk, int val)
955 {
956 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
957 	 * as a negative value.
958 	 */
959 	val = min_t(int, val, INT_MAX / 2);
960 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
961 
962 	/* We double it on the way in to account for "struct sk_buff" etc.
963 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
964 	 * will allow that much actual data to be received on that socket.
965 	 *
966 	 * Applications are unaware that "struct sk_buff" and other overheads
967 	 * allocate from the receive buffer during socket buffer allocation.
968 	 *
969 	 * And after considering the possible alternatives, returning the value
970 	 * we actually used in getsockopt is the most desirable behavior.
971 	 */
972 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
973 }
974 
975 void sock_set_rcvbuf(struct sock *sk, int val)
976 {
977 	lock_sock(sk);
978 	__sock_set_rcvbuf(sk, val);
979 	release_sock(sk);
980 }
981 EXPORT_SYMBOL(sock_set_rcvbuf);
982 
983 static void __sock_set_mark(struct sock *sk, u32 val)
984 {
985 	if (val != sk->sk_mark) {
986 		WRITE_ONCE(sk->sk_mark, val);
987 		sk_dst_reset(sk);
988 	}
989 }
990 
991 void sock_set_mark(struct sock *sk, u32 val)
992 {
993 	lock_sock(sk);
994 	__sock_set_mark(sk, val);
995 	release_sock(sk);
996 }
997 EXPORT_SYMBOL(sock_set_mark);
998 
999 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1000 {
1001 	/* Round down bytes to multiple of pages */
1002 	bytes = round_down(bytes, PAGE_SIZE);
1003 
1004 	WARN_ON(bytes > sk->sk_reserved_mem);
1005 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1006 	sk_mem_reclaim(sk);
1007 }
1008 
1009 static int sock_reserve_memory(struct sock *sk, int bytes)
1010 {
1011 	long allocated;
1012 	bool charged;
1013 	int pages;
1014 
1015 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1016 		return -EOPNOTSUPP;
1017 
1018 	if (!bytes)
1019 		return 0;
1020 
1021 	pages = sk_mem_pages(bytes);
1022 
1023 	/* pre-charge to memcg */
1024 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1025 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1026 	if (!charged)
1027 		return -ENOMEM;
1028 
1029 	/* pre-charge to forward_alloc */
1030 	sk_memory_allocated_add(sk, pages);
1031 	allocated = sk_memory_allocated(sk);
1032 	/* If the system goes into memory pressure with this
1033 	 * precharge, give up and return error.
1034 	 */
1035 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1036 		sk_memory_allocated_sub(sk, pages);
1037 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1038 		return -ENOMEM;
1039 	}
1040 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1041 
1042 	WRITE_ONCE(sk->sk_reserved_mem,
1043 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1044 
1045 	return 0;
1046 }
1047 
1048 #ifdef CONFIG_PAGE_POOL
1049 
1050 /* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in
1051  * 1 syscall. The limit exists to limit the amount of memory the kernel
1052  * allocates to copy these tokens.
1053  */
1054 #define MAX_DONTNEED_TOKENS 128
1055 
1056 static noinline_for_stack int
1057 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1058 {
1059 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1060 	struct dmabuf_token *tokens;
1061 	netmem_ref netmems[16];
1062 	int ret = 0;
1063 
1064 	if (!sk_is_tcp(sk))
1065 		return -EBADF;
1066 
1067 	if (optlen % sizeof(struct dmabuf_token) ||
1068 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1069 		return -EINVAL;
1070 
1071 	tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
1072 	if (!tokens)
1073 		return -ENOMEM;
1074 
1075 	num_tokens = optlen / sizeof(struct dmabuf_token);
1076 	if (copy_from_sockptr(tokens, optval, optlen)) {
1077 		kvfree(tokens);
1078 		return -EFAULT;
1079 	}
1080 
1081 	xa_lock_bh(&sk->sk_user_frags);
1082 	for (i = 0; i < num_tokens; i++) {
1083 		for (j = 0; j < tokens[i].token_count; j++) {
1084 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1085 				&sk->sk_user_frags, tokens[i].token_start + j);
1086 
1087 			if (netmem &&
1088 			    !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) {
1089 				netmems[netmem_num++] = netmem;
1090 				if (netmem_num == ARRAY_SIZE(netmems)) {
1091 					xa_unlock_bh(&sk->sk_user_frags);
1092 					for (k = 0; k < netmem_num; k++)
1093 						WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1094 					netmem_num = 0;
1095 					xa_lock_bh(&sk->sk_user_frags);
1096 				}
1097 				ret++;
1098 			}
1099 		}
1100 	}
1101 
1102 	xa_unlock_bh(&sk->sk_user_frags);
1103 	for (k = 0; k < netmem_num; k++)
1104 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1105 
1106 	kvfree(tokens);
1107 	return ret;
1108 }
1109 #endif
1110 
1111 void sockopt_lock_sock(struct sock *sk)
1112 {
1113 	/* When current->bpf_ctx is set, the setsockopt is called from
1114 	 * a bpf prog.  bpf has ensured the sk lock has been
1115 	 * acquired before calling setsockopt().
1116 	 */
1117 	if (has_current_bpf_ctx())
1118 		return;
1119 
1120 	lock_sock(sk);
1121 }
1122 EXPORT_SYMBOL(sockopt_lock_sock);
1123 
1124 void sockopt_release_sock(struct sock *sk)
1125 {
1126 	if (has_current_bpf_ctx())
1127 		return;
1128 
1129 	release_sock(sk);
1130 }
1131 EXPORT_SYMBOL(sockopt_release_sock);
1132 
1133 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1134 {
1135 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1136 }
1137 EXPORT_SYMBOL(sockopt_ns_capable);
1138 
1139 bool sockopt_capable(int cap)
1140 {
1141 	return has_current_bpf_ctx() || capable(cap);
1142 }
1143 EXPORT_SYMBOL(sockopt_capable);
1144 
1145 static int sockopt_validate_clockid(__kernel_clockid_t value)
1146 {
1147 	switch (value) {
1148 	case CLOCK_REALTIME:
1149 	case CLOCK_MONOTONIC:
1150 	case CLOCK_TAI:
1151 		return 0;
1152 	}
1153 	return -EINVAL;
1154 }
1155 
1156 /*
1157  *	This is meant for all protocols to use and covers goings on
1158  *	at the socket level. Everything here is generic.
1159  */
1160 
1161 int sk_setsockopt(struct sock *sk, int level, int optname,
1162 		  sockptr_t optval, unsigned int optlen)
1163 {
1164 	struct so_timestamping timestamping;
1165 	struct socket *sock = sk->sk_socket;
1166 	struct sock_txtime sk_txtime;
1167 	int val;
1168 	int valbool;
1169 	struct linger ling;
1170 	int ret = 0;
1171 
1172 	/*
1173 	 *	Options without arguments
1174 	 */
1175 
1176 	if (optname == SO_BINDTODEVICE)
1177 		return sock_setbindtodevice(sk, optval, optlen);
1178 
1179 	if (optlen < sizeof(int))
1180 		return -EINVAL;
1181 
1182 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1183 		return -EFAULT;
1184 
1185 	valbool = val ? 1 : 0;
1186 
1187 	/* handle options which do not require locking the socket. */
1188 	switch (optname) {
1189 	case SO_PRIORITY:
1190 		if ((val >= 0 && val <= 6) ||
1191 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1192 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1193 			sock_set_priority(sk, val);
1194 			return 0;
1195 		}
1196 		return -EPERM;
1197 	case SO_PASSSEC:
1198 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1199 		return 0;
1200 	case SO_PASSCRED:
1201 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1202 		return 0;
1203 	case SO_PASSPIDFD:
1204 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1205 		return 0;
1206 	case SO_TYPE:
1207 	case SO_PROTOCOL:
1208 	case SO_DOMAIN:
1209 	case SO_ERROR:
1210 		return -ENOPROTOOPT;
1211 #ifdef CONFIG_NET_RX_BUSY_POLL
1212 	case SO_BUSY_POLL:
1213 		if (val < 0)
1214 			return -EINVAL;
1215 		WRITE_ONCE(sk->sk_ll_usec, val);
1216 		return 0;
1217 	case SO_PREFER_BUSY_POLL:
1218 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1219 			return -EPERM;
1220 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1221 		return 0;
1222 	case SO_BUSY_POLL_BUDGET:
1223 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1224 		    !sockopt_capable(CAP_NET_ADMIN))
1225 			return -EPERM;
1226 		if (val < 0 || val > U16_MAX)
1227 			return -EINVAL;
1228 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1229 		return 0;
1230 #endif
1231 	case SO_MAX_PACING_RATE:
1232 		{
1233 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1234 		unsigned long pacing_rate;
1235 
1236 		if (sizeof(ulval) != sizeof(val) &&
1237 		    optlen >= sizeof(ulval) &&
1238 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1239 			return -EFAULT;
1240 		}
1241 		if (ulval != ~0UL)
1242 			cmpxchg(&sk->sk_pacing_status,
1243 				SK_PACING_NONE,
1244 				SK_PACING_NEEDED);
1245 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1246 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1247 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1248 		if (ulval < pacing_rate)
1249 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1250 		return 0;
1251 		}
1252 	case SO_TXREHASH:
1253 		if (val < -1 || val > 1)
1254 			return -EINVAL;
1255 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1256 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1257 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1258 		 * and sk_getsockopt().
1259 		 */
1260 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1261 		return 0;
1262 	case SO_PEEK_OFF:
1263 		{
1264 		int (*set_peek_off)(struct sock *sk, int val);
1265 
1266 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1267 		if (set_peek_off)
1268 			ret = set_peek_off(sk, val);
1269 		else
1270 			ret = -EOPNOTSUPP;
1271 		return ret;
1272 		}
1273 #ifdef CONFIG_PAGE_POOL
1274 	case SO_DEVMEM_DONTNEED:
1275 		return sock_devmem_dontneed(sk, optval, optlen);
1276 #endif
1277 	}
1278 
1279 	sockopt_lock_sock(sk);
1280 
1281 	switch (optname) {
1282 	case SO_DEBUG:
1283 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1284 			ret = -EACCES;
1285 		else
1286 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1287 		break;
1288 	case SO_REUSEADDR:
1289 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1290 		break;
1291 	case SO_REUSEPORT:
1292 		sk->sk_reuseport = valbool;
1293 		break;
1294 	case SO_DONTROUTE:
1295 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1296 		sk_dst_reset(sk);
1297 		break;
1298 	case SO_BROADCAST:
1299 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1300 		break;
1301 	case SO_SNDBUF:
1302 		/* Don't error on this BSD doesn't and if you think
1303 		 * about it this is right. Otherwise apps have to
1304 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1305 		 * are treated in BSD as hints
1306 		 */
1307 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1308 set_sndbuf:
1309 		/* Ensure val * 2 fits into an int, to prevent max_t()
1310 		 * from treating it as a negative value.
1311 		 */
1312 		val = min_t(int, val, INT_MAX / 2);
1313 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1314 		WRITE_ONCE(sk->sk_sndbuf,
1315 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1316 		/* Wake up sending tasks if we upped the value. */
1317 		sk->sk_write_space(sk);
1318 		break;
1319 
1320 	case SO_SNDBUFFORCE:
1321 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1322 			ret = -EPERM;
1323 			break;
1324 		}
1325 
1326 		/* No negative values (to prevent underflow, as val will be
1327 		 * multiplied by 2).
1328 		 */
1329 		if (val < 0)
1330 			val = 0;
1331 		goto set_sndbuf;
1332 
1333 	case SO_RCVBUF:
1334 		/* Don't error on this BSD doesn't and if you think
1335 		 * about it this is right. Otherwise apps have to
1336 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1337 		 * are treated in BSD as hints
1338 		 */
1339 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1340 		break;
1341 
1342 	case SO_RCVBUFFORCE:
1343 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1344 			ret = -EPERM;
1345 			break;
1346 		}
1347 
1348 		/* No negative values (to prevent underflow, as val will be
1349 		 * multiplied by 2).
1350 		 */
1351 		__sock_set_rcvbuf(sk, max(val, 0));
1352 		break;
1353 
1354 	case SO_KEEPALIVE:
1355 		if (sk->sk_prot->keepalive)
1356 			sk->sk_prot->keepalive(sk, valbool);
1357 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1358 		break;
1359 
1360 	case SO_OOBINLINE:
1361 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1362 		break;
1363 
1364 	case SO_NO_CHECK:
1365 		sk->sk_no_check_tx = valbool;
1366 		break;
1367 
1368 	case SO_LINGER:
1369 		if (optlen < sizeof(ling)) {
1370 			ret = -EINVAL;	/* 1003.1g */
1371 			break;
1372 		}
1373 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1374 			ret = -EFAULT;
1375 			break;
1376 		}
1377 		if (!ling.l_onoff) {
1378 			sock_reset_flag(sk, SOCK_LINGER);
1379 		} else {
1380 			unsigned long t_sec = ling.l_linger;
1381 
1382 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1383 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1384 			else
1385 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1386 			sock_set_flag(sk, SOCK_LINGER);
1387 		}
1388 		break;
1389 
1390 	case SO_BSDCOMPAT:
1391 		break;
1392 
1393 	case SO_TIMESTAMP_OLD:
1394 	case SO_TIMESTAMP_NEW:
1395 	case SO_TIMESTAMPNS_OLD:
1396 	case SO_TIMESTAMPNS_NEW:
1397 		sock_set_timestamp(sk, optname, valbool);
1398 		break;
1399 
1400 	case SO_TIMESTAMPING_NEW:
1401 	case SO_TIMESTAMPING_OLD:
1402 		if (optlen == sizeof(timestamping)) {
1403 			if (copy_from_sockptr(&timestamping, optval,
1404 					      sizeof(timestamping))) {
1405 				ret = -EFAULT;
1406 				break;
1407 			}
1408 		} else {
1409 			memset(&timestamping, 0, sizeof(timestamping));
1410 			timestamping.flags = val;
1411 		}
1412 		ret = sock_set_timestamping(sk, optname, timestamping);
1413 		break;
1414 
1415 	case SO_RCVLOWAT:
1416 		{
1417 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1418 
1419 		if (val < 0)
1420 			val = INT_MAX;
1421 		if (sock)
1422 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1423 		if (set_rcvlowat)
1424 			ret = set_rcvlowat(sk, val);
1425 		else
1426 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1427 		break;
1428 		}
1429 	case SO_RCVTIMEO_OLD:
1430 	case SO_RCVTIMEO_NEW:
1431 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1432 				       optlen, optname == SO_RCVTIMEO_OLD);
1433 		break;
1434 
1435 	case SO_SNDTIMEO_OLD:
1436 	case SO_SNDTIMEO_NEW:
1437 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1438 				       optlen, optname == SO_SNDTIMEO_OLD);
1439 		break;
1440 
1441 	case SO_ATTACH_FILTER: {
1442 		struct sock_fprog fprog;
1443 
1444 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1445 		if (!ret)
1446 			ret = sk_attach_filter(&fprog, sk);
1447 		break;
1448 	}
1449 	case SO_ATTACH_BPF:
1450 		ret = -EINVAL;
1451 		if (optlen == sizeof(u32)) {
1452 			u32 ufd;
1453 
1454 			ret = -EFAULT;
1455 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1456 				break;
1457 
1458 			ret = sk_attach_bpf(ufd, sk);
1459 		}
1460 		break;
1461 
1462 	case SO_ATTACH_REUSEPORT_CBPF: {
1463 		struct sock_fprog fprog;
1464 
1465 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1466 		if (!ret)
1467 			ret = sk_reuseport_attach_filter(&fprog, sk);
1468 		break;
1469 	}
1470 	case SO_ATTACH_REUSEPORT_EBPF:
1471 		ret = -EINVAL;
1472 		if (optlen == sizeof(u32)) {
1473 			u32 ufd;
1474 
1475 			ret = -EFAULT;
1476 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1477 				break;
1478 
1479 			ret = sk_reuseport_attach_bpf(ufd, sk);
1480 		}
1481 		break;
1482 
1483 	case SO_DETACH_REUSEPORT_BPF:
1484 		ret = reuseport_detach_prog(sk);
1485 		break;
1486 
1487 	case SO_DETACH_FILTER:
1488 		ret = sk_detach_filter(sk);
1489 		break;
1490 
1491 	case SO_LOCK_FILTER:
1492 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1493 			ret = -EPERM;
1494 		else
1495 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1496 		break;
1497 
1498 	case SO_MARK:
1499 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1500 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1501 			ret = -EPERM;
1502 			break;
1503 		}
1504 
1505 		__sock_set_mark(sk, val);
1506 		break;
1507 	case SO_RCVMARK:
1508 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1509 		break;
1510 
1511 	case SO_RXQ_OVFL:
1512 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1513 		break;
1514 
1515 	case SO_WIFI_STATUS:
1516 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1517 		break;
1518 
1519 	case SO_NOFCS:
1520 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1521 		break;
1522 
1523 	case SO_SELECT_ERR_QUEUE:
1524 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1525 		break;
1526 
1527 
1528 	case SO_INCOMING_CPU:
1529 		reuseport_update_incoming_cpu(sk, val);
1530 		break;
1531 
1532 	case SO_CNX_ADVICE:
1533 		if (val == 1)
1534 			dst_negative_advice(sk);
1535 		break;
1536 
1537 	case SO_ZEROCOPY:
1538 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1539 			if (!(sk_is_tcp(sk) ||
1540 			      (sk->sk_type == SOCK_DGRAM &&
1541 			       sk->sk_protocol == IPPROTO_UDP)))
1542 				ret = -EOPNOTSUPP;
1543 		} else if (sk->sk_family != PF_RDS) {
1544 			ret = -EOPNOTSUPP;
1545 		}
1546 		if (!ret) {
1547 			if (val < 0 || val > 1)
1548 				ret = -EINVAL;
1549 			else
1550 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1551 		}
1552 		break;
1553 
1554 	case SO_TXTIME:
1555 		if (optlen != sizeof(struct sock_txtime)) {
1556 			ret = -EINVAL;
1557 			break;
1558 		} else if (copy_from_sockptr(&sk_txtime, optval,
1559 			   sizeof(struct sock_txtime))) {
1560 			ret = -EFAULT;
1561 			break;
1562 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1563 			ret = -EINVAL;
1564 			break;
1565 		}
1566 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1567 		 * scheduler has enough safe guards.
1568 		 */
1569 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1570 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1571 			ret = -EPERM;
1572 			break;
1573 		}
1574 
1575 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1576 		if (ret)
1577 			break;
1578 
1579 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1580 		sk->sk_clockid = sk_txtime.clockid;
1581 		sk->sk_txtime_deadline_mode =
1582 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1583 		sk->sk_txtime_report_errors =
1584 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1585 		break;
1586 
1587 	case SO_BINDTOIFINDEX:
1588 		ret = sock_bindtoindex_locked(sk, val);
1589 		break;
1590 
1591 	case SO_BUF_LOCK:
1592 		if (val & ~SOCK_BUF_LOCK_MASK) {
1593 			ret = -EINVAL;
1594 			break;
1595 		}
1596 		sk->sk_userlocks = val | (sk->sk_userlocks &
1597 					  ~SOCK_BUF_LOCK_MASK);
1598 		break;
1599 
1600 	case SO_RESERVE_MEM:
1601 	{
1602 		int delta;
1603 
1604 		if (val < 0) {
1605 			ret = -EINVAL;
1606 			break;
1607 		}
1608 
1609 		delta = val - sk->sk_reserved_mem;
1610 		if (delta < 0)
1611 			sock_release_reserved_memory(sk, -delta);
1612 		else
1613 			ret = sock_reserve_memory(sk, delta);
1614 		break;
1615 	}
1616 
1617 	default:
1618 		ret = -ENOPROTOOPT;
1619 		break;
1620 	}
1621 	sockopt_release_sock(sk);
1622 	return ret;
1623 }
1624 
1625 int sock_setsockopt(struct socket *sock, int level, int optname,
1626 		    sockptr_t optval, unsigned int optlen)
1627 {
1628 	return sk_setsockopt(sock->sk, level, optname,
1629 			     optval, optlen);
1630 }
1631 EXPORT_SYMBOL(sock_setsockopt);
1632 
1633 static const struct cred *sk_get_peer_cred(struct sock *sk)
1634 {
1635 	const struct cred *cred;
1636 
1637 	spin_lock(&sk->sk_peer_lock);
1638 	cred = get_cred(sk->sk_peer_cred);
1639 	spin_unlock(&sk->sk_peer_lock);
1640 
1641 	return cred;
1642 }
1643 
1644 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1645 			  struct ucred *ucred)
1646 {
1647 	ucred->pid = pid_vnr(pid);
1648 	ucred->uid = ucred->gid = -1;
1649 	if (cred) {
1650 		struct user_namespace *current_ns = current_user_ns();
1651 
1652 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1653 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1654 	}
1655 }
1656 
1657 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1658 {
1659 	struct user_namespace *user_ns = current_user_ns();
1660 	int i;
1661 
1662 	for (i = 0; i < src->ngroups; i++) {
1663 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1664 
1665 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1666 			return -EFAULT;
1667 	}
1668 
1669 	return 0;
1670 }
1671 
1672 int sk_getsockopt(struct sock *sk, int level, int optname,
1673 		  sockptr_t optval, sockptr_t optlen)
1674 {
1675 	struct socket *sock = sk->sk_socket;
1676 
1677 	union {
1678 		int val;
1679 		u64 val64;
1680 		unsigned long ulval;
1681 		struct linger ling;
1682 		struct old_timeval32 tm32;
1683 		struct __kernel_old_timeval tm;
1684 		struct  __kernel_sock_timeval stm;
1685 		struct sock_txtime txtime;
1686 		struct so_timestamping timestamping;
1687 	} v;
1688 
1689 	int lv = sizeof(int);
1690 	int len;
1691 
1692 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1693 		return -EFAULT;
1694 	if (len < 0)
1695 		return -EINVAL;
1696 
1697 	memset(&v, 0, sizeof(v));
1698 
1699 	switch (optname) {
1700 	case SO_DEBUG:
1701 		v.val = sock_flag(sk, SOCK_DBG);
1702 		break;
1703 
1704 	case SO_DONTROUTE:
1705 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1706 		break;
1707 
1708 	case SO_BROADCAST:
1709 		v.val = sock_flag(sk, SOCK_BROADCAST);
1710 		break;
1711 
1712 	case SO_SNDBUF:
1713 		v.val = READ_ONCE(sk->sk_sndbuf);
1714 		break;
1715 
1716 	case SO_RCVBUF:
1717 		v.val = READ_ONCE(sk->sk_rcvbuf);
1718 		break;
1719 
1720 	case SO_REUSEADDR:
1721 		v.val = sk->sk_reuse;
1722 		break;
1723 
1724 	case SO_REUSEPORT:
1725 		v.val = sk->sk_reuseport;
1726 		break;
1727 
1728 	case SO_KEEPALIVE:
1729 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1730 		break;
1731 
1732 	case SO_TYPE:
1733 		v.val = sk->sk_type;
1734 		break;
1735 
1736 	case SO_PROTOCOL:
1737 		v.val = sk->sk_protocol;
1738 		break;
1739 
1740 	case SO_DOMAIN:
1741 		v.val = sk->sk_family;
1742 		break;
1743 
1744 	case SO_ERROR:
1745 		v.val = -sock_error(sk);
1746 		if (v.val == 0)
1747 			v.val = xchg(&sk->sk_err_soft, 0);
1748 		break;
1749 
1750 	case SO_OOBINLINE:
1751 		v.val = sock_flag(sk, SOCK_URGINLINE);
1752 		break;
1753 
1754 	case SO_NO_CHECK:
1755 		v.val = sk->sk_no_check_tx;
1756 		break;
1757 
1758 	case SO_PRIORITY:
1759 		v.val = READ_ONCE(sk->sk_priority);
1760 		break;
1761 
1762 	case SO_LINGER:
1763 		lv		= sizeof(v.ling);
1764 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1765 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1766 		break;
1767 
1768 	case SO_BSDCOMPAT:
1769 		break;
1770 
1771 	case SO_TIMESTAMP_OLD:
1772 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1773 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1774 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1775 		break;
1776 
1777 	case SO_TIMESTAMPNS_OLD:
1778 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1779 		break;
1780 
1781 	case SO_TIMESTAMP_NEW:
1782 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1783 		break;
1784 
1785 	case SO_TIMESTAMPNS_NEW:
1786 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1787 		break;
1788 
1789 	case SO_TIMESTAMPING_OLD:
1790 	case SO_TIMESTAMPING_NEW:
1791 		lv = sizeof(v.timestamping);
1792 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1793 		 * returning the flags when they were set through the same option.
1794 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1795 		 */
1796 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1797 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1798 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1799 		}
1800 		break;
1801 
1802 	case SO_RCVTIMEO_OLD:
1803 	case SO_RCVTIMEO_NEW:
1804 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1805 				      SO_RCVTIMEO_OLD == optname);
1806 		break;
1807 
1808 	case SO_SNDTIMEO_OLD:
1809 	case SO_SNDTIMEO_NEW:
1810 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1811 				      SO_SNDTIMEO_OLD == optname);
1812 		break;
1813 
1814 	case SO_RCVLOWAT:
1815 		v.val = READ_ONCE(sk->sk_rcvlowat);
1816 		break;
1817 
1818 	case SO_SNDLOWAT:
1819 		v.val = 1;
1820 		break;
1821 
1822 	case SO_PASSCRED:
1823 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1824 		break;
1825 
1826 	case SO_PASSPIDFD:
1827 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1828 		break;
1829 
1830 	case SO_PEERCRED:
1831 	{
1832 		struct ucred peercred;
1833 		if (len > sizeof(peercred))
1834 			len = sizeof(peercred);
1835 
1836 		spin_lock(&sk->sk_peer_lock);
1837 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1838 		spin_unlock(&sk->sk_peer_lock);
1839 
1840 		if (copy_to_sockptr(optval, &peercred, len))
1841 			return -EFAULT;
1842 		goto lenout;
1843 	}
1844 
1845 	case SO_PEERPIDFD:
1846 	{
1847 		struct pid *peer_pid;
1848 		struct file *pidfd_file = NULL;
1849 		int pidfd;
1850 
1851 		if (len > sizeof(pidfd))
1852 			len = sizeof(pidfd);
1853 
1854 		spin_lock(&sk->sk_peer_lock);
1855 		peer_pid = get_pid(sk->sk_peer_pid);
1856 		spin_unlock(&sk->sk_peer_lock);
1857 
1858 		if (!peer_pid)
1859 			return -ENODATA;
1860 
1861 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1862 		put_pid(peer_pid);
1863 		if (pidfd < 0)
1864 			return pidfd;
1865 
1866 		if (copy_to_sockptr(optval, &pidfd, len) ||
1867 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1868 			put_unused_fd(pidfd);
1869 			fput(pidfd_file);
1870 
1871 			return -EFAULT;
1872 		}
1873 
1874 		fd_install(pidfd, pidfd_file);
1875 		return 0;
1876 	}
1877 
1878 	case SO_PEERGROUPS:
1879 	{
1880 		const struct cred *cred;
1881 		int ret, n;
1882 
1883 		cred = sk_get_peer_cred(sk);
1884 		if (!cred)
1885 			return -ENODATA;
1886 
1887 		n = cred->group_info->ngroups;
1888 		if (len < n * sizeof(gid_t)) {
1889 			len = n * sizeof(gid_t);
1890 			put_cred(cred);
1891 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1892 		}
1893 		len = n * sizeof(gid_t);
1894 
1895 		ret = groups_to_user(optval, cred->group_info);
1896 		put_cred(cred);
1897 		if (ret)
1898 			return ret;
1899 		goto lenout;
1900 	}
1901 
1902 	case SO_PEERNAME:
1903 	{
1904 		struct sockaddr_storage address;
1905 
1906 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1907 		if (lv < 0)
1908 			return -ENOTCONN;
1909 		if (lv < len)
1910 			return -EINVAL;
1911 		if (copy_to_sockptr(optval, &address, len))
1912 			return -EFAULT;
1913 		goto lenout;
1914 	}
1915 
1916 	/* Dubious BSD thing... Probably nobody even uses it, but
1917 	 * the UNIX standard wants it for whatever reason... -DaveM
1918 	 */
1919 	case SO_ACCEPTCONN:
1920 		v.val = sk->sk_state == TCP_LISTEN;
1921 		break;
1922 
1923 	case SO_PASSSEC:
1924 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1925 		break;
1926 
1927 	case SO_PEERSEC:
1928 		return security_socket_getpeersec_stream(sock,
1929 							 optval, optlen, len);
1930 
1931 	case SO_MARK:
1932 		v.val = READ_ONCE(sk->sk_mark);
1933 		break;
1934 
1935 	case SO_RCVMARK:
1936 		v.val = sock_flag(sk, SOCK_RCVMARK);
1937 		break;
1938 
1939 	case SO_RXQ_OVFL:
1940 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1941 		break;
1942 
1943 	case SO_WIFI_STATUS:
1944 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1945 		break;
1946 
1947 	case SO_PEEK_OFF:
1948 		if (!READ_ONCE(sock->ops)->set_peek_off)
1949 			return -EOPNOTSUPP;
1950 
1951 		v.val = READ_ONCE(sk->sk_peek_off);
1952 		break;
1953 	case SO_NOFCS:
1954 		v.val = sock_flag(sk, SOCK_NOFCS);
1955 		break;
1956 
1957 	case SO_BINDTODEVICE:
1958 		return sock_getbindtodevice(sk, optval, optlen, len);
1959 
1960 	case SO_GET_FILTER:
1961 		len = sk_get_filter(sk, optval, len);
1962 		if (len < 0)
1963 			return len;
1964 
1965 		goto lenout;
1966 
1967 	case SO_LOCK_FILTER:
1968 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1969 		break;
1970 
1971 	case SO_BPF_EXTENSIONS:
1972 		v.val = bpf_tell_extensions();
1973 		break;
1974 
1975 	case SO_SELECT_ERR_QUEUE:
1976 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1977 		break;
1978 
1979 #ifdef CONFIG_NET_RX_BUSY_POLL
1980 	case SO_BUSY_POLL:
1981 		v.val = READ_ONCE(sk->sk_ll_usec);
1982 		break;
1983 	case SO_PREFER_BUSY_POLL:
1984 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1985 		break;
1986 #endif
1987 
1988 	case SO_MAX_PACING_RATE:
1989 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1990 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1991 			lv = sizeof(v.ulval);
1992 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1993 		} else {
1994 			/* 32bit version */
1995 			v.val = min_t(unsigned long, ~0U,
1996 				      READ_ONCE(sk->sk_max_pacing_rate));
1997 		}
1998 		break;
1999 
2000 	case SO_INCOMING_CPU:
2001 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2002 		break;
2003 
2004 	case SO_MEMINFO:
2005 	{
2006 		u32 meminfo[SK_MEMINFO_VARS];
2007 
2008 		sk_get_meminfo(sk, meminfo);
2009 
2010 		len = min_t(unsigned int, len, sizeof(meminfo));
2011 		if (copy_to_sockptr(optval, &meminfo, len))
2012 			return -EFAULT;
2013 
2014 		goto lenout;
2015 	}
2016 
2017 #ifdef CONFIG_NET_RX_BUSY_POLL
2018 	case SO_INCOMING_NAPI_ID:
2019 		v.val = READ_ONCE(sk->sk_napi_id);
2020 
2021 		/* aggregate non-NAPI IDs down to 0 */
2022 		if (v.val < MIN_NAPI_ID)
2023 			v.val = 0;
2024 
2025 		break;
2026 #endif
2027 
2028 	case SO_COOKIE:
2029 		lv = sizeof(u64);
2030 		if (len < lv)
2031 			return -EINVAL;
2032 		v.val64 = sock_gen_cookie(sk);
2033 		break;
2034 
2035 	case SO_ZEROCOPY:
2036 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2037 		break;
2038 
2039 	case SO_TXTIME:
2040 		lv = sizeof(v.txtime);
2041 		v.txtime.clockid = sk->sk_clockid;
2042 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2043 				  SOF_TXTIME_DEADLINE_MODE : 0;
2044 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2045 				  SOF_TXTIME_REPORT_ERRORS : 0;
2046 		break;
2047 
2048 	case SO_BINDTOIFINDEX:
2049 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2050 		break;
2051 
2052 	case SO_NETNS_COOKIE:
2053 		lv = sizeof(u64);
2054 		if (len != lv)
2055 			return -EINVAL;
2056 		v.val64 = sock_net(sk)->net_cookie;
2057 		break;
2058 
2059 	case SO_BUF_LOCK:
2060 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2061 		break;
2062 
2063 	case SO_RESERVE_MEM:
2064 		v.val = READ_ONCE(sk->sk_reserved_mem);
2065 		break;
2066 
2067 	case SO_TXREHASH:
2068 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2069 		v.val = READ_ONCE(sk->sk_txrehash);
2070 		break;
2071 
2072 	default:
2073 		/* We implement the SO_SNDLOWAT etc to not be settable
2074 		 * (1003.1g 7).
2075 		 */
2076 		return -ENOPROTOOPT;
2077 	}
2078 
2079 	if (len > lv)
2080 		len = lv;
2081 	if (copy_to_sockptr(optval, &v, len))
2082 		return -EFAULT;
2083 lenout:
2084 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2085 		return -EFAULT;
2086 	return 0;
2087 }
2088 
2089 /*
2090  * Initialize an sk_lock.
2091  *
2092  * (We also register the sk_lock with the lock validator.)
2093  */
2094 static inline void sock_lock_init(struct sock *sk)
2095 {
2096 	if (sk->sk_kern_sock)
2097 		sock_lock_init_class_and_name(
2098 			sk,
2099 			af_family_kern_slock_key_strings[sk->sk_family],
2100 			af_family_kern_slock_keys + sk->sk_family,
2101 			af_family_kern_key_strings[sk->sk_family],
2102 			af_family_kern_keys + sk->sk_family);
2103 	else
2104 		sock_lock_init_class_and_name(
2105 			sk,
2106 			af_family_slock_key_strings[sk->sk_family],
2107 			af_family_slock_keys + sk->sk_family,
2108 			af_family_key_strings[sk->sk_family],
2109 			af_family_keys + sk->sk_family);
2110 }
2111 
2112 /*
2113  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2114  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2115  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2116  */
2117 static void sock_copy(struct sock *nsk, const struct sock *osk)
2118 {
2119 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2120 #ifdef CONFIG_SECURITY_NETWORK
2121 	void *sptr = nsk->sk_security;
2122 #endif
2123 
2124 	/* If we move sk_tx_queue_mapping out of the private section,
2125 	 * we must check if sk_tx_queue_clear() is called after
2126 	 * sock_copy() in sk_clone_lock().
2127 	 */
2128 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2129 		     offsetof(struct sock, sk_dontcopy_begin) ||
2130 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2131 		     offsetof(struct sock, sk_dontcopy_end));
2132 
2133 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2134 
2135 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2136 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2137 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2138 
2139 #ifdef CONFIG_SECURITY_NETWORK
2140 	nsk->sk_security = sptr;
2141 	security_sk_clone(osk, nsk);
2142 #endif
2143 }
2144 
2145 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2146 		int family)
2147 {
2148 	struct sock *sk;
2149 	struct kmem_cache *slab;
2150 
2151 	slab = prot->slab;
2152 	if (slab != NULL) {
2153 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2154 		if (!sk)
2155 			return sk;
2156 		if (want_init_on_alloc(priority))
2157 			sk_prot_clear_nulls(sk, prot->obj_size);
2158 	} else
2159 		sk = kmalloc(prot->obj_size, priority);
2160 
2161 	if (sk != NULL) {
2162 		if (security_sk_alloc(sk, family, priority))
2163 			goto out_free;
2164 
2165 		if (!try_module_get(prot->owner))
2166 			goto out_free_sec;
2167 	}
2168 
2169 	return sk;
2170 
2171 out_free_sec:
2172 	security_sk_free(sk);
2173 out_free:
2174 	if (slab != NULL)
2175 		kmem_cache_free(slab, sk);
2176 	else
2177 		kfree(sk);
2178 	return NULL;
2179 }
2180 
2181 static void sk_prot_free(struct proto *prot, struct sock *sk)
2182 {
2183 	struct kmem_cache *slab;
2184 	struct module *owner;
2185 
2186 	owner = prot->owner;
2187 	slab = prot->slab;
2188 
2189 	cgroup_sk_free(&sk->sk_cgrp_data);
2190 	mem_cgroup_sk_free(sk);
2191 	security_sk_free(sk);
2192 	if (slab != NULL)
2193 		kmem_cache_free(slab, sk);
2194 	else
2195 		kfree(sk);
2196 	module_put(owner);
2197 }
2198 
2199 /**
2200  *	sk_alloc - All socket objects are allocated here
2201  *	@net: the applicable net namespace
2202  *	@family: protocol family
2203  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2204  *	@prot: struct proto associated with this new sock instance
2205  *	@kern: is this to be a kernel socket?
2206  */
2207 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2208 		      struct proto *prot, int kern)
2209 {
2210 	struct sock *sk;
2211 
2212 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2213 	if (sk) {
2214 		sk->sk_family = family;
2215 		/*
2216 		 * See comment in struct sock definition to understand
2217 		 * why we need sk_prot_creator -acme
2218 		 */
2219 		sk->sk_prot = sk->sk_prot_creator = prot;
2220 		sk->sk_kern_sock = kern;
2221 		sock_lock_init(sk);
2222 		sk->sk_net_refcnt = kern ? 0 : 1;
2223 		if (likely(sk->sk_net_refcnt)) {
2224 			get_net_track(net, &sk->ns_tracker, priority);
2225 			sock_inuse_add(net, 1);
2226 		} else {
2227 			__netns_tracker_alloc(net, &sk->ns_tracker,
2228 					      false, priority);
2229 		}
2230 
2231 		sock_net_set(sk, net);
2232 		refcount_set(&sk->sk_wmem_alloc, 1);
2233 
2234 		mem_cgroup_sk_alloc(sk);
2235 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2236 		sock_update_classid(&sk->sk_cgrp_data);
2237 		sock_update_netprioidx(&sk->sk_cgrp_data);
2238 		sk_tx_queue_clear(sk);
2239 	}
2240 
2241 	return sk;
2242 }
2243 EXPORT_SYMBOL(sk_alloc);
2244 
2245 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2246  * grace period. This is the case for UDP sockets and TCP listeners.
2247  */
2248 static void __sk_destruct(struct rcu_head *head)
2249 {
2250 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2251 	struct sk_filter *filter;
2252 
2253 	if (sk->sk_destruct)
2254 		sk->sk_destruct(sk);
2255 
2256 	filter = rcu_dereference_check(sk->sk_filter,
2257 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2258 	if (filter) {
2259 		sk_filter_uncharge(sk, filter);
2260 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2261 	}
2262 
2263 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2264 
2265 #ifdef CONFIG_BPF_SYSCALL
2266 	bpf_sk_storage_free(sk);
2267 #endif
2268 
2269 	if (atomic_read(&sk->sk_omem_alloc))
2270 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2271 			 __func__, atomic_read(&sk->sk_omem_alloc));
2272 
2273 	if (sk->sk_frag.page) {
2274 		put_page(sk->sk_frag.page);
2275 		sk->sk_frag.page = NULL;
2276 	}
2277 
2278 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2279 	put_cred(sk->sk_peer_cred);
2280 	put_pid(sk->sk_peer_pid);
2281 
2282 	if (likely(sk->sk_net_refcnt))
2283 		put_net_track(sock_net(sk), &sk->ns_tracker);
2284 	else
2285 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2286 
2287 	sk_prot_free(sk->sk_prot_creator, sk);
2288 }
2289 
2290 void sk_destruct(struct sock *sk)
2291 {
2292 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2293 
2294 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2295 		reuseport_detach_sock(sk);
2296 		use_call_rcu = true;
2297 	}
2298 
2299 	if (use_call_rcu)
2300 		call_rcu(&sk->sk_rcu, __sk_destruct);
2301 	else
2302 		__sk_destruct(&sk->sk_rcu);
2303 }
2304 
2305 static void __sk_free(struct sock *sk)
2306 {
2307 	if (likely(sk->sk_net_refcnt))
2308 		sock_inuse_add(sock_net(sk), -1);
2309 
2310 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2311 		sock_diag_broadcast_destroy(sk);
2312 	else
2313 		sk_destruct(sk);
2314 }
2315 
2316 void sk_free(struct sock *sk)
2317 {
2318 	/*
2319 	 * We subtract one from sk_wmem_alloc and can know if
2320 	 * some packets are still in some tx queue.
2321 	 * If not null, sock_wfree() will call __sk_free(sk) later
2322 	 */
2323 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2324 		__sk_free(sk);
2325 }
2326 EXPORT_SYMBOL(sk_free);
2327 
2328 static void sk_init_common(struct sock *sk)
2329 {
2330 	skb_queue_head_init(&sk->sk_receive_queue);
2331 	skb_queue_head_init(&sk->sk_write_queue);
2332 	skb_queue_head_init(&sk->sk_error_queue);
2333 
2334 	rwlock_init(&sk->sk_callback_lock);
2335 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2336 			af_rlock_keys + sk->sk_family,
2337 			af_family_rlock_key_strings[sk->sk_family]);
2338 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2339 			af_wlock_keys + sk->sk_family,
2340 			af_family_wlock_key_strings[sk->sk_family]);
2341 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2342 			af_elock_keys + sk->sk_family,
2343 			af_family_elock_key_strings[sk->sk_family]);
2344 	if (sk->sk_kern_sock)
2345 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2346 			af_kern_callback_keys + sk->sk_family,
2347 			af_family_kern_clock_key_strings[sk->sk_family]);
2348 	else
2349 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2350 			af_callback_keys + sk->sk_family,
2351 			af_family_clock_key_strings[sk->sk_family]);
2352 }
2353 
2354 /**
2355  *	sk_clone_lock - clone a socket, and lock its clone
2356  *	@sk: the socket to clone
2357  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2358  *
2359  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2360  */
2361 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2362 {
2363 	struct proto *prot = READ_ONCE(sk->sk_prot);
2364 	struct sk_filter *filter;
2365 	bool is_charged = true;
2366 	struct sock *newsk;
2367 
2368 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2369 	if (!newsk)
2370 		goto out;
2371 
2372 	sock_copy(newsk, sk);
2373 
2374 	newsk->sk_prot_creator = prot;
2375 
2376 	/* SANITY */
2377 	if (likely(newsk->sk_net_refcnt)) {
2378 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2379 		sock_inuse_add(sock_net(newsk), 1);
2380 	} else {
2381 		/* Kernel sockets are not elevating the struct net refcount.
2382 		 * Instead, use a tracker to more easily detect if a layer
2383 		 * is not properly dismantling its kernel sockets at netns
2384 		 * destroy time.
2385 		 */
2386 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2387 				      false, priority);
2388 	}
2389 	sk_node_init(&newsk->sk_node);
2390 	sock_lock_init(newsk);
2391 	bh_lock_sock(newsk);
2392 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2393 	newsk->sk_backlog.len = 0;
2394 
2395 	atomic_set(&newsk->sk_rmem_alloc, 0);
2396 
2397 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2398 	refcount_set(&newsk->sk_wmem_alloc, 1);
2399 
2400 	atomic_set(&newsk->sk_omem_alloc, 0);
2401 	sk_init_common(newsk);
2402 
2403 	newsk->sk_dst_cache	= NULL;
2404 	newsk->sk_dst_pending_confirm = 0;
2405 	newsk->sk_wmem_queued	= 0;
2406 	newsk->sk_forward_alloc = 0;
2407 	newsk->sk_reserved_mem  = 0;
2408 	atomic_set(&newsk->sk_drops, 0);
2409 	newsk->sk_send_head	= NULL;
2410 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2411 	atomic_set(&newsk->sk_zckey, 0);
2412 
2413 	sock_reset_flag(newsk, SOCK_DONE);
2414 
2415 	/* sk->sk_memcg will be populated at accept() time */
2416 	newsk->sk_memcg = NULL;
2417 
2418 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2419 
2420 	rcu_read_lock();
2421 	filter = rcu_dereference(sk->sk_filter);
2422 	if (filter != NULL)
2423 		/* though it's an empty new sock, the charging may fail
2424 		 * if sysctl_optmem_max was changed between creation of
2425 		 * original socket and cloning
2426 		 */
2427 		is_charged = sk_filter_charge(newsk, filter);
2428 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2429 	rcu_read_unlock();
2430 
2431 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2432 		/* We need to make sure that we don't uncharge the new
2433 		 * socket if we couldn't charge it in the first place
2434 		 * as otherwise we uncharge the parent's filter.
2435 		 */
2436 		if (!is_charged)
2437 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2438 		sk_free_unlock_clone(newsk);
2439 		newsk = NULL;
2440 		goto out;
2441 	}
2442 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2443 
2444 	if (bpf_sk_storage_clone(sk, newsk)) {
2445 		sk_free_unlock_clone(newsk);
2446 		newsk = NULL;
2447 		goto out;
2448 	}
2449 
2450 	/* Clear sk_user_data if parent had the pointer tagged
2451 	 * as not suitable for copying when cloning.
2452 	 */
2453 	if (sk_user_data_is_nocopy(newsk))
2454 		newsk->sk_user_data = NULL;
2455 
2456 	newsk->sk_err	   = 0;
2457 	newsk->sk_err_soft = 0;
2458 	newsk->sk_priority = 0;
2459 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2460 
2461 	/* Before updating sk_refcnt, we must commit prior changes to memory
2462 	 * (Documentation/RCU/rculist_nulls.rst for details)
2463 	 */
2464 	smp_wmb();
2465 	refcount_set(&newsk->sk_refcnt, 2);
2466 
2467 	sk_set_socket(newsk, NULL);
2468 	sk_tx_queue_clear(newsk);
2469 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2470 
2471 	if (newsk->sk_prot->sockets_allocated)
2472 		sk_sockets_allocated_inc(newsk);
2473 
2474 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2475 		net_enable_timestamp();
2476 out:
2477 	return newsk;
2478 }
2479 EXPORT_SYMBOL_GPL(sk_clone_lock);
2480 
2481 void sk_free_unlock_clone(struct sock *sk)
2482 {
2483 	/* It is still raw copy of parent, so invalidate
2484 	 * destructor and make plain sk_free() */
2485 	sk->sk_destruct = NULL;
2486 	bh_unlock_sock(sk);
2487 	sk_free(sk);
2488 }
2489 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2490 
2491 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2492 {
2493 	bool is_ipv6 = false;
2494 	u32 max_size;
2495 
2496 #if IS_ENABLED(CONFIG_IPV6)
2497 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2498 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2499 #endif
2500 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2501 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2502 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2503 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2504 		max_size = GSO_LEGACY_MAX_SIZE;
2505 
2506 	return max_size - (MAX_TCP_HEADER + 1);
2507 }
2508 
2509 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2510 {
2511 	u32 max_segs = 1;
2512 
2513 	sk->sk_route_caps = dst->dev->features;
2514 	if (sk_is_tcp(sk))
2515 		sk->sk_route_caps |= NETIF_F_GSO;
2516 	if (sk->sk_route_caps & NETIF_F_GSO)
2517 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2518 	if (unlikely(sk->sk_gso_disabled))
2519 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2520 	if (sk_can_gso(sk)) {
2521 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2522 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2523 		} else {
2524 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2525 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2526 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2527 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2528 		}
2529 	}
2530 	sk->sk_gso_max_segs = max_segs;
2531 	sk_dst_set(sk, dst);
2532 }
2533 EXPORT_SYMBOL_GPL(sk_setup_caps);
2534 
2535 /*
2536  *	Simple resource managers for sockets.
2537  */
2538 
2539 
2540 /*
2541  * Write buffer destructor automatically called from kfree_skb.
2542  */
2543 void sock_wfree(struct sk_buff *skb)
2544 {
2545 	struct sock *sk = skb->sk;
2546 	unsigned int len = skb->truesize;
2547 	bool free;
2548 
2549 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2550 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2551 		    sk->sk_write_space == sock_def_write_space) {
2552 			rcu_read_lock();
2553 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2554 			sock_def_write_space_wfree(sk);
2555 			rcu_read_unlock();
2556 			if (unlikely(free))
2557 				__sk_free(sk);
2558 			return;
2559 		}
2560 
2561 		/*
2562 		 * Keep a reference on sk_wmem_alloc, this will be released
2563 		 * after sk_write_space() call
2564 		 */
2565 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2566 		sk->sk_write_space(sk);
2567 		len = 1;
2568 	}
2569 	/*
2570 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2571 	 * could not do because of in-flight packets
2572 	 */
2573 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2574 		__sk_free(sk);
2575 }
2576 EXPORT_SYMBOL(sock_wfree);
2577 
2578 /* This variant of sock_wfree() is used by TCP,
2579  * since it sets SOCK_USE_WRITE_QUEUE.
2580  */
2581 void __sock_wfree(struct sk_buff *skb)
2582 {
2583 	struct sock *sk = skb->sk;
2584 
2585 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2586 		__sk_free(sk);
2587 }
2588 
2589 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2590 {
2591 	skb_orphan(skb);
2592 #ifdef CONFIG_INET
2593 	if (unlikely(!sk_fullsock(sk)))
2594 		return skb_set_owner_edemux(skb, sk);
2595 #endif
2596 	skb->sk = sk;
2597 	skb->destructor = sock_wfree;
2598 	skb_set_hash_from_sk(skb, sk);
2599 	/*
2600 	 * We used to take a refcount on sk, but following operation
2601 	 * is enough to guarantee sk_free() won't free this sock until
2602 	 * all in-flight packets are completed
2603 	 */
2604 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2605 }
2606 EXPORT_SYMBOL(skb_set_owner_w);
2607 
2608 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2609 {
2610 	/* Drivers depend on in-order delivery for crypto offload,
2611 	 * partial orphan breaks out-of-order-OK logic.
2612 	 */
2613 	if (skb_is_decrypted(skb))
2614 		return false;
2615 
2616 	return (skb->destructor == sock_wfree ||
2617 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2618 }
2619 
2620 /* This helper is used by netem, as it can hold packets in its
2621  * delay queue. We want to allow the owner socket to send more
2622  * packets, as if they were already TX completed by a typical driver.
2623  * But we also want to keep skb->sk set because some packet schedulers
2624  * rely on it (sch_fq for example).
2625  */
2626 void skb_orphan_partial(struct sk_buff *skb)
2627 {
2628 	if (skb_is_tcp_pure_ack(skb))
2629 		return;
2630 
2631 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2632 		return;
2633 
2634 	skb_orphan(skb);
2635 }
2636 EXPORT_SYMBOL(skb_orphan_partial);
2637 
2638 /*
2639  * Read buffer destructor automatically called from kfree_skb.
2640  */
2641 void sock_rfree(struct sk_buff *skb)
2642 {
2643 	struct sock *sk = skb->sk;
2644 	unsigned int len = skb->truesize;
2645 
2646 	atomic_sub(len, &sk->sk_rmem_alloc);
2647 	sk_mem_uncharge(sk, len);
2648 }
2649 EXPORT_SYMBOL(sock_rfree);
2650 
2651 /*
2652  * Buffer destructor for skbs that are not used directly in read or write
2653  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2654  */
2655 void sock_efree(struct sk_buff *skb)
2656 {
2657 	sock_put(skb->sk);
2658 }
2659 EXPORT_SYMBOL(sock_efree);
2660 
2661 /* Buffer destructor for prefetch/receive path where reference count may
2662  * not be held, e.g. for listen sockets.
2663  */
2664 #ifdef CONFIG_INET
2665 void sock_pfree(struct sk_buff *skb)
2666 {
2667 	struct sock *sk = skb->sk;
2668 
2669 	if (!sk_is_refcounted(sk))
2670 		return;
2671 
2672 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2673 		inet_reqsk(sk)->rsk_listener = NULL;
2674 		reqsk_free(inet_reqsk(sk));
2675 		return;
2676 	}
2677 
2678 	sock_gen_put(sk);
2679 }
2680 EXPORT_SYMBOL(sock_pfree);
2681 #endif /* CONFIG_INET */
2682 
2683 kuid_t sock_i_uid(struct sock *sk)
2684 {
2685 	kuid_t uid;
2686 
2687 	read_lock_bh(&sk->sk_callback_lock);
2688 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2689 	read_unlock_bh(&sk->sk_callback_lock);
2690 	return uid;
2691 }
2692 EXPORT_SYMBOL(sock_i_uid);
2693 
2694 unsigned long __sock_i_ino(struct sock *sk)
2695 {
2696 	unsigned long ino;
2697 
2698 	read_lock(&sk->sk_callback_lock);
2699 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2700 	read_unlock(&sk->sk_callback_lock);
2701 	return ino;
2702 }
2703 EXPORT_SYMBOL(__sock_i_ino);
2704 
2705 unsigned long sock_i_ino(struct sock *sk)
2706 {
2707 	unsigned long ino;
2708 
2709 	local_bh_disable();
2710 	ino = __sock_i_ino(sk);
2711 	local_bh_enable();
2712 	return ino;
2713 }
2714 EXPORT_SYMBOL(sock_i_ino);
2715 
2716 /*
2717  * Allocate a skb from the socket's send buffer.
2718  */
2719 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2720 			     gfp_t priority)
2721 {
2722 	if (force ||
2723 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2724 		struct sk_buff *skb = alloc_skb(size, priority);
2725 
2726 		if (skb) {
2727 			skb_set_owner_w(skb, sk);
2728 			return skb;
2729 		}
2730 	}
2731 	return NULL;
2732 }
2733 EXPORT_SYMBOL(sock_wmalloc);
2734 
2735 static void sock_ofree(struct sk_buff *skb)
2736 {
2737 	struct sock *sk = skb->sk;
2738 
2739 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2740 }
2741 
2742 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2743 			     gfp_t priority)
2744 {
2745 	struct sk_buff *skb;
2746 
2747 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2748 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2749 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2750 		return NULL;
2751 
2752 	skb = alloc_skb(size, priority);
2753 	if (!skb)
2754 		return NULL;
2755 
2756 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2757 	skb->sk = sk;
2758 	skb->destructor = sock_ofree;
2759 	return skb;
2760 }
2761 
2762 /*
2763  * Allocate a memory block from the socket's option memory buffer.
2764  */
2765 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2766 {
2767 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2768 
2769 	if ((unsigned int)size <= optmem_max &&
2770 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2771 		void *mem;
2772 		/* First do the add, to avoid the race if kmalloc
2773 		 * might sleep.
2774 		 */
2775 		atomic_add(size, &sk->sk_omem_alloc);
2776 		mem = kmalloc(size, priority);
2777 		if (mem)
2778 			return mem;
2779 		atomic_sub(size, &sk->sk_omem_alloc);
2780 	}
2781 	return NULL;
2782 }
2783 EXPORT_SYMBOL(sock_kmalloc);
2784 
2785 /* Free an option memory block. Note, we actually want the inline
2786  * here as this allows gcc to detect the nullify and fold away the
2787  * condition entirely.
2788  */
2789 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2790 				  const bool nullify)
2791 {
2792 	if (WARN_ON_ONCE(!mem))
2793 		return;
2794 	if (nullify)
2795 		kfree_sensitive(mem);
2796 	else
2797 		kfree(mem);
2798 	atomic_sub(size, &sk->sk_omem_alloc);
2799 }
2800 
2801 void sock_kfree_s(struct sock *sk, void *mem, int size)
2802 {
2803 	__sock_kfree_s(sk, mem, size, false);
2804 }
2805 EXPORT_SYMBOL(sock_kfree_s);
2806 
2807 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2808 {
2809 	__sock_kfree_s(sk, mem, size, true);
2810 }
2811 EXPORT_SYMBOL(sock_kzfree_s);
2812 
2813 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2814    I think, these locks should be removed for datagram sockets.
2815  */
2816 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2817 {
2818 	DEFINE_WAIT(wait);
2819 
2820 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2821 	for (;;) {
2822 		if (!timeo)
2823 			break;
2824 		if (signal_pending(current))
2825 			break;
2826 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2827 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2828 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2829 			break;
2830 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2831 			break;
2832 		if (READ_ONCE(sk->sk_err))
2833 			break;
2834 		timeo = schedule_timeout(timeo);
2835 	}
2836 	finish_wait(sk_sleep(sk), &wait);
2837 	return timeo;
2838 }
2839 
2840 
2841 /*
2842  *	Generic send/receive buffer handlers
2843  */
2844 
2845 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2846 				     unsigned long data_len, int noblock,
2847 				     int *errcode, int max_page_order)
2848 {
2849 	struct sk_buff *skb;
2850 	long timeo;
2851 	int err;
2852 
2853 	timeo = sock_sndtimeo(sk, noblock);
2854 	for (;;) {
2855 		err = sock_error(sk);
2856 		if (err != 0)
2857 			goto failure;
2858 
2859 		err = -EPIPE;
2860 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2861 			goto failure;
2862 
2863 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2864 			break;
2865 
2866 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2867 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2868 		err = -EAGAIN;
2869 		if (!timeo)
2870 			goto failure;
2871 		if (signal_pending(current))
2872 			goto interrupted;
2873 		timeo = sock_wait_for_wmem(sk, timeo);
2874 	}
2875 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2876 				   errcode, sk->sk_allocation);
2877 	if (skb)
2878 		skb_set_owner_w(skb, sk);
2879 	return skb;
2880 
2881 interrupted:
2882 	err = sock_intr_errno(timeo);
2883 failure:
2884 	*errcode = err;
2885 	return NULL;
2886 }
2887 EXPORT_SYMBOL(sock_alloc_send_pskb);
2888 
2889 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2890 		     struct sockcm_cookie *sockc)
2891 {
2892 	u32 tsflags;
2893 
2894 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
2895 
2896 	switch (cmsg->cmsg_type) {
2897 	case SO_MARK:
2898 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2899 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2900 			return -EPERM;
2901 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2902 			return -EINVAL;
2903 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2904 		break;
2905 	case SO_TIMESTAMPING_OLD:
2906 	case SO_TIMESTAMPING_NEW:
2907 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2908 			return -EINVAL;
2909 
2910 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2911 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2912 			return -EINVAL;
2913 
2914 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2915 		sockc->tsflags |= tsflags;
2916 		break;
2917 	case SCM_TXTIME:
2918 		if (!sock_flag(sk, SOCK_TXTIME))
2919 			return -EINVAL;
2920 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2921 			return -EINVAL;
2922 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2923 		break;
2924 	case SCM_TS_OPT_ID:
2925 		if (sk_is_tcp(sk))
2926 			return -EINVAL;
2927 		tsflags = READ_ONCE(sk->sk_tsflags);
2928 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
2929 			return -EINVAL;
2930 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2931 			return -EINVAL;
2932 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
2933 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
2934 		break;
2935 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2936 	case SCM_RIGHTS:
2937 	case SCM_CREDENTIALS:
2938 		break;
2939 	default:
2940 		return -EINVAL;
2941 	}
2942 	return 0;
2943 }
2944 EXPORT_SYMBOL(__sock_cmsg_send);
2945 
2946 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2947 		   struct sockcm_cookie *sockc)
2948 {
2949 	struct cmsghdr *cmsg;
2950 	int ret;
2951 
2952 	for_each_cmsghdr(cmsg, msg) {
2953 		if (!CMSG_OK(msg, cmsg))
2954 			return -EINVAL;
2955 		if (cmsg->cmsg_level != SOL_SOCKET)
2956 			continue;
2957 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2958 		if (ret)
2959 			return ret;
2960 	}
2961 	return 0;
2962 }
2963 EXPORT_SYMBOL(sock_cmsg_send);
2964 
2965 static void sk_enter_memory_pressure(struct sock *sk)
2966 {
2967 	if (!sk->sk_prot->enter_memory_pressure)
2968 		return;
2969 
2970 	sk->sk_prot->enter_memory_pressure(sk);
2971 }
2972 
2973 static void sk_leave_memory_pressure(struct sock *sk)
2974 {
2975 	if (sk->sk_prot->leave_memory_pressure) {
2976 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2977 				     tcp_leave_memory_pressure, sk);
2978 	} else {
2979 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2980 
2981 		if (memory_pressure && READ_ONCE(*memory_pressure))
2982 			WRITE_ONCE(*memory_pressure, 0);
2983 	}
2984 }
2985 
2986 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2987 
2988 /**
2989  * skb_page_frag_refill - check that a page_frag contains enough room
2990  * @sz: minimum size of the fragment we want to get
2991  * @pfrag: pointer to page_frag
2992  * @gfp: priority for memory allocation
2993  *
2994  * Note: While this allocator tries to use high order pages, there is
2995  * no guarantee that allocations succeed. Therefore, @sz MUST be
2996  * less or equal than PAGE_SIZE.
2997  */
2998 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2999 {
3000 	if (pfrag->page) {
3001 		if (page_ref_count(pfrag->page) == 1) {
3002 			pfrag->offset = 0;
3003 			return true;
3004 		}
3005 		if (pfrag->offset + sz <= pfrag->size)
3006 			return true;
3007 		put_page(pfrag->page);
3008 	}
3009 
3010 	pfrag->offset = 0;
3011 	if (SKB_FRAG_PAGE_ORDER &&
3012 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3013 		/* Avoid direct reclaim but allow kswapd to wake */
3014 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3015 					  __GFP_COMP | __GFP_NOWARN |
3016 					  __GFP_NORETRY,
3017 					  SKB_FRAG_PAGE_ORDER);
3018 		if (likely(pfrag->page)) {
3019 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3020 			return true;
3021 		}
3022 	}
3023 	pfrag->page = alloc_page(gfp);
3024 	if (likely(pfrag->page)) {
3025 		pfrag->size = PAGE_SIZE;
3026 		return true;
3027 	}
3028 	return false;
3029 }
3030 EXPORT_SYMBOL(skb_page_frag_refill);
3031 
3032 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3033 {
3034 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3035 		return true;
3036 
3037 	sk_enter_memory_pressure(sk);
3038 	sk_stream_moderate_sndbuf(sk);
3039 	return false;
3040 }
3041 EXPORT_SYMBOL(sk_page_frag_refill);
3042 
3043 void __lock_sock(struct sock *sk)
3044 	__releases(&sk->sk_lock.slock)
3045 	__acquires(&sk->sk_lock.slock)
3046 {
3047 	DEFINE_WAIT(wait);
3048 
3049 	for (;;) {
3050 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3051 					TASK_UNINTERRUPTIBLE);
3052 		spin_unlock_bh(&sk->sk_lock.slock);
3053 		schedule();
3054 		spin_lock_bh(&sk->sk_lock.slock);
3055 		if (!sock_owned_by_user(sk))
3056 			break;
3057 	}
3058 	finish_wait(&sk->sk_lock.wq, &wait);
3059 }
3060 
3061 void __release_sock(struct sock *sk)
3062 	__releases(&sk->sk_lock.slock)
3063 	__acquires(&sk->sk_lock.slock)
3064 {
3065 	struct sk_buff *skb, *next;
3066 
3067 	while ((skb = sk->sk_backlog.head) != NULL) {
3068 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3069 
3070 		spin_unlock_bh(&sk->sk_lock.slock);
3071 
3072 		do {
3073 			next = skb->next;
3074 			prefetch(next);
3075 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3076 			skb_mark_not_on_list(skb);
3077 			sk_backlog_rcv(sk, skb);
3078 
3079 			cond_resched();
3080 
3081 			skb = next;
3082 		} while (skb != NULL);
3083 
3084 		spin_lock_bh(&sk->sk_lock.slock);
3085 	}
3086 
3087 	/*
3088 	 * Doing the zeroing here guarantee we can not loop forever
3089 	 * while a wild producer attempts to flood us.
3090 	 */
3091 	sk->sk_backlog.len = 0;
3092 }
3093 
3094 void __sk_flush_backlog(struct sock *sk)
3095 {
3096 	spin_lock_bh(&sk->sk_lock.slock);
3097 	__release_sock(sk);
3098 
3099 	if (sk->sk_prot->release_cb)
3100 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3101 				     tcp_release_cb, sk);
3102 
3103 	spin_unlock_bh(&sk->sk_lock.slock);
3104 }
3105 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3106 
3107 /**
3108  * sk_wait_data - wait for data to arrive at sk_receive_queue
3109  * @sk:    sock to wait on
3110  * @timeo: for how long
3111  * @skb:   last skb seen on sk_receive_queue
3112  *
3113  * Now socket state including sk->sk_err is changed only under lock,
3114  * hence we may omit checks after joining wait queue.
3115  * We check receive queue before schedule() only as optimization;
3116  * it is very likely that release_sock() added new data.
3117  */
3118 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3119 {
3120 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3121 	int rc;
3122 
3123 	add_wait_queue(sk_sleep(sk), &wait);
3124 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3125 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3126 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3127 	remove_wait_queue(sk_sleep(sk), &wait);
3128 	return rc;
3129 }
3130 EXPORT_SYMBOL(sk_wait_data);
3131 
3132 /**
3133  *	__sk_mem_raise_allocated - increase memory_allocated
3134  *	@sk: socket
3135  *	@size: memory size to allocate
3136  *	@amt: pages to allocate
3137  *	@kind: allocation type
3138  *
3139  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3140  *
3141  *	Unlike the globally shared limits among the sockets under same protocol,
3142  *	consuming the budget of a memcg won't have direct effect on other ones.
3143  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3144  *	whether or not to raise allocated through sk_under_memory_pressure() or
3145  *	its variants.
3146  */
3147 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3148 {
3149 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3150 	struct proto *prot = sk->sk_prot;
3151 	bool charged = false;
3152 	long allocated;
3153 
3154 	sk_memory_allocated_add(sk, amt);
3155 	allocated = sk_memory_allocated(sk);
3156 
3157 	if (memcg) {
3158 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3159 			goto suppress_allocation;
3160 		charged = true;
3161 	}
3162 
3163 	/* Under limit. */
3164 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3165 		sk_leave_memory_pressure(sk);
3166 		return 1;
3167 	}
3168 
3169 	/* Under pressure. */
3170 	if (allocated > sk_prot_mem_limits(sk, 1))
3171 		sk_enter_memory_pressure(sk);
3172 
3173 	/* Over hard limit. */
3174 	if (allocated > sk_prot_mem_limits(sk, 2))
3175 		goto suppress_allocation;
3176 
3177 	/* Guarantee minimum buffer size under pressure (either global
3178 	 * or memcg) to make sure features described in RFC 7323 (TCP
3179 	 * Extensions for High Performance) work properly.
3180 	 *
3181 	 * This rule does NOT stand when exceeds global or memcg's hard
3182 	 * limit, or else a DoS attack can be taken place by spawning
3183 	 * lots of sockets whose usage are under minimum buffer size.
3184 	 */
3185 	if (kind == SK_MEM_RECV) {
3186 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3187 			return 1;
3188 
3189 	} else { /* SK_MEM_SEND */
3190 		int wmem0 = sk_get_wmem0(sk, prot);
3191 
3192 		if (sk->sk_type == SOCK_STREAM) {
3193 			if (sk->sk_wmem_queued < wmem0)
3194 				return 1;
3195 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3196 				return 1;
3197 		}
3198 	}
3199 
3200 	if (sk_has_memory_pressure(sk)) {
3201 		u64 alloc;
3202 
3203 		/* The following 'average' heuristic is within the
3204 		 * scope of global accounting, so it only makes
3205 		 * sense for global memory pressure.
3206 		 */
3207 		if (!sk_under_global_memory_pressure(sk))
3208 			return 1;
3209 
3210 		/* Try to be fair among all the sockets under global
3211 		 * pressure by allowing the ones that below average
3212 		 * usage to raise.
3213 		 */
3214 		alloc = sk_sockets_allocated_read_positive(sk);
3215 		if (sk_prot_mem_limits(sk, 2) > alloc *
3216 		    sk_mem_pages(sk->sk_wmem_queued +
3217 				 atomic_read(&sk->sk_rmem_alloc) +
3218 				 sk->sk_forward_alloc))
3219 			return 1;
3220 	}
3221 
3222 suppress_allocation:
3223 
3224 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3225 		sk_stream_moderate_sndbuf(sk);
3226 
3227 		/* Fail only if socket is _under_ its sndbuf.
3228 		 * In this case we cannot block, so that we have to fail.
3229 		 */
3230 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3231 			/* Force charge with __GFP_NOFAIL */
3232 			if (memcg && !charged) {
3233 				mem_cgroup_charge_skmem(memcg, amt,
3234 					gfp_memcg_charge() | __GFP_NOFAIL);
3235 			}
3236 			return 1;
3237 		}
3238 	}
3239 
3240 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3241 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3242 
3243 	sk_memory_allocated_sub(sk, amt);
3244 
3245 	if (charged)
3246 		mem_cgroup_uncharge_skmem(memcg, amt);
3247 
3248 	return 0;
3249 }
3250 
3251 /**
3252  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3253  *	@sk: socket
3254  *	@size: memory size to allocate
3255  *	@kind: allocation type
3256  *
3257  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3258  *	rmem allocation. This function assumes that protocols which have
3259  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3260  */
3261 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3262 {
3263 	int ret, amt = sk_mem_pages(size);
3264 
3265 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3266 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3267 	if (!ret)
3268 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3269 	return ret;
3270 }
3271 EXPORT_SYMBOL(__sk_mem_schedule);
3272 
3273 /**
3274  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3275  *	@sk: socket
3276  *	@amount: number of quanta
3277  *
3278  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3279  */
3280 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3281 {
3282 	sk_memory_allocated_sub(sk, amount);
3283 
3284 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3285 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3286 
3287 	if (sk_under_global_memory_pressure(sk) &&
3288 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3289 		sk_leave_memory_pressure(sk);
3290 }
3291 
3292 /**
3293  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3294  *	@sk: socket
3295  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3296  */
3297 void __sk_mem_reclaim(struct sock *sk, int amount)
3298 {
3299 	amount >>= PAGE_SHIFT;
3300 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3301 	__sk_mem_reduce_allocated(sk, amount);
3302 }
3303 EXPORT_SYMBOL(__sk_mem_reclaim);
3304 
3305 int sk_set_peek_off(struct sock *sk, int val)
3306 {
3307 	WRITE_ONCE(sk->sk_peek_off, val);
3308 	return 0;
3309 }
3310 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3311 
3312 /*
3313  * Set of default routines for initialising struct proto_ops when
3314  * the protocol does not support a particular function. In certain
3315  * cases where it makes no sense for a protocol to have a "do nothing"
3316  * function, some default processing is provided.
3317  */
3318 
3319 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3320 {
3321 	return -EOPNOTSUPP;
3322 }
3323 EXPORT_SYMBOL(sock_no_bind);
3324 
3325 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3326 		    int len, int flags)
3327 {
3328 	return -EOPNOTSUPP;
3329 }
3330 EXPORT_SYMBOL(sock_no_connect);
3331 
3332 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3333 {
3334 	return -EOPNOTSUPP;
3335 }
3336 EXPORT_SYMBOL(sock_no_socketpair);
3337 
3338 int sock_no_accept(struct socket *sock, struct socket *newsock,
3339 		   struct proto_accept_arg *arg)
3340 {
3341 	return -EOPNOTSUPP;
3342 }
3343 EXPORT_SYMBOL(sock_no_accept);
3344 
3345 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3346 		    int peer)
3347 {
3348 	return -EOPNOTSUPP;
3349 }
3350 EXPORT_SYMBOL(sock_no_getname);
3351 
3352 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3353 {
3354 	return -EOPNOTSUPP;
3355 }
3356 EXPORT_SYMBOL(sock_no_ioctl);
3357 
3358 int sock_no_listen(struct socket *sock, int backlog)
3359 {
3360 	return -EOPNOTSUPP;
3361 }
3362 EXPORT_SYMBOL(sock_no_listen);
3363 
3364 int sock_no_shutdown(struct socket *sock, int how)
3365 {
3366 	return -EOPNOTSUPP;
3367 }
3368 EXPORT_SYMBOL(sock_no_shutdown);
3369 
3370 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3371 {
3372 	return -EOPNOTSUPP;
3373 }
3374 EXPORT_SYMBOL(sock_no_sendmsg);
3375 
3376 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3377 {
3378 	return -EOPNOTSUPP;
3379 }
3380 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3381 
3382 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3383 		    int flags)
3384 {
3385 	return -EOPNOTSUPP;
3386 }
3387 EXPORT_SYMBOL(sock_no_recvmsg);
3388 
3389 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3390 {
3391 	/* Mirror missing mmap method error code */
3392 	return -ENODEV;
3393 }
3394 EXPORT_SYMBOL(sock_no_mmap);
3395 
3396 /*
3397  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3398  * various sock-based usage counts.
3399  */
3400 void __receive_sock(struct file *file)
3401 {
3402 	struct socket *sock;
3403 
3404 	sock = sock_from_file(file);
3405 	if (sock) {
3406 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3407 		sock_update_classid(&sock->sk->sk_cgrp_data);
3408 	}
3409 }
3410 
3411 /*
3412  *	Default Socket Callbacks
3413  */
3414 
3415 static void sock_def_wakeup(struct sock *sk)
3416 {
3417 	struct socket_wq *wq;
3418 
3419 	rcu_read_lock();
3420 	wq = rcu_dereference(sk->sk_wq);
3421 	if (skwq_has_sleeper(wq))
3422 		wake_up_interruptible_all(&wq->wait);
3423 	rcu_read_unlock();
3424 }
3425 
3426 static void sock_def_error_report(struct sock *sk)
3427 {
3428 	struct socket_wq *wq;
3429 
3430 	rcu_read_lock();
3431 	wq = rcu_dereference(sk->sk_wq);
3432 	if (skwq_has_sleeper(wq))
3433 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3434 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3435 	rcu_read_unlock();
3436 }
3437 
3438 void sock_def_readable(struct sock *sk)
3439 {
3440 	struct socket_wq *wq;
3441 
3442 	trace_sk_data_ready(sk);
3443 
3444 	rcu_read_lock();
3445 	wq = rcu_dereference(sk->sk_wq);
3446 	if (skwq_has_sleeper(wq))
3447 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3448 						EPOLLRDNORM | EPOLLRDBAND);
3449 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3450 	rcu_read_unlock();
3451 }
3452 
3453 static void sock_def_write_space(struct sock *sk)
3454 {
3455 	struct socket_wq *wq;
3456 
3457 	rcu_read_lock();
3458 
3459 	/* Do not wake up a writer until he can make "significant"
3460 	 * progress.  --DaveM
3461 	 */
3462 	if (sock_writeable(sk)) {
3463 		wq = rcu_dereference(sk->sk_wq);
3464 		if (skwq_has_sleeper(wq))
3465 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3466 						EPOLLWRNORM | EPOLLWRBAND);
3467 
3468 		/* Should agree with poll, otherwise some programs break */
3469 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3470 	}
3471 
3472 	rcu_read_unlock();
3473 }
3474 
3475 /* An optimised version of sock_def_write_space(), should only be called
3476  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3477  * ->sk_wmem_alloc.
3478  */
3479 static void sock_def_write_space_wfree(struct sock *sk)
3480 {
3481 	/* Do not wake up a writer until he can make "significant"
3482 	 * progress.  --DaveM
3483 	 */
3484 	if (sock_writeable(sk)) {
3485 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3486 
3487 		/* rely on refcount_sub from sock_wfree() */
3488 		smp_mb__after_atomic();
3489 		if (wq && waitqueue_active(&wq->wait))
3490 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3491 						EPOLLWRNORM | EPOLLWRBAND);
3492 
3493 		/* Should agree with poll, otherwise some programs break */
3494 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3495 	}
3496 }
3497 
3498 static void sock_def_destruct(struct sock *sk)
3499 {
3500 }
3501 
3502 void sk_send_sigurg(struct sock *sk)
3503 {
3504 	if (sk->sk_socket && sk->sk_socket->file)
3505 		if (send_sigurg(sk->sk_socket->file))
3506 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3507 }
3508 EXPORT_SYMBOL(sk_send_sigurg);
3509 
3510 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3511 		    unsigned long expires)
3512 {
3513 	if (!mod_timer(timer, expires))
3514 		sock_hold(sk);
3515 }
3516 EXPORT_SYMBOL(sk_reset_timer);
3517 
3518 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3519 {
3520 	if (del_timer(timer))
3521 		__sock_put(sk);
3522 }
3523 EXPORT_SYMBOL(sk_stop_timer);
3524 
3525 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3526 {
3527 	if (del_timer_sync(timer))
3528 		__sock_put(sk);
3529 }
3530 EXPORT_SYMBOL(sk_stop_timer_sync);
3531 
3532 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3533 {
3534 	sk_init_common(sk);
3535 	sk->sk_send_head	=	NULL;
3536 
3537 	timer_setup(&sk->sk_timer, NULL, 0);
3538 
3539 	sk->sk_allocation	=	GFP_KERNEL;
3540 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3541 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3542 	sk->sk_state		=	TCP_CLOSE;
3543 	sk->sk_use_task_frag	=	true;
3544 	sk_set_socket(sk, sock);
3545 
3546 	sock_set_flag(sk, SOCK_ZAPPED);
3547 
3548 	if (sock) {
3549 		sk->sk_type	=	sock->type;
3550 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3551 		sock->sk	=	sk;
3552 	} else {
3553 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3554 	}
3555 	sk->sk_uid	=	uid;
3556 
3557 	sk->sk_state_change	=	sock_def_wakeup;
3558 	sk->sk_data_ready	=	sock_def_readable;
3559 	sk->sk_write_space	=	sock_def_write_space;
3560 	sk->sk_error_report	=	sock_def_error_report;
3561 	sk->sk_destruct		=	sock_def_destruct;
3562 
3563 	sk->sk_frag.page	=	NULL;
3564 	sk->sk_frag.offset	=	0;
3565 	sk->sk_peek_off		=	-1;
3566 
3567 	sk->sk_peer_pid 	=	NULL;
3568 	sk->sk_peer_cred	=	NULL;
3569 	spin_lock_init(&sk->sk_peer_lock);
3570 
3571 	sk->sk_write_pending	=	0;
3572 	sk->sk_rcvlowat		=	1;
3573 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3574 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3575 
3576 	sk->sk_stamp = SK_DEFAULT_STAMP;
3577 #if BITS_PER_LONG==32
3578 	seqlock_init(&sk->sk_stamp_seq);
3579 #endif
3580 	atomic_set(&sk->sk_zckey, 0);
3581 
3582 #ifdef CONFIG_NET_RX_BUSY_POLL
3583 	sk->sk_napi_id		=	0;
3584 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3585 #endif
3586 
3587 	sk->sk_max_pacing_rate = ~0UL;
3588 	sk->sk_pacing_rate = ~0UL;
3589 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3590 	sk->sk_incoming_cpu = -1;
3591 
3592 	sk_rx_queue_clear(sk);
3593 	/*
3594 	 * Before updating sk_refcnt, we must commit prior changes to memory
3595 	 * (Documentation/RCU/rculist_nulls.rst for details)
3596 	 */
3597 	smp_wmb();
3598 	refcount_set(&sk->sk_refcnt, 1);
3599 	atomic_set(&sk->sk_drops, 0);
3600 }
3601 EXPORT_SYMBOL(sock_init_data_uid);
3602 
3603 void sock_init_data(struct socket *sock, struct sock *sk)
3604 {
3605 	kuid_t uid = sock ?
3606 		SOCK_INODE(sock)->i_uid :
3607 		make_kuid(sock_net(sk)->user_ns, 0);
3608 
3609 	sock_init_data_uid(sock, sk, uid);
3610 }
3611 EXPORT_SYMBOL(sock_init_data);
3612 
3613 void lock_sock_nested(struct sock *sk, int subclass)
3614 {
3615 	/* The sk_lock has mutex_lock() semantics here. */
3616 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3617 
3618 	might_sleep();
3619 	spin_lock_bh(&sk->sk_lock.slock);
3620 	if (sock_owned_by_user_nocheck(sk))
3621 		__lock_sock(sk);
3622 	sk->sk_lock.owned = 1;
3623 	spin_unlock_bh(&sk->sk_lock.slock);
3624 }
3625 EXPORT_SYMBOL(lock_sock_nested);
3626 
3627 void release_sock(struct sock *sk)
3628 {
3629 	spin_lock_bh(&sk->sk_lock.slock);
3630 	if (sk->sk_backlog.tail)
3631 		__release_sock(sk);
3632 
3633 	if (sk->sk_prot->release_cb)
3634 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3635 				     tcp_release_cb, sk);
3636 
3637 	sock_release_ownership(sk);
3638 	if (waitqueue_active(&sk->sk_lock.wq))
3639 		wake_up(&sk->sk_lock.wq);
3640 	spin_unlock_bh(&sk->sk_lock.slock);
3641 }
3642 EXPORT_SYMBOL(release_sock);
3643 
3644 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3645 {
3646 	might_sleep();
3647 	spin_lock_bh(&sk->sk_lock.slock);
3648 
3649 	if (!sock_owned_by_user_nocheck(sk)) {
3650 		/*
3651 		 * Fast path return with bottom halves disabled and
3652 		 * sock::sk_lock.slock held.
3653 		 *
3654 		 * The 'mutex' is not contended and holding
3655 		 * sock::sk_lock.slock prevents all other lockers to
3656 		 * proceed so the corresponding unlock_sock_fast() can
3657 		 * avoid the slow path of release_sock() completely and
3658 		 * just release slock.
3659 		 *
3660 		 * From a semantical POV this is equivalent to 'acquiring'
3661 		 * the 'mutex', hence the corresponding lockdep
3662 		 * mutex_release() has to happen in the fast path of
3663 		 * unlock_sock_fast().
3664 		 */
3665 		return false;
3666 	}
3667 
3668 	__lock_sock(sk);
3669 	sk->sk_lock.owned = 1;
3670 	__acquire(&sk->sk_lock.slock);
3671 	spin_unlock_bh(&sk->sk_lock.slock);
3672 	return true;
3673 }
3674 EXPORT_SYMBOL(__lock_sock_fast);
3675 
3676 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3677 		   bool timeval, bool time32)
3678 {
3679 	struct sock *sk = sock->sk;
3680 	struct timespec64 ts;
3681 
3682 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3683 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3684 	if (ts.tv_sec == -1)
3685 		return -ENOENT;
3686 	if (ts.tv_sec == 0) {
3687 		ktime_t kt = ktime_get_real();
3688 		sock_write_timestamp(sk, kt);
3689 		ts = ktime_to_timespec64(kt);
3690 	}
3691 
3692 	if (timeval)
3693 		ts.tv_nsec /= 1000;
3694 
3695 #ifdef CONFIG_COMPAT_32BIT_TIME
3696 	if (time32)
3697 		return put_old_timespec32(&ts, userstamp);
3698 #endif
3699 #ifdef CONFIG_SPARC64
3700 	/* beware of padding in sparc64 timeval */
3701 	if (timeval && !in_compat_syscall()) {
3702 		struct __kernel_old_timeval __user tv = {
3703 			.tv_sec = ts.tv_sec,
3704 			.tv_usec = ts.tv_nsec,
3705 		};
3706 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3707 			return -EFAULT;
3708 		return 0;
3709 	}
3710 #endif
3711 	return put_timespec64(&ts, userstamp);
3712 }
3713 EXPORT_SYMBOL(sock_gettstamp);
3714 
3715 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3716 {
3717 	if (!sock_flag(sk, flag)) {
3718 		unsigned long previous_flags = sk->sk_flags;
3719 
3720 		sock_set_flag(sk, flag);
3721 		/*
3722 		 * we just set one of the two flags which require net
3723 		 * time stamping, but time stamping might have been on
3724 		 * already because of the other one
3725 		 */
3726 		if (sock_needs_netstamp(sk) &&
3727 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3728 			net_enable_timestamp();
3729 	}
3730 }
3731 
3732 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3733 		       int level, int type)
3734 {
3735 	struct sock_exterr_skb *serr;
3736 	struct sk_buff *skb;
3737 	int copied, err;
3738 
3739 	err = -EAGAIN;
3740 	skb = sock_dequeue_err_skb(sk);
3741 	if (skb == NULL)
3742 		goto out;
3743 
3744 	copied = skb->len;
3745 	if (copied > len) {
3746 		msg->msg_flags |= MSG_TRUNC;
3747 		copied = len;
3748 	}
3749 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3750 	if (err)
3751 		goto out_free_skb;
3752 
3753 	sock_recv_timestamp(msg, sk, skb);
3754 
3755 	serr = SKB_EXT_ERR(skb);
3756 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3757 
3758 	msg->msg_flags |= MSG_ERRQUEUE;
3759 	err = copied;
3760 
3761 out_free_skb:
3762 	kfree_skb(skb);
3763 out:
3764 	return err;
3765 }
3766 EXPORT_SYMBOL(sock_recv_errqueue);
3767 
3768 /*
3769  *	Get a socket option on an socket.
3770  *
3771  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3772  *	asynchronous errors should be reported by getsockopt. We assume
3773  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3774  */
3775 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3776 			   char __user *optval, int __user *optlen)
3777 {
3778 	struct sock *sk = sock->sk;
3779 
3780 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3781 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3782 }
3783 EXPORT_SYMBOL(sock_common_getsockopt);
3784 
3785 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3786 			int flags)
3787 {
3788 	struct sock *sk = sock->sk;
3789 	int addr_len = 0;
3790 	int err;
3791 
3792 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3793 	if (err >= 0)
3794 		msg->msg_namelen = addr_len;
3795 	return err;
3796 }
3797 EXPORT_SYMBOL(sock_common_recvmsg);
3798 
3799 /*
3800  *	Set socket options on an inet socket.
3801  */
3802 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3803 			   sockptr_t optval, unsigned int optlen)
3804 {
3805 	struct sock *sk = sock->sk;
3806 
3807 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3808 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3809 }
3810 EXPORT_SYMBOL(sock_common_setsockopt);
3811 
3812 void sk_common_release(struct sock *sk)
3813 {
3814 	if (sk->sk_prot->destroy)
3815 		sk->sk_prot->destroy(sk);
3816 
3817 	/*
3818 	 * Observation: when sk_common_release is called, processes have
3819 	 * no access to socket. But net still has.
3820 	 * Step one, detach it from networking:
3821 	 *
3822 	 * A. Remove from hash tables.
3823 	 */
3824 
3825 	sk->sk_prot->unhash(sk);
3826 
3827 	/*
3828 	 * In this point socket cannot receive new packets, but it is possible
3829 	 * that some packets are in flight because some CPU runs receiver and
3830 	 * did hash table lookup before we unhashed socket. They will achieve
3831 	 * receive queue and will be purged by socket destructor.
3832 	 *
3833 	 * Also we still have packets pending on receive queue and probably,
3834 	 * our own packets waiting in device queues. sock_destroy will drain
3835 	 * receive queue, but transmitted packets will delay socket destruction
3836 	 * until the last reference will be released.
3837 	 */
3838 
3839 	sock_orphan(sk);
3840 
3841 	xfrm_sk_free_policy(sk);
3842 
3843 	sock_put(sk);
3844 }
3845 EXPORT_SYMBOL(sk_common_release);
3846 
3847 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3848 {
3849 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3850 
3851 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3852 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3853 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3854 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3855 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3856 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3857 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3858 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3859 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3860 }
3861 
3862 #ifdef CONFIG_PROC_FS
3863 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3864 
3865 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3866 {
3867 	int cpu, idx = prot->inuse_idx;
3868 	int res = 0;
3869 
3870 	for_each_possible_cpu(cpu)
3871 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3872 
3873 	return res >= 0 ? res : 0;
3874 }
3875 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3876 
3877 int sock_inuse_get(struct net *net)
3878 {
3879 	int cpu, res = 0;
3880 
3881 	for_each_possible_cpu(cpu)
3882 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3883 
3884 	return res;
3885 }
3886 
3887 EXPORT_SYMBOL_GPL(sock_inuse_get);
3888 
3889 static int __net_init sock_inuse_init_net(struct net *net)
3890 {
3891 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3892 	if (net->core.prot_inuse == NULL)
3893 		return -ENOMEM;
3894 	return 0;
3895 }
3896 
3897 static void __net_exit sock_inuse_exit_net(struct net *net)
3898 {
3899 	free_percpu(net->core.prot_inuse);
3900 }
3901 
3902 static struct pernet_operations net_inuse_ops = {
3903 	.init = sock_inuse_init_net,
3904 	.exit = sock_inuse_exit_net,
3905 };
3906 
3907 static __init int net_inuse_init(void)
3908 {
3909 	if (register_pernet_subsys(&net_inuse_ops))
3910 		panic("Cannot initialize net inuse counters");
3911 
3912 	return 0;
3913 }
3914 
3915 core_initcall(net_inuse_init);
3916 
3917 static int assign_proto_idx(struct proto *prot)
3918 {
3919 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3920 
3921 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3922 		pr_err("PROTO_INUSE_NR exhausted\n");
3923 		return -ENOSPC;
3924 	}
3925 
3926 	set_bit(prot->inuse_idx, proto_inuse_idx);
3927 	return 0;
3928 }
3929 
3930 static void release_proto_idx(struct proto *prot)
3931 {
3932 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3933 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3934 }
3935 #else
3936 static inline int assign_proto_idx(struct proto *prot)
3937 {
3938 	return 0;
3939 }
3940 
3941 static inline void release_proto_idx(struct proto *prot)
3942 {
3943 }
3944 
3945 #endif
3946 
3947 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3948 {
3949 	if (!twsk_prot)
3950 		return;
3951 	kfree(twsk_prot->twsk_slab_name);
3952 	twsk_prot->twsk_slab_name = NULL;
3953 	kmem_cache_destroy(twsk_prot->twsk_slab);
3954 	twsk_prot->twsk_slab = NULL;
3955 }
3956 
3957 static int tw_prot_init(const struct proto *prot)
3958 {
3959 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3960 
3961 	if (!twsk_prot)
3962 		return 0;
3963 
3964 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3965 					      prot->name);
3966 	if (!twsk_prot->twsk_slab_name)
3967 		return -ENOMEM;
3968 
3969 	twsk_prot->twsk_slab =
3970 		kmem_cache_create(twsk_prot->twsk_slab_name,
3971 				  twsk_prot->twsk_obj_size, 0,
3972 				  SLAB_ACCOUNT | prot->slab_flags,
3973 				  NULL);
3974 	if (!twsk_prot->twsk_slab) {
3975 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3976 			prot->name);
3977 		return -ENOMEM;
3978 	}
3979 
3980 	return 0;
3981 }
3982 
3983 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3984 {
3985 	if (!rsk_prot)
3986 		return;
3987 	kfree(rsk_prot->slab_name);
3988 	rsk_prot->slab_name = NULL;
3989 	kmem_cache_destroy(rsk_prot->slab);
3990 	rsk_prot->slab = NULL;
3991 }
3992 
3993 static int req_prot_init(const struct proto *prot)
3994 {
3995 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3996 
3997 	if (!rsk_prot)
3998 		return 0;
3999 
4000 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4001 					prot->name);
4002 	if (!rsk_prot->slab_name)
4003 		return -ENOMEM;
4004 
4005 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4006 					   rsk_prot->obj_size, 0,
4007 					   SLAB_ACCOUNT | prot->slab_flags,
4008 					   NULL);
4009 
4010 	if (!rsk_prot->slab) {
4011 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4012 			prot->name);
4013 		return -ENOMEM;
4014 	}
4015 	return 0;
4016 }
4017 
4018 int proto_register(struct proto *prot, int alloc_slab)
4019 {
4020 	int ret = -ENOBUFS;
4021 
4022 	if (prot->memory_allocated && !prot->sysctl_mem) {
4023 		pr_err("%s: missing sysctl_mem\n", prot->name);
4024 		return -EINVAL;
4025 	}
4026 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4027 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4028 		return -EINVAL;
4029 	}
4030 	if (alloc_slab) {
4031 		prot->slab = kmem_cache_create_usercopy(prot->name,
4032 					prot->obj_size, 0,
4033 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4034 					prot->slab_flags,
4035 					prot->useroffset, prot->usersize,
4036 					NULL);
4037 
4038 		if (prot->slab == NULL) {
4039 			pr_crit("%s: Can't create sock SLAB cache!\n",
4040 				prot->name);
4041 			goto out;
4042 		}
4043 
4044 		if (req_prot_init(prot))
4045 			goto out_free_request_sock_slab;
4046 
4047 		if (tw_prot_init(prot))
4048 			goto out_free_timewait_sock_slab;
4049 	}
4050 
4051 	mutex_lock(&proto_list_mutex);
4052 	ret = assign_proto_idx(prot);
4053 	if (ret) {
4054 		mutex_unlock(&proto_list_mutex);
4055 		goto out_free_timewait_sock_slab;
4056 	}
4057 	list_add(&prot->node, &proto_list);
4058 	mutex_unlock(&proto_list_mutex);
4059 	return ret;
4060 
4061 out_free_timewait_sock_slab:
4062 	if (alloc_slab)
4063 		tw_prot_cleanup(prot->twsk_prot);
4064 out_free_request_sock_slab:
4065 	if (alloc_slab) {
4066 		req_prot_cleanup(prot->rsk_prot);
4067 
4068 		kmem_cache_destroy(prot->slab);
4069 		prot->slab = NULL;
4070 	}
4071 out:
4072 	return ret;
4073 }
4074 EXPORT_SYMBOL(proto_register);
4075 
4076 void proto_unregister(struct proto *prot)
4077 {
4078 	mutex_lock(&proto_list_mutex);
4079 	release_proto_idx(prot);
4080 	list_del(&prot->node);
4081 	mutex_unlock(&proto_list_mutex);
4082 
4083 	kmem_cache_destroy(prot->slab);
4084 	prot->slab = NULL;
4085 
4086 	req_prot_cleanup(prot->rsk_prot);
4087 	tw_prot_cleanup(prot->twsk_prot);
4088 }
4089 EXPORT_SYMBOL(proto_unregister);
4090 
4091 int sock_load_diag_module(int family, int protocol)
4092 {
4093 	if (!protocol) {
4094 		if (!sock_is_registered(family))
4095 			return -ENOENT;
4096 
4097 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4098 				      NETLINK_SOCK_DIAG, family);
4099 	}
4100 
4101 #ifdef CONFIG_INET
4102 	if (family == AF_INET &&
4103 	    protocol != IPPROTO_RAW &&
4104 	    protocol < MAX_INET_PROTOS &&
4105 	    !rcu_access_pointer(inet_protos[protocol]))
4106 		return -ENOENT;
4107 #endif
4108 
4109 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4110 			      NETLINK_SOCK_DIAG, family, protocol);
4111 }
4112 EXPORT_SYMBOL(sock_load_diag_module);
4113 
4114 #ifdef CONFIG_PROC_FS
4115 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4116 	__acquires(proto_list_mutex)
4117 {
4118 	mutex_lock(&proto_list_mutex);
4119 	return seq_list_start_head(&proto_list, *pos);
4120 }
4121 
4122 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4123 {
4124 	return seq_list_next(v, &proto_list, pos);
4125 }
4126 
4127 static void proto_seq_stop(struct seq_file *seq, void *v)
4128 	__releases(proto_list_mutex)
4129 {
4130 	mutex_unlock(&proto_list_mutex);
4131 }
4132 
4133 static char proto_method_implemented(const void *method)
4134 {
4135 	return method == NULL ? 'n' : 'y';
4136 }
4137 static long sock_prot_memory_allocated(struct proto *proto)
4138 {
4139 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4140 }
4141 
4142 static const char *sock_prot_memory_pressure(struct proto *proto)
4143 {
4144 	return proto->memory_pressure != NULL ?
4145 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4146 }
4147 
4148 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4149 {
4150 
4151 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4152 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4153 		   proto->name,
4154 		   proto->obj_size,
4155 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4156 		   sock_prot_memory_allocated(proto),
4157 		   sock_prot_memory_pressure(proto),
4158 		   proto->max_header,
4159 		   proto->slab == NULL ? "no" : "yes",
4160 		   module_name(proto->owner),
4161 		   proto_method_implemented(proto->close),
4162 		   proto_method_implemented(proto->connect),
4163 		   proto_method_implemented(proto->disconnect),
4164 		   proto_method_implemented(proto->accept),
4165 		   proto_method_implemented(proto->ioctl),
4166 		   proto_method_implemented(proto->init),
4167 		   proto_method_implemented(proto->destroy),
4168 		   proto_method_implemented(proto->shutdown),
4169 		   proto_method_implemented(proto->setsockopt),
4170 		   proto_method_implemented(proto->getsockopt),
4171 		   proto_method_implemented(proto->sendmsg),
4172 		   proto_method_implemented(proto->recvmsg),
4173 		   proto_method_implemented(proto->bind),
4174 		   proto_method_implemented(proto->backlog_rcv),
4175 		   proto_method_implemented(proto->hash),
4176 		   proto_method_implemented(proto->unhash),
4177 		   proto_method_implemented(proto->get_port),
4178 		   proto_method_implemented(proto->enter_memory_pressure));
4179 }
4180 
4181 static int proto_seq_show(struct seq_file *seq, void *v)
4182 {
4183 	if (v == &proto_list)
4184 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4185 			   "protocol",
4186 			   "size",
4187 			   "sockets",
4188 			   "memory",
4189 			   "press",
4190 			   "maxhdr",
4191 			   "slab",
4192 			   "module",
4193 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4194 	else
4195 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4196 	return 0;
4197 }
4198 
4199 static const struct seq_operations proto_seq_ops = {
4200 	.start  = proto_seq_start,
4201 	.next   = proto_seq_next,
4202 	.stop   = proto_seq_stop,
4203 	.show   = proto_seq_show,
4204 };
4205 
4206 static __net_init int proto_init_net(struct net *net)
4207 {
4208 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4209 			sizeof(struct seq_net_private)))
4210 		return -ENOMEM;
4211 
4212 	return 0;
4213 }
4214 
4215 static __net_exit void proto_exit_net(struct net *net)
4216 {
4217 	remove_proc_entry("protocols", net->proc_net);
4218 }
4219 
4220 
4221 static __net_initdata struct pernet_operations proto_net_ops = {
4222 	.init = proto_init_net,
4223 	.exit = proto_exit_net,
4224 };
4225 
4226 static int __init proto_init(void)
4227 {
4228 	return register_pernet_subsys(&proto_net_ops);
4229 }
4230 
4231 subsys_initcall(proto_init);
4232 
4233 #endif /* PROC_FS */
4234 
4235 #ifdef CONFIG_NET_RX_BUSY_POLL
4236 bool sk_busy_loop_end(void *p, unsigned long start_time)
4237 {
4238 	struct sock *sk = p;
4239 
4240 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4241 		return true;
4242 
4243 	if (sk_is_udp(sk) &&
4244 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4245 		return true;
4246 
4247 	return sk_busy_loop_timeout(sk, start_time);
4248 }
4249 EXPORT_SYMBOL(sk_busy_loop_end);
4250 #endif /* CONFIG_NET_RX_BUSY_POLL */
4251 
4252 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4253 {
4254 	if (!sk->sk_prot->bind_add)
4255 		return -EOPNOTSUPP;
4256 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4257 }
4258 EXPORT_SYMBOL(sock_bind_add);
4259 
4260 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4261 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4262 		     void __user *arg, void *karg, size_t size)
4263 {
4264 	int ret;
4265 
4266 	if (copy_from_user(karg, arg, size))
4267 		return -EFAULT;
4268 
4269 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4270 	if (ret)
4271 		return ret;
4272 
4273 	if (copy_to_user(arg, karg, size))
4274 		return -EFAULT;
4275 
4276 	return 0;
4277 }
4278 EXPORT_SYMBOL(sock_ioctl_inout);
4279 
4280 /* This is the most common ioctl prep function, where the result (4 bytes) is
4281  * copied back to userspace if the ioctl() returns successfully. No input is
4282  * copied from userspace as input argument.
4283  */
4284 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4285 {
4286 	int ret, karg = 0;
4287 
4288 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4289 	if (ret)
4290 		return ret;
4291 
4292 	return put_user(karg, (int __user *)arg);
4293 }
4294 
4295 /* A wrapper around sock ioctls, which copies the data from userspace
4296  * (depending on the protocol/ioctl), and copies back the result to userspace.
4297  * The main motivation for this function is to pass kernel memory to the
4298  * protocol ioctl callbacks, instead of userspace memory.
4299  */
4300 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4301 {
4302 	int rc = 1;
4303 
4304 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4305 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4306 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4307 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4308 	else if (sk_is_phonet(sk))
4309 		rc = phonet_sk_ioctl(sk, cmd, arg);
4310 
4311 	/* If ioctl was processed, returns its value */
4312 	if (rc <= 0)
4313 		return rc;
4314 
4315 	/* Otherwise call the default handler */
4316 	return sock_ioctl_out(sk, cmd, arg);
4317 }
4318 EXPORT_SYMBOL(sk_ioctl);
4319 
4320 static int __init sock_struct_check(void)
4321 {
4322 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4323 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4324 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4325 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4326 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4327 
4328 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4329 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4330 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4331 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4332 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4333 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4334 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4335 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4336 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4337 
4338 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4339 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4340 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4341 
4342 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4343 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4344 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4345 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4346 
4347 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4348 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4349 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4350 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4351 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4352 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4353 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4354 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4355 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4356 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4357 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4358 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4359 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4360 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4361 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4362 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4363 
4364 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4365 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4366 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4367 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4368 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4369 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4370 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4371 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4372 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4373 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4374 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4375 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4376 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4377 	return 0;
4378 }
4379 
4380 core_initcall(sock_struct_check);
4381