xref: /linux/net/core/sock.c (revision 6e7fd890f1d6ac83805409e9c346240de2705584)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <net/proto_memory.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
137 
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
140 #include <net/bpf_sk_storage.h>
141 
142 #include <trace/events/sock.h>
143 
144 #include <net/tcp.h>
145 #include <net/busy_poll.h>
146 #include <net/phonet/phonet.h>
147 
148 #include <linux/ethtool.h>
149 
150 #include "dev.h"
151 
152 static DEFINE_MUTEX(proto_list_mutex);
153 static LIST_HEAD(proto_list);
154 
155 static void sock_def_write_space_wfree(struct sock *sk);
156 static void sock_def_write_space(struct sock *sk);
157 
158 /**
159  * sk_ns_capable - General socket capability test
160  * @sk: Socket to use a capability on or through
161  * @user_ns: The user namespace of the capability to use
162  * @cap: The capability to use
163  *
164  * Test to see if the opener of the socket had when the socket was
165  * created and the current process has the capability @cap in the user
166  * namespace @user_ns.
167  */
168 bool sk_ns_capable(const struct sock *sk,
169 		   struct user_namespace *user_ns, int cap)
170 {
171 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
172 		ns_capable(user_ns, cap);
173 }
174 EXPORT_SYMBOL(sk_ns_capable);
175 
176 /**
177  * sk_capable - Socket global capability test
178  * @sk: Socket to use a capability on or through
179  * @cap: The global capability to use
180  *
181  * Test to see if the opener of the socket had when the socket was
182  * created and the current process has the capability @cap in all user
183  * namespaces.
184  */
185 bool sk_capable(const struct sock *sk, int cap)
186 {
187 	return sk_ns_capable(sk, &init_user_ns, cap);
188 }
189 EXPORT_SYMBOL(sk_capable);
190 
191 /**
192  * sk_net_capable - Network namespace socket capability test
193  * @sk: Socket to use a capability on or through
194  * @cap: The capability to use
195  *
196  * Test to see if the opener of the socket had when the socket was created
197  * and the current process has the capability @cap over the network namespace
198  * the socket is a member of.
199  */
200 bool sk_net_capable(const struct sock *sk, int cap)
201 {
202 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
203 }
204 EXPORT_SYMBOL(sk_net_capable);
205 
206 /*
207  * Each address family might have different locking rules, so we have
208  * one slock key per address family and separate keys for internal and
209  * userspace sockets.
210  */
211 static struct lock_class_key af_family_keys[AF_MAX];
212 static struct lock_class_key af_family_kern_keys[AF_MAX];
213 static struct lock_class_key af_family_slock_keys[AF_MAX];
214 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
215 
216 /*
217  * Make lock validator output more readable. (we pre-construct these
218  * strings build-time, so that runtime initialization of socket
219  * locks is fast):
220  */
221 
222 #define _sock_locks(x)						  \
223   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
224   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
225   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
226   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
227   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
228   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
229   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
230   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
231   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
232   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
233   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
234   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
235   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
236   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
237   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
238   x "AF_MCTP"  , \
239   x "AF_MAX"
240 
241 static const char *const af_family_key_strings[AF_MAX+1] = {
242 	_sock_locks("sk_lock-")
243 };
244 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
245 	_sock_locks("slock-")
246 };
247 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
248 	_sock_locks("clock-")
249 };
250 
251 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
252 	_sock_locks("k-sk_lock-")
253 };
254 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
255 	_sock_locks("k-slock-")
256 };
257 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
258 	_sock_locks("k-clock-")
259 };
260 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
261 	_sock_locks("rlock-")
262 };
263 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
264 	_sock_locks("wlock-")
265 };
266 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
267 	_sock_locks("elock-")
268 };
269 
270 /*
271  * sk_callback_lock and sk queues locking rules are per-address-family,
272  * so split the lock classes by using a per-AF key:
273  */
274 static struct lock_class_key af_callback_keys[AF_MAX];
275 static struct lock_class_key af_rlock_keys[AF_MAX];
276 static struct lock_class_key af_wlock_keys[AF_MAX];
277 static struct lock_class_key af_elock_keys[AF_MAX];
278 static struct lock_class_key af_kern_callback_keys[AF_MAX];
279 
280 /* Run time adjustable parameters. */
281 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
282 EXPORT_SYMBOL(sysctl_wmem_max);
283 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
284 EXPORT_SYMBOL(sysctl_rmem_max);
285 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
286 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
287 
288 int sysctl_tstamp_allow_data __read_mostly = 1;
289 
290 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
291 EXPORT_SYMBOL_GPL(memalloc_socks_key);
292 
293 /**
294  * sk_set_memalloc - sets %SOCK_MEMALLOC
295  * @sk: socket to set it on
296  *
297  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
298  * It's the responsibility of the admin to adjust min_free_kbytes
299  * to meet the requirements
300  */
301 void sk_set_memalloc(struct sock *sk)
302 {
303 	sock_set_flag(sk, SOCK_MEMALLOC);
304 	sk->sk_allocation |= __GFP_MEMALLOC;
305 	static_branch_inc(&memalloc_socks_key);
306 }
307 EXPORT_SYMBOL_GPL(sk_set_memalloc);
308 
309 void sk_clear_memalloc(struct sock *sk)
310 {
311 	sock_reset_flag(sk, SOCK_MEMALLOC);
312 	sk->sk_allocation &= ~__GFP_MEMALLOC;
313 	static_branch_dec(&memalloc_socks_key);
314 
315 	/*
316 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
317 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
318 	 * it has rmem allocations due to the last swapfile being deactivated
319 	 * but there is a risk that the socket is unusable due to exceeding
320 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
321 	 */
322 	sk_mem_reclaim(sk);
323 }
324 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
325 
326 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
327 {
328 	int ret;
329 	unsigned int noreclaim_flag;
330 
331 	/* these should have been dropped before queueing */
332 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
333 
334 	noreclaim_flag = memalloc_noreclaim_save();
335 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
336 				 tcp_v6_do_rcv,
337 				 tcp_v4_do_rcv,
338 				 sk, skb);
339 	memalloc_noreclaim_restore(noreclaim_flag);
340 
341 	return ret;
342 }
343 EXPORT_SYMBOL(__sk_backlog_rcv);
344 
345 void sk_error_report(struct sock *sk)
346 {
347 	sk->sk_error_report(sk);
348 
349 	switch (sk->sk_family) {
350 	case AF_INET:
351 		fallthrough;
352 	case AF_INET6:
353 		trace_inet_sk_error_report(sk);
354 		break;
355 	default:
356 		break;
357 	}
358 }
359 EXPORT_SYMBOL(sk_error_report);
360 
361 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
362 {
363 	struct __kernel_sock_timeval tv;
364 
365 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
366 		tv.tv_sec = 0;
367 		tv.tv_usec = 0;
368 	} else {
369 		tv.tv_sec = timeo / HZ;
370 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
371 	}
372 
373 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
374 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
375 		*(struct old_timeval32 *)optval = tv32;
376 		return sizeof(tv32);
377 	}
378 
379 	if (old_timeval) {
380 		struct __kernel_old_timeval old_tv;
381 		old_tv.tv_sec = tv.tv_sec;
382 		old_tv.tv_usec = tv.tv_usec;
383 		*(struct __kernel_old_timeval *)optval = old_tv;
384 		return sizeof(old_tv);
385 	}
386 
387 	*(struct __kernel_sock_timeval *)optval = tv;
388 	return sizeof(tv);
389 }
390 EXPORT_SYMBOL(sock_get_timeout);
391 
392 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
393 			   sockptr_t optval, int optlen, bool old_timeval)
394 {
395 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
396 		struct old_timeval32 tv32;
397 
398 		if (optlen < sizeof(tv32))
399 			return -EINVAL;
400 
401 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
402 			return -EFAULT;
403 		tv->tv_sec = tv32.tv_sec;
404 		tv->tv_usec = tv32.tv_usec;
405 	} else if (old_timeval) {
406 		struct __kernel_old_timeval old_tv;
407 
408 		if (optlen < sizeof(old_tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
411 			return -EFAULT;
412 		tv->tv_sec = old_tv.tv_sec;
413 		tv->tv_usec = old_tv.tv_usec;
414 	} else {
415 		if (optlen < sizeof(*tv))
416 			return -EINVAL;
417 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
418 			return -EFAULT;
419 	}
420 
421 	return 0;
422 }
423 EXPORT_SYMBOL(sock_copy_user_timeval);
424 
425 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
426 			    bool old_timeval)
427 {
428 	struct __kernel_sock_timeval tv;
429 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
430 	long val;
431 
432 	if (err)
433 		return err;
434 
435 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
436 		return -EDOM;
437 
438 	if (tv.tv_sec < 0) {
439 		static int warned __read_mostly;
440 
441 		WRITE_ONCE(*timeo_p, 0);
442 		if (warned < 10 && net_ratelimit()) {
443 			warned++;
444 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
445 				__func__, current->comm, task_pid_nr(current));
446 		}
447 		return 0;
448 	}
449 	val = MAX_SCHEDULE_TIMEOUT;
450 	if ((tv.tv_sec || tv.tv_usec) &&
451 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
452 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
453 						    USEC_PER_SEC / HZ);
454 	WRITE_ONCE(*timeo_p, val);
455 	return 0;
456 }
457 
458 static bool sock_needs_netstamp(const struct sock *sk)
459 {
460 	switch (sk->sk_family) {
461 	case AF_UNSPEC:
462 	case AF_UNIX:
463 		return false;
464 	default:
465 		return true;
466 	}
467 }
468 
469 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
470 {
471 	if (sk->sk_flags & flags) {
472 		sk->sk_flags &= ~flags;
473 		if (sock_needs_netstamp(sk) &&
474 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
475 			net_disable_timestamp();
476 	}
477 }
478 
479 
480 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
481 {
482 	unsigned long flags;
483 	struct sk_buff_head *list = &sk->sk_receive_queue;
484 
485 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
486 		atomic_inc(&sk->sk_drops);
487 		trace_sock_rcvqueue_full(sk, skb);
488 		return -ENOMEM;
489 	}
490 
491 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
492 		atomic_inc(&sk->sk_drops);
493 		return -ENOBUFS;
494 	}
495 
496 	skb->dev = NULL;
497 	skb_set_owner_r(skb, sk);
498 
499 	/* we escape from rcu protected region, make sure we dont leak
500 	 * a norefcounted dst
501 	 */
502 	skb_dst_force(skb);
503 
504 	spin_lock_irqsave(&list->lock, flags);
505 	sock_skb_set_dropcount(sk, skb);
506 	__skb_queue_tail(list, skb);
507 	spin_unlock_irqrestore(&list->lock, flags);
508 
509 	if (!sock_flag(sk, SOCK_DEAD))
510 		sk->sk_data_ready(sk);
511 	return 0;
512 }
513 EXPORT_SYMBOL(__sock_queue_rcv_skb);
514 
515 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
516 			      enum skb_drop_reason *reason)
517 {
518 	enum skb_drop_reason drop_reason;
519 	int err;
520 
521 	err = sk_filter(sk, skb);
522 	if (err) {
523 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
524 		goto out;
525 	}
526 	err = __sock_queue_rcv_skb(sk, skb);
527 	switch (err) {
528 	case -ENOMEM:
529 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
530 		break;
531 	case -ENOBUFS:
532 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
533 		break;
534 	default:
535 		drop_reason = SKB_NOT_DROPPED_YET;
536 		break;
537 	}
538 out:
539 	if (reason)
540 		*reason = drop_reason;
541 	return err;
542 }
543 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
544 
545 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
546 		     const int nested, unsigned int trim_cap, bool refcounted)
547 {
548 	int rc = NET_RX_SUCCESS;
549 
550 	if (sk_filter_trim_cap(sk, skb, trim_cap))
551 		goto discard_and_relse;
552 
553 	skb->dev = NULL;
554 
555 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
556 		atomic_inc(&sk->sk_drops);
557 		goto discard_and_relse;
558 	}
559 	if (nested)
560 		bh_lock_sock_nested(sk);
561 	else
562 		bh_lock_sock(sk);
563 	if (!sock_owned_by_user(sk)) {
564 		/*
565 		 * trylock + unlock semantics:
566 		 */
567 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
568 
569 		rc = sk_backlog_rcv(sk, skb);
570 
571 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
572 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
573 		bh_unlock_sock(sk);
574 		atomic_inc(&sk->sk_drops);
575 		goto discard_and_relse;
576 	}
577 
578 	bh_unlock_sock(sk);
579 out:
580 	if (refcounted)
581 		sock_put(sk);
582 	return rc;
583 discard_and_relse:
584 	kfree_skb(skb);
585 	goto out;
586 }
587 EXPORT_SYMBOL(__sk_receive_skb);
588 
589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
590 							  u32));
591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
592 							   u32));
593 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
594 {
595 	struct dst_entry *dst = __sk_dst_get(sk);
596 
597 	if (dst && dst->obsolete &&
598 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
599 			       dst, cookie) == NULL) {
600 		sk_tx_queue_clear(sk);
601 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
602 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
603 		dst_release(dst);
604 		return NULL;
605 	}
606 
607 	return dst;
608 }
609 EXPORT_SYMBOL(__sk_dst_check);
610 
611 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
612 {
613 	struct dst_entry *dst = sk_dst_get(sk);
614 
615 	if (dst && dst->obsolete &&
616 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
617 			       dst, cookie) == NULL) {
618 		sk_dst_reset(sk);
619 		dst_release(dst);
620 		return NULL;
621 	}
622 
623 	return dst;
624 }
625 EXPORT_SYMBOL(sk_dst_check);
626 
627 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
628 {
629 	int ret = -ENOPROTOOPT;
630 #ifdef CONFIG_NETDEVICES
631 	struct net *net = sock_net(sk);
632 
633 	/* Sorry... */
634 	ret = -EPERM;
635 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
636 		goto out;
637 
638 	ret = -EINVAL;
639 	if (ifindex < 0)
640 		goto out;
641 
642 	/* Paired with all READ_ONCE() done locklessly. */
643 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
644 
645 	if (sk->sk_prot->rehash)
646 		sk->sk_prot->rehash(sk);
647 	sk_dst_reset(sk);
648 
649 	ret = 0;
650 
651 out:
652 #endif
653 
654 	return ret;
655 }
656 
657 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
658 {
659 	int ret;
660 
661 	if (lock_sk)
662 		lock_sock(sk);
663 	ret = sock_bindtoindex_locked(sk, ifindex);
664 	if (lock_sk)
665 		release_sock(sk);
666 
667 	return ret;
668 }
669 EXPORT_SYMBOL(sock_bindtoindex);
670 
671 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
672 {
673 	int ret = -ENOPROTOOPT;
674 #ifdef CONFIG_NETDEVICES
675 	struct net *net = sock_net(sk);
676 	char devname[IFNAMSIZ];
677 	int index;
678 
679 	ret = -EINVAL;
680 	if (optlen < 0)
681 		goto out;
682 
683 	/* Bind this socket to a particular device like "eth0",
684 	 * as specified in the passed interface name. If the
685 	 * name is "" or the option length is zero the socket
686 	 * is not bound.
687 	 */
688 	if (optlen > IFNAMSIZ - 1)
689 		optlen = IFNAMSIZ - 1;
690 	memset(devname, 0, sizeof(devname));
691 
692 	ret = -EFAULT;
693 	if (copy_from_sockptr(devname, optval, optlen))
694 		goto out;
695 
696 	index = 0;
697 	if (devname[0] != '\0') {
698 		struct net_device *dev;
699 
700 		rcu_read_lock();
701 		dev = dev_get_by_name_rcu(net, devname);
702 		if (dev)
703 			index = dev->ifindex;
704 		rcu_read_unlock();
705 		ret = -ENODEV;
706 		if (!dev)
707 			goto out;
708 	}
709 
710 	sockopt_lock_sock(sk);
711 	ret = sock_bindtoindex_locked(sk, index);
712 	sockopt_release_sock(sk);
713 out:
714 #endif
715 
716 	return ret;
717 }
718 
719 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
720 				sockptr_t optlen, int len)
721 {
722 	int ret = -ENOPROTOOPT;
723 #ifdef CONFIG_NETDEVICES
724 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
725 	struct net *net = sock_net(sk);
726 	char devname[IFNAMSIZ];
727 
728 	if (bound_dev_if == 0) {
729 		len = 0;
730 		goto zero;
731 	}
732 
733 	ret = -EINVAL;
734 	if (len < IFNAMSIZ)
735 		goto out;
736 
737 	ret = netdev_get_name(net, devname, bound_dev_if);
738 	if (ret)
739 		goto out;
740 
741 	len = strlen(devname) + 1;
742 
743 	ret = -EFAULT;
744 	if (copy_to_sockptr(optval, devname, len))
745 		goto out;
746 
747 zero:
748 	ret = -EFAULT;
749 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
750 		goto out;
751 
752 	ret = 0;
753 
754 out:
755 #endif
756 
757 	return ret;
758 }
759 
760 bool sk_mc_loop(const struct sock *sk)
761 {
762 	if (dev_recursion_level())
763 		return false;
764 	if (!sk)
765 		return true;
766 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
767 	switch (READ_ONCE(sk->sk_family)) {
768 	case AF_INET:
769 		return inet_test_bit(MC_LOOP, sk);
770 #if IS_ENABLED(CONFIG_IPV6)
771 	case AF_INET6:
772 		return inet6_test_bit(MC6_LOOP, sk);
773 #endif
774 	}
775 	WARN_ON_ONCE(1);
776 	return true;
777 }
778 EXPORT_SYMBOL(sk_mc_loop);
779 
780 void sock_set_reuseaddr(struct sock *sk)
781 {
782 	lock_sock(sk);
783 	sk->sk_reuse = SK_CAN_REUSE;
784 	release_sock(sk);
785 }
786 EXPORT_SYMBOL(sock_set_reuseaddr);
787 
788 void sock_set_reuseport(struct sock *sk)
789 {
790 	lock_sock(sk);
791 	sk->sk_reuseport = true;
792 	release_sock(sk);
793 }
794 EXPORT_SYMBOL(sock_set_reuseport);
795 
796 void sock_no_linger(struct sock *sk)
797 {
798 	lock_sock(sk);
799 	WRITE_ONCE(sk->sk_lingertime, 0);
800 	sock_set_flag(sk, SOCK_LINGER);
801 	release_sock(sk);
802 }
803 EXPORT_SYMBOL(sock_no_linger);
804 
805 void sock_set_priority(struct sock *sk, u32 priority)
806 {
807 	WRITE_ONCE(sk->sk_priority, priority);
808 }
809 EXPORT_SYMBOL(sock_set_priority);
810 
811 void sock_set_sndtimeo(struct sock *sk, s64 secs)
812 {
813 	lock_sock(sk);
814 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
815 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
816 	else
817 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
818 	release_sock(sk);
819 }
820 EXPORT_SYMBOL(sock_set_sndtimeo);
821 
822 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
823 {
824 	if (val)  {
825 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
826 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
827 		sock_set_flag(sk, SOCK_RCVTSTAMP);
828 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
829 	} else {
830 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
831 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
832 	}
833 }
834 
835 void sock_enable_timestamps(struct sock *sk)
836 {
837 	lock_sock(sk);
838 	__sock_set_timestamps(sk, true, false, true);
839 	release_sock(sk);
840 }
841 EXPORT_SYMBOL(sock_enable_timestamps);
842 
843 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
844 {
845 	switch (optname) {
846 	case SO_TIMESTAMP_OLD:
847 		__sock_set_timestamps(sk, valbool, false, false);
848 		break;
849 	case SO_TIMESTAMP_NEW:
850 		__sock_set_timestamps(sk, valbool, true, false);
851 		break;
852 	case SO_TIMESTAMPNS_OLD:
853 		__sock_set_timestamps(sk, valbool, false, true);
854 		break;
855 	case SO_TIMESTAMPNS_NEW:
856 		__sock_set_timestamps(sk, valbool, true, true);
857 		break;
858 	}
859 }
860 
861 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
862 {
863 	struct net *net = sock_net(sk);
864 	struct net_device *dev = NULL;
865 	bool match = false;
866 	int *vclock_index;
867 	int i, num;
868 
869 	if (sk->sk_bound_dev_if)
870 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
871 
872 	if (!dev) {
873 		pr_err("%s: sock not bind to device\n", __func__);
874 		return -EOPNOTSUPP;
875 	}
876 
877 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
878 	dev_put(dev);
879 
880 	for (i = 0; i < num; i++) {
881 		if (*(vclock_index + i) == phc_index) {
882 			match = true;
883 			break;
884 		}
885 	}
886 
887 	if (num > 0)
888 		kfree(vclock_index);
889 
890 	if (!match)
891 		return -EINVAL;
892 
893 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
894 
895 	return 0;
896 }
897 
898 int sock_set_timestamping(struct sock *sk, int optname,
899 			  struct so_timestamping timestamping)
900 {
901 	int val = timestamping.flags;
902 	int ret;
903 
904 	if (val & ~SOF_TIMESTAMPING_MASK)
905 		return -EINVAL;
906 
907 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
908 	    !(val & SOF_TIMESTAMPING_OPT_ID))
909 		return -EINVAL;
910 
911 	if (val & SOF_TIMESTAMPING_OPT_ID &&
912 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
913 		if (sk_is_tcp(sk)) {
914 			if ((1 << sk->sk_state) &
915 			    (TCPF_CLOSE | TCPF_LISTEN))
916 				return -EINVAL;
917 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
918 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
919 			else
920 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
921 		} else {
922 			atomic_set(&sk->sk_tskey, 0);
923 		}
924 	}
925 
926 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
927 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
928 		return -EINVAL;
929 
930 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
931 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
932 		if (ret)
933 			return ret;
934 	}
935 
936 	WRITE_ONCE(sk->sk_tsflags, val);
937 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
938 
939 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
940 		sock_enable_timestamp(sk,
941 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
942 	else
943 		sock_disable_timestamp(sk,
944 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
945 	return 0;
946 }
947 
948 void sock_set_keepalive(struct sock *sk)
949 {
950 	lock_sock(sk);
951 	if (sk->sk_prot->keepalive)
952 		sk->sk_prot->keepalive(sk, true);
953 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
954 	release_sock(sk);
955 }
956 EXPORT_SYMBOL(sock_set_keepalive);
957 
958 static void __sock_set_rcvbuf(struct sock *sk, int val)
959 {
960 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
961 	 * as a negative value.
962 	 */
963 	val = min_t(int, val, INT_MAX / 2);
964 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
965 
966 	/* We double it on the way in to account for "struct sk_buff" etc.
967 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
968 	 * will allow that much actual data to be received on that socket.
969 	 *
970 	 * Applications are unaware that "struct sk_buff" and other overheads
971 	 * allocate from the receive buffer during socket buffer allocation.
972 	 *
973 	 * And after considering the possible alternatives, returning the value
974 	 * we actually used in getsockopt is the most desirable behavior.
975 	 */
976 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
977 }
978 
979 void sock_set_rcvbuf(struct sock *sk, int val)
980 {
981 	lock_sock(sk);
982 	__sock_set_rcvbuf(sk, val);
983 	release_sock(sk);
984 }
985 EXPORT_SYMBOL(sock_set_rcvbuf);
986 
987 static void __sock_set_mark(struct sock *sk, u32 val)
988 {
989 	if (val != sk->sk_mark) {
990 		WRITE_ONCE(sk->sk_mark, val);
991 		sk_dst_reset(sk);
992 	}
993 }
994 
995 void sock_set_mark(struct sock *sk, u32 val)
996 {
997 	lock_sock(sk);
998 	__sock_set_mark(sk, val);
999 	release_sock(sk);
1000 }
1001 EXPORT_SYMBOL(sock_set_mark);
1002 
1003 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1004 {
1005 	/* Round down bytes to multiple of pages */
1006 	bytes = round_down(bytes, PAGE_SIZE);
1007 
1008 	WARN_ON(bytes > sk->sk_reserved_mem);
1009 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1010 	sk_mem_reclaim(sk);
1011 }
1012 
1013 static int sock_reserve_memory(struct sock *sk, int bytes)
1014 {
1015 	long allocated;
1016 	bool charged;
1017 	int pages;
1018 
1019 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1020 		return -EOPNOTSUPP;
1021 
1022 	if (!bytes)
1023 		return 0;
1024 
1025 	pages = sk_mem_pages(bytes);
1026 
1027 	/* pre-charge to memcg */
1028 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1029 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1030 	if (!charged)
1031 		return -ENOMEM;
1032 
1033 	/* pre-charge to forward_alloc */
1034 	sk_memory_allocated_add(sk, pages);
1035 	allocated = sk_memory_allocated(sk);
1036 	/* If the system goes into memory pressure with this
1037 	 * precharge, give up and return error.
1038 	 */
1039 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1040 		sk_memory_allocated_sub(sk, pages);
1041 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1042 		return -ENOMEM;
1043 	}
1044 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1045 
1046 	WRITE_ONCE(sk->sk_reserved_mem,
1047 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1048 
1049 	return 0;
1050 }
1051 
1052 void sockopt_lock_sock(struct sock *sk)
1053 {
1054 	/* When current->bpf_ctx is set, the setsockopt is called from
1055 	 * a bpf prog.  bpf has ensured the sk lock has been
1056 	 * acquired before calling setsockopt().
1057 	 */
1058 	if (has_current_bpf_ctx())
1059 		return;
1060 
1061 	lock_sock(sk);
1062 }
1063 EXPORT_SYMBOL(sockopt_lock_sock);
1064 
1065 void sockopt_release_sock(struct sock *sk)
1066 {
1067 	if (has_current_bpf_ctx())
1068 		return;
1069 
1070 	release_sock(sk);
1071 }
1072 EXPORT_SYMBOL(sockopt_release_sock);
1073 
1074 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1075 {
1076 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1077 }
1078 EXPORT_SYMBOL(sockopt_ns_capable);
1079 
1080 bool sockopt_capable(int cap)
1081 {
1082 	return has_current_bpf_ctx() || capable(cap);
1083 }
1084 EXPORT_SYMBOL(sockopt_capable);
1085 
1086 static int sockopt_validate_clockid(__kernel_clockid_t value)
1087 {
1088 	switch (value) {
1089 	case CLOCK_REALTIME:
1090 	case CLOCK_MONOTONIC:
1091 	case CLOCK_TAI:
1092 		return 0;
1093 	}
1094 	return -EINVAL;
1095 }
1096 
1097 /*
1098  *	This is meant for all protocols to use and covers goings on
1099  *	at the socket level. Everything here is generic.
1100  */
1101 
1102 int sk_setsockopt(struct sock *sk, int level, int optname,
1103 		  sockptr_t optval, unsigned int optlen)
1104 {
1105 	struct so_timestamping timestamping;
1106 	struct socket *sock = sk->sk_socket;
1107 	struct sock_txtime sk_txtime;
1108 	int val;
1109 	int valbool;
1110 	struct linger ling;
1111 	int ret = 0;
1112 
1113 	/*
1114 	 *	Options without arguments
1115 	 */
1116 
1117 	if (optname == SO_BINDTODEVICE)
1118 		return sock_setbindtodevice(sk, optval, optlen);
1119 
1120 	if (optlen < sizeof(int))
1121 		return -EINVAL;
1122 
1123 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1124 		return -EFAULT;
1125 
1126 	valbool = val ? 1 : 0;
1127 
1128 	/* handle options which do not require locking the socket. */
1129 	switch (optname) {
1130 	case SO_PRIORITY:
1131 		if ((val >= 0 && val <= 6) ||
1132 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1133 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1134 			sock_set_priority(sk, val);
1135 			return 0;
1136 		}
1137 		return -EPERM;
1138 	case SO_PASSSEC:
1139 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1140 		return 0;
1141 	case SO_PASSCRED:
1142 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1143 		return 0;
1144 	case SO_PASSPIDFD:
1145 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1146 		return 0;
1147 	case SO_TYPE:
1148 	case SO_PROTOCOL:
1149 	case SO_DOMAIN:
1150 	case SO_ERROR:
1151 		return -ENOPROTOOPT;
1152 #ifdef CONFIG_NET_RX_BUSY_POLL
1153 	case SO_BUSY_POLL:
1154 		if (val < 0)
1155 			return -EINVAL;
1156 		WRITE_ONCE(sk->sk_ll_usec, val);
1157 		return 0;
1158 	case SO_PREFER_BUSY_POLL:
1159 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1160 			return -EPERM;
1161 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1162 		return 0;
1163 	case SO_BUSY_POLL_BUDGET:
1164 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1165 		    !sockopt_capable(CAP_NET_ADMIN))
1166 			return -EPERM;
1167 		if (val < 0 || val > U16_MAX)
1168 			return -EINVAL;
1169 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1170 		return 0;
1171 #endif
1172 	case SO_MAX_PACING_RATE:
1173 		{
1174 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1175 		unsigned long pacing_rate;
1176 
1177 		if (sizeof(ulval) != sizeof(val) &&
1178 		    optlen >= sizeof(ulval) &&
1179 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1180 			return -EFAULT;
1181 		}
1182 		if (ulval != ~0UL)
1183 			cmpxchg(&sk->sk_pacing_status,
1184 				SK_PACING_NONE,
1185 				SK_PACING_NEEDED);
1186 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1187 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1188 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1189 		if (ulval < pacing_rate)
1190 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1191 		return 0;
1192 		}
1193 	case SO_TXREHASH:
1194 		if (val < -1 || val > 1)
1195 			return -EINVAL;
1196 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1197 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1198 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1199 		 * and sk_getsockopt().
1200 		 */
1201 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1202 		return 0;
1203 	case SO_PEEK_OFF:
1204 		{
1205 		int (*set_peek_off)(struct sock *sk, int val);
1206 
1207 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1208 		if (set_peek_off)
1209 			ret = set_peek_off(sk, val);
1210 		else
1211 			ret = -EOPNOTSUPP;
1212 		return ret;
1213 		}
1214 	}
1215 
1216 	sockopt_lock_sock(sk);
1217 
1218 	switch (optname) {
1219 	case SO_DEBUG:
1220 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1221 			ret = -EACCES;
1222 		else
1223 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1224 		break;
1225 	case SO_REUSEADDR:
1226 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1227 		break;
1228 	case SO_REUSEPORT:
1229 		sk->sk_reuseport = valbool;
1230 		break;
1231 	case SO_DONTROUTE:
1232 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1233 		sk_dst_reset(sk);
1234 		break;
1235 	case SO_BROADCAST:
1236 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1237 		break;
1238 	case SO_SNDBUF:
1239 		/* Don't error on this BSD doesn't and if you think
1240 		 * about it this is right. Otherwise apps have to
1241 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1242 		 * are treated in BSD as hints
1243 		 */
1244 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1245 set_sndbuf:
1246 		/* Ensure val * 2 fits into an int, to prevent max_t()
1247 		 * from treating it as a negative value.
1248 		 */
1249 		val = min_t(int, val, INT_MAX / 2);
1250 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1251 		WRITE_ONCE(sk->sk_sndbuf,
1252 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1253 		/* Wake up sending tasks if we upped the value. */
1254 		sk->sk_write_space(sk);
1255 		break;
1256 
1257 	case SO_SNDBUFFORCE:
1258 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1259 			ret = -EPERM;
1260 			break;
1261 		}
1262 
1263 		/* No negative values (to prevent underflow, as val will be
1264 		 * multiplied by 2).
1265 		 */
1266 		if (val < 0)
1267 			val = 0;
1268 		goto set_sndbuf;
1269 
1270 	case SO_RCVBUF:
1271 		/* Don't error on this BSD doesn't and if you think
1272 		 * about it this is right. Otherwise apps have to
1273 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1274 		 * are treated in BSD as hints
1275 		 */
1276 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1277 		break;
1278 
1279 	case SO_RCVBUFFORCE:
1280 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1281 			ret = -EPERM;
1282 			break;
1283 		}
1284 
1285 		/* No negative values (to prevent underflow, as val will be
1286 		 * multiplied by 2).
1287 		 */
1288 		__sock_set_rcvbuf(sk, max(val, 0));
1289 		break;
1290 
1291 	case SO_KEEPALIVE:
1292 		if (sk->sk_prot->keepalive)
1293 			sk->sk_prot->keepalive(sk, valbool);
1294 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1295 		break;
1296 
1297 	case SO_OOBINLINE:
1298 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1299 		break;
1300 
1301 	case SO_NO_CHECK:
1302 		sk->sk_no_check_tx = valbool;
1303 		break;
1304 
1305 	case SO_LINGER:
1306 		if (optlen < sizeof(ling)) {
1307 			ret = -EINVAL;	/* 1003.1g */
1308 			break;
1309 		}
1310 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1311 			ret = -EFAULT;
1312 			break;
1313 		}
1314 		if (!ling.l_onoff) {
1315 			sock_reset_flag(sk, SOCK_LINGER);
1316 		} else {
1317 			unsigned long t_sec = ling.l_linger;
1318 
1319 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1320 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1321 			else
1322 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1323 			sock_set_flag(sk, SOCK_LINGER);
1324 		}
1325 		break;
1326 
1327 	case SO_BSDCOMPAT:
1328 		break;
1329 
1330 	case SO_TIMESTAMP_OLD:
1331 	case SO_TIMESTAMP_NEW:
1332 	case SO_TIMESTAMPNS_OLD:
1333 	case SO_TIMESTAMPNS_NEW:
1334 		sock_set_timestamp(sk, optname, valbool);
1335 		break;
1336 
1337 	case SO_TIMESTAMPING_NEW:
1338 	case SO_TIMESTAMPING_OLD:
1339 		if (optlen == sizeof(timestamping)) {
1340 			if (copy_from_sockptr(&timestamping, optval,
1341 					      sizeof(timestamping))) {
1342 				ret = -EFAULT;
1343 				break;
1344 			}
1345 		} else {
1346 			memset(&timestamping, 0, sizeof(timestamping));
1347 			timestamping.flags = val;
1348 		}
1349 		ret = sock_set_timestamping(sk, optname, timestamping);
1350 		break;
1351 
1352 	case SO_RCVLOWAT:
1353 		{
1354 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1355 
1356 		if (val < 0)
1357 			val = INT_MAX;
1358 		if (sock)
1359 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1360 		if (set_rcvlowat)
1361 			ret = set_rcvlowat(sk, val);
1362 		else
1363 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1364 		break;
1365 		}
1366 	case SO_RCVTIMEO_OLD:
1367 	case SO_RCVTIMEO_NEW:
1368 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1369 				       optlen, optname == SO_RCVTIMEO_OLD);
1370 		break;
1371 
1372 	case SO_SNDTIMEO_OLD:
1373 	case SO_SNDTIMEO_NEW:
1374 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1375 				       optlen, optname == SO_SNDTIMEO_OLD);
1376 		break;
1377 
1378 	case SO_ATTACH_FILTER: {
1379 		struct sock_fprog fprog;
1380 
1381 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1382 		if (!ret)
1383 			ret = sk_attach_filter(&fprog, sk);
1384 		break;
1385 	}
1386 	case SO_ATTACH_BPF:
1387 		ret = -EINVAL;
1388 		if (optlen == sizeof(u32)) {
1389 			u32 ufd;
1390 
1391 			ret = -EFAULT;
1392 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1393 				break;
1394 
1395 			ret = sk_attach_bpf(ufd, sk);
1396 		}
1397 		break;
1398 
1399 	case SO_ATTACH_REUSEPORT_CBPF: {
1400 		struct sock_fprog fprog;
1401 
1402 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1403 		if (!ret)
1404 			ret = sk_reuseport_attach_filter(&fprog, sk);
1405 		break;
1406 	}
1407 	case SO_ATTACH_REUSEPORT_EBPF:
1408 		ret = -EINVAL;
1409 		if (optlen == sizeof(u32)) {
1410 			u32 ufd;
1411 
1412 			ret = -EFAULT;
1413 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1414 				break;
1415 
1416 			ret = sk_reuseport_attach_bpf(ufd, sk);
1417 		}
1418 		break;
1419 
1420 	case SO_DETACH_REUSEPORT_BPF:
1421 		ret = reuseport_detach_prog(sk);
1422 		break;
1423 
1424 	case SO_DETACH_FILTER:
1425 		ret = sk_detach_filter(sk);
1426 		break;
1427 
1428 	case SO_LOCK_FILTER:
1429 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1430 			ret = -EPERM;
1431 		else
1432 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1433 		break;
1434 
1435 	case SO_MARK:
1436 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1437 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1438 			ret = -EPERM;
1439 			break;
1440 		}
1441 
1442 		__sock_set_mark(sk, val);
1443 		break;
1444 	case SO_RCVMARK:
1445 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1446 		break;
1447 
1448 	case SO_RXQ_OVFL:
1449 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1450 		break;
1451 
1452 	case SO_WIFI_STATUS:
1453 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1454 		break;
1455 
1456 	case SO_NOFCS:
1457 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1458 		break;
1459 
1460 	case SO_SELECT_ERR_QUEUE:
1461 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1462 		break;
1463 
1464 
1465 	case SO_INCOMING_CPU:
1466 		reuseport_update_incoming_cpu(sk, val);
1467 		break;
1468 
1469 	case SO_CNX_ADVICE:
1470 		if (val == 1)
1471 			dst_negative_advice(sk);
1472 		break;
1473 
1474 	case SO_ZEROCOPY:
1475 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1476 			if (!(sk_is_tcp(sk) ||
1477 			      (sk->sk_type == SOCK_DGRAM &&
1478 			       sk->sk_protocol == IPPROTO_UDP)))
1479 				ret = -EOPNOTSUPP;
1480 		} else if (sk->sk_family != PF_RDS) {
1481 			ret = -EOPNOTSUPP;
1482 		}
1483 		if (!ret) {
1484 			if (val < 0 || val > 1)
1485 				ret = -EINVAL;
1486 			else
1487 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1488 		}
1489 		break;
1490 
1491 	case SO_TXTIME:
1492 		if (optlen != sizeof(struct sock_txtime)) {
1493 			ret = -EINVAL;
1494 			break;
1495 		} else if (copy_from_sockptr(&sk_txtime, optval,
1496 			   sizeof(struct sock_txtime))) {
1497 			ret = -EFAULT;
1498 			break;
1499 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1500 			ret = -EINVAL;
1501 			break;
1502 		}
1503 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1504 		 * scheduler has enough safe guards.
1505 		 */
1506 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1507 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1508 			ret = -EPERM;
1509 			break;
1510 		}
1511 
1512 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1513 		if (ret)
1514 			break;
1515 
1516 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1517 		sk->sk_clockid = sk_txtime.clockid;
1518 		sk->sk_txtime_deadline_mode =
1519 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1520 		sk->sk_txtime_report_errors =
1521 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1522 		break;
1523 
1524 	case SO_BINDTOIFINDEX:
1525 		ret = sock_bindtoindex_locked(sk, val);
1526 		break;
1527 
1528 	case SO_BUF_LOCK:
1529 		if (val & ~SOCK_BUF_LOCK_MASK) {
1530 			ret = -EINVAL;
1531 			break;
1532 		}
1533 		sk->sk_userlocks = val | (sk->sk_userlocks &
1534 					  ~SOCK_BUF_LOCK_MASK);
1535 		break;
1536 
1537 	case SO_RESERVE_MEM:
1538 	{
1539 		int delta;
1540 
1541 		if (val < 0) {
1542 			ret = -EINVAL;
1543 			break;
1544 		}
1545 
1546 		delta = val - sk->sk_reserved_mem;
1547 		if (delta < 0)
1548 			sock_release_reserved_memory(sk, -delta);
1549 		else
1550 			ret = sock_reserve_memory(sk, delta);
1551 		break;
1552 	}
1553 
1554 	default:
1555 		ret = -ENOPROTOOPT;
1556 		break;
1557 	}
1558 	sockopt_release_sock(sk);
1559 	return ret;
1560 }
1561 
1562 int sock_setsockopt(struct socket *sock, int level, int optname,
1563 		    sockptr_t optval, unsigned int optlen)
1564 {
1565 	return sk_setsockopt(sock->sk, level, optname,
1566 			     optval, optlen);
1567 }
1568 EXPORT_SYMBOL(sock_setsockopt);
1569 
1570 static const struct cred *sk_get_peer_cred(struct sock *sk)
1571 {
1572 	const struct cred *cred;
1573 
1574 	spin_lock(&sk->sk_peer_lock);
1575 	cred = get_cred(sk->sk_peer_cred);
1576 	spin_unlock(&sk->sk_peer_lock);
1577 
1578 	return cred;
1579 }
1580 
1581 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1582 			  struct ucred *ucred)
1583 {
1584 	ucred->pid = pid_vnr(pid);
1585 	ucred->uid = ucred->gid = -1;
1586 	if (cred) {
1587 		struct user_namespace *current_ns = current_user_ns();
1588 
1589 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1590 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1591 	}
1592 }
1593 
1594 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1595 {
1596 	struct user_namespace *user_ns = current_user_ns();
1597 	int i;
1598 
1599 	for (i = 0; i < src->ngroups; i++) {
1600 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1601 
1602 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1603 			return -EFAULT;
1604 	}
1605 
1606 	return 0;
1607 }
1608 
1609 int sk_getsockopt(struct sock *sk, int level, int optname,
1610 		  sockptr_t optval, sockptr_t optlen)
1611 {
1612 	struct socket *sock = sk->sk_socket;
1613 
1614 	union {
1615 		int val;
1616 		u64 val64;
1617 		unsigned long ulval;
1618 		struct linger ling;
1619 		struct old_timeval32 tm32;
1620 		struct __kernel_old_timeval tm;
1621 		struct  __kernel_sock_timeval stm;
1622 		struct sock_txtime txtime;
1623 		struct so_timestamping timestamping;
1624 	} v;
1625 
1626 	int lv = sizeof(int);
1627 	int len;
1628 
1629 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1630 		return -EFAULT;
1631 	if (len < 0)
1632 		return -EINVAL;
1633 
1634 	memset(&v, 0, sizeof(v));
1635 
1636 	switch (optname) {
1637 	case SO_DEBUG:
1638 		v.val = sock_flag(sk, SOCK_DBG);
1639 		break;
1640 
1641 	case SO_DONTROUTE:
1642 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1643 		break;
1644 
1645 	case SO_BROADCAST:
1646 		v.val = sock_flag(sk, SOCK_BROADCAST);
1647 		break;
1648 
1649 	case SO_SNDBUF:
1650 		v.val = READ_ONCE(sk->sk_sndbuf);
1651 		break;
1652 
1653 	case SO_RCVBUF:
1654 		v.val = READ_ONCE(sk->sk_rcvbuf);
1655 		break;
1656 
1657 	case SO_REUSEADDR:
1658 		v.val = sk->sk_reuse;
1659 		break;
1660 
1661 	case SO_REUSEPORT:
1662 		v.val = sk->sk_reuseport;
1663 		break;
1664 
1665 	case SO_KEEPALIVE:
1666 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1667 		break;
1668 
1669 	case SO_TYPE:
1670 		v.val = sk->sk_type;
1671 		break;
1672 
1673 	case SO_PROTOCOL:
1674 		v.val = sk->sk_protocol;
1675 		break;
1676 
1677 	case SO_DOMAIN:
1678 		v.val = sk->sk_family;
1679 		break;
1680 
1681 	case SO_ERROR:
1682 		v.val = -sock_error(sk);
1683 		if (v.val == 0)
1684 			v.val = xchg(&sk->sk_err_soft, 0);
1685 		break;
1686 
1687 	case SO_OOBINLINE:
1688 		v.val = sock_flag(sk, SOCK_URGINLINE);
1689 		break;
1690 
1691 	case SO_NO_CHECK:
1692 		v.val = sk->sk_no_check_tx;
1693 		break;
1694 
1695 	case SO_PRIORITY:
1696 		v.val = READ_ONCE(sk->sk_priority);
1697 		break;
1698 
1699 	case SO_LINGER:
1700 		lv		= sizeof(v.ling);
1701 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1702 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1703 		break;
1704 
1705 	case SO_BSDCOMPAT:
1706 		break;
1707 
1708 	case SO_TIMESTAMP_OLD:
1709 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1710 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1711 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1712 		break;
1713 
1714 	case SO_TIMESTAMPNS_OLD:
1715 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1716 		break;
1717 
1718 	case SO_TIMESTAMP_NEW:
1719 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1720 		break;
1721 
1722 	case SO_TIMESTAMPNS_NEW:
1723 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1724 		break;
1725 
1726 	case SO_TIMESTAMPING_OLD:
1727 	case SO_TIMESTAMPING_NEW:
1728 		lv = sizeof(v.timestamping);
1729 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1730 		 * returning the flags when they were set through the same option.
1731 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1732 		 */
1733 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1734 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1735 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1736 		}
1737 		break;
1738 
1739 	case SO_RCVTIMEO_OLD:
1740 	case SO_RCVTIMEO_NEW:
1741 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1742 				      SO_RCVTIMEO_OLD == optname);
1743 		break;
1744 
1745 	case SO_SNDTIMEO_OLD:
1746 	case SO_SNDTIMEO_NEW:
1747 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1748 				      SO_SNDTIMEO_OLD == optname);
1749 		break;
1750 
1751 	case SO_RCVLOWAT:
1752 		v.val = READ_ONCE(sk->sk_rcvlowat);
1753 		break;
1754 
1755 	case SO_SNDLOWAT:
1756 		v.val = 1;
1757 		break;
1758 
1759 	case SO_PASSCRED:
1760 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1761 		break;
1762 
1763 	case SO_PASSPIDFD:
1764 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1765 		break;
1766 
1767 	case SO_PEERCRED:
1768 	{
1769 		struct ucred peercred;
1770 		if (len > sizeof(peercred))
1771 			len = sizeof(peercred);
1772 
1773 		spin_lock(&sk->sk_peer_lock);
1774 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1775 		spin_unlock(&sk->sk_peer_lock);
1776 
1777 		if (copy_to_sockptr(optval, &peercred, len))
1778 			return -EFAULT;
1779 		goto lenout;
1780 	}
1781 
1782 	case SO_PEERPIDFD:
1783 	{
1784 		struct pid *peer_pid;
1785 		struct file *pidfd_file = NULL;
1786 		int pidfd;
1787 
1788 		if (len > sizeof(pidfd))
1789 			len = sizeof(pidfd);
1790 
1791 		spin_lock(&sk->sk_peer_lock);
1792 		peer_pid = get_pid(sk->sk_peer_pid);
1793 		spin_unlock(&sk->sk_peer_lock);
1794 
1795 		if (!peer_pid)
1796 			return -ENODATA;
1797 
1798 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1799 		put_pid(peer_pid);
1800 		if (pidfd < 0)
1801 			return pidfd;
1802 
1803 		if (copy_to_sockptr(optval, &pidfd, len) ||
1804 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1805 			put_unused_fd(pidfd);
1806 			fput(pidfd_file);
1807 
1808 			return -EFAULT;
1809 		}
1810 
1811 		fd_install(pidfd, pidfd_file);
1812 		return 0;
1813 	}
1814 
1815 	case SO_PEERGROUPS:
1816 	{
1817 		const struct cred *cred;
1818 		int ret, n;
1819 
1820 		cred = sk_get_peer_cred(sk);
1821 		if (!cred)
1822 			return -ENODATA;
1823 
1824 		n = cred->group_info->ngroups;
1825 		if (len < n * sizeof(gid_t)) {
1826 			len = n * sizeof(gid_t);
1827 			put_cred(cred);
1828 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1829 		}
1830 		len = n * sizeof(gid_t);
1831 
1832 		ret = groups_to_user(optval, cred->group_info);
1833 		put_cred(cred);
1834 		if (ret)
1835 			return ret;
1836 		goto lenout;
1837 	}
1838 
1839 	case SO_PEERNAME:
1840 	{
1841 		struct sockaddr_storage address;
1842 
1843 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1844 		if (lv < 0)
1845 			return -ENOTCONN;
1846 		if (lv < len)
1847 			return -EINVAL;
1848 		if (copy_to_sockptr(optval, &address, len))
1849 			return -EFAULT;
1850 		goto lenout;
1851 	}
1852 
1853 	/* Dubious BSD thing... Probably nobody even uses it, but
1854 	 * the UNIX standard wants it for whatever reason... -DaveM
1855 	 */
1856 	case SO_ACCEPTCONN:
1857 		v.val = sk->sk_state == TCP_LISTEN;
1858 		break;
1859 
1860 	case SO_PASSSEC:
1861 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1862 		break;
1863 
1864 	case SO_PEERSEC:
1865 		return security_socket_getpeersec_stream(sock,
1866 							 optval, optlen, len);
1867 
1868 	case SO_MARK:
1869 		v.val = READ_ONCE(sk->sk_mark);
1870 		break;
1871 
1872 	case SO_RCVMARK:
1873 		v.val = sock_flag(sk, SOCK_RCVMARK);
1874 		break;
1875 
1876 	case SO_RXQ_OVFL:
1877 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1878 		break;
1879 
1880 	case SO_WIFI_STATUS:
1881 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1882 		break;
1883 
1884 	case SO_PEEK_OFF:
1885 		if (!READ_ONCE(sock->ops)->set_peek_off)
1886 			return -EOPNOTSUPP;
1887 
1888 		v.val = READ_ONCE(sk->sk_peek_off);
1889 		break;
1890 	case SO_NOFCS:
1891 		v.val = sock_flag(sk, SOCK_NOFCS);
1892 		break;
1893 
1894 	case SO_BINDTODEVICE:
1895 		return sock_getbindtodevice(sk, optval, optlen, len);
1896 
1897 	case SO_GET_FILTER:
1898 		len = sk_get_filter(sk, optval, len);
1899 		if (len < 0)
1900 			return len;
1901 
1902 		goto lenout;
1903 
1904 	case SO_LOCK_FILTER:
1905 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1906 		break;
1907 
1908 	case SO_BPF_EXTENSIONS:
1909 		v.val = bpf_tell_extensions();
1910 		break;
1911 
1912 	case SO_SELECT_ERR_QUEUE:
1913 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1914 		break;
1915 
1916 #ifdef CONFIG_NET_RX_BUSY_POLL
1917 	case SO_BUSY_POLL:
1918 		v.val = READ_ONCE(sk->sk_ll_usec);
1919 		break;
1920 	case SO_PREFER_BUSY_POLL:
1921 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1922 		break;
1923 #endif
1924 
1925 	case SO_MAX_PACING_RATE:
1926 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1927 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1928 			lv = sizeof(v.ulval);
1929 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1930 		} else {
1931 			/* 32bit version */
1932 			v.val = min_t(unsigned long, ~0U,
1933 				      READ_ONCE(sk->sk_max_pacing_rate));
1934 		}
1935 		break;
1936 
1937 	case SO_INCOMING_CPU:
1938 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1939 		break;
1940 
1941 	case SO_MEMINFO:
1942 	{
1943 		u32 meminfo[SK_MEMINFO_VARS];
1944 
1945 		sk_get_meminfo(sk, meminfo);
1946 
1947 		len = min_t(unsigned int, len, sizeof(meminfo));
1948 		if (copy_to_sockptr(optval, &meminfo, len))
1949 			return -EFAULT;
1950 
1951 		goto lenout;
1952 	}
1953 
1954 #ifdef CONFIG_NET_RX_BUSY_POLL
1955 	case SO_INCOMING_NAPI_ID:
1956 		v.val = READ_ONCE(sk->sk_napi_id);
1957 
1958 		/* aggregate non-NAPI IDs down to 0 */
1959 		if (v.val < MIN_NAPI_ID)
1960 			v.val = 0;
1961 
1962 		break;
1963 #endif
1964 
1965 	case SO_COOKIE:
1966 		lv = sizeof(u64);
1967 		if (len < lv)
1968 			return -EINVAL;
1969 		v.val64 = sock_gen_cookie(sk);
1970 		break;
1971 
1972 	case SO_ZEROCOPY:
1973 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1974 		break;
1975 
1976 	case SO_TXTIME:
1977 		lv = sizeof(v.txtime);
1978 		v.txtime.clockid = sk->sk_clockid;
1979 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1980 				  SOF_TXTIME_DEADLINE_MODE : 0;
1981 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1982 				  SOF_TXTIME_REPORT_ERRORS : 0;
1983 		break;
1984 
1985 	case SO_BINDTOIFINDEX:
1986 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1987 		break;
1988 
1989 	case SO_NETNS_COOKIE:
1990 		lv = sizeof(u64);
1991 		if (len != lv)
1992 			return -EINVAL;
1993 		v.val64 = sock_net(sk)->net_cookie;
1994 		break;
1995 
1996 	case SO_BUF_LOCK:
1997 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1998 		break;
1999 
2000 	case SO_RESERVE_MEM:
2001 		v.val = READ_ONCE(sk->sk_reserved_mem);
2002 		break;
2003 
2004 	case SO_TXREHASH:
2005 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2006 		v.val = READ_ONCE(sk->sk_txrehash);
2007 		break;
2008 
2009 	default:
2010 		/* We implement the SO_SNDLOWAT etc to not be settable
2011 		 * (1003.1g 7).
2012 		 */
2013 		return -ENOPROTOOPT;
2014 	}
2015 
2016 	if (len > lv)
2017 		len = lv;
2018 	if (copy_to_sockptr(optval, &v, len))
2019 		return -EFAULT;
2020 lenout:
2021 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2022 		return -EFAULT;
2023 	return 0;
2024 }
2025 
2026 /*
2027  * Initialize an sk_lock.
2028  *
2029  * (We also register the sk_lock with the lock validator.)
2030  */
2031 static inline void sock_lock_init(struct sock *sk)
2032 {
2033 	if (sk->sk_kern_sock)
2034 		sock_lock_init_class_and_name(
2035 			sk,
2036 			af_family_kern_slock_key_strings[sk->sk_family],
2037 			af_family_kern_slock_keys + sk->sk_family,
2038 			af_family_kern_key_strings[sk->sk_family],
2039 			af_family_kern_keys + sk->sk_family);
2040 	else
2041 		sock_lock_init_class_and_name(
2042 			sk,
2043 			af_family_slock_key_strings[sk->sk_family],
2044 			af_family_slock_keys + sk->sk_family,
2045 			af_family_key_strings[sk->sk_family],
2046 			af_family_keys + sk->sk_family);
2047 }
2048 
2049 /*
2050  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2051  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2052  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2053  */
2054 static void sock_copy(struct sock *nsk, const struct sock *osk)
2055 {
2056 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2057 #ifdef CONFIG_SECURITY_NETWORK
2058 	void *sptr = nsk->sk_security;
2059 #endif
2060 
2061 	/* If we move sk_tx_queue_mapping out of the private section,
2062 	 * we must check if sk_tx_queue_clear() is called after
2063 	 * sock_copy() in sk_clone_lock().
2064 	 */
2065 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2066 		     offsetof(struct sock, sk_dontcopy_begin) ||
2067 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2068 		     offsetof(struct sock, sk_dontcopy_end));
2069 
2070 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2071 
2072 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2073 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2074 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2075 
2076 #ifdef CONFIG_SECURITY_NETWORK
2077 	nsk->sk_security = sptr;
2078 	security_sk_clone(osk, nsk);
2079 #endif
2080 }
2081 
2082 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2083 		int family)
2084 {
2085 	struct sock *sk;
2086 	struct kmem_cache *slab;
2087 
2088 	slab = prot->slab;
2089 	if (slab != NULL) {
2090 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2091 		if (!sk)
2092 			return sk;
2093 		if (want_init_on_alloc(priority))
2094 			sk_prot_clear_nulls(sk, prot->obj_size);
2095 	} else
2096 		sk = kmalloc(prot->obj_size, priority);
2097 
2098 	if (sk != NULL) {
2099 		if (security_sk_alloc(sk, family, priority))
2100 			goto out_free;
2101 
2102 		if (!try_module_get(prot->owner))
2103 			goto out_free_sec;
2104 	}
2105 
2106 	return sk;
2107 
2108 out_free_sec:
2109 	security_sk_free(sk);
2110 out_free:
2111 	if (slab != NULL)
2112 		kmem_cache_free(slab, sk);
2113 	else
2114 		kfree(sk);
2115 	return NULL;
2116 }
2117 
2118 static void sk_prot_free(struct proto *prot, struct sock *sk)
2119 {
2120 	struct kmem_cache *slab;
2121 	struct module *owner;
2122 
2123 	owner = prot->owner;
2124 	slab = prot->slab;
2125 
2126 	cgroup_sk_free(&sk->sk_cgrp_data);
2127 	mem_cgroup_sk_free(sk);
2128 	security_sk_free(sk);
2129 	if (slab != NULL)
2130 		kmem_cache_free(slab, sk);
2131 	else
2132 		kfree(sk);
2133 	module_put(owner);
2134 }
2135 
2136 /**
2137  *	sk_alloc - All socket objects are allocated here
2138  *	@net: the applicable net namespace
2139  *	@family: protocol family
2140  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2141  *	@prot: struct proto associated with this new sock instance
2142  *	@kern: is this to be a kernel socket?
2143  */
2144 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2145 		      struct proto *prot, int kern)
2146 {
2147 	struct sock *sk;
2148 
2149 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2150 	if (sk) {
2151 		sk->sk_family = family;
2152 		/*
2153 		 * See comment in struct sock definition to understand
2154 		 * why we need sk_prot_creator -acme
2155 		 */
2156 		sk->sk_prot = sk->sk_prot_creator = prot;
2157 		sk->sk_kern_sock = kern;
2158 		sock_lock_init(sk);
2159 		sk->sk_net_refcnt = kern ? 0 : 1;
2160 		if (likely(sk->sk_net_refcnt)) {
2161 			get_net_track(net, &sk->ns_tracker, priority);
2162 			sock_inuse_add(net, 1);
2163 		} else {
2164 			__netns_tracker_alloc(net, &sk->ns_tracker,
2165 					      false, priority);
2166 		}
2167 
2168 		sock_net_set(sk, net);
2169 		refcount_set(&sk->sk_wmem_alloc, 1);
2170 
2171 		mem_cgroup_sk_alloc(sk);
2172 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2173 		sock_update_classid(&sk->sk_cgrp_data);
2174 		sock_update_netprioidx(&sk->sk_cgrp_data);
2175 		sk_tx_queue_clear(sk);
2176 	}
2177 
2178 	return sk;
2179 }
2180 EXPORT_SYMBOL(sk_alloc);
2181 
2182 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2183  * grace period. This is the case for UDP sockets and TCP listeners.
2184  */
2185 static void __sk_destruct(struct rcu_head *head)
2186 {
2187 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2188 	struct sk_filter *filter;
2189 
2190 	if (sk->sk_destruct)
2191 		sk->sk_destruct(sk);
2192 
2193 	filter = rcu_dereference_check(sk->sk_filter,
2194 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2195 	if (filter) {
2196 		sk_filter_uncharge(sk, filter);
2197 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2198 	}
2199 
2200 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2201 
2202 #ifdef CONFIG_BPF_SYSCALL
2203 	bpf_sk_storage_free(sk);
2204 #endif
2205 
2206 	if (atomic_read(&sk->sk_omem_alloc))
2207 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2208 			 __func__, atomic_read(&sk->sk_omem_alloc));
2209 
2210 	if (sk->sk_frag.page) {
2211 		put_page(sk->sk_frag.page);
2212 		sk->sk_frag.page = NULL;
2213 	}
2214 
2215 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2216 	put_cred(sk->sk_peer_cred);
2217 	put_pid(sk->sk_peer_pid);
2218 
2219 	if (likely(sk->sk_net_refcnt))
2220 		put_net_track(sock_net(sk), &sk->ns_tracker);
2221 	else
2222 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2223 
2224 	sk_prot_free(sk->sk_prot_creator, sk);
2225 }
2226 
2227 void sk_destruct(struct sock *sk)
2228 {
2229 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2230 
2231 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2232 		reuseport_detach_sock(sk);
2233 		use_call_rcu = true;
2234 	}
2235 
2236 	if (use_call_rcu)
2237 		call_rcu(&sk->sk_rcu, __sk_destruct);
2238 	else
2239 		__sk_destruct(&sk->sk_rcu);
2240 }
2241 
2242 static void __sk_free(struct sock *sk)
2243 {
2244 	if (likely(sk->sk_net_refcnt))
2245 		sock_inuse_add(sock_net(sk), -1);
2246 
2247 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2248 		sock_diag_broadcast_destroy(sk);
2249 	else
2250 		sk_destruct(sk);
2251 }
2252 
2253 void sk_free(struct sock *sk)
2254 {
2255 	/*
2256 	 * We subtract one from sk_wmem_alloc and can know if
2257 	 * some packets are still in some tx queue.
2258 	 * If not null, sock_wfree() will call __sk_free(sk) later
2259 	 */
2260 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2261 		__sk_free(sk);
2262 }
2263 EXPORT_SYMBOL(sk_free);
2264 
2265 static void sk_init_common(struct sock *sk)
2266 {
2267 	skb_queue_head_init(&sk->sk_receive_queue);
2268 	skb_queue_head_init(&sk->sk_write_queue);
2269 	skb_queue_head_init(&sk->sk_error_queue);
2270 
2271 	rwlock_init(&sk->sk_callback_lock);
2272 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2273 			af_rlock_keys + sk->sk_family,
2274 			af_family_rlock_key_strings[sk->sk_family]);
2275 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2276 			af_wlock_keys + sk->sk_family,
2277 			af_family_wlock_key_strings[sk->sk_family]);
2278 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2279 			af_elock_keys + sk->sk_family,
2280 			af_family_elock_key_strings[sk->sk_family]);
2281 	if (sk->sk_kern_sock)
2282 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2283 			af_kern_callback_keys + sk->sk_family,
2284 			af_family_kern_clock_key_strings[sk->sk_family]);
2285 	else
2286 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2287 			af_callback_keys + sk->sk_family,
2288 			af_family_clock_key_strings[sk->sk_family]);
2289 }
2290 
2291 /**
2292  *	sk_clone_lock - clone a socket, and lock its clone
2293  *	@sk: the socket to clone
2294  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2295  *
2296  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2297  */
2298 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2299 {
2300 	struct proto *prot = READ_ONCE(sk->sk_prot);
2301 	struct sk_filter *filter;
2302 	bool is_charged = true;
2303 	struct sock *newsk;
2304 
2305 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2306 	if (!newsk)
2307 		goto out;
2308 
2309 	sock_copy(newsk, sk);
2310 
2311 	newsk->sk_prot_creator = prot;
2312 
2313 	/* SANITY */
2314 	if (likely(newsk->sk_net_refcnt)) {
2315 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2316 		sock_inuse_add(sock_net(newsk), 1);
2317 	} else {
2318 		/* Kernel sockets are not elevating the struct net refcount.
2319 		 * Instead, use a tracker to more easily detect if a layer
2320 		 * is not properly dismantling its kernel sockets at netns
2321 		 * destroy time.
2322 		 */
2323 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2324 				      false, priority);
2325 	}
2326 	sk_node_init(&newsk->sk_node);
2327 	sock_lock_init(newsk);
2328 	bh_lock_sock(newsk);
2329 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2330 	newsk->sk_backlog.len = 0;
2331 
2332 	atomic_set(&newsk->sk_rmem_alloc, 0);
2333 
2334 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2335 	refcount_set(&newsk->sk_wmem_alloc, 1);
2336 
2337 	atomic_set(&newsk->sk_omem_alloc, 0);
2338 	sk_init_common(newsk);
2339 
2340 	newsk->sk_dst_cache	= NULL;
2341 	newsk->sk_dst_pending_confirm = 0;
2342 	newsk->sk_wmem_queued	= 0;
2343 	newsk->sk_forward_alloc = 0;
2344 	newsk->sk_reserved_mem  = 0;
2345 	atomic_set(&newsk->sk_drops, 0);
2346 	newsk->sk_send_head	= NULL;
2347 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2348 	atomic_set(&newsk->sk_zckey, 0);
2349 
2350 	sock_reset_flag(newsk, SOCK_DONE);
2351 
2352 	/* sk->sk_memcg will be populated at accept() time */
2353 	newsk->sk_memcg = NULL;
2354 
2355 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2356 
2357 	rcu_read_lock();
2358 	filter = rcu_dereference(sk->sk_filter);
2359 	if (filter != NULL)
2360 		/* though it's an empty new sock, the charging may fail
2361 		 * if sysctl_optmem_max was changed between creation of
2362 		 * original socket and cloning
2363 		 */
2364 		is_charged = sk_filter_charge(newsk, filter);
2365 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2366 	rcu_read_unlock();
2367 
2368 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2369 		/* We need to make sure that we don't uncharge the new
2370 		 * socket if we couldn't charge it in the first place
2371 		 * as otherwise we uncharge the parent's filter.
2372 		 */
2373 		if (!is_charged)
2374 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2375 		sk_free_unlock_clone(newsk);
2376 		newsk = NULL;
2377 		goto out;
2378 	}
2379 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2380 
2381 	if (bpf_sk_storage_clone(sk, newsk)) {
2382 		sk_free_unlock_clone(newsk);
2383 		newsk = NULL;
2384 		goto out;
2385 	}
2386 
2387 	/* Clear sk_user_data if parent had the pointer tagged
2388 	 * as not suitable for copying when cloning.
2389 	 */
2390 	if (sk_user_data_is_nocopy(newsk))
2391 		newsk->sk_user_data = NULL;
2392 
2393 	newsk->sk_err	   = 0;
2394 	newsk->sk_err_soft = 0;
2395 	newsk->sk_priority = 0;
2396 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2397 
2398 	/* Before updating sk_refcnt, we must commit prior changes to memory
2399 	 * (Documentation/RCU/rculist_nulls.rst for details)
2400 	 */
2401 	smp_wmb();
2402 	refcount_set(&newsk->sk_refcnt, 2);
2403 
2404 	sk_set_socket(newsk, NULL);
2405 	sk_tx_queue_clear(newsk);
2406 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2407 
2408 	if (newsk->sk_prot->sockets_allocated)
2409 		sk_sockets_allocated_inc(newsk);
2410 
2411 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2412 		net_enable_timestamp();
2413 out:
2414 	return newsk;
2415 }
2416 EXPORT_SYMBOL_GPL(sk_clone_lock);
2417 
2418 void sk_free_unlock_clone(struct sock *sk)
2419 {
2420 	/* It is still raw copy of parent, so invalidate
2421 	 * destructor and make plain sk_free() */
2422 	sk->sk_destruct = NULL;
2423 	bh_unlock_sock(sk);
2424 	sk_free(sk);
2425 }
2426 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2427 
2428 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2429 {
2430 	bool is_ipv6 = false;
2431 	u32 max_size;
2432 
2433 #if IS_ENABLED(CONFIG_IPV6)
2434 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2435 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2436 #endif
2437 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2438 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2439 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2440 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2441 		max_size = GSO_LEGACY_MAX_SIZE;
2442 
2443 	return max_size - (MAX_TCP_HEADER + 1);
2444 }
2445 
2446 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2447 {
2448 	u32 max_segs = 1;
2449 
2450 	sk->sk_route_caps = dst->dev->features;
2451 	if (sk_is_tcp(sk))
2452 		sk->sk_route_caps |= NETIF_F_GSO;
2453 	if (sk->sk_route_caps & NETIF_F_GSO)
2454 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2455 	if (unlikely(sk->sk_gso_disabled))
2456 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2457 	if (sk_can_gso(sk)) {
2458 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2459 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2460 		} else {
2461 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2462 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2463 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2464 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2465 		}
2466 	}
2467 	sk->sk_gso_max_segs = max_segs;
2468 	sk_dst_set(sk, dst);
2469 }
2470 EXPORT_SYMBOL_GPL(sk_setup_caps);
2471 
2472 /*
2473  *	Simple resource managers for sockets.
2474  */
2475 
2476 
2477 /*
2478  * Write buffer destructor automatically called from kfree_skb.
2479  */
2480 void sock_wfree(struct sk_buff *skb)
2481 {
2482 	struct sock *sk = skb->sk;
2483 	unsigned int len = skb->truesize;
2484 	bool free;
2485 
2486 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2487 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2488 		    sk->sk_write_space == sock_def_write_space) {
2489 			rcu_read_lock();
2490 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2491 			sock_def_write_space_wfree(sk);
2492 			rcu_read_unlock();
2493 			if (unlikely(free))
2494 				__sk_free(sk);
2495 			return;
2496 		}
2497 
2498 		/*
2499 		 * Keep a reference on sk_wmem_alloc, this will be released
2500 		 * after sk_write_space() call
2501 		 */
2502 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2503 		sk->sk_write_space(sk);
2504 		len = 1;
2505 	}
2506 	/*
2507 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2508 	 * could not do because of in-flight packets
2509 	 */
2510 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2511 		__sk_free(sk);
2512 }
2513 EXPORT_SYMBOL(sock_wfree);
2514 
2515 /* This variant of sock_wfree() is used by TCP,
2516  * since it sets SOCK_USE_WRITE_QUEUE.
2517  */
2518 void __sock_wfree(struct sk_buff *skb)
2519 {
2520 	struct sock *sk = skb->sk;
2521 
2522 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2523 		__sk_free(sk);
2524 }
2525 
2526 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2527 {
2528 	skb_orphan(skb);
2529 	skb->sk = sk;
2530 #ifdef CONFIG_INET
2531 	if (unlikely(!sk_fullsock(sk))) {
2532 		skb->destructor = sock_edemux;
2533 		sock_hold(sk);
2534 		return;
2535 	}
2536 #endif
2537 	skb->destructor = sock_wfree;
2538 	skb_set_hash_from_sk(skb, sk);
2539 	/*
2540 	 * We used to take a refcount on sk, but following operation
2541 	 * is enough to guarantee sk_free() wont free this sock until
2542 	 * all in-flight packets are completed
2543 	 */
2544 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2545 }
2546 EXPORT_SYMBOL(skb_set_owner_w);
2547 
2548 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2549 {
2550 	/* Drivers depend on in-order delivery for crypto offload,
2551 	 * partial orphan breaks out-of-order-OK logic.
2552 	 */
2553 	if (skb_is_decrypted(skb))
2554 		return false;
2555 
2556 	return (skb->destructor == sock_wfree ||
2557 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2558 }
2559 
2560 /* This helper is used by netem, as it can hold packets in its
2561  * delay queue. We want to allow the owner socket to send more
2562  * packets, as if they were already TX completed by a typical driver.
2563  * But we also want to keep skb->sk set because some packet schedulers
2564  * rely on it (sch_fq for example).
2565  */
2566 void skb_orphan_partial(struct sk_buff *skb)
2567 {
2568 	if (skb_is_tcp_pure_ack(skb))
2569 		return;
2570 
2571 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2572 		return;
2573 
2574 	skb_orphan(skb);
2575 }
2576 EXPORT_SYMBOL(skb_orphan_partial);
2577 
2578 /*
2579  * Read buffer destructor automatically called from kfree_skb.
2580  */
2581 void sock_rfree(struct sk_buff *skb)
2582 {
2583 	struct sock *sk = skb->sk;
2584 	unsigned int len = skb->truesize;
2585 
2586 	atomic_sub(len, &sk->sk_rmem_alloc);
2587 	sk_mem_uncharge(sk, len);
2588 }
2589 EXPORT_SYMBOL(sock_rfree);
2590 
2591 /*
2592  * Buffer destructor for skbs that are not used directly in read or write
2593  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2594  */
2595 void sock_efree(struct sk_buff *skb)
2596 {
2597 	sock_put(skb->sk);
2598 }
2599 EXPORT_SYMBOL(sock_efree);
2600 
2601 /* Buffer destructor for prefetch/receive path where reference count may
2602  * not be held, e.g. for listen sockets.
2603  */
2604 #ifdef CONFIG_INET
2605 void sock_pfree(struct sk_buff *skb)
2606 {
2607 	struct sock *sk = skb->sk;
2608 
2609 	if (!sk_is_refcounted(sk))
2610 		return;
2611 
2612 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2613 		inet_reqsk(sk)->rsk_listener = NULL;
2614 		reqsk_free(inet_reqsk(sk));
2615 		return;
2616 	}
2617 
2618 	sock_gen_put(sk);
2619 }
2620 EXPORT_SYMBOL(sock_pfree);
2621 #endif /* CONFIG_INET */
2622 
2623 kuid_t sock_i_uid(struct sock *sk)
2624 {
2625 	kuid_t uid;
2626 
2627 	read_lock_bh(&sk->sk_callback_lock);
2628 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2629 	read_unlock_bh(&sk->sk_callback_lock);
2630 	return uid;
2631 }
2632 EXPORT_SYMBOL(sock_i_uid);
2633 
2634 unsigned long __sock_i_ino(struct sock *sk)
2635 {
2636 	unsigned long ino;
2637 
2638 	read_lock(&sk->sk_callback_lock);
2639 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2640 	read_unlock(&sk->sk_callback_lock);
2641 	return ino;
2642 }
2643 EXPORT_SYMBOL(__sock_i_ino);
2644 
2645 unsigned long sock_i_ino(struct sock *sk)
2646 {
2647 	unsigned long ino;
2648 
2649 	local_bh_disable();
2650 	ino = __sock_i_ino(sk);
2651 	local_bh_enable();
2652 	return ino;
2653 }
2654 EXPORT_SYMBOL(sock_i_ino);
2655 
2656 /*
2657  * Allocate a skb from the socket's send buffer.
2658  */
2659 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2660 			     gfp_t priority)
2661 {
2662 	if (force ||
2663 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2664 		struct sk_buff *skb = alloc_skb(size, priority);
2665 
2666 		if (skb) {
2667 			skb_set_owner_w(skb, sk);
2668 			return skb;
2669 		}
2670 	}
2671 	return NULL;
2672 }
2673 EXPORT_SYMBOL(sock_wmalloc);
2674 
2675 static void sock_ofree(struct sk_buff *skb)
2676 {
2677 	struct sock *sk = skb->sk;
2678 
2679 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2680 }
2681 
2682 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2683 			     gfp_t priority)
2684 {
2685 	struct sk_buff *skb;
2686 
2687 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2688 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2689 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2690 		return NULL;
2691 
2692 	skb = alloc_skb(size, priority);
2693 	if (!skb)
2694 		return NULL;
2695 
2696 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2697 	skb->sk = sk;
2698 	skb->destructor = sock_ofree;
2699 	return skb;
2700 }
2701 
2702 /*
2703  * Allocate a memory block from the socket's option memory buffer.
2704  */
2705 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2706 {
2707 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2708 
2709 	if ((unsigned int)size <= optmem_max &&
2710 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2711 		void *mem;
2712 		/* First do the add, to avoid the race if kmalloc
2713 		 * might sleep.
2714 		 */
2715 		atomic_add(size, &sk->sk_omem_alloc);
2716 		mem = kmalloc(size, priority);
2717 		if (mem)
2718 			return mem;
2719 		atomic_sub(size, &sk->sk_omem_alloc);
2720 	}
2721 	return NULL;
2722 }
2723 EXPORT_SYMBOL(sock_kmalloc);
2724 
2725 /* Free an option memory block. Note, we actually want the inline
2726  * here as this allows gcc to detect the nullify and fold away the
2727  * condition entirely.
2728  */
2729 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2730 				  const bool nullify)
2731 {
2732 	if (WARN_ON_ONCE(!mem))
2733 		return;
2734 	if (nullify)
2735 		kfree_sensitive(mem);
2736 	else
2737 		kfree(mem);
2738 	atomic_sub(size, &sk->sk_omem_alloc);
2739 }
2740 
2741 void sock_kfree_s(struct sock *sk, void *mem, int size)
2742 {
2743 	__sock_kfree_s(sk, mem, size, false);
2744 }
2745 EXPORT_SYMBOL(sock_kfree_s);
2746 
2747 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2748 {
2749 	__sock_kfree_s(sk, mem, size, true);
2750 }
2751 EXPORT_SYMBOL(sock_kzfree_s);
2752 
2753 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2754    I think, these locks should be removed for datagram sockets.
2755  */
2756 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2757 {
2758 	DEFINE_WAIT(wait);
2759 
2760 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2761 	for (;;) {
2762 		if (!timeo)
2763 			break;
2764 		if (signal_pending(current))
2765 			break;
2766 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2767 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2768 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2769 			break;
2770 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2771 			break;
2772 		if (READ_ONCE(sk->sk_err))
2773 			break;
2774 		timeo = schedule_timeout(timeo);
2775 	}
2776 	finish_wait(sk_sleep(sk), &wait);
2777 	return timeo;
2778 }
2779 
2780 
2781 /*
2782  *	Generic send/receive buffer handlers
2783  */
2784 
2785 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2786 				     unsigned long data_len, int noblock,
2787 				     int *errcode, int max_page_order)
2788 {
2789 	struct sk_buff *skb;
2790 	long timeo;
2791 	int err;
2792 
2793 	timeo = sock_sndtimeo(sk, noblock);
2794 	for (;;) {
2795 		err = sock_error(sk);
2796 		if (err != 0)
2797 			goto failure;
2798 
2799 		err = -EPIPE;
2800 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2801 			goto failure;
2802 
2803 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2804 			break;
2805 
2806 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2807 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2808 		err = -EAGAIN;
2809 		if (!timeo)
2810 			goto failure;
2811 		if (signal_pending(current))
2812 			goto interrupted;
2813 		timeo = sock_wait_for_wmem(sk, timeo);
2814 	}
2815 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2816 				   errcode, sk->sk_allocation);
2817 	if (skb)
2818 		skb_set_owner_w(skb, sk);
2819 	return skb;
2820 
2821 interrupted:
2822 	err = sock_intr_errno(timeo);
2823 failure:
2824 	*errcode = err;
2825 	return NULL;
2826 }
2827 EXPORT_SYMBOL(sock_alloc_send_pskb);
2828 
2829 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2830 		     struct sockcm_cookie *sockc)
2831 {
2832 	u32 tsflags;
2833 
2834 	switch (cmsg->cmsg_type) {
2835 	case SO_MARK:
2836 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2837 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2838 			return -EPERM;
2839 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2840 			return -EINVAL;
2841 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2842 		break;
2843 	case SO_TIMESTAMPING_OLD:
2844 	case SO_TIMESTAMPING_NEW:
2845 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2846 			return -EINVAL;
2847 
2848 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2849 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2850 			return -EINVAL;
2851 
2852 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2853 		sockc->tsflags |= tsflags;
2854 		break;
2855 	case SCM_TXTIME:
2856 		if (!sock_flag(sk, SOCK_TXTIME))
2857 			return -EINVAL;
2858 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2859 			return -EINVAL;
2860 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2861 		break;
2862 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2863 	case SCM_RIGHTS:
2864 	case SCM_CREDENTIALS:
2865 		break;
2866 	default:
2867 		return -EINVAL;
2868 	}
2869 	return 0;
2870 }
2871 EXPORT_SYMBOL(__sock_cmsg_send);
2872 
2873 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2874 		   struct sockcm_cookie *sockc)
2875 {
2876 	struct cmsghdr *cmsg;
2877 	int ret;
2878 
2879 	for_each_cmsghdr(cmsg, msg) {
2880 		if (!CMSG_OK(msg, cmsg))
2881 			return -EINVAL;
2882 		if (cmsg->cmsg_level != SOL_SOCKET)
2883 			continue;
2884 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2885 		if (ret)
2886 			return ret;
2887 	}
2888 	return 0;
2889 }
2890 EXPORT_SYMBOL(sock_cmsg_send);
2891 
2892 static void sk_enter_memory_pressure(struct sock *sk)
2893 {
2894 	if (!sk->sk_prot->enter_memory_pressure)
2895 		return;
2896 
2897 	sk->sk_prot->enter_memory_pressure(sk);
2898 }
2899 
2900 static void sk_leave_memory_pressure(struct sock *sk)
2901 {
2902 	if (sk->sk_prot->leave_memory_pressure) {
2903 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2904 				     tcp_leave_memory_pressure, sk);
2905 	} else {
2906 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2907 
2908 		if (memory_pressure && READ_ONCE(*memory_pressure))
2909 			WRITE_ONCE(*memory_pressure, 0);
2910 	}
2911 }
2912 
2913 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2914 
2915 /**
2916  * skb_page_frag_refill - check that a page_frag contains enough room
2917  * @sz: minimum size of the fragment we want to get
2918  * @pfrag: pointer to page_frag
2919  * @gfp: priority for memory allocation
2920  *
2921  * Note: While this allocator tries to use high order pages, there is
2922  * no guarantee that allocations succeed. Therefore, @sz MUST be
2923  * less or equal than PAGE_SIZE.
2924  */
2925 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2926 {
2927 	if (pfrag->page) {
2928 		if (page_ref_count(pfrag->page) == 1) {
2929 			pfrag->offset = 0;
2930 			return true;
2931 		}
2932 		if (pfrag->offset + sz <= pfrag->size)
2933 			return true;
2934 		put_page(pfrag->page);
2935 	}
2936 
2937 	pfrag->offset = 0;
2938 	if (SKB_FRAG_PAGE_ORDER &&
2939 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2940 		/* Avoid direct reclaim but allow kswapd to wake */
2941 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2942 					  __GFP_COMP | __GFP_NOWARN |
2943 					  __GFP_NORETRY,
2944 					  SKB_FRAG_PAGE_ORDER);
2945 		if (likely(pfrag->page)) {
2946 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2947 			return true;
2948 		}
2949 	}
2950 	pfrag->page = alloc_page(gfp);
2951 	if (likely(pfrag->page)) {
2952 		pfrag->size = PAGE_SIZE;
2953 		return true;
2954 	}
2955 	return false;
2956 }
2957 EXPORT_SYMBOL(skb_page_frag_refill);
2958 
2959 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2960 {
2961 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2962 		return true;
2963 
2964 	sk_enter_memory_pressure(sk);
2965 	sk_stream_moderate_sndbuf(sk);
2966 	return false;
2967 }
2968 EXPORT_SYMBOL(sk_page_frag_refill);
2969 
2970 void __lock_sock(struct sock *sk)
2971 	__releases(&sk->sk_lock.slock)
2972 	__acquires(&sk->sk_lock.slock)
2973 {
2974 	DEFINE_WAIT(wait);
2975 
2976 	for (;;) {
2977 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2978 					TASK_UNINTERRUPTIBLE);
2979 		spin_unlock_bh(&sk->sk_lock.slock);
2980 		schedule();
2981 		spin_lock_bh(&sk->sk_lock.slock);
2982 		if (!sock_owned_by_user(sk))
2983 			break;
2984 	}
2985 	finish_wait(&sk->sk_lock.wq, &wait);
2986 }
2987 
2988 void __release_sock(struct sock *sk)
2989 	__releases(&sk->sk_lock.slock)
2990 	__acquires(&sk->sk_lock.slock)
2991 {
2992 	struct sk_buff *skb, *next;
2993 
2994 	while ((skb = sk->sk_backlog.head) != NULL) {
2995 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2996 
2997 		spin_unlock_bh(&sk->sk_lock.slock);
2998 
2999 		do {
3000 			next = skb->next;
3001 			prefetch(next);
3002 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3003 			skb_mark_not_on_list(skb);
3004 			sk_backlog_rcv(sk, skb);
3005 
3006 			cond_resched();
3007 
3008 			skb = next;
3009 		} while (skb != NULL);
3010 
3011 		spin_lock_bh(&sk->sk_lock.slock);
3012 	}
3013 
3014 	/*
3015 	 * Doing the zeroing here guarantee we can not loop forever
3016 	 * while a wild producer attempts to flood us.
3017 	 */
3018 	sk->sk_backlog.len = 0;
3019 }
3020 
3021 void __sk_flush_backlog(struct sock *sk)
3022 {
3023 	spin_lock_bh(&sk->sk_lock.slock);
3024 	__release_sock(sk);
3025 
3026 	if (sk->sk_prot->release_cb)
3027 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3028 				     tcp_release_cb, sk);
3029 
3030 	spin_unlock_bh(&sk->sk_lock.slock);
3031 }
3032 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3033 
3034 /**
3035  * sk_wait_data - wait for data to arrive at sk_receive_queue
3036  * @sk:    sock to wait on
3037  * @timeo: for how long
3038  * @skb:   last skb seen on sk_receive_queue
3039  *
3040  * Now socket state including sk->sk_err is changed only under lock,
3041  * hence we may omit checks after joining wait queue.
3042  * We check receive queue before schedule() only as optimization;
3043  * it is very likely that release_sock() added new data.
3044  */
3045 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3046 {
3047 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3048 	int rc;
3049 
3050 	add_wait_queue(sk_sleep(sk), &wait);
3051 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3052 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3053 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3054 	remove_wait_queue(sk_sleep(sk), &wait);
3055 	return rc;
3056 }
3057 EXPORT_SYMBOL(sk_wait_data);
3058 
3059 /**
3060  *	__sk_mem_raise_allocated - increase memory_allocated
3061  *	@sk: socket
3062  *	@size: memory size to allocate
3063  *	@amt: pages to allocate
3064  *	@kind: allocation type
3065  *
3066  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3067  *
3068  *	Unlike the globally shared limits among the sockets under same protocol,
3069  *	consuming the budget of a memcg won't have direct effect on other ones.
3070  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3071  *	whether or not to raise allocated through sk_under_memory_pressure() or
3072  *	its variants.
3073  */
3074 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3075 {
3076 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3077 	struct proto *prot = sk->sk_prot;
3078 	bool charged = false;
3079 	long allocated;
3080 
3081 	sk_memory_allocated_add(sk, amt);
3082 	allocated = sk_memory_allocated(sk);
3083 
3084 	if (memcg) {
3085 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3086 			goto suppress_allocation;
3087 		charged = true;
3088 	}
3089 
3090 	/* Under limit. */
3091 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3092 		sk_leave_memory_pressure(sk);
3093 		return 1;
3094 	}
3095 
3096 	/* Under pressure. */
3097 	if (allocated > sk_prot_mem_limits(sk, 1))
3098 		sk_enter_memory_pressure(sk);
3099 
3100 	/* Over hard limit. */
3101 	if (allocated > sk_prot_mem_limits(sk, 2))
3102 		goto suppress_allocation;
3103 
3104 	/* Guarantee minimum buffer size under pressure (either global
3105 	 * or memcg) to make sure features described in RFC 7323 (TCP
3106 	 * Extensions for High Performance) work properly.
3107 	 *
3108 	 * This rule does NOT stand when exceeds global or memcg's hard
3109 	 * limit, or else a DoS attack can be taken place by spawning
3110 	 * lots of sockets whose usage are under minimum buffer size.
3111 	 */
3112 	if (kind == SK_MEM_RECV) {
3113 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3114 			return 1;
3115 
3116 	} else { /* SK_MEM_SEND */
3117 		int wmem0 = sk_get_wmem0(sk, prot);
3118 
3119 		if (sk->sk_type == SOCK_STREAM) {
3120 			if (sk->sk_wmem_queued < wmem0)
3121 				return 1;
3122 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3123 				return 1;
3124 		}
3125 	}
3126 
3127 	if (sk_has_memory_pressure(sk)) {
3128 		u64 alloc;
3129 
3130 		/* The following 'average' heuristic is within the
3131 		 * scope of global accounting, so it only makes
3132 		 * sense for global memory pressure.
3133 		 */
3134 		if (!sk_under_global_memory_pressure(sk))
3135 			return 1;
3136 
3137 		/* Try to be fair among all the sockets under global
3138 		 * pressure by allowing the ones that below average
3139 		 * usage to raise.
3140 		 */
3141 		alloc = sk_sockets_allocated_read_positive(sk);
3142 		if (sk_prot_mem_limits(sk, 2) > alloc *
3143 		    sk_mem_pages(sk->sk_wmem_queued +
3144 				 atomic_read(&sk->sk_rmem_alloc) +
3145 				 sk->sk_forward_alloc))
3146 			return 1;
3147 	}
3148 
3149 suppress_allocation:
3150 
3151 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3152 		sk_stream_moderate_sndbuf(sk);
3153 
3154 		/* Fail only if socket is _under_ its sndbuf.
3155 		 * In this case we cannot block, so that we have to fail.
3156 		 */
3157 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3158 			/* Force charge with __GFP_NOFAIL */
3159 			if (memcg && !charged) {
3160 				mem_cgroup_charge_skmem(memcg, amt,
3161 					gfp_memcg_charge() | __GFP_NOFAIL);
3162 			}
3163 			return 1;
3164 		}
3165 	}
3166 
3167 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3168 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3169 
3170 	sk_memory_allocated_sub(sk, amt);
3171 
3172 	if (charged)
3173 		mem_cgroup_uncharge_skmem(memcg, amt);
3174 
3175 	return 0;
3176 }
3177 
3178 /**
3179  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3180  *	@sk: socket
3181  *	@size: memory size to allocate
3182  *	@kind: allocation type
3183  *
3184  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3185  *	rmem allocation. This function assumes that protocols which have
3186  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3187  */
3188 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3189 {
3190 	int ret, amt = sk_mem_pages(size);
3191 
3192 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3193 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3194 	if (!ret)
3195 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3196 	return ret;
3197 }
3198 EXPORT_SYMBOL(__sk_mem_schedule);
3199 
3200 /**
3201  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3202  *	@sk: socket
3203  *	@amount: number of quanta
3204  *
3205  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3206  */
3207 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3208 {
3209 	sk_memory_allocated_sub(sk, amount);
3210 
3211 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3212 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3213 
3214 	if (sk_under_global_memory_pressure(sk) &&
3215 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3216 		sk_leave_memory_pressure(sk);
3217 }
3218 
3219 /**
3220  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3221  *	@sk: socket
3222  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3223  */
3224 void __sk_mem_reclaim(struct sock *sk, int amount)
3225 {
3226 	amount >>= PAGE_SHIFT;
3227 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3228 	__sk_mem_reduce_allocated(sk, amount);
3229 }
3230 EXPORT_SYMBOL(__sk_mem_reclaim);
3231 
3232 int sk_set_peek_off(struct sock *sk, int val)
3233 {
3234 	WRITE_ONCE(sk->sk_peek_off, val);
3235 	return 0;
3236 }
3237 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3238 
3239 /*
3240  * Set of default routines for initialising struct proto_ops when
3241  * the protocol does not support a particular function. In certain
3242  * cases where it makes no sense for a protocol to have a "do nothing"
3243  * function, some default processing is provided.
3244  */
3245 
3246 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3247 {
3248 	return -EOPNOTSUPP;
3249 }
3250 EXPORT_SYMBOL(sock_no_bind);
3251 
3252 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3253 		    int len, int flags)
3254 {
3255 	return -EOPNOTSUPP;
3256 }
3257 EXPORT_SYMBOL(sock_no_connect);
3258 
3259 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3260 {
3261 	return -EOPNOTSUPP;
3262 }
3263 EXPORT_SYMBOL(sock_no_socketpair);
3264 
3265 int sock_no_accept(struct socket *sock, struct socket *newsock,
3266 		   struct proto_accept_arg *arg)
3267 {
3268 	return -EOPNOTSUPP;
3269 }
3270 EXPORT_SYMBOL(sock_no_accept);
3271 
3272 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3273 		    int peer)
3274 {
3275 	return -EOPNOTSUPP;
3276 }
3277 EXPORT_SYMBOL(sock_no_getname);
3278 
3279 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3280 {
3281 	return -EOPNOTSUPP;
3282 }
3283 EXPORT_SYMBOL(sock_no_ioctl);
3284 
3285 int sock_no_listen(struct socket *sock, int backlog)
3286 {
3287 	return -EOPNOTSUPP;
3288 }
3289 EXPORT_SYMBOL(sock_no_listen);
3290 
3291 int sock_no_shutdown(struct socket *sock, int how)
3292 {
3293 	return -EOPNOTSUPP;
3294 }
3295 EXPORT_SYMBOL(sock_no_shutdown);
3296 
3297 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3298 {
3299 	return -EOPNOTSUPP;
3300 }
3301 EXPORT_SYMBOL(sock_no_sendmsg);
3302 
3303 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3304 {
3305 	return -EOPNOTSUPP;
3306 }
3307 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3308 
3309 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3310 		    int flags)
3311 {
3312 	return -EOPNOTSUPP;
3313 }
3314 EXPORT_SYMBOL(sock_no_recvmsg);
3315 
3316 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3317 {
3318 	/* Mirror missing mmap method error code */
3319 	return -ENODEV;
3320 }
3321 EXPORT_SYMBOL(sock_no_mmap);
3322 
3323 /*
3324  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3325  * various sock-based usage counts.
3326  */
3327 void __receive_sock(struct file *file)
3328 {
3329 	struct socket *sock;
3330 
3331 	sock = sock_from_file(file);
3332 	if (sock) {
3333 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3334 		sock_update_classid(&sock->sk->sk_cgrp_data);
3335 	}
3336 }
3337 
3338 /*
3339  *	Default Socket Callbacks
3340  */
3341 
3342 static void sock_def_wakeup(struct sock *sk)
3343 {
3344 	struct socket_wq *wq;
3345 
3346 	rcu_read_lock();
3347 	wq = rcu_dereference(sk->sk_wq);
3348 	if (skwq_has_sleeper(wq))
3349 		wake_up_interruptible_all(&wq->wait);
3350 	rcu_read_unlock();
3351 }
3352 
3353 static void sock_def_error_report(struct sock *sk)
3354 {
3355 	struct socket_wq *wq;
3356 
3357 	rcu_read_lock();
3358 	wq = rcu_dereference(sk->sk_wq);
3359 	if (skwq_has_sleeper(wq))
3360 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3361 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3362 	rcu_read_unlock();
3363 }
3364 
3365 void sock_def_readable(struct sock *sk)
3366 {
3367 	struct socket_wq *wq;
3368 
3369 	trace_sk_data_ready(sk);
3370 
3371 	rcu_read_lock();
3372 	wq = rcu_dereference(sk->sk_wq);
3373 	if (skwq_has_sleeper(wq))
3374 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3375 						EPOLLRDNORM | EPOLLRDBAND);
3376 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3377 	rcu_read_unlock();
3378 }
3379 
3380 static void sock_def_write_space(struct sock *sk)
3381 {
3382 	struct socket_wq *wq;
3383 
3384 	rcu_read_lock();
3385 
3386 	/* Do not wake up a writer until he can make "significant"
3387 	 * progress.  --DaveM
3388 	 */
3389 	if (sock_writeable(sk)) {
3390 		wq = rcu_dereference(sk->sk_wq);
3391 		if (skwq_has_sleeper(wq))
3392 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3393 						EPOLLWRNORM | EPOLLWRBAND);
3394 
3395 		/* Should agree with poll, otherwise some programs break */
3396 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3397 	}
3398 
3399 	rcu_read_unlock();
3400 }
3401 
3402 /* An optimised version of sock_def_write_space(), should only be called
3403  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3404  * ->sk_wmem_alloc.
3405  */
3406 static void sock_def_write_space_wfree(struct sock *sk)
3407 {
3408 	/* Do not wake up a writer until he can make "significant"
3409 	 * progress.  --DaveM
3410 	 */
3411 	if (sock_writeable(sk)) {
3412 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3413 
3414 		/* rely on refcount_sub from sock_wfree() */
3415 		smp_mb__after_atomic();
3416 		if (wq && waitqueue_active(&wq->wait))
3417 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3418 						EPOLLWRNORM | EPOLLWRBAND);
3419 
3420 		/* Should agree with poll, otherwise some programs break */
3421 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3422 	}
3423 }
3424 
3425 static void sock_def_destruct(struct sock *sk)
3426 {
3427 }
3428 
3429 void sk_send_sigurg(struct sock *sk)
3430 {
3431 	if (sk->sk_socket && sk->sk_socket->file)
3432 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3433 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3434 }
3435 EXPORT_SYMBOL(sk_send_sigurg);
3436 
3437 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3438 		    unsigned long expires)
3439 {
3440 	if (!mod_timer(timer, expires))
3441 		sock_hold(sk);
3442 }
3443 EXPORT_SYMBOL(sk_reset_timer);
3444 
3445 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3446 {
3447 	if (del_timer(timer))
3448 		__sock_put(sk);
3449 }
3450 EXPORT_SYMBOL(sk_stop_timer);
3451 
3452 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3453 {
3454 	if (del_timer_sync(timer))
3455 		__sock_put(sk);
3456 }
3457 EXPORT_SYMBOL(sk_stop_timer_sync);
3458 
3459 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3460 {
3461 	sk_init_common(sk);
3462 	sk->sk_send_head	=	NULL;
3463 
3464 	timer_setup(&sk->sk_timer, NULL, 0);
3465 
3466 	sk->sk_allocation	=	GFP_KERNEL;
3467 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3468 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3469 	sk->sk_state		=	TCP_CLOSE;
3470 	sk->sk_use_task_frag	=	true;
3471 	sk_set_socket(sk, sock);
3472 
3473 	sock_set_flag(sk, SOCK_ZAPPED);
3474 
3475 	if (sock) {
3476 		sk->sk_type	=	sock->type;
3477 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3478 		sock->sk	=	sk;
3479 	} else {
3480 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3481 	}
3482 	sk->sk_uid	=	uid;
3483 
3484 	sk->sk_state_change	=	sock_def_wakeup;
3485 	sk->sk_data_ready	=	sock_def_readable;
3486 	sk->sk_write_space	=	sock_def_write_space;
3487 	sk->sk_error_report	=	sock_def_error_report;
3488 	sk->sk_destruct		=	sock_def_destruct;
3489 
3490 	sk->sk_frag.page	=	NULL;
3491 	sk->sk_frag.offset	=	0;
3492 	sk->sk_peek_off		=	-1;
3493 
3494 	sk->sk_peer_pid 	=	NULL;
3495 	sk->sk_peer_cred	=	NULL;
3496 	spin_lock_init(&sk->sk_peer_lock);
3497 
3498 	sk->sk_write_pending	=	0;
3499 	sk->sk_rcvlowat		=	1;
3500 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3501 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3502 
3503 	sk->sk_stamp = SK_DEFAULT_STAMP;
3504 #if BITS_PER_LONG==32
3505 	seqlock_init(&sk->sk_stamp_seq);
3506 #endif
3507 	atomic_set(&sk->sk_zckey, 0);
3508 
3509 #ifdef CONFIG_NET_RX_BUSY_POLL
3510 	sk->sk_napi_id		=	0;
3511 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3512 #endif
3513 
3514 	sk->sk_max_pacing_rate = ~0UL;
3515 	sk->sk_pacing_rate = ~0UL;
3516 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3517 	sk->sk_incoming_cpu = -1;
3518 
3519 	sk_rx_queue_clear(sk);
3520 	/*
3521 	 * Before updating sk_refcnt, we must commit prior changes to memory
3522 	 * (Documentation/RCU/rculist_nulls.rst for details)
3523 	 */
3524 	smp_wmb();
3525 	refcount_set(&sk->sk_refcnt, 1);
3526 	atomic_set(&sk->sk_drops, 0);
3527 }
3528 EXPORT_SYMBOL(sock_init_data_uid);
3529 
3530 void sock_init_data(struct socket *sock, struct sock *sk)
3531 {
3532 	kuid_t uid = sock ?
3533 		SOCK_INODE(sock)->i_uid :
3534 		make_kuid(sock_net(sk)->user_ns, 0);
3535 
3536 	sock_init_data_uid(sock, sk, uid);
3537 }
3538 EXPORT_SYMBOL(sock_init_data);
3539 
3540 void lock_sock_nested(struct sock *sk, int subclass)
3541 {
3542 	/* The sk_lock has mutex_lock() semantics here. */
3543 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3544 
3545 	might_sleep();
3546 	spin_lock_bh(&sk->sk_lock.slock);
3547 	if (sock_owned_by_user_nocheck(sk))
3548 		__lock_sock(sk);
3549 	sk->sk_lock.owned = 1;
3550 	spin_unlock_bh(&sk->sk_lock.slock);
3551 }
3552 EXPORT_SYMBOL(lock_sock_nested);
3553 
3554 void release_sock(struct sock *sk)
3555 {
3556 	spin_lock_bh(&sk->sk_lock.slock);
3557 	if (sk->sk_backlog.tail)
3558 		__release_sock(sk);
3559 
3560 	if (sk->sk_prot->release_cb)
3561 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3562 				     tcp_release_cb, sk);
3563 
3564 	sock_release_ownership(sk);
3565 	if (waitqueue_active(&sk->sk_lock.wq))
3566 		wake_up(&sk->sk_lock.wq);
3567 	spin_unlock_bh(&sk->sk_lock.slock);
3568 }
3569 EXPORT_SYMBOL(release_sock);
3570 
3571 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3572 {
3573 	might_sleep();
3574 	spin_lock_bh(&sk->sk_lock.slock);
3575 
3576 	if (!sock_owned_by_user_nocheck(sk)) {
3577 		/*
3578 		 * Fast path return with bottom halves disabled and
3579 		 * sock::sk_lock.slock held.
3580 		 *
3581 		 * The 'mutex' is not contended and holding
3582 		 * sock::sk_lock.slock prevents all other lockers to
3583 		 * proceed so the corresponding unlock_sock_fast() can
3584 		 * avoid the slow path of release_sock() completely and
3585 		 * just release slock.
3586 		 *
3587 		 * From a semantical POV this is equivalent to 'acquiring'
3588 		 * the 'mutex', hence the corresponding lockdep
3589 		 * mutex_release() has to happen in the fast path of
3590 		 * unlock_sock_fast().
3591 		 */
3592 		return false;
3593 	}
3594 
3595 	__lock_sock(sk);
3596 	sk->sk_lock.owned = 1;
3597 	__acquire(&sk->sk_lock.slock);
3598 	spin_unlock_bh(&sk->sk_lock.slock);
3599 	return true;
3600 }
3601 EXPORT_SYMBOL(__lock_sock_fast);
3602 
3603 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3604 		   bool timeval, bool time32)
3605 {
3606 	struct sock *sk = sock->sk;
3607 	struct timespec64 ts;
3608 
3609 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3610 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3611 	if (ts.tv_sec == -1)
3612 		return -ENOENT;
3613 	if (ts.tv_sec == 0) {
3614 		ktime_t kt = ktime_get_real();
3615 		sock_write_timestamp(sk, kt);
3616 		ts = ktime_to_timespec64(kt);
3617 	}
3618 
3619 	if (timeval)
3620 		ts.tv_nsec /= 1000;
3621 
3622 #ifdef CONFIG_COMPAT_32BIT_TIME
3623 	if (time32)
3624 		return put_old_timespec32(&ts, userstamp);
3625 #endif
3626 #ifdef CONFIG_SPARC64
3627 	/* beware of padding in sparc64 timeval */
3628 	if (timeval && !in_compat_syscall()) {
3629 		struct __kernel_old_timeval __user tv = {
3630 			.tv_sec = ts.tv_sec,
3631 			.tv_usec = ts.tv_nsec,
3632 		};
3633 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3634 			return -EFAULT;
3635 		return 0;
3636 	}
3637 #endif
3638 	return put_timespec64(&ts, userstamp);
3639 }
3640 EXPORT_SYMBOL(sock_gettstamp);
3641 
3642 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3643 {
3644 	if (!sock_flag(sk, flag)) {
3645 		unsigned long previous_flags = sk->sk_flags;
3646 
3647 		sock_set_flag(sk, flag);
3648 		/*
3649 		 * we just set one of the two flags which require net
3650 		 * time stamping, but time stamping might have been on
3651 		 * already because of the other one
3652 		 */
3653 		if (sock_needs_netstamp(sk) &&
3654 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3655 			net_enable_timestamp();
3656 	}
3657 }
3658 
3659 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3660 		       int level, int type)
3661 {
3662 	struct sock_exterr_skb *serr;
3663 	struct sk_buff *skb;
3664 	int copied, err;
3665 
3666 	err = -EAGAIN;
3667 	skb = sock_dequeue_err_skb(sk);
3668 	if (skb == NULL)
3669 		goto out;
3670 
3671 	copied = skb->len;
3672 	if (copied > len) {
3673 		msg->msg_flags |= MSG_TRUNC;
3674 		copied = len;
3675 	}
3676 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3677 	if (err)
3678 		goto out_free_skb;
3679 
3680 	sock_recv_timestamp(msg, sk, skb);
3681 
3682 	serr = SKB_EXT_ERR(skb);
3683 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3684 
3685 	msg->msg_flags |= MSG_ERRQUEUE;
3686 	err = copied;
3687 
3688 out_free_skb:
3689 	kfree_skb(skb);
3690 out:
3691 	return err;
3692 }
3693 EXPORT_SYMBOL(sock_recv_errqueue);
3694 
3695 /*
3696  *	Get a socket option on an socket.
3697  *
3698  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3699  *	asynchronous errors should be reported by getsockopt. We assume
3700  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3701  */
3702 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3703 			   char __user *optval, int __user *optlen)
3704 {
3705 	struct sock *sk = sock->sk;
3706 
3707 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3708 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3709 }
3710 EXPORT_SYMBOL(sock_common_getsockopt);
3711 
3712 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3713 			int flags)
3714 {
3715 	struct sock *sk = sock->sk;
3716 	int addr_len = 0;
3717 	int err;
3718 
3719 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3720 	if (err >= 0)
3721 		msg->msg_namelen = addr_len;
3722 	return err;
3723 }
3724 EXPORT_SYMBOL(sock_common_recvmsg);
3725 
3726 /*
3727  *	Set socket options on an inet socket.
3728  */
3729 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3730 			   sockptr_t optval, unsigned int optlen)
3731 {
3732 	struct sock *sk = sock->sk;
3733 
3734 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3735 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3736 }
3737 EXPORT_SYMBOL(sock_common_setsockopt);
3738 
3739 void sk_common_release(struct sock *sk)
3740 {
3741 	if (sk->sk_prot->destroy)
3742 		sk->sk_prot->destroy(sk);
3743 
3744 	/*
3745 	 * Observation: when sk_common_release is called, processes have
3746 	 * no access to socket. But net still has.
3747 	 * Step one, detach it from networking:
3748 	 *
3749 	 * A. Remove from hash tables.
3750 	 */
3751 
3752 	sk->sk_prot->unhash(sk);
3753 
3754 	if (sk->sk_socket)
3755 		sk->sk_socket->sk = NULL;
3756 
3757 	/*
3758 	 * In this point socket cannot receive new packets, but it is possible
3759 	 * that some packets are in flight because some CPU runs receiver and
3760 	 * did hash table lookup before we unhashed socket. They will achieve
3761 	 * receive queue and will be purged by socket destructor.
3762 	 *
3763 	 * Also we still have packets pending on receive queue and probably,
3764 	 * our own packets waiting in device queues. sock_destroy will drain
3765 	 * receive queue, but transmitted packets will delay socket destruction
3766 	 * until the last reference will be released.
3767 	 */
3768 
3769 	sock_orphan(sk);
3770 
3771 	xfrm_sk_free_policy(sk);
3772 
3773 	sock_put(sk);
3774 }
3775 EXPORT_SYMBOL(sk_common_release);
3776 
3777 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3778 {
3779 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3780 
3781 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3782 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3783 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3784 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3785 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3786 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3787 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3788 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3789 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3790 }
3791 
3792 #ifdef CONFIG_PROC_FS
3793 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3794 
3795 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3796 {
3797 	int cpu, idx = prot->inuse_idx;
3798 	int res = 0;
3799 
3800 	for_each_possible_cpu(cpu)
3801 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3802 
3803 	return res >= 0 ? res : 0;
3804 }
3805 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3806 
3807 int sock_inuse_get(struct net *net)
3808 {
3809 	int cpu, res = 0;
3810 
3811 	for_each_possible_cpu(cpu)
3812 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3813 
3814 	return res;
3815 }
3816 
3817 EXPORT_SYMBOL_GPL(sock_inuse_get);
3818 
3819 static int __net_init sock_inuse_init_net(struct net *net)
3820 {
3821 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3822 	if (net->core.prot_inuse == NULL)
3823 		return -ENOMEM;
3824 	return 0;
3825 }
3826 
3827 static void __net_exit sock_inuse_exit_net(struct net *net)
3828 {
3829 	free_percpu(net->core.prot_inuse);
3830 }
3831 
3832 static struct pernet_operations net_inuse_ops = {
3833 	.init = sock_inuse_init_net,
3834 	.exit = sock_inuse_exit_net,
3835 };
3836 
3837 static __init int net_inuse_init(void)
3838 {
3839 	if (register_pernet_subsys(&net_inuse_ops))
3840 		panic("Cannot initialize net inuse counters");
3841 
3842 	return 0;
3843 }
3844 
3845 core_initcall(net_inuse_init);
3846 
3847 static int assign_proto_idx(struct proto *prot)
3848 {
3849 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3850 
3851 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3852 		pr_err("PROTO_INUSE_NR exhausted\n");
3853 		return -ENOSPC;
3854 	}
3855 
3856 	set_bit(prot->inuse_idx, proto_inuse_idx);
3857 	return 0;
3858 }
3859 
3860 static void release_proto_idx(struct proto *prot)
3861 {
3862 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3863 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3864 }
3865 #else
3866 static inline int assign_proto_idx(struct proto *prot)
3867 {
3868 	return 0;
3869 }
3870 
3871 static inline void release_proto_idx(struct proto *prot)
3872 {
3873 }
3874 
3875 #endif
3876 
3877 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3878 {
3879 	if (!twsk_prot)
3880 		return;
3881 	kfree(twsk_prot->twsk_slab_name);
3882 	twsk_prot->twsk_slab_name = NULL;
3883 	kmem_cache_destroy(twsk_prot->twsk_slab);
3884 	twsk_prot->twsk_slab = NULL;
3885 }
3886 
3887 static int tw_prot_init(const struct proto *prot)
3888 {
3889 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3890 
3891 	if (!twsk_prot)
3892 		return 0;
3893 
3894 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3895 					      prot->name);
3896 	if (!twsk_prot->twsk_slab_name)
3897 		return -ENOMEM;
3898 
3899 	twsk_prot->twsk_slab =
3900 		kmem_cache_create(twsk_prot->twsk_slab_name,
3901 				  twsk_prot->twsk_obj_size, 0,
3902 				  SLAB_ACCOUNT | prot->slab_flags,
3903 				  NULL);
3904 	if (!twsk_prot->twsk_slab) {
3905 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3906 			prot->name);
3907 		return -ENOMEM;
3908 	}
3909 
3910 	return 0;
3911 }
3912 
3913 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3914 {
3915 	if (!rsk_prot)
3916 		return;
3917 	kfree(rsk_prot->slab_name);
3918 	rsk_prot->slab_name = NULL;
3919 	kmem_cache_destroy(rsk_prot->slab);
3920 	rsk_prot->slab = NULL;
3921 }
3922 
3923 static int req_prot_init(const struct proto *prot)
3924 {
3925 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3926 
3927 	if (!rsk_prot)
3928 		return 0;
3929 
3930 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3931 					prot->name);
3932 	if (!rsk_prot->slab_name)
3933 		return -ENOMEM;
3934 
3935 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3936 					   rsk_prot->obj_size, 0,
3937 					   SLAB_ACCOUNT | prot->slab_flags,
3938 					   NULL);
3939 
3940 	if (!rsk_prot->slab) {
3941 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3942 			prot->name);
3943 		return -ENOMEM;
3944 	}
3945 	return 0;
3946 }
3947 
3948 int proto_register(struct proto *prot, int alloc_slab)
3949 {
3950 	int ret = -ENOBUFS;
3951 
3952 	if (prot->memory_allocated && !prot->sysctl_mem) {
3953 		pr_err("%s: missing sysctl_mem\n", prot->name);
3954 		return -EINVAL;
3955 	}
3956 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3957 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3958 		return -EINVAL;
3959 	}
3960 	if (alloc_slab) {
3961 		prot->slab = kmem_cache_create_usercopy(prot->name,
3962 					prot->obj_size, 0,
3963 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3964 					prot->slab_flags,
3965 					prot->useroffset, prot->usersize,
3966 					NULL);
3967 
3968 		if (prot->slab == NULL) {
3969 			pr_crit("%s: Can't create sock SLAB cache!\n",
3970 				prot->name);
3971 			goto out;
3972 		}
3973 
3974 		if (req_prot_init(prot))
3975 			goto out_free_request_sock_slab;
3976 
3977 		if (tw_prot_init(prot))
3978 			goto out_free_timewait_sock_slab;
3979 	}
3980 
3981 	mutex_lock(&proto_list_mutex);
3982 	ret = assign_proto_idx(prot);
3983 	if (ret) {
3984 		mutex_unlock(&proto_list_mutex);
3985 		goto out_free_timewait_sock_slab;
3986 	}
3987 	list_add(&prot->node, &proto_list);
3988 	mutex_unlock(&proto_list_mutex);
3989 	return ret;
3990 
3991 out_free_timewait_sock_slab:
3992 	if (alloc_slab)
3993 		tw_prot_cleanup(prot->twsk_prot);
3994 out_free_request_sock_slab:
3995 	if (alloc_slab) {
3996 		req_prot_cleanup(prot->rsk_prot);
3997 
3998 		kmem_cache_destroy(prot->slab);
3999 		prot->slab = NULL;
4000 	}
4001 out:
4002 	return ret;
4003 }
4004 EXPORT_SYMBOL(proto_register);
4005 
4006 void proto_unregister(struct proto *prot)
4007 {
4008 	mutex_lock(&proto_list_mutex);
4009 	release_proto_idx(prot);
4010 	list_del(&prot->node);
4011 	mutex_unlock(&proto_list_mutex);
4012 
4013 	kmem_cache_destroy(prot->slab);
4014 	prot->slab = NULL;
4015 
4016 	req_prot_cleanup(prot->rsk_prot);
4017 	tw_prot_cleanup(prot->twsk_prot);
4018 }
4019 EXPORT_SYMBOL(proto_unregister);
4020 
4021 int sock_load_diag_module(int family, int protocol)
4022 {
4023 	if (!protocol) {
4024 		if (!sock_is_registered(family))
4025 			return -ENOENT;
4026 
4027 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4028 				      NETLINK_SOCK_DIAG, family);
4029 	}
4030 
4031 #ifdef CONFIG_INET
4032 	if (family == AF_INET &&
4033 	    protocol != IPPROTO_RAW &&
4034 	    protocol < MAX_INET_PROTOS &&
4035 	    !rcu_access_pointer(inet_protos[protocol]))
4036 		return -ENOENT;
4037 #endif
4038 
4039 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4040 			      NETLINK_SOCK_DIAG, family, protocol);
4041 }
4042 EXPORT_SYMBOL(sock_load_diag_module);
4043 
4044 #ifdef CONFIG_PROC_FS
4045 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4046 	__acquires(proto_list_mutex)
4047 {
4048 	mutex_lock(&proto_list_mutex);
4049 	return seq_list_start_head(&proto_list, *pos);
4050 }
4051 
4052 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4053 {
4054 	return seq_list_next(v, &proto_list, pos);
4055 }
4056 
4057 static void proto_seq_stop(struct seq_file *seq, void *v)
4058 	__releases(proto_list_mutex)
4059 {
4060 	mutex_unlock(&proto_list_mutex);
4061 }
4062 
4063 static char proto_method_implemented(const void *method)
4064 {
4065 	return method == NULL ? 'n' : 'y';
4066 }
4067 static long sock_prot_memory_allocated(struct proto *proto)
4068 {
4069 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4070 }
4071 
4072 static const char *sock_prot_memory_pressure(struct proto *proto)
4073 {
4074 	return proto->memory_pressure != NULL ?
4075 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4076 }
4077 
4078 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4079 {
4080 
4081 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4082 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4083 		   proto->name,
4084 		   proto->obj_size,
4085 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4086 		   sock_prot_memory_allocated(proto),
4087 		   sock_prot_memory_pressure(proto),
4088 		   proto->max_header,
4089 		   proto->slab == NULL ? "no" : "yes",
4090 		   module_name(proto->owner),
4091 		   proto_method_implemented(proto->close),
4092 		   proto_method_implemented(proto->connect),
4093 		   proto_method_implemented(proto->disconnect),
4094 		   proto_method_implemented(proto->accept),
4095 		   proto_method_implemented(proto->ioctl),
4096 		   proto_method_implemented(proto->init),
4097 		   proto_method_implemented(proto->destroy),
4098 		   proto_method_implemented(proto->shutdown),
4099 		   proto_method_implemented(proto->setsockopt),
4100 		   proto_method_implemented(proto->getsockopt),
4101 		   proto_method_implemented(proto->sendmsg),
4102 		   proto_method_implemented(proto->recvmsg),
4103 		   proto_method_implemented(proto->bind),
4104 		   proto_method_implemented(proto->backlog_rcv),
4105 		   proto_method_implemented(proto->hash),
4106 		   proto_method_implemented(proto->unhash),
4107 		   proto_method_implemented(proto->get_port),
4108 		   proto_method_implemented(proto->enter_memory_pressure));
4109 }
4110 
4111 static int proto_seq_show(struct seq_file *seq, void *v)
4112 {
4113 	if (v == &proto_list)
4114 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4115 			   "protocol",
4116 			   "size",
4117 			   "sockets",
4118 			   "memory",
4119 			   "press",
4120 			   "maxhdr",
4121 			   "slab",
4122 			   "module",
4123 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4124 	else
4125 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4126 	return 0;
4127 }
4128 
4129 static const struct seq_operations proto_seq_ops = {
4130 	.start  = proto_seq_start,
4131 	.next   = proto_seq_next,
4132 	.stop   = proto_seq_stop,
4133 	.show   = proto_seq_show,
4134 };
4135 
4136 static __net_init int proto_init_net(struct net *net)
4137 {
4138 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4139 			sizeof(struct seq_net_private)))
4140 		return -ENOMEM;
4141 
4142 	return 0;
4143 }
4144 
4145 static __net_exit void proto_exit_net(struct net *net)
4146 {
4147 	remove_proc_entry("protocols", net->proc_net);
4148 }
4149 
4150 
4151 static __net_initdata struct pernet_operations proto_net_ops = {
4152 	.init = proto_init_net,
4153 	.exit = proto_exit_net,
4154 };
4155 
4156 static int __init proto_init(void)
4157 {
4158 	return register_pernet_subsys(&proto_net_ops);
4159 }
4160 
4161 subsys_initcall(proto_init);
4162 
4163 #endif /* PROC_FS */
4164 
4165 #ifdef CONFIG_NET_RX_BUSY_POLL
4166 bool sk_busy_loop_end(void *p, unsigned long start_time)
4167 {
4168 	struct sock *sk = p;
4169 
4170 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4171 		return true;
4172 
4173 	if (sk_is_udp(sk) &&
4174 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4175 		return true;
4176 
4177 	return sk_busy_loop_timeout(sk, start_time);
4178 }
4179 EXPORT_SYMBOL(sk_busy_loop_end);
4180 #endif /* CONFIG_NET_RX_BUSY_POLL */
4181 
4182 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4183 {
4184 	if (!sk->sk_prot->bind_add)
4185 		return -EOPNOTSUPP;
4186 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4187 }
4188 EXPORT_SYMBOL(sock_bind_add);
4189 
4190 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4191 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4192 		     void __user *arg, void *karg, size_t size)
4193 {
4194 	int ret;
4195 
4196 	if (copy_from_user(karg, arg, size))
4197 		return -EFAULT;
4198 
4199 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4200 	if (ret)
4201 		return ret;
4202 
4203 	if (copy_to_user(arg, karg, size))
4204 		return -EFAULT;
4205 
4206 	return 0;
4207 }
4208 EXPORT_SYMBOL(sock_ioctl_inout);
4209 
4210 /* This is the most common ioctl prep function, where the result (4 bytes) is
4211  * copied back to userspace if the ioctl() returns successfully. No input is
4212  * copied from userspace as input argument.
4213  */
4214 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4215 {
4216 	int ret, karg = 0;
4217 
4218 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4219 	if (ret)
4220 		return ret;
4221 
4222 	return put_user(karg, (int __user *)arg);
4223 }
4224 
4225 /* A wrapper around sock ioctls, which copies the data from userspace
4226  * (depending on the protocol/ioctl), and copies back the result to userspace.
4227  * The main motivation for this function is to pass kernel memory to the
4228  * protocol ioctl callbacks, instead of userspace memory.
4229  */
4230 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4231 {
4232 	int rc = 1;
4233 
4234 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4235 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4236 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4237 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4238 	else if (sk_is_phonet(sk))
4239 		rc = phonet_sk_ioctl(sk, cmd, arg);
4240 
4241 	/* If ioctl was processed, returns its value */
4242 	if (rc <= 0)
4243 		return rc;
4244 
4245 	/* Otherwise call the default handler */
4246 	return sock_ioctl_out(sk, cmd, arg);
4247 }
4248 EXPORT_SYMBOL(sk_ioctl);
4249 
4250 static int __init sock_struct_check(void)
4251 {
4252 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4253 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4254 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4255 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4256 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4257 
4258 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4259 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4260 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4261 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4262 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4263 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4264 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4265 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4266 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4267 
4268 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4269 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4270 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4271 
4272 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4273 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4274 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4275 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4276 
4277 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4278 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4279 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4280 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4281 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4282 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4283 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4284 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4285 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4286 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4287 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4288 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4289 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4290 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4291 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4292 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4293 
4294 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4295 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4296 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4297 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4298 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4299 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4300 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4301 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4302 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4303 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4304 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4305 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4306 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4307 	return 0;
4308 }
4309 
4310 core_initcall(sock_struct_check);
4311