xref: /linux/net/core/sock.c (revision 73287fe228721b05690e671adbcccc6cf5435be6)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <net/proto_memory.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
137 
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
140 #include <net/bpf_sk_storage.h>
141 
142 #include <trace/events/sock.h>
143 
144 #include <net/tcp.h>
145 #include <net/busy_poll.h>
146 #include <net/phonet/phonet.h>
147 
148 #include <linux/ethtool.h>
149 
150 #include "dev.h"
151 
152 static DEFINE_MUTEX(proto_list_mutex);
153 static LIST_HEAD(proto_list);
154 
155 static void sock_def_write_space_wfree(struct sock *sk);
156 static void sock_def_write_space(struct sock *sk);
157 
158 /**
159  * sk_ns_capable - General socket capability test
160  * @sk: Socket to use a capability on or through
161  * @user_ns: The user namespace of the capability to use
162  * @cap: The capability to use
163  *
164  * Test to see if the opener of the socket had when the socket was
165  * created and the current process has the capability @cap in the user
166  * namespace @user_ns.
167  */
168 bool sk_ns_capable(const struct sock *sk,
169 		   struct user_namespace *user_ns, int cap)
170 {
171 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
172 		ns_capable(user_ns, cap);
173 }
174 EXPORT_SYMBOL(sk_ns_capable);
175 
176 /**
177  * sk_capable - Socket global capability test
178  * @sk: Socket to use a capability on or through
179  * @cap: The global capability to use
180  *
181  * Test to see if the opener of the socket had when the socket was
182  * created and the current process has the capability @cap in all user
183  * namespaces.
184  */
185 bool sk_capable(const struct sock *sk, int cap)
186 {
187 	return sk_ns_capable(sk, &init_user_ns, cap);
188 }
189 EXPORT_SYMBOL(sk_capable);
190 
191 /**
192  * sk_net_capable - Network namespace socket capability test
193  * @sk: Socket to use a capability on or through
194  * @cap: The capability to use
195  *
196  * Test to see if the opener of the socket had when the socket was created
197  * and the current process has the capability @cap over the network namespace
198  * the socket is a member of.
199  */
200 bool sk_net_capable(const struct sock *sk, int cap)
201 {
202 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
203 }
204 EXPORT_SYMBOL(sk_net_capable);
205 
206 /*
207  * Each address family might have different locking rules, so we have
208  * one slock key per address family and separate keys for internal and
209  * userspace sockets.
210  */
211 static struct lock_class_key af_family_keys[AF_MAX];
212 static struct lock_class_key af_family_kern_keys[AF_MAX];
213 static struct lock_class_key af_family_slock_keys[AF_MAX];
214 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
215 
216 /*
217  * Make lock validator output more readable. (we pre-construct these
218  * strings build-time, so that runtime initialization of socket
219  * locks is fast):
220  */
221 
222 #define _sock_locks(x)						  \
223   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
224   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
225   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
226   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
227   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
228   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
229   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
230   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
231   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
232   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
233   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
234   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
235   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
236   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
237   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
238   x "AF_MCTP"  , \
239   x "AF_MAX"
240 
241 static const char *const af_family_key_strings[AF_MAX+1] = {
242 	_sock_locks("sk_lock-")
243 };
244 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
245 	_sock_locks("slock-")
246 };
247 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
248 	_sock_locks("clock-")
249 };
250 
251 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
252 	_sock_locks("k-sk_lock-")
253 };
254 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
255 	_sock_locks("k-slock-")
256 };
257 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
258 	_sock_locks("k-clock-")
259 };
260 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
261 	_sock_locks("rlock-")
262 };
263 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
264 	_sock_locks("wlock-")
265 };
266 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
267 	_sock_locks("elock-")
268 };
269 
270 /*
271  * sk_callback_lock and sk queues locking rules are per-address-family,
272  * so split the lock classes by using a per-AF key:
273  */
274 static struct lock_class_key af_callback_keys[AF_MAX];
275 static struct lock_class_key af_rlock_keys[AF_MAX];
276 static struct lock_class_key af_wlock_keys[AF_MAX];
277 static struct lock_class_key af_elock_keys[AF_MAX];
278 static struct lock_class_key af_kern_callback_keys[AF_MAX];
279 
280 /* Run time adjustable parameters. */
281 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
282 EXPORT_SYMBOL(sysctl_wmem_max);
283 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
284 EXPORT_SYMBOL(sysctl_rmem_max);
285 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
286 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
287 
288 int sysctl_tstamp_allow_data __read_mostly = 1;
289 
290 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
291 EXPORT_SYMBOL_GPL(memalloc_socks_key);
292 
293 /**
294  * sk_set_memalloc - sets %SOCK_MEMALLOC
295  * @sk: socket to set it on
296  *
297  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
298  * It's the responsibility of the admin to adjust min_free_kbytes
299  * to meet the requirements
300  */
301 void sk_set_memalloc(struct sock *sk)
302 {
303 	sock_set_flag(sk, SOCK_MEMALLOC);
304 	sk->sk_allocation |= __GFP_MEMALLOC;
305 	static_branch_inc(&memalloc_socks_key);
306 }
307 EXPORT_SYMBOL_GPL(sk_set_memalloc);
308 
309 void sk_clear_memalloc(struct sock *sk)
310 {
311 	sock_reset_flag(sk, SOCK_MEMALLOC);
312 	sk->sk_allocation &= ~__GFP_MEMALLOC;
313 	static_branch_dec(&memalloc_socks_key);
314 
315 	/*
316 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
317 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
318 	 * it has rmem allocations due to the last swapfile being deactivated
319 	 * but there is a risk that the socket is unusable due to exceeding
320 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
321 	 */
322 	sk_mem_reclaim(sk);
323 }
324 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
325 
326 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
327 {
328 	int ret;
329 	unsigned int noreclaim_flag;
330 
331 	/* these should have been dropped before queueing */
332 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
333 
334 	noreclaim_flag = memalloc_noreclaim_save();
335 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
336 				 tcp_v6_do_rcv,
337 				 tcp_v4_do_rcv,
338 				 sk, skb);
339 	memalloc_noreclaim_restore(noreclaim_flag);
340 
341 	return ret;
342 }
343 EXPORT_SYMBOL(__sk_backlog_rcv);
344 
345 void sk_error_report(struct sock *sk)
346 {
347 	sk->sk_error_report(sk);
348 
349 	switch (sk->sk_family) {
350 	case AF_INET:
351 		fallthrough;
352 	case AF_INET6:
353 		trace_inet_sk_error_report(sk);
354 		break;
355 	default:
356 		break;
357 	}
358 }
359 EXPORT_SYMBOL(sk_error_report);
360 
361 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
362 {
363 	struct __kernel_sock_timeval tv;
364 
365 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
366 		tv.tv_sec = 0;
367 		tv.tv_usec = 0;
368 	} else {
369 		tv.tv_sec = timeo / HZ;
370 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
371 	}
372 
373 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
374 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
375 		*(struct old_timeval32 *)optval = tv32;
376 		return sizeof(tv32);
377 	}
378 
379 	if (old_timeval) {
380 		struct __kernel_old_timeval old_tv;
381 		old_tv.tv_sec = tv.tv_sec;
382 		old_tv.tv_usec = tv.tv_usec;
383 		*(struct __kernel_old_timeval *)optval = old_tv;
384 		return sizeof(old_tv);
385 	}
386 
387 	*(struct __kernel_sock_timeval *)optval = tv;
388 	return sizeof(tv);
389 }
390 EXPORT_SYMBOL(sock_get_timeout);
391 
392 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
393 			   sockptr_t optval, int optlen, bool old_timeval)
394 {
395 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
396 		struct old_timeval32 tv32;
397 
398 		if (optlen < sizeof(tv32))
399 			return -EINVAL;
400 
401 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
402 			return -EFAULT;
403 		tv->tv_sec = tv32.tv_sec;
404 		tv->tv_usec = tv32.tv_usec;
405 	} else if (old_timeval) {
406 		struct __kernel_old_timeval old_tv;
407 
408 		if (optlen < sizeof(old_tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
411 			return -EFAULT;
412 		tv->tv_sec = old_tv.tv_sec;
413 		tv->tv_usec = old_tv.tv_usec;
414 	} else {
415 		if (optlen < sizeof(*tv))
416 			return -EINVAL;
417 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
418 			return -EFAULT;
419 	}
420 
421 	return 0;
422 }
423 EXPORT_SYMBOL(sock_copy_user_timeval);
424 
425 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
426 			    bool old_timeval)
427 {
428 	struct __kernel_sock_timeval tv;
429 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
430 	long val;
431 
432 	if (err)
433 		return err;
434 
435 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
436 		return -EDOM;
437 
438 	if (tv.tv_sec < 0) {
439 		static int warned __read_mostly;
440 
441 		WRITE_ONCE(*timeo_p, 0);
442 		if (warned < 10 && net_ratelimit()) {
443 			warned++;
444 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
445 				__func__, current->comm, task_pid_nr(current));
446 		}
447 		return 0;
448 	}
449 	val = MAX_SCHEDULE_TIMEOUT;
450 	if ((tv.tv_sec || tv.tv_usec) &&
451 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
452 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
453 						    USEC_PER_SEC / HZ);
454 	WRITE_ONCE(*timeo_p, val);
455 	return 0;
456 }
457 
458 static bool sock_needs_netstamp(const struct sock *sk)
459 {
460 	switch (sk->sk_family) {
461 	case AF_UNSPEC:
462 	case AF_UNIX:
463 		return false;
464 	default:
465 		return true;
466 	}
467 }
468 
469 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
470 {
471 	if (sk->sk_flags & flags) {
472 		sk->sk_flags &= ~flags;
473 		if (sock_needs_netstamp(sk) &&
474 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
475 			net_disable_timestamp();
476 	}
477 }
478 
479 
480 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
481 {
482 	unsigned long flags;
483 	struct sk_buff_head *list = &sk->sk_receive_queue;
484 
485 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
486 		atomic_inc(&sk->sk_drops);
487 		trace_sock_rcvqueue_full(sk, skb);
488 		return -ENOMEM;
489 	}
490 
491 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
492 		atomic_inc(&sk->sk_drops);
493 		return -ENOBUFS;
494 	}
495 
496 	skb->dev = NULL;
497 	skb_set_owner_r(skb, sk);
498 
499 	/* we escape from rcu protected region, make sure we dont leak
500 	 * a norefcounted dst
501 	 */
502 	skb_dst_force(skb);
503 
504 	spin_lock_irqsave(&list->lock, flags);
505 	sock_skb_set_dropcount(sk, skb);
506 	__skb_queue_tail(list, skb);
507 	spin_unlock_irqrestore(&list->lock, flags);
508 
509 	if (!sock_flag(sk, SOCK_DEAD))
510 		sk->sk_data_ready(sk);
511 	return 0;
512 }
513 EXPORT_SYMBOL(__sock_queue_rcv_skb);
514 
515 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
516 			      enum skb_drop_reason *reason)
517 {
518 	enum skb_drop_reason drop_reason;
519 	int err;
520 
521 	err = sk_filter(sk, skb);
522 	if (err) {
523 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
524 		goto out;
525 	}
526 	err = __sock_queue_rcv_skb(sk, skb);
527 	switch (err) {
528 	case -ENOMEM:
529 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
530 		break;
531 	case -ENOBUFS:
532 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
533 		break;
534 	default:
535 		drop_reason = SKB_NOT_DROPPED_YET;
536 		break;
537 	}
538 out:
539 	if (reason)
540 		*reason = drop_reason;
541 	return err;
542 }
543 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
544 
545 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
546 		     const int nested, unsigned int trim_cap, bool refcounted)
547 {
548 	int rc = NET_RX_SUCCESS;
549 
550 	if (sk_filter_trim_cap(sk, skb, trim_cap))
551 		goto discard_and_relse;
552 
553 	skb->dev = NULL;
554 
555 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
556 		atomic_inc(&sk->sk_drops);
557 		goto discard_and_relse;
558 	}
559 	if (nested)
560 		bh_lock_sock_nested(sk);
561 	else
562 		bh_lock_sock(sk);
563 	if (!sock_owned_by_user(sk)) {
564 		/*
565 		 * trylock + unlock semantics:
566 		 */
567 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
568 
569 		rc = sk_backlog_rcv(sk, skb);
570 
571 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
572 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
573 		bh_unlock_sock(sk);
574 		atomic_inc(&sk->sk_drops);
575 		goto discard_and_relse;
576 	}
577 
578 	bh_unlock_sock(sk);
579 out:
580 	if (refcounted)
581 		sock_put(sk);
582 	return rc;
583 discard_and_relse:
584 	kfree_skb(skb);
585 	goto out;
586 }
587 EXPORT_SYMBOL(__sk_receive_skb);
588 
589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
590 							  u32));
591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
592 							   u32));
593 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
594 {
595 	struct dst_entry *dst = __sk_dst_get(sk);
596 
597 	if (dst && dst->obsolete &&
598 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
599 			       dst, cookie) == NULL) {
600 		sk_tx_queue_clear(sk);
601 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
602 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
603 		dst_release(dst);
604 		return NULL;
605 	}
606 
607 	return dst;
608 }
609 EXPORT_SYMBOL(__sk_dst_check);
610 
611 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
612 {
613 	struct dst_entry *dst = sk_dst_get(sk);
614 
615 	if (dst && dst->obsolete &&
616 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
617 			       dst, cookie) == NULL) {
618 		sk_dst_reset(sk);
619 		dst_release(dst);
620 		return NULL;
621 	}
622 
623 	return dst;
624 }
625 EXPORT_SYMBOL(sk_dst_check);
626 
627 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
628 {
629 	int ret = -ENOPROTOOPT;
630 #ifdef CONFIG_NETDEVICES
631 	struct net *net = sock_net(sk);
632 
633 	/* Sorry... */
634 	ret = -EPERM;
635 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
636 		goto out;
637 
638 	ret = -EINVAL;
639 	if (ifindex < 0)
640 		goto out;
641 
642 	/* Paired with all READ_ONCE() done locklessly. */
643 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
644 
645 	if (sk->sk_prot->rehash)
646 		sk->sk_prot->rehash(sk);
647 	sk_dst_reset(sk);
648 
649 	ret = 0;
650 
651 out:
652 #endif
653 
654 	return ret;
655 }
656 
657 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
658 {
659 	int ret;
660 
661 	if (lock_sk)
662 		lock_sock(sk);
663 	ret = sock_bindtoindex_locked(sk, ifindex);
664 	if (lock_sk)
665 		release_sock(sk);
666 
667 	return ret;
668 }
669 EXPORT_SYMBOL(sock_bindtoindex);
670 
671 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
672 {
673 	int ret = -ENOPROTOOPT;
674 #ifdef CONFIG_NETDEVICES
675 	struct net *net = sock_net(sk);
676 	char devname[IFNAMSIZ];
677 	int index;
678 
679 	ret = -EINVAL;
680 	if (optlen < 0)
681 		goto out;
682 
683 	/* Bind this socket to a particular device like "eth0",
684 	 * as specified in the passed interface name. If the
685 	 * name is "" or the option length is zero the socket
686 	 * is not bound.
687 	 */
688 	if (optlen > IFNAMSIZ - 1)
689 		optlen = IFNAMSIZ - 1;
690 	memset(devname, 0, sizeof(devname));
691 
692 	ret = -EFAULT;
693 	if (copy_from_sockptr(devname, optval, optlen))
694 		goto out;
695 
696 	index = 0;
697 	if (devname[0] != '\0') {
698 		struct net_device *dev;
699 
700 		rcu_read_lock();
701 		dev = dev_get_by_name_rcu(net, devname);
702 		if (dev)
703 			index = dev->ifindex;
704 		rcu_read_unlock();
705 		ret = -ENODEV;
706 		if (!dev)
707 			goto out;
708 	}
709 
710 	sockopt_lock_sock(sk);
711 	ret = sock_bindtoindex_locked(sk, index);
712 	sockopt_release_sock(sk);
713 out:
714 #endif
715 
716 	return ret;
717 }
718 
719 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
720 				sockptr_t optlen, int len)
721 {
722 	int ret = -ENOPROTOOPT;
723 #ifdef CONFIG_NETDEVICES
724 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
725 	struct net *net = sock_net(sk);
726 	char devname[IFNAMSIZ];
727 
728 	if (bound_dev_if == 0) {
729 		len = 0;
730 		goto zero;
731 	}
732 
733 	ret = -EINVAL;
734 	if (len < IFNAMSIZ)
735 		goto out;
736 
737 	ret = netdev_get_name(net, devname, bound_dev_if);
738 	if (ret)
739 		goto out;
740 
741 	len = strlen(devname) + 1;
742 
743 	ret = -EFAULT;
744 	if (copy_to_sockptr(optval, devname, len))
745 		goto out;
746 
747 zero:
748 	ret = -EFAULT;
749 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
750 		goto out;
751 
752 	ret = 0;
753 
754 out:
755 #endif
756 
757 	return ret;
758 }
759 
760 bool sk_mc_loop(const struct sock *sk)
761 {
762 	if (dev_recursion_level())
763 		return false;
764 	if (!sk)
765 		return true;
766 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
767 	switch (READ_ONCE(sk->sk_family)) {
768 	case AF_INET:
769 		return inet_test_bit(MC_LOOP, sk);
770 #if IS_ENABLED(CONFIG_IPV6)
771 	case AF_INET6:
772 		return inet6_test_bit(MC6_LOOP, sk);
773 #endif
774 	}
775 	WARN_ON_ONCE(1);
776 	return true;
777 }
778 EXPORT_SYMBOL(sk_mc_loop);
779 
780 void sock_set_reuseaddr(struct sock *sk)
781 {
782 	lock_sock(sk);
783 	sk->sk_reuse = SK_CAN_REUSE;
784 	release_sock(sk);
785 }
786 EXPORT_SYMBOL(sock_set_reuseaddr);
787 
788 void sock_set_reuseport(struct sock *sk)
789 {
790 	lock_sock(sk);
791 	sk->sk_reuseport = true;
792 	release_sock(sk);
793 }
794 EXPORT_SYMBOL(sock_set_reuseport);
795 
796 void sock_no_linger(struct sock *sk)
797 {
798 	lock_sock(sk);
799 	WRITE_ONCE(sk->sk_lingertime, 0);
800 	sock_set_flag(sk, SOCK_LINGER);
801 	release_sock(sk);
802 }
803 EXPORT_SYMBOL(sock_no_linger);
804 
805 void sock_set_priority(struct sock *sk, u32 priority)
806 {
807 	WRITE_ONCE(sk->sk_priority, priority);
808 }
809 EXPORT_SYMBOL(sock_set_priority);
810 
811 void sock_set_sndtimeo(struct sock *sk, s64 secs)
812 {
813 	lock_sock(sk);
814 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
815 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
816 	else
817 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
818 	release_sock(sk);
819 }
820 EXPORT_SYMBOL(sock_set_sndtimeo);
821 
822 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
823 {
824 	if (val)  {
825 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
826 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
827 		sock_set_flag(sk, SOCK_RCVTSTAMP);
828 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
829 	} else {
830 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
831 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
832 	}
833 }
834 
835 void sock_enable_timestamps(struct sock *sk)
836 {
837 	lock_sock(sk);
838 	__sock_set_timestamps(sk, true, false, true);
839 	release_sock(sk);
840 }
841 EXPORT_SYMBOL(sock_enable_timestamps);
842 
843 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
844 {
845 	switch (optname) {
846 	case SO_TIMESTAMP_OLD:
847 		__sock_set_timestamps(sk, valbool, false, false);
848 		break;
849 	case SO_TIMESTAMP_NEW:
850 		__sock_set_timestamps(sk, valbool, true, false);
851 		break;
852 	case SO_TIMESTAMPNS_OLD:
853 		__sock_set_timestamps(sk, valbool, false, true);
854 		break;
855 	case SO_TIMESTAMPNS_NEW:
856 		__sock_set_timestamps(sk, valbool, true, true);
857 		break;
858 	}
859 }
860 
861 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
862 {
863 	struct net *net = sock_net(sk);
864 	struct net_device *dev = NULL;
865 	bool match = false;
866 	int *vclock_index;
867 	int i, num;
868 
869 	if (sk->sk_bound_dev_if)
870 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
871 
872 	if (!dev) {
873 		pr_err("%s: sock not bind to device\n", __func__);
874 		return -EOPNOTSUPP;
875 	}
876 
877 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
878 	dev_put(dev);
879 
880 	for (i = 0; i < num; i++) {
881 		if (*(vclock_index + i) == phc_index) {
882 			match = true;
883 			break;
884 		}
885 	}
886 
887 	if (num > 0)
888 		kfree(vclock_index);
889 
890 	if (!match)
891 		return -EINVAL;
892 
893 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
894 
895 	return 0;
896 }
897 
898 int sock_set_timestamping(struct sock *sk, int optname,
899 			  struct so_timestamping timestamping)
900 {
901 	int val = timestamping.flags;
902 	int ret;
903 
904 	if (val & ~SOF_TIMESTAMPING_MASK)
905 		return -EINVAL;
906 
907 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
908 	    !(val & SOF_TIMESTAMPING_OPT_ID))
909 		return -EINVAL;
910 
911 	if (val & SOF_TIMESTAMPING_OPT_ID &&
912 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
913 		if (sk_is_tcp(sk)) {
914 			if ((1 << sk->sk_state) &
915 			    (TCPF_CLOSE | TCPF_LISTEN))
916 				return -EINVAL;
917 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
918 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
919 			else
920 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
921 		} else {
922 			atomic_set(&sk->sk_tskey, 0);
923 		}
924 	}
925 
926 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
927 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
928 		return -EINVAL;
929 
930 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
931 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
932 		if (ret)
933 			return ret;
934 	}
935 
936 	WRITE_ONCE(sk->sk_tsflags, val);
937 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
938 
939 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
940 		sock_enable_timestamp(sk,
941 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
942 	else
943 		sock_disable_timestamp(sk,
944 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
945 	return 0;
946 }
947 
948 void sock_set_keepalive(struct sock *sk)
949 {
950 	lock_sock(sk);
951 	if (sk->sk_prot->keepalive)
952 		sk->sk_prot->keepalive(sk, true);
953 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
954 	release_sock(sk);
955 }
956 EXPORT_SYMBOL(sock_set_keepalive);
957 
958 static void __sock_set_rcvbuf(struct sock *sk, int val)
959 {
960 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
961 	 * as a negative value.
962 	 */
963 	val = min_t(int, val, INT_MAX / 2);
964 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
965 
966 	/* We double it on the way in to account for "struct sk_buff" etc.
967 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
968 	 * will allow that much actual data to be received on that socket.
969 	 *
970 	 * Applications are unaware that "struct sk_buff" and other overheads
971 	 * allocate from the receive buffer during socket buffer allocation.
972 	 *
973 	 * And after considering the possible alternatives, returning the value
974 	 * we actually used in getsockopt is the most desirable behavior.
975 	 */
976 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
977 }
978 
979 void sock_set_rcvbuf(struct sock *sk, int val)
980 {
981 	lock_sock(sk);
982 	__sock_set_rcvbuf(sk, val);
983 	release_sock(sk);
984 }
985 EXPORT_SYMBOL(sock_set_rcvbuf);
986 
987 static void __sock_set_mark(struct sock *sk, u32 val)
988 {
989 	if (val != sk->sk_mark) {
990 		WRITE_ONCE(sk->sk_mark, val);
991 		sk_dst_reset(sk);
992 	}
993 }
994 
995 void sock_set_mark(struct sock *sk, u32 val)
996 {
997 	lock_sock(sk);
998 	__sock_set_mark(sk, val);
999 	release_sock(sk);
1000 }
1001 EXPORT_SYMBOL(sock_set_mark);
1002 
1003 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1004 {
1005 	/* Round down bytes to multiple of pages */
1006 	bytes = round_down(bytes, PAGE_SIZE);
1007 
1008 	WARN_ON(bytes > sk->sk_reserved_mem);
1009 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1010 	sk_mem_reclaim(sk);
1011 }
1012 
1013 static int sock_reserve_memory(struct sock *sk, int bytes)
1014 {
1015 	long allocated;
1016 	bool charged;
1017 	int pages;
1018 
1019 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1020 		return -EOPNOTSUPP;
1021 
1022 	if (!bytes)
1023 		return 0;
1024 
1025 	pages = sk_mem_pages(bytes);
1026 
1027 	/* pre-charge to memcg */
1028 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1029 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1030 	if (!charged)
1031 		return -ENOMEM;
1032 
1033 	/* pre-charge to forward_alloc */
1034 	sk_memory_allocated_add(sk, pages);
1035 	allocated = sk_memory_allocated(sk);
1036 	/* If the system goes into memory pressure with this
1037 	 * precharge, give up and return error.
1038 	 */
1039 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1040 		sk_memory_allocated_sub(sk, pages);
1041 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1042 		return -ENOMEM;
1043 	}
1044 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1045 
1046 	WRITE_ONCE(sk->sk_reserved_mem,
1047 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1048 
1049 	return 0;
1050 }
1051 
1052 void sockopt_lock_sock(struct sock *sk)
1053 {
1054 	/* When current->bpf_ctx is set, the setsockopt is called from
1055 	 * a bpf prog.  bpf has ensured the sk lock has been
1056 	 * acquired before calling setsockopt().
1057 	 */
1058 	if (has_current_bpf_ctx())
1059 		return;
1060 
1061 	lock_sock(sk);
1062 }
1063 EXPORT_SYMBOL(sockopt_lock_sock);
1064 
1065 void sockopt_release_sock(struct sock *sk)
1066 {
1067 	if (has_current_bpf_ctx())
1068 		return;
1069 
1070 	release_sock(sk);
1071 }
1072 EXPORT_SYMBOL(sockopt_release_sock);
1073 
1074 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1075 {
1076 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1077 }
1078 EXPORT_SYMBOL(sockopt_ns_capable);
1079 
1080 bool sockopt_capable(int cap)
1081 {
1082 	return has_current_bpf_ctx() || capable(cap);
1083 }
1084 EXPORT_SYMBOL(sockopt_capable);
1085 
1086 /*
1087  *	This is meant for all protocols to use and covers goings on
1088  *	at the socket level. Everything here is generic.
1089  */
1090 
1091 int sk_setsockopt(struct sock *sk, int level, int optname,
1092 		  sockptr_t optval, unsigned int optlen)
1093 {
1094 	struct so_timestamping timestamping;
1095 	struct socket *sock = sk->sk_socket;
1096 	struct sock_txtime sk_txtime;
1097 	int val;
1098 	int valbool;
1099 	struct linger ling;
1100 	int ret = 0;
1101 
1102 	/*
1103 	 *	Options without arguments
1104 	 */
1105 
1106 	if (optname == SO_BINDTODEVICE)
1107 		return sock_setbindtodevice(sk, optval, optlen);
1108 
1109 	if (optlen < sizeof(int))
1110 		return -EINVAL;
1111 
1112 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1113 		return -EFAULT;
1114 
1115 	valbool = val ? 1 : 0;
1116 
1117 	/* handle options which do not require locking the socket. */
1118 	switch (optname) {
1119 	case SO_PRIORITY:
1120 		if ((val >= 0 && val <= 6) ||
1121 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1122 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1123 			sock_set_priority(sk, val);
1124 			return 0;
1125 		}
1126 		return -EPERM;
1127 	case SO_PASSSEC:
1128 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1129 		return 0;
1130 	case SO_PASSCRED:
1131 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1132 		return 0;
1133 	case SO_PASSPIDFD:
1134 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1135 		return 0;
1136 	case SO_TYPE:
1137 	case SO_PROTOCOL:
1138 	case SO_DOMAIN:
1139 	case SO_ERROR:
1140 		return -ENOPROTOOPT;
1141 #ifdef CONFIG_NET_RX_BUSY_POLL
1142 	case SO_BUSY_POLL:
1143 		if (val < 0)
1144 			return -EINVAL;
1145 		WRITE_ONCE(sk->sk_ll_usec, val);
1146 		return 0;
1147 	case SO_PREFER_BUSY_POLL:
1148 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1149 			return -EPERM;
1150 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1151 		return 0;
1152 	case SO_BUSY_POLL_BUDGET:
1153 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1154 		    !sockopt_capable(CAP_NET_ADMIN))
1155 			return -EPERM;
1156 		if (val < 0 || val > U16_MAX)
1157 			return -EINVAL;
1158 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1159 		return 0;
1160 #endif
1161 	case SO_MAX_PACING_RATE:
1162 		{
1163 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1164 		unsigned long pacing_rate;
1165 
1166 		if (sizeof(ulval) != sizeof(val) &&
1167 		    optlen >= sizeof(ulval) &&
1168 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1169 			return -EFAULT;
1170 		}
1171 		if (ulval != ~0UL)
1172 			cmpxchg(&sk->sk_pacing_status,
1173 				SK_PACING_NONE,
1174 				SK_PACING_NEEDED);
1175 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1176 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1177 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1178 		if (ulval < pacing_rate)
1179 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1180 		return 0;
1181 		}
1182 	case SO_TXREHASH:
1183 		if (val < -1 || val > 1)
1184 			return -EINVAL;
1185 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1186 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1187 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1188 		 * and sk_getsockopt().
1189 		 */
1190 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1191 		return 0;
1192 	case SO_PEEK_OFF:
1193 		{
1194 		int (*set_peek_off)(struct sock *sk, int val);
1195 
1196 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1197 		if (set_peek_off)
1198 			ret = set_peek_off(sk, val);
1199 		else
1200 			ret = -EOPNOTSUPP;
1201 		return ret;
1202 		}
1203 	}
1204 
1205 	sockopt_lock_sock(sk);
1206 
1207 	switch (optname) {
1208 	case SO_DEBUG:
1209 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1210 			ret = -EACCES;
1211 		else
1212 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1213 		break;
1214 	case SO_REUSEADDR:
1215 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1216 		break;
1217 	case SO_REUSEPORT:
1218 		sk->sk_reuseport = valbool;
1219 		break;
1220 	case SO_DONTROUTE:
1221 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1222 		sk_dst_reset(sk);
1223 		break;
1224 	case SO_BROADCAST:
1225 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1226 		break;
1227 	case SO_SNDBUF:
1228 		/* Don't error on this BSD doesn't and if you think
1229 		 * about it this is right. Otherwise apps have to
1230 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1231 		 * are treated in BSD as hints
1232 		 */
1233 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1234 set_sndbuf:
1235 		/* Ensure val * 2 fits into an int, to prevent max_t()
1236 		 * from treating it as a negative value.
1237 		 */
1238 		val = min_t(int, val, INT_MAX / 2);
1239 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1240 		WRITE_ONCE(sk->sk_sndbuf,
1241 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1242 		/* Wake up sending tasks if we upped the value. */
1243 		sk->sk_write_space(sk);
1244 		break;
1245 
1246 	case SO_SNDBUFFORCE:
1247 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1248 			ret = -EPERM;
1249 			break;
1250 		}
1251 
1252 		/* No negative values (to prevent underflow, as val will be
1253 		 * multiplied by 2).
1254 		 */
1255 		if (val < 0)
1256 			val = 0;
1257 		goto set_sndbuf;
1258 
1259 	case SO_RCVBUF:
1260 		/* Don't error on this BSD doesn't and if you think
1261 		 * about it this is right. Otherwise apps have to
1262 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1263 		 * are treated in BSD as hints
1264 		 */
1265 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1266 		break;
1267 
1268 	case SO_RCVBUFFORCE:
1269 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1270 			ret = -EPERM;
1271 			break;
1272 		}
1273 
1274 		/* No negative values (to prevent underflow, as val will be
1275 		 * multiplied by 2).
1276 		 */
1277 		__sock_set_rcvbuf(sk, max(val, 0));
1278 		break;
1279 
1280 	case SO_KEEPALIVE:
1281 		if (sk->sk_prot->keepalive)
1282 			sk->sk_prot->keepalive(sk, valbool);
1283 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1284 		break;
1285 
1286 	case SO_OOBINLINE:
1287 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1288 		break;
1289 
1290 	case SO_NO_CHECK:
1291 		sk->sk_no_check_tx = valbool;
1292 		break;
1293 
1294 	case SO_LINGER:
1295 		if (optlen < sizeof(ling)) {
1296 			ret = -EINVAL;	/* 1003.1g */
1297 			break;
1298 		}
1299 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1300 			ret = -EFAULT;
1301 			break;
1302 		}
1303 		if (!ling.l_onoff) {
1304 			sock_reset_flag(sk, SOCK_LINGER);
1305 		} else {
1306 			unsigned long t_sec = ling.l_linger;
1307 
1308 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1309 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1310 			else
1311 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1312 			sock_set_flag(sk, SOCK_LINGER);
1313 		}
1314 		break;
1315 
1316 	case SO_BSDCOMPAT:
1317 		break;
1318 
1319 	case SO_TIMESTAMP_OLD:
1320 	case SO_TIMESTAMP_NEW:
1321 	case SO_TIMESTAMPNS_OLD:
1322 	case SO_TIMESTAMPNS_NEW:
1323 		sock_set_timestamp(sk, optname, valbool);
1324 		break;
1325 
1326 	case SO_TIMESTAMPING_NEW:
1327 	case SO_TIMESTAMPING_OLD:
1328 		if (optlen == sizeof(timestamping)) {
1329 			if (copy_from_sockptr(&timestamping, optval,
1330 					      sizeof(timestamping))) {
1331 				ret = -EFAULT;
1332 				break;
1333 			}
1334 		} else {
1335 			memset(&timestamping, 0, sizeof(timestamping));
1336 			timestamping.flags = val;
1337 		}
1338 		ret = sock_set_timestamping(sk, optname, timestamping);
1339 		break;
1340 
1341 	case SO_RCVLOWAT:
1342 		{
1343 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1344 
1345 		if (val < 0)
1346 			val = INT_MAX;
1347 		if (sock)
1348 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1349 		if (set_rcvlowat)
1350 			ret = set_rcvlowat(sk, val);
1351 		else
1352 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1353 		break;
1354 		}
1355 	case SO_RCVTIMEO_OLD:
1356 	case SO_RCVTIMEO_NEW:
1357 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1358 				       optlen, optname == SO_RCVTIMEO_OLD);
1359 		break;
1360 
1361 	case SO_SNDTIMEO_OLD:
1362 	case SO_SNDTIMEO_NEW:
1363 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1364 				       optlen, optname == SO_SNDTIMEO_OLD);
1365 		break;
1366 
1367 	case SO_ATTACH_FILTER: {
1368 		struct sock_fprog fprog;
1369 
1370 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1371 		if (!ret)
1372 			ret = sk_attach_filter(&fprog, sk);
1373 		break;
1374 	}
1375 	case SO_ATTACH_BPF:
1376 		ret = -EINVAL;
1377 		if (optlen == sizeof(u32)) {
1378 			u32 ufd;
1379 
1380 			ret = -EFAULT;
1381 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1382 				break;
1383 
1384 			ret = sk_attach_bpf(ufd, sk);
1385 		}
1386 		break;
1387 
1388 	case SO_ATTACH_REUSEPORT_CBPF: {
1389 		struct sock_fprog fprog;
1390 
1391 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1392 		if (!ret)
1393 			ret = sk_reuseport_attach_filter(&fprog, sk);
1394 		break;
1395 	}
1396 	case SO_ATTACH_REUSEPORT_EBPF:
1397 		ret = -EINVAL;
1398 		if (optlen == sizeof(u32)) {
1399 			u32 ufd;
1400 
1401 			ret = -EFAULT;
1402 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1403 				break;
1404 
1405 			ret = sk_reuseport_attach_bpf(ufd, sk);
1406 		}
1407 		break;
1408 
1409 	case SO_DETACH_REUSEPORT_BPF:
1410 		ret = reuseport_detach_prog(sk);
1411 		break;
1412 
1413 	case SO_DETACH_FILTER:
1414 		ret = sk_detach_filter(sk);
1415 		break;
1416 
1417 	case SO_LOCK_FILTER:
1418 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1419 			ret = -EPERM;
1420 		else
1421 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1422 		break;
1423 
1424 	case SO_MARK:
1425 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1426 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1427 			ret = -EPERM;
1428 			break;
1429 		}
1430 
1431 		__sock_set_mark(sk, val);
1432 		break;
1433 	case SO_RCVMARK:
1434 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1435 		break;
1436 
1437 	case SO_RXQ_OVFL:
1438 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1439 		break;
1440 
1441 	case SO_WIFI_STATUS:
1442 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1443 		break;
1444 
1445 	case SO_NOFCS:
1446 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1447 		break;
1448 
1449 	case SO_SELECT_ERR_QUEUE:
1450 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1451 		break;
1452 
1453 
1454 	case SO_INCOMING_CPU:
1455 		reuseport_update_incoming_cpu(sk, val);
1456 		break;
1457 
1458 	case SO_CNX_ADVICE:
1459 		if (val == 1)
1460 			dst_negative_advice(sk);
1461 		break;
1462 
1463 	case SO_ZEROCOPY:
1464 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1465 			if (!(sk_is_tcp(sk) ||
1466 			      (sk->sk_type == SOCK_DGRAM &&
1467 			       sk->sk_protocol == IPPROTO_UDP)))
1468 				ret = -EOPNOTSUPP;
1469 		} else if (sk->sk_family != PF_RDS) {
1470 			ret = -EOPNOTSUPP;
1471 		}
1472 		if (!ret) {
1473 			if (val < 0 || val > 1)
1474 				ret = -EINVAL;
1475 			else
1476 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1477 		}
1478 		break;
1479 
1480 	case SO_TXTIME:
1481 		if (optlen != sizeof(struct sock_txtime)) {
1482 			ret = -EINVAL;
1483 			break;
1484 		} else if (copy_from_sockptr(&sk_txtime, optval,
1485 			   sizeof(struct sock_txtime))) {
1486 			ret = -EFAULT;
1487 			break;
1488 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1489 			ret = -EINVAL;
1490 			break;
1491 		}
1492 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1493 		 * scheduler has enough safe guards.
1494 		 */
1495 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1496 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1497 			ret = -EPERM;
1498 			break;
1499 		}
1500 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1501 		sk->sk_clockid = sk_txtime.clockid;
1502 		sk->sk_txtime_deadline_mode =
1503 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1504 		sk->sk_txtime_report_errors =
1505 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1506 		break;
1507 
1508 	case SO_BINDTOIFINDEX:
1509 		ret = sock_bindtoindex_locked(sk, val);
1510 		break;
1511 
1512 	case SO_BUF_LOCK:
1513 		if (val & ~SOCK_BUF_LOCK_MASK) {
1514 			ret = -EINVAL;
1515 			break;
1516 		}
1517 		sk->sk_userlocks = val | (sk->sk_userlocks &
1518 					  ~SOCK_BUF_LOCK_MASK);
1519 		break;
1520 
1521 	case SO_RESERVE_MEM:
1522 	{
1523 		int delta;
1524 
1525 		if (val < 0) {
1526 			ret = -EINVAL;
1527 			break;
1528 		}
1529 
1530 		delta = val - sk->sk_reserved_mem;
1531 		if (delta < 0)
1532 			sock_release_reserved_memory(sk, -delta);
1533 		else
1534 			ret = sock_reserve_memory(sk, delta);
1535 		break;
1536 	}
1537 
1538 	default:
1539 		ret = -ENOPROTOOPT;
1540 		break;
1541 	}
1542 	sockopt_release_sock(sk);
1543 	return ret;
1544 }
1545 
1546 int sock_setsockopt(struct socket *sock, int level, int optname,
1547 		    sockptr_t optval, unsigned int optlen)
1548 {
1549 	return sk_setsockopt(sock->sk, level, optname,
1550 			     optval, optlen);
1551 }
1552 EXPORT_SYMBOL(sock_setsockopt);
1553 
1554 static const struct cred *sk_get_peer_cred(struct sock *sk)
1555 {
1556 	const struct cred *cred;
1557 
1558 	spin_lock(&sk->sk_peer_lock);
1559 	cred = get_cred(sk->sk_peer_cred);
1560 	spin_unlock(&sk->sk_peer_lock);
1561 
1562 	return cred;
1563 }
1564 
1565 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1566 			  struct ucred *ucred)
1567 {
1568 	ucred->pid = pid_vnr(pid);
1569 	ucred->uid = ucred->gid = -1;
1570 	if (cred) {
1571 		struct user_namespace *current_ns = current_user_ns();
1572 
1573 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1574 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1575 	}
1576 }
1577 
1578 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1579 {
1580 	struct user_namespace *user_ns = current_user_ns();
1581 	int i;
1582 
1583 	for (i = 0; i < src->ngroups; i++) {
1584 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1585 
1586 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1587 			return -EFAULT;
1588 	}
1589 
1590 	return 0;
1591 }
1592 
1593 int sk_getsockopt(struct sock *sk, int level, int optname,
1594 		  sockptr_t optval, sockptr_t optlen)
1595 {
1596 	struct socket *sock = sk->sk_socket;
1597 
1598 	union {
1599 		int val;
1600 		u64 val64;
1601 		unsigned long ulval;
1602 		struct linger ling;
1603 		struct old_timeval32 tm32;
1604 		struct __kernel_old_timeval tm;
1605 		struct  __kernel_sock_timeval stm;
1606 		struct sock_txtime txtime;
1607 		struct so_timestamping timestamping;
1608 	} v;
1609 
1610 	int lv = sizeof(int);
1611 	int len;
1612 
1613 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1614 		return -EFAULT;
1615 	if (len < 0)
1616 		return -EINVAL;
1617 
1618 	memset(&v, 0, sizeof(v));
1619 
1620 	switch (optname) {
1621 	case SO_DEBUG:
1622 		v.val = sock_flag(sk, SOCK_DBG);
1623 		break;
1624 
1625 	case SO_DONTROUTE:
1626 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1627 		break;
1628 
1629 	case SO_BROADCAST:
1630 		v.val = sock_flag(sk, SOCK_BROADCAST);
1631 		break;
1632 
1633 	case SO_SNDBUF:
1634 		v.val = READ_ONCE(sk->sk_sndbuf);
1635 		break;
1636 
1637 	case SO_RCVBUF:
1638 		v.val = READ_ONCE(sk->sk_rcvbuf);
1639 		break;
1640 
1641 	case SO_REUSEADDR:
1642 		v.val = sk->sk_reuse;
1643 		break;
1644 
1645 	case SO_REUSEPORT:
1646 		v.val = sk->sk_reuseport;
1647 		break;
1648 
1649 	case SO_KEEPALIVE:
1650 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1651 		break;
1652 
1653 	case SO_TYPE:
1654 		v.val = sk->sk_type;
1655 		break;
1656 
1657 	case SO_PROTOCOL:
1658 		v.val = sk->sk_protocol;
1659 		break;
1660 
1661 	case SO_DOMAIN:
1662 		v.val = sk->sk_family;
1663 		break;
1664 
1665 	case SO_ERROR:
1666 		v.val = -sock_error(sk);
1667 		if (v.val == 0)
1668 			v.val = xchg(&sk->sk_err_soft, 0);
1669 		break;
1670 
1671 	case SO_OOBINLINE:
1672 		v.val = sock_flag(sk, SOCK_URGINLINE);
1673 		break;
1674 
1675 	case SO_NO_CHECK:
1676 		v.val = sk->sk_no_check_tx;
1677 		break;
1678 
1679 	case SO_PRIORITY:
1680 		v.val = READ_ONCE(sk->sk_priority);
1681 		break;
1682 
1683 	case SO_LINGER:
1684 		lv		= sizeof(v.ling);
1685 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1686 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1687 		break;
1688 
1689 	case SO_BSDCOMPAT:
1690 		break;
1691 
1692 	case SO_TIMESTAMP_OLD:
1693 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1694 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1695 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1696 		break;
1697 
1698 	case SO_TIMESTAMPNS_OLD:
1699 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1700 		break;
1701 
1702 	case SO_TIMESTAMP_NEW:
1703 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1704 		break;
1705 
1706 	case SO_TIMESTAMPNS_NEW:
1707 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1708 		break;
1709 
1710 	case SO_TIMESTAMPING_OLD:
1711 	case SO_TIMESTAMPING_NEW:
1712 		lv = sizeof(v.timestamping);
1713 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1714 		 * returning the flags when they were set through the same option.
1715 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1716 		 */
1717 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1718 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1719 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1720 		}
1721 		break;
1722 
1723 	case SO_RCVTIMEO_OLD:
1724 	case SO_RCVTIMEO_NEW:
1725 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1726 				      SO_RCVTIMEO_OLD == optname);
1727 		break;
1728 
1729 	case SO_SNDTIMEO_OLD:
1730 	case SO_SNDTIMEO_NEW:
1731 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1732 				      SO_SNDTIMEO_OLD == optname);
1733 		break;
1734 
1735 	case SO_RCVLOWAT:
1736 		v.val = READ_ONCE(sk->sk_rcvlowat);
1737 		break;
1738 
1739 	case SO_SNDLOWAT:
1740 		v.val = 1;
1741 		break;
1742 
1743 	case SO_PASSCRED:
1744 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1745 		break;
1746 
1747 	case SO_PASSPIDFD:
1748 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1749 		break;
1750 
1751 	case SO_PEERCRED:
1752 	{
1753 		struct ucred peercred;
1754 		if (len > sizeof(peercred))
1755 			len = sizeof(peercred);
1756 
1757 		spin_lock(&sk->sk_peer_lock);
1758 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1759 		spin_unlock(&sk->sk_peer_lock);
1760 
1761 		if (copy_to_sockptr(optval, &peercred, len))
1762 			return -EFAULT;
1763 		goto lenout;
1764 	}
1765 
1766 	case SO_PEERPIDFD:
1767 	{
1768 		struct pid *peer_pid;
1769 		struct file *pidfd_file = NULL;
1770 		int pidfd;
1771 
1772 		if (len > sizeof(pidfd))
1773 			len = sizeof(pidfd);
1774 
1775 		spin_lock(&sk->sk_peer_lock);
1776 		peer_pid = get_pid(sk->sk_peer_pid);
1777 		spin_unlock(&sk->sk_peer_lock);
1778 
1779 		if (!peer_pid)
1780 			return -ENODATA;
1781 
1782 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1783 		put_pid(peer_pid);
1784 		if (pidfd < 0)
1785 			return pidfd;
1786 
1787 		if (copy_to_sockptr(optval, &pidfd, len) ||
1788 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1789 			put_unused_fd(pidfd);
1790 			fput(pidfd_file);
1791 
1792 			return -EFAULT;
1793 		}
1794 
1795 		fd_install(pidfd, pidfd_file);
1796 		return 0;
1797 	}
1798 
1799 	case SO_PEERGROUPS:
1800 	{
1801 		const struct cred *cred;
1802 		int ret, n;
1803 
1804 		cred = sk_get_peer_cred(sk);
1805 		if (!cred)
1806 			return -ENODATA;
1807 
1808 		n = cred->group_info->ngroups;
1809 		if (len < n * sizeof(gid_t)) {
1810 			len = n * sizeof(gid_t);
1811 			put_cred(cred);
1812 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1813 		}
1814 		len = n * sizeof(gid_t);
1815 
1816 		ret = groups_to_user(optval, cred->group_info);
1817 		put_cred(cred);
1818 		if (ret)
1819 			return ret;
1820 		goto lenout;
1821 	}
1822 
1823 	case SO_PEERNAME:
1824 	{
1825 		struct sockaddr_storage address;
1826 
1827 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1828 		if (lv < 0)
1829 			return -ENOTCONN;
1830 		if (lv < len)
1831 			return -EINVAL;
1832 		if (copy_to_sockptr(optval, &address, len))
1833 			return -EFAULT;
1834 		goto lenout;
1835 	}
1836 
1837 	/* Dubious BSD thing... Probably nobody even uses it, but
1838 	 * the UNIX standard wants it for whatever reason... -DaveM
1839 	 */
1840 	case SO_ACCEPTCONN:
1841 		v.val = sk->sk_state == TCP_LISTEN;
1842 		break;
1843 
1844 	case SO_PASSSEC:
1845 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1846 		break;
1847 
1848 	case SO_PEERSEC:
1849 		return security_socket_getpeersec_stream(sock,
1850 							 optval, optlen, len);
1851 
1852 	case SO_MARK:
1853 		v.val = READ_ONCE(sk->sk_mark);
1854 		break;
1855 
1856 	case SO_RCVMARK:
1857 		v.val = sock_flag(sk, SOCK_RCVMARK);
1858 		break;
1859 
1860 	case SO_RXQ_OVFL:
1861 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1862 		break;
1863 
1864 	case SO_WIFI_STATUS:
1865 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1866 		break;
1867 
1868 	case SO_PEEK_OFF:
1869 		if (!READ_ONCE(sock->ops)->set_peek_off)
1870 			return -EOPNOTSUPP;
1871 
1872 		v.val = READ_ONCE(sk->sk_peek_off);
1873 		break;
1874 	case SO_NOFCS:
1875 		v.val = sock_flag(sk, SOCK_NOFCS);
1876 		break;
1877 
1878 	case SO_BINDTODEVICE:
1879 		return sock_getbindtodevice(sk, optval, optlen, len);
1880 
1881 	case SO_GET_FILTER:
1882 		len = sk_get_filter(sk, optval, len);
1883 		if (len < 0)
1884 			return len;
1885 
1886 		goto lenout;
1887 
1888 	case SO_LOCK_FILTER:
1889 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1890 		break;
1891 
1892 	case SO_BPF_EXTENSIONS:
1893 		v.val = bpf_tell_extensions();
1894 		break;
1895 
1896 	case SO_SELECT_ERR_QUEUE:
1897 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1898 		break;
1899 
1900 #ifdef CONFIG_NET_RX_BUSY_POLL
1901 	case SO_BUSY_POLL:
1902 		v.val = READ_ONCE(sk->sk_ll_usec);
1903 		break;
1904 	case SO_PREFER_BUSY_POLL:
1905 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1906 		break;
1907 #endif
1908 
1909 	case SO_MAX_PACING_RATE:
1910 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1911 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1912 			lv = sizeof(v.ulval);
1913 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1914 		} else {
1915 			/* 32bit version */
1916 			v.val = min_t(unsigned long, ~0U,
1917 				      READ_ONCE(sk->sk_max_pacing_rate));
1918 		}
1919 		break;
1920 
1921 	case SO_INCOMING_CPU:
1922 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1923 		break;
1924 
1925 	case SO_MEMINFO:
1926 	{
1927 		u32 meminfo[SK_MEMINFO_VARS];
1928 
1929 		sk_get_meminfo(sk, meminfo);
1930 
1931 		len = min_t(unsigned int, len, sizeof(meminfo));
1932 		if (copy_to_sockptr(optval, &meminfo, len))
1933 			return -EFAULT;
1934 
1935 		goto lenout;
1936 	}
1937 
1938 #ifdef CONFIG_NET_RX_BUSY_POLL
1939 	case SO_INCOMING_NAPI_ID:
1940 		v.val = READ_ONCE(sk->sk_napi_id);
1941 
1942 		/* aggregate non-NAPI IDs down to 0 */
1943 		if (v.val < MIN_NAPI_ID)
1944 			v.val = 0;
1945 
1946 		break;
1947 #endif
1948 
1949 	case SO_COOKIE:
1950 		lv = sizeof(u64);
1951 		if (len < lv)
1952 			return -EINVAL;
1953 		v.val64 = sock_gen_cookie(sk);
1954 		break;
1955 
1956 	case SO_ZEROCOPY:
1957 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1958 		break;
1959 
1960 	case SO_TXTIME:
1961 		lv = sizeof(v.txtime);
1962 		v.txtime.clockid = sk->sk_clockid;
1963 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1964 				  SOF_TXTIME_DEADLINE_MODE : 0;
1965 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1966 				  SOF_TXTIME_REPORT_ERRORS : 0;
1967 		break;
1968 
1969 	case SO_BINDTOIFINDEX:
1970 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1971 		break;
1972 
1973 	case SO_NETNS_COOKIE:
1974 		lv = sizeof(u64);
1975 		if (len != lv)
1976 			return -EINVAL;
1977 		v.val64 = sock_net(sk)->net_cookie;
1978 		break;
1979 
1980 	case SO_BUF_LOCK:
1981 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1982 		break;
1983 
1984 	case SO_RESERVE_MEM:
1985 		v.val = READ_ONCE(sk->sk_reserved_mem);
1986 		break;
1987 
1988 	case SO_TXREHASH:
1989 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
1990 		v.val = READ_ONCE(sk->sk_txrehash);
1991 		break;
1992 
1993 	default:
1994 		/* We implement the SO_SNDLOWAT etc to not be settable
1995 		 * (1003.1g 7).
1996 		 */
1997 		return -ENOPROTOOPT;
1998 	}
1999 
2000 	if (len > lv)
2001 		len = lv;
2002 	if (copy_to_sockptr(optval, &v, len))
2003 		return -EFAULT;
2004 lenout:
2005 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2006 		return -EFAULT;
2007 	return 0;
2008 }
2009 
2010 /*
2011  * Initialize an sk_lock.
2012  *
2013  * (We also register the sk_lock with the lock validator.)
2014  */
2015 static inline void sock_lock_init(struct sock *sk)
2016 {
2017 	if (sk->sk_kern_sock)
2018 		sock_lock_init_class_and_name(
2019 			sk,
2020 			af_family_kern_slock_key_strings[sk->sk_family],
2021 			af_family_kern_slock_keys + sk->sk_family,
2022 			af_family_kern_key_strings[sk->sk_family],
2023 			af_family_kern_keys + sk->sk_family);
2024 	else
2025 		sock_lock_init_class_and_name(
2026 			sk,
2027 			af_family_slock_key_strings[sk->sk_family],
2028 			af_family_slock_keys + sk->sk_family,
2029 			af_family_key_strings[sk->sk_family],
2030 			af_family_keys + sk->sk_family);
2031 }
2032 
2033 /*
2034  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2035  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2036  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2037  */
2038 static void sock_copy(struct sock *nsk, const struct sock *osk)
2039 {
2040 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2041 #ifdef CONFIG_SECURITY_NETWORK
2042 	void *sptr = nsk->sk_security;
2043 #endif
2044 
2045 	/* If we move sk_tx_queue_mapping out of the private section,
2046 	 * we must check if sk_tx_queue_clear() is called after
2047 	 * sock_copy() in sk_clone_lock().
2048 	 */
2049 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2050 		     offsetof(struct sock, sk_dontcopy_begin) ||
2051 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2052 		     offsetof(struct sock, sk_dontcopy_end));
2053 
2054 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2055 
2056 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2057 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2058 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2059 
2060 #ifdef CONFIG_SECURITY_NETWORK
2061 	nsk->sk_security = sptr;
2062 	security_sk_clone(osk, nsk);
2063 #endif
2064 }
2065 
2066 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2067 		int family)
2068 {
2069 	struct sock *sk;
2070 	struct kmem_cache *slab;
2071 
2072 	slab = prot->slab;
2073 	if (slab != NULL) {
2074 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2075 		if (!sk)
2076 			return sk;
2077 		if (want_init_on_alloc(priority))
2078 			sk_prot_clear_nulls(sk, prot->obj_size);
2079 	} else
2080 		sk = kmalloc(prot->obj_size, priority);
2081 
2082 	if (sk != NULL) {
2083 		if (security_sk_alloc(sk, family, priority))
2084 			goto out_free;
2085 
2086 		if (!try_module_get(prot->owner))
2087 			goto out_free_sec;
2088 	}
2089 
2090 	return sk;
2091 
2092 out_free_sec:
2093 	security_sk_free(sk);
2094 out_free:
2095 	if (slab != NULL)
2096 		kmem_cache_free(slab, sk);
2097 	else
2098 		kfree(sk);
2099 	return NULL;
2100 }
2101 
2102 static void sk_prot_free(struct proto *prot, struct sock *sk)
2103 {
2104 	struct kmem_cache *slab;
2105 	struct module *owner;
2106 
2107 	owner = prot->owner;
2108 	slab = prot->slab;
2109 
2110 	cgroup_sk_free(&sk->sk_cgrp_data);
2111 	mem_cgroup_sk_free(sk);
2112 	security_sk_free(sk);
2113 	if (slab != NULL)
2114 		kmem_cache_free(slab, sk);
2115 	else
2116 		kfree(sk);
2117 	module_put(owner);
2118 }
2119 
2120 /**
2121  *	sk_alloc - All socket objects are allocated here
2122  *	@net: the applicable net namespace
2123  *	@family: protocol family
2124  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2125  *	@prot: struct proto associated with this new sock instance
2126  *	@kern: is this to be a kernel socket?
2127  */
2128 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2129 		      struct proto *prot, int kern)
2130 {
2131 	struct sock *sk;
2132 
2133 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2134 	if (sk) {
2135 		sk->sk_family = family;
2136 		/*
2137 		 * See comment in struct sock definition to understand
2138 		 * why we need sk_prot_creator -acme
2139 		 */
2140 		sk->sk_prot = sk->sk_prot_creator = prot;
2141 		sk->sk_kern_sock = kern;
2142 		sock_lock_init(sk);
2143 		sk->sk_net_refcnt = kern ? 0 : 1;
2144 		if (likely(sk->sk_net_refcnt)) {
2145 			get_net_track(net, &sk->ns_tracker, priority);
2146 			sock_inuse_add(net, 1);
2147 		} else {
2148 			__netns_tracker_alloc(net, &sk->ns_tracker,
2149 					      false, priority);
2150 		}
2151 
2152 		sock_net_set(sk, net);
2153 		refcount_set(&sk->sk_wmem_alloc, 1);
2154 
2155 		mem_cgroup_sk_alloc(sk);
2156 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2157 		sock_update_classid(&sk->sk_cgrp_data);
2158 		sock_update_netprioidx(&sk->sk_cgrp_data);
2159 		sk_tx_queue_clear(sk);
2160 	}
2161 
2162 	return sk;
2163 }
2164 EXPORT_SYMBOL(sk_alloc);
2165 
2166 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2167  * grace period. This is the case for UDP sockets and TCP listeners.
2168  */
2169 static void __sk_destruct(struct rcu_head *head)
2170 {
2171 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2172 	struct sk_filter *filter;
2173 
2174 	if (sk->sk_destruct)
2175 		sk->sk_destruct(sk);
2176 
2177 	filter = rcu_dereference_check(sk->sk_filter,
2178 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2179 	if (filter) {
2180 		sk_filter_uncharge(sk, filter);
2181 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2182 	}
2183 
2184 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2185 
2186 #ifdef CONFIG_BPF_SYSCALL
2187 	bpf_sk_storage_free(sk);
2188 #endif
2189 
2190 	if (atomic_read(&sk->sk_omem_alloc))
2191 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2192 			 __func__, atomic_read(&sk->sk_omem_alloc));
2193 
2194 	if (sk->sk_frag.page) {
2195 		put_page(sk->sk_frag.page);
2196 		sk->sk_frag.page = NULL;
2197 	}
2198 
2199 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2200 	put_cred(sk->sk_peer_cred);
2201 	put_pid(sk->sk_peer_pid);
2202 
2203 	if (likely(sk->sk_net_refcnt))
2204 		put_net_track(sock_net(sk), &sk->ns_tracker);
2205 	else
2206 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2207 
2208 	sk_prot_free(sk->sk_prot_creator, sk);
2209 }
2210 
2211 void sk_destruct(struct sock *sk)
2212 {
2213 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2214 
2215 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2216 		reuseport_detach_sock(sk);
2217 		use_call_rcu = true;
2218 	}
2219 
2220 	if (use_call_rcu)
2221 		call_rcu(&sk->sk_rcu, __sk_destruct);
2222 	else
2223 		__sk_destruct(&sk->sk_rcu);
2224 }
2225 
2226 static void __sk_free(struct sock *sk)
2227 {
2228 	if (likely(sk->sk_net_refcnt))
2229 		sock_inuse_add(sock_net(sk), -1);
2230 
2231 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2232 		sock_diag_broadcast_destroy(sk);
2233 	else
2234 		sk_destruct(sk);
2235 }
2236 
2237 void sk_free(struct sock *sk)
2238 {
2239 	/*
2240 	 * We subtract one from sk_wmem_alloc and can know if
2241 	 * some packets are still in some tx queue.
2242 	 * If not null, sock_wfree() will call __sk_free(sk) later
2243 	 */
2244 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2245 		__sk_free(sk);
2246 }
2247 EXPORT_SYMBOL(sk_free);
2248 
2249 static void sk_init_common(struct sock *sk)
2250 {
2251 	skb_queue_head_init(&sk->sk_receive_queue);
2252 	skb_queue_head_init(&sk->sk_write_queue);
2253 	skb_queue_head_init(&sk->sk_error_queue);
2254 
2255 	rwlock_init(&sk->sk_callback_lock);
2256 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2257 			af_rlock_keys + sk->sk_family,
2258 			af_family_rlock_key_strings[sk->sk_family]);
2259 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2260 			af_wlock_keys + sk->sk_family,
2261 			af_family_wlock_key_strings[sk->sk_family]);
2262 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2263 			af_elock_keys + sk->sk_family,
2264 			af_family_elock_key_strings[sk->sk_family]);
2265 	if (sk->sk_kern_sock)
2266 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2267 			af_kern_callback_keys + sk->sk_family,
2268 			af_family_kern_clock_key_strings[sk->sk_family]);
2269 	else
2270 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2271 			af_callback_keys + sk->sk_family,
2272 			af_family_clock_key_strings[sk->sk_family]);
2273 }
2274 
2275 /**
2276  *	sk_clone_lock - clone a socket, and lock its clone
2277  *	@sk: the socket to clone
2278  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2279  *
2280  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2281  */
2282 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2283 {
2284 	struct proto *prot = READ_ONCE(sk->sk_prot);
2285 	struct sk_filter *filter;
2286 	bool is_charged = true;
2287 	struct sock *newsk;
2288 
2289 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2290 	if (!newsk)
2291 		goto out;
2292 
2293 	sock_copy(newsk, sk);
2294 
2295 	newsk->sk_prot_creator = prot;
2296 
2297 	/* SANITY */
2298 	if (likely(newsk->sk_net_refcnt)) {
2299 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2300 		sock_inuse_add(sock_net(newsk), 1);
2301 	} else {
2302 		/* Kernel sockets are not elevating the struct net refcount.
2303 		 * Instead, use a tracker to more easily detect if a layer
2304 		 * is not properly dismantling its kernel sockets at netns
2305 		 * destroy time.
2306 		 */
2307 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2308 				      false, priority);
2309 	}
2310 	sk_node_init(&newsk->sk_node);
2311 	sock_lock_init(newsk);
2312 	bh_lock_sock(newsk);
2313 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2314 	newsk->sk_backlog.len = 0;
2315 
2316 	atomic_set(&newsk->sk_rmem_alloc, 0);
2317 
2318 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2319 	refcount_set(&newsk->sk_wmem_alloc, 1);
2320 
2321 	atomic_set(&newsk->sk_omem_alloc, 0);
2322 	sk_init_common(newsk);
2323 
2324 	newsk->sk_dst_cache	= NULL;
2325 	newsk->sk_dst_pending_confirm = 0;
2326 	newsk->sk_wmem_queued	= 0;
2327 	newsk->sk_forward_alloc = 0;
2328 	newsk->sk_reserved_mem  = 0;
2329 	atomic_set(&newsk->sk_drops, 0);
2330 	newsk->sk_send_head	= NULL;
2331 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2332 	atomic_set(&newsk->sk_zckey, 0);
2333 
2334 	sock_reset_flag(newsk, SOCK_DONE);
2335 
2336 	/* sk->sk_memcg will be populated at accept() time */
2337 	newsk->sk_memcg = NULL;
2338 
2339 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2340 
2341 	rcu_read_lock();
2342 	filter = rcu_dereference(sk->sk_filter);
2343 	if (filter != NULL)
2344 		/* though it's an empty new sock, the charging may fail
2345 		 * if sysctl_optmem_max was changed between creation of
2346 		 * original socket and cloning
2347 		 */
2348 		is_charged = sk_filter_charge(newsk, filter);
2349 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2350 	rcu_read_unlock();
2351 
2352 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2353 		/* We need to make sure that we don't uncharge the new
2354 		 * socket if we couldn't charge it in the first place
2355 		 * as otherwise we uncharge the parent's filter.
2356 		 */
2357 		if (!is_charged)
2358 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2359 		sk_free_unlock_clone(newsk);
2360 		newsk = NULL;
2361 		goto out;
2362 	}
2363 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2364 
2365 	if (bpf_sk_storage_clone(sk, newsk)) {
2366 		sk_free_unlock_clone(newsk);
2367 		newsk = NULL;
2368 		goto out;
2369 	}
2370 
2371 	/* Clear sk_user_data if parent had the pointer tagged
2372 	 * as not suitable for copying when cloning.
2373 	 */
2374 	if (sk_user_data_is_nocopy(newsk))
2375 		newsk->sk_user_data = NULL;
2376 
2377 	newsk->sk_err	   = 0;
2378 	newsk->sk_err_soft = 0;
2379 	newsk->sk_priority = 0;
2380 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2381 
2382 	/* Before updating sk_refcnt, we must commit prior changes to memory
2383 	 * (Documentation/RCU/rculist_nulls.rst for details)
2384 	 */
2385 	smp_wmb();
2386 	refcount_set(&newsk->sk_refcnt, 2);
2387 
2388 	sk_set_socket(newsk, NULL);
2389 	sk_tx_queue_clear(newsk);
2390 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2391 
2392 	if (newsk->sk_prot->sockets_allocated)
2393 		sk_sockets_allocated_inc(newsk);
2394 
2395 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2396 		net_enable_timestamp();
2397 out:
2398 	return newsk;
2399 }
2400 EXPORT_SYMBOL_GPL(sk_clone_lock);
2401 
2402 void sk_free_unlock_clone(struct sock *sk)
2403 {
2404 	/* It is still raw copy of parent, so invalidate
2405 	 * destructor and make plain sk_free() */
2406 	sk->sk_destruct = NULL;
2407 	bh_unlock_sock(sk);
2408 	sk_free(sk);
2409 }
2410 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2411 
2412 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2413 {
2414 	bool is_ipv6 = false;
2415 	u32 max_size;
2416 
2417 #if IS_ENABLED(CONFIG_IPV6)
2418 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2419 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2420 #endif
2421 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2422 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2423 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2424 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2425 		max_size = GSO_LEGACY_MAX_SIZE;
2426 
2427 	return max_size - (MAX_TCP_HEADER + 1);
2428 }
2429 
2430 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2431 {
2432 	u32 max_segs = 1;
2433 
2434 	sk->sk_route_caps = dst->dev->features;
2435 	if (sk_is_tcp(sk))
2436 		sk->sk_route_caps |= NETIF_F_GSO;
2437 	if (sk->sk_route_caps & NETIF_F_GSO)
2438 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2439 	if (unlikely(sk->sk_gso_disabled))
2440 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2441 	if (sk_can_gso(sk)) {
2442 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2443 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2444 		} else {
2445 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2446 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2447 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2448 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2449 		}
2450 	}
2451 	sk->sk_gso_max_segs = max_segs;
2452 	sk_dst_set(sk, dst);
2453 }
2454 EXPORT_SYMBOL_GPL(sk_setup_caps);
2455 
2456 /*
2457  *	Simple resource managers for sockets.
2458  */
2459 
2460 
2461 /*
2462  * Write buffer destructor automatically called from kfree_skb.
2463  */
2464 void sock_wfree(struct sk_buff *skb)
2465 {
2466 	struct sock *sk = skb->sk;
2467 	unsigned int len = skb->truesize;
2468 	bool free;
2469 
2470 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2471 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2472 		    sk->sk_write_space == sock_def_write_space) {
2473 			rcu_read_lock();
2474 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2475 			sock_def_write_space_wfree(sk);
2476 			rcu_read_unlock();
2477 			if (unlikely(free))
2478 				__sk_free(sk);
2479 			return;
2480 		}
2481 
2482 		/*
2483 		 * Keep a reference on sk_wmem_alloc, this will be released
2484 		 * after sk_write_space() call
2485 		 */
2486 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2487 		sk->sk_write_space(sk);
2488 		len = 1;
2489 	}
2490 	/*
2491 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2492 	 * could not do because of in-flight packets
2493 	 */
2494 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2495 		__sk_free(sk);
2496 }
2497 EXPORT_SYMBOL(sock_wfree);
2498 
2499 /* This variant of sock_wfree() is used by TCP,
2500  * since it sets SOCK_USE_WRITE_QUEUE.
2501  */
2502 void __sock_wfree(struct sk_buff *skb)
2503 {
2504 	struct sock *sk = skb->sk;
2505 
2506 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2507 		__sk_free(sk);
2508 }
2509 
2510 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2511 {
2512 	skb_orphan(skb);
2513 	skb->sk = sk;
2514 #ifdef CONFIG_INET
2515 	if (unlikely(!sk_fullsock(sk))) {
2516 		skb->destructor = sock_edemux;
2517 		sock_hold(sk);
2518 		return;
2519 	}
2520 #endif
2521 	skb->destructor = sock_wfree;
2522 	skb_set_hash_from_sk(skb, sk);
2523 	/*
2524 	 * We used to take a refcount on sk, but following operation
2525 	 * is enough to guarantee sk_free() wont free this sock until
2526 	 * all in-flight packets are completed
2527 	 */
2528 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2529 }
2530 EXPORT_SYMBOL(skb_set_owner_w);
2531 
2532 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2533 {
2534 	/* Drivers depend on in-order delivery for crypto offload,
2535 	 * partial orphan breaks out-of-order-OK logic.
2536 	 */
2537 	if (skb_is_decrypted(skb))
2538 		return false;
2539 
2540 	return (skb->destructor == sock_wfree ||
2541 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2542 }
2543 
2544 /* This helper is used by netem, as it can hold packets in its
2545  * delay queue. We want to allow the owner socket to send more
2546  * packets, as if they were already TX completed by a typical driver.
2547  * But we also want to keep skb->sk set because some packet schedulers
2548  * rely on it (sch_fq for example).
2549  */
2550 void skb_orphan_partial(struct sk_buff *skb)
2551 {
2552 	if (skb_is_tcp_pure_ack(skb))
2553 		return;
2554 
2555 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2556 		return;
2557 
2558 	skb_orphan(skb);
2559 }
2560 EXPORT_SYMBOL(skb_orphan_partial);
2561 
2562 /*
2563  * Read buffer destructor automatically called from kfree_skb.
2564  */
2565 void sock_rfree(struct sk_buff *skb)
2566 {
2567 	struct sock *sk = skb->sk;
2568 	unsigned int len = skb->truesize;
2569 
2570 	atomic_sub(len, &sk->sk_rmem_alloc);
2571 	sk_mem_uncharge(sk, len);
2572 }
2573 EXPORT_SYMBOL(sock_rfree);
2574 
2575 /*
2576  * Buffer destructor for skbs that are not used directly in read or write
2577  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2578  */
2579 void sock_efree(struct sk_buff *skb)
2580 {
2581 	sock_put(skb->sk);
2582 }
2583 EXPORT_SYMBOL(sock_efree);
2584 
2585 /* Buffer destructor for prefetch/receive path where reference count may
2586  * not be held, e.g. for listen sockets.
2587  */
2588 #ifdef CONFIG_INET
2589 void sock_pfree(struct sk_buff *skb)
2590 {
2591 	struct sock *sk = skb->sk;
2592 
2593 	if (!sk_is_refcounted(sk))
2594 		return;
2595 
2596 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2597 		inet_reqsk(sk)->rsk_listener = NULL;
2598 		reqsk_free(inet_reqsk(sk));
2599 		return;
2600 	}
2601 
2602 	sock_gen_put(sk);
2603 }
2604 EXPORT_SYMBOL(sock_pfree);
2605 #endif /* CONFIG_INET */
2606 
2607 kuid_t sock_i_uid(struct sock *sk)
2608 {
2609 	kuid_t uid;
2610 
2611 	read_lock_bh(&sk->sk_callback_lock);
2612 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2613 	read_unlock_bh(&sk->sk_callback_lock);
2614 	return uid;
2615 }
2616 EXPORT_SYMBOL(sock_i_uid);
2617 
2618 unsigned long __sock_i_ino(struct sock *sk)
2619 {
2620 	unsigned long ino;
2621 
2622 	read_lock(&sk->sk_callback_lock);
2623 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2624 	read_unlock(&sk->sk_callback_lock);
2625 	return ino;
2626 }
2627 EXPORT_SYMBOL(__sock_i_ino);
2628 
2629 unsigned long sock_i_ino(struct sock *sk)
2630 {
2631 	unsigned long ino;
2632 
2633 	local_bh_disable();
2634 	ino = __sock_i_ino(sk);
2635 	local_bh_enable();
2636 	return ino;
2637 }
2638 EXPORT_SYMBOL(sock_i_ino);
2639 
2640 /*
2641  * Allocate a skb from the socket's send buffer.
2642  */
2643 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2644 			     gfp_t priority)
2645 {
2646 	if (force ||
2647 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2648 		struct sk_buff *skb = alloc_skb(size, priority);
2649 
2650 		if (skb) {
2651 			skb_set_owner_w(skb, sk);
2652 			return skb;
2653 		}
2654 	}
2655 	return NULL;
2656 }
2657 EXPORT_SYMBOL(sock_wmalloc);
2658 
2659 static void sock_ofree(struct sk_buff *skb)
2660 {
2661 	struct sock *sk = skb->sk;
2662 
2663 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2664 }
2665 
2666 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2667 			     gfp_t priority)
2668 {
2669 	struct sk_buff *skb;
2670 
2671 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2672 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2673 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2674 		return NULL;
2675 
2676 	skb = alloc_skb(size, priority);
2677 	if (!skb)
2678 		return NULL;
2679 
2680 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2681 	skb->sk = sk;
2682 	skb->destructor = sock_ofree;
2683 	return skb;
2684 }
2685 
2686 /*
2687  * Allocate a memory block from the socket's option memory buffer.
2688  */
2689 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2690 {
2691 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2692 
2693 	if ((unsigned int)size <= optmem_max &&
2694 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2695 		void *mem;
2696 		/* First do the add, to avoid the race if kmalloc
2697 		 * might sleep.
2698 		 */
2699 		atomic_add(size, &sk->sk_omem_alloc);
2700 		mem = kmalloc(size, priority);
2701 		if (mem)
2702 			return mem;
2703 		atomic_sub(size, &sk->sk_omem_alloc);
2704 	}
2705 	return NULL;
2706 }
2707 EXPORT_SYMBOL(sock_kmalloc);
2708 
2709 /* Free an option memory block. Note, we actually want the inline
2710  * here as this allows gcc to detect the nullify and fold away the
2711  * condition entirely.
2712  */
2713 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2714 				  const bool nullify)
2715 {
2716 	if (WARN_ON_ONCE(!mem))
2717 		return;
2718 	if (nullify)
2719 		kfree_sensitive(mem);
2720 	else
2721 		kfree(mem);
2722 	atomic_sub(size, &sk->sk_omem_alloc);
2723 }
2724 
2725 void sock_kfree_s(struct sock *sk, void *mem, int size)
2726 {
2727 	__sock_kfree_s(sk, mem, size, false);
2728 }
2729 EXPORT_SYMBOL(sock_kfree_s);
2730 
2731 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2732 {
2733 	__sock_kfree_s(sk, mem, size, true);
2734 }
2735 EXPORT_SYMBOL(sock_kzfree_s);
2736 
2737 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2738    I think, these locks should be removed for datagram sockets.
2739  */
2740 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2741 {
2742 	DEFINE_WAIT(wait);
2743 
2744 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2745 	for (;;) {
2746 		if (!timeo)
2747 			break;
2748 		if (signal_pending(current))
2749 			break;
2750 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2751 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2752 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2753 			break;
2754 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2755 			break;
2756 		if (READ_ONCE(sk->sk_err))
2757 			break;
2758 		timeo = schedule_timeout(timeo);
2759 	}
2760 	finish_wait(sk_sleep(sk), &wait);
2761 	return timeo;
2762 }
2763 
2764 
2765 /*
2766  *	Generic send/receive buffer handlers
2767  */
2768 
2769 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2770 				     unsigned long data_len, int noblock,
2771 				     int *errcode, int max_page_order)
2772 {
2773 	struct sk_buff *skb;
2774 	long timeo;
2775 	int err;
2776 
2777 	timeo = sock_sndtimeo(sk, noblock);
2778 	for (;;) {
2779 		err = sock_error(sk);
2780 		if (err != 0)
2781 			goto failure;
2782 
2783 		err = -EPIPE;
2784 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2785 			goto failure;
2786 
2787 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2788 			break;
2789 
2790 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2791 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2792 		err = -EAGAIN;
2793 		if (!timeo)
2794 			goto failure;
2795 		if (signal_pending(current))
2796 			goto interrupted;
2797 		timeo = sock_wait_for_wmem(sk, timeo);
2798 	}
2799 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2800 				   errcode, sk->sk_allocation);
2801 	if (skb)
2802 		skb_set_owner_w(skb, sk);
2803 	return skb;
2804 
2805 interrupted:
2806 	err = sock_intr_errno(timeo);
2807 failure:
2808 	*errcode = err;
2809 	return NULL;
2810 }
2811 EXPORT_SYMBOL(sock_alloc_send_pskb);
2812 
2813 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2814 		     struct sockcm_cookie *sockc)
2815 {
2816 	u32 tsflags;
2817 
2818 	switch (cmsg->cmsg_type) {
2819 	case SO_MARK:
2820 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2821 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2822 			return -EPERM;
2823 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2824 			return -EINVAL;
2825 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2826 		break;
2827 	case SO_TIMESTAMPING_OLD:
2828 	case SO_TIMESTAMPING_NEW:
2829 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2830 			return -EINVAL;
2831 
2832 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2833 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2834 			return -EINVAL;
2835 
2836 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2837 		sockc->tsflags |= tsflags;
2838 		break;
2839 	case SCM_TXTIME:
2840 		if (!sock_flag(sk, SOCK_TXTIME))
2841 			return -EINVAL;
2842 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2843 			return -EINVAL;
2844 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2845 		break;
2846 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2847 	case SCM_RIGHTS:
2848 	case SCM_CREDENTIALS:
2849 		break;
2850 	default:
2851 		return -EINVAL;
2852 	}
2853 	return 0;
2854 }
2855 EXPORT_SYMBOL(__sock_cmsg_send);
2856 
2857 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2858 		   struct sockcm_cookie *sockc)
2859 {
2860 	struct cmsghdr *cmsg;
2861 	int ret;
2862 
2863 	for_each_cmsghdr(cmsg, msg) {
2864 		if (!CMSG_OK(msg, cmsg))
2865 			return -EINVAL;
2866 		if (cmsg->cmsg_level != SOL_SOCKET)
2867 			continue;
2868 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2869 		if (ret)
2870 			return ret;
2871 	}
2872 	return 0;
2873 }
2874 EXPORT_SYMBOL(sock_cmsg_send);
2875 
2876 static void sk_enter_memory_pressure(struct sock *sk)
2877 {
2878 	if (!sk->sk_prot->enter_memory_pressure)
2879 		return;
2880 
2881 	sk->sk_prot->enter_memory_pressure(sk);
2882 }
2883 
2884 static void sk_leave_memory_pressure(struct sock *sk)
2885 {
2886 	if (sk->sk_prot->leave_memory_pressure) {
2887 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2888 				     tcp_leave_memory_pressure, sk);
2889 	} else {
2890 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2891 
2892 		if (memory_pressure && READ_ONCE(*memory_pressure))
2893 			WRITE_ONCE(*memory_pressure, 0);
2894 	}
2895 }
2896 
2897 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2898 
2899 /**
2900  * skb_page_frag_refill - check that a page_frag contains enough room
2901  * @sz: minimum size of the fragment we want to get
2902  * @pfrag: pointer to page_frag
2903  * @gfp: priority for memory allocation
2904  *
2905  * Note: While this allocator tries to use high order pages, there is
2906  * no guarantee that allocations succeed. Therefore, @sz MUST be
2907  * less or equal than PAGE_SIZE.
2908  */
2909 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2910 {
2911 	if (pfrag->page) {
2912 		if (page_ref_count(pfrag->page) == 1) {
2913 			pfrag->offset = 0;
2914 			return true;
2915 		}
2916 		if (pfrag->offset + sz <= pfrag->size)
2917 			return true;
2918 		put_page(pfrag->page);
2919 	}
2920 
2921 	pfrag->offset = 0;
2922 	if (SKB_FRAG_PAGE_ORDER &&
2923 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2924 		/* Avoid direct reclaim but allow kswapd to wake */
2925 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2926 					  __GFP_COMP | __GFP_NOWARN |
2927 					  __GFP_NORETRY,
2928 					  SKB_FRAG_PAGE_ORDER);
2929 		if (likely(pfrag->page)) {
2930 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2931 			return true;
2932 		}
2933 	}
2934 	pfrag->page = alloc_page(gfp);
2935 	if (likely(pfrag->page)) {
2936 		pfrag->size = PAGE_SIZE;
2937 		return true;
2938 	}
2939 	return false;
2940 }
2941 EXPORT_SYMBOL(skb_page_frag_refill);
2942 
2943 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2944 {
2945 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2946 		return true;
2947 
2948 	sk_enter_memory_pressure(sk);
2949 	sk_stream_moderate_sndbuf(sk);
2950 	return false;
2951 }
2952 EXPORT_SYMBOL(sk_page_frag_refill);
2953 
2954 void __lock_sock(struct sock *sk)
2955 	__releases(&sk->sk_lock.slock)
2956 	__acquires(&sk->sk_lock.slock)
2957 {
2958 	DEFINE_WAIT(wait);
2959 
2960 	for (;;) {
2961 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2962 					TASK_UNINTERRUPTIBLE);
2963 		spin_unlock_bh(&sk->sk_lock.slock);
2964 		schedule();
2965 		spin_lock_bh(&sk->sk_lock.slock);
2966 		if (!sock_owned_by_user(sk))
2967 			break;
2968 	}
2969 	finish_wait(&sk->sk_lock.wq, &wait);
2970 }
2971 
2972 void __release_sock(struct sock *sk)
2973 	__releases(&sk->sk_lock.slock)
2974 	__acquires(&sk->sk_lock.slock)
2975 {
2976 	struct sk_buff *skb, *next;
2977 
2978 	while ((skb = sk->sk_backlog.head) != NULL) {
2979 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2980 
2981 		spin_unlock_bh(&sk->sk_lock.slock);
2982 
2983 		do {
2984 			next = skb->next;
2985 			prefetch(next);
2986 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2987 			skb_mark_not_on_list(skb);
2988 			sk_backlog_rcv(sk, skb);
2989 
2990 			cond_resched();
2991 
2992 			skb = next;
2993 		} while (skb != NULL);
2994 
2995 		spin_lock_bh(&sk->sk_lock.slock);
2996 	}
2997 
2998 	/*
2999 	 * Doing the zeroing here guarantee we can not loop forever
3000 	 * while a wild producer attempts to flood us.
3001 	 */
3002 	sk->sk_backlog.len = 0;
3003 }
3004 
3005 void __sk_flush_backlog(struct sock *sk)
3006 {
3007 	spin_lock_bh(&sk->sk_lock.slock);
3008 	__release_sock(sk);
3009 
3010 	if (sk->sk_prot->release_cb)
3011 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3012 				     tcp_release_cb, sk);
3013 
3014 	spin_unlock_bh(&sk->sk_lock.slock);
3015 }
3016 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3017 
3018 /**
3019  * sk_wait_data - wait for data to arrive at sk_receive_queue
3020  * @sk:    sock to wait on
3021  * @timeo: for how long
3022  * @skb:   last skb seen on sk_receive_queue
3023  *
3024  * Now socket state including sk->sk_err is changed only under lock,
3025  * hence we may omit checks after joining wait queue.
3026  * We check receive queue before schedule() only as optimization;
3027  * it is very likely that release_sock() added new data.
3028  */
3029 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3030 {
3031 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3032 	int rc;
3033 
3034 	add_wait_queue(sk_sleep(sk), &wait);
3035 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3036 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3037 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3038 	remove_wait_queue(sk_sleep(sk), &wait);
3039 	return rc;
3040 }
3041 EXPORT_SYMBOL(sk_wait_data);
3042 
3043 /**
3044  *	__sk_mem_raise_allocated - increase memory_allocated
3045  *	@sk: socket
3046  *	@size: memory size to allocate
3047  *	@amt: pages to allocate
3048  *	@kind: allocation type
3049  *
3050  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3051  *
3052  *	Unlike the globally shared limits among the sockets under same protocol,
3053  *	consuming the budget of a memcg won't have direct effect on other ones.
3054  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3055  *	whether or not to raise allocated through sk_under_memory_pressure() or
3056  *	its variants.
3057  */
3058 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3059 {
3060 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3061 	struct proto *prot = sk->sk_prot;
3062 	bool charged = false;
3063 	long allocated;
3064 
3065 	sk_memory_allocated_add(sk, amt);
3066 	allocated = sk_memory_allocated(sk);
3067 
3068 	if (memcg) {
3069 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3070 			goto suppress_allocation;
3071 		charged = true;
3072 	}
3073 
3074 	/* Under limit. */
3075 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3076 		sk_leave_memory_pressure(sk);
3077 		return 1;
3078 	}
3079 
3080 	/* Under pressure. */
3081 	if (allocated > sk_prot_mem_limits(sk, 1))
3082 		sk_enter_memory_pressure(sk);
3083 
3084 	/* Over hard limit. */
3085 	if (allocated > sk_prot_mem_limits(sk, 2))
3086 		goto suppress_allocation;
3087 
3088 	/* Guarantee minimum buffer size under pressure (either global
3089 	 * or memcg) to make sure features described in RFC 7323 (TCP
3090 	 * Extensions for High Performance) work properly.
3091 	 *
3092 	 * This rule does NOT stand when exceeds global or memcg's hard
3093 	 * limit, or else a DoS attack can be taken place by spawning
3094 	 * lots of sockets whose usage are under minimum buffer size.
3095 	 */
3096 	if (kind == SK_MEM_RECV) {
3097 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3098 			return 1;
3099 
3100 	} else { /* SK_MEM_SEND */
3101 		int wmem0 = sk_get_wmem0(sk, prot);
3102 
3103 		if (sk->sk_type == SOCK_STREAM) {
3104 			if (sk->sk_wmem_queued < wmem0)
3105 				return 1;
3106 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3107 				return 1;
3108 		}
3109 	}
3110 
3111 	if (sk_has_memory_pressure(sk)) {
3112 		u64 alloc;
3113 
3114 		/* The following 'average' heuristic is within the
3115 		 * scope of global accounting, so it only makes
3116 		 * sense for global memory pressure.
3117 		 */
3118 		if (!sk_under_global_memory_pressure(sk))
3119 			return 1;
3120 
3121 		/* Try to be fair among all the sockets under global
3122 		 * pressure by allowing the ones that below average
3123 		 * usage to raise.
3124 		 */
3125 		alloc = sk_sockets_allocated_read_positive(sk);
3126 		if (sk_prot_mem_limits(sk, 2) > alloc *
3127 		    sk_mem_pages(sk->sk_wmem_queued +
3128 				 atomic_read(&sk->sk_rmem_alloc) +
3129 				 sk->sk_forward_alloc))
3130 			return 1;
3131 	}
3132 
3133 suppress_allocation:
3134 
3135 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3136 		sk_stream_moderate_sndbuf(sk);
3137 
3138 		/* Fail only if socket is _under_ its sndbuf.
3139 		 * In this case we cannot block, so that we have to fail.
3140 		 */
3141 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3142 			/* Force charge with __GFP_NOFAIL */
3143 			if (memcg && !charged) {
3144 				mem_cgroup_charge_skmem(memcg, amt,
3145 					gfp_memcg_charge() | __GFP_NOFAIL);
3146 			}
3147 			return 1;
3148 		}
3149 	}
3150 
3151 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3152 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3153 
3154 	sk_memory_allocated_sub(sk, amt);
3155 
3156 	if (charged)
3157 		mem_cgroup_uncharge_skmem(memcg, amt);
3158 
3159 	return 0;
3160 }
3161 
3162 /**
3163  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3164  *	@sk: socket
3165  *	@size: memory size to allocate
3166  *	@kind: allocation type
3167  *
3168  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3169  *	rmem allocation. This function assumes that protocols which have
3170  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3171  */
3172 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3173 {
3174 	int ret, amt = sk_mem_pages(size);
3175 
3176 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3177 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3178 	if (!ret)
3179 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3180 	return ret;
3181 }
3182 EXPORT_SYMBOL(__sk_mem_schedule);
3183 
3184 /**
3185  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3186  *	@sk: socket
3187  *	@amount: number of quanta
3188  *
3189  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3190  */
3191 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3192 {
3193 	sk_memory_allocated_sub(sk, amount);
3194 
3195 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3196 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3197 
3198 	if (sk_under_global_memory_pressure(sk) &&
3199 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3200 		sk_leave_memory_pressure(sk);
3201 }
3202 
3203 /**
3204  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3205  *	@sk: socket
3206  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3207  */
3208 void __sk_mem_reclaim(struct sock *sk, int amount)
3209 {
3210 	amount >>= PAGE_SHIFT;
3211 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3212 	__sk_mem_reduce_allocated(sk, amount);
3213 }
3214 EXPORT_SYMBOL(__sk_mem_reclaim);
3215 
3216 int sk_set_peek_off(struct sock *sk, int val)
3217 {
3218 	WRITE_ONCE(sk->sk_peek_off, val);
3219 	return 0;
3220 }
3221 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3222 
3223 /*
3224  * Set of default routines for initialising struct proto_ops when
3225  * the protocol does not support a particular function. In certain
3226  * cases where it makes no sense for a protocol to have a "do nothing"
3227  * function, some default processing is provided.
3228  */
3229 
3230 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3231 {
3232 	return -EOPNOTSUPP;
3233 }
3234 EXPORT_SYMBOL(sock_no_bind);
3235 
3236 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3237 		    int len, int flags)
3238 {
3239 	return -EOPNOTSUPP;
3240 }
3241 EXPORT_SYMBOL(sock_no_connect);
3242 
3243 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3244 {
3245 	return -EOPNOTSUPP;
3246 }
3247 EXPORT_SYMBOL(sock_no_socketpair);
3248 
3249 int sock_no_accept(struct socket *sock, struct socket *newsock,
3250 		   struct proto_accept_arg *arg)
3251 {
3252 	return -EOPNOTSUPP;
3253 }
3254 EXPORT_SYMBOL(sock_no_accept);
3255 
3256 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3257 		    int peer)
3258 {
3259 	return -EOPNOTSUPP;
3260 }
3261 EXPORT_SYMBOL(sock_no_getname);
3262 
3263 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3264 {
3265 	return -EOPNOTSUPP;
3266 }
3267 EXPORT_SYMBOL(sock_no_ioctl);
3268 
3269 int sock_no_listen(struct socket *sock, int backlog)
3270 {
3271 	return -EOPNOTSUPP;
3272 }
3273 EXPORT_SYMBOL(sock_no_listen);
3274 
3275 int sock_no_shutdown(struct socket *sock, int how)
3276 {
3277 	return -EOPNOTSUPP;
3278 }
3279 EXPORT_SYMBOL(sock_no_shutdown);
3280 
3281 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3282 {
3283 	return -EOPNOTSUPP;
3284 }
3285 EXPORT_SYMBOL(sock_no_sendmsg);
3286 
3287 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3288 {
3289 	return -EOPNOTSUPP;
3290 }
3291 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3292 
3293 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3294 		    int flags)
3295 {
3296 	return -EOPNOTSUPP;
3297 }
3298 EXPORT_SYMBOL(sock_no_recvmsg);
3299 
3300 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3301 {
3302 	/* Mirror missing mmap method error code */
3303 	return -ENODEV;
3304 }
3305 EXPORT_SYMBOL(sock_no_mmap);
3306 
3307 /*
3308  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3309  * various sock-based usage counts.
3310  */
3311 void __receive_sock(struct file *file)
3312 {
3313 	struct socket *sock;
3314 
3315 	sock = sock_from_file(file);
3316 	if (sock) {
3317 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3318 		sock_update_classid(&sock->sk->sk_cgrp_data);
3319 	}
3320 }
3321 
3322 /*
3323  *	Default Socket Callbacks
3324  */
3325 
3326 static void sock_def_wakeup(struct sock *sk)
3327 {
3328 	struct socket_wq *wq;
3329 
3330 	rcu_read_lock();
3331 	wq = rcu_dereference(sk->sk_wq);
3332 	if (skwq_has_sleeper(wq))
3333 		wake_up_interruptible_all(&wq->wait);
3334 	rcu_read_unlock();
3335 }
3336 
3337 static void sock_def_error_report(struct sock *sk)
3338 {
3339 	struct socket_wq *wq;
3340 
3341 	rcu_read_lock();
3342 	wq = rcu_dereference(sk->sk_wq);
3343 	if (skwq_has_sleeper(wq))
3344 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3345 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3346 	rcu_read_unlock();
3347 }
3348 
3349 void sock_def_readable(struct sock *sk)
3350 {
3351 	struct socket_wq *wq;
3352 
3353 	trace_sk_data_ready(sk);
3354 
3355 	rcu_read_lock();
3356 	wq = rcu_dereference(sk->sk_wq);
3357 	if (skwq_has_sleeper(wq))
3358 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3359 						EPOLLRDNORM | EPOLLRDBAND);
3360 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3361 	rcu_read_unlock();
3362 }
3363 
3364 static void sock_def_write_space(struct sock *sk)
3365 {
3366 	struct socket_wq *wq;
3367 
3368 	rcu_read_lock();
3369 
3370 	/* Do not wake up a writer until he can make "significant"
3371 	 * progress.  --DaveM
3372 	 */
3373 	if (sock_writeable(sk)) {
3374 		wq = rcu_dereference(sk->sk_wq);
3375 		if (skwq_has_sleeper(wq))
3376 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3377 						EPOLLWRNORM | EPOLLWRBAND);
3378 
3379 		/* Should agree with poll, otherwise some programs break */
3380 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3381 	}
3382 
3383 	rcu_read_unlock();
3384 }
3385 
3386 /* An optimised version of sock_def_write_space(), should only be called
3387  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3388  * ->sk_wmem_alloc.
3389  */
3390 static void sock_def_write_space_wfree(struct sock *sk)
3391 {
3392 	/* Do not wake up a writer until he can make "significant"
3393 	 * progress.  --DaveM
3394 	 */
3395 	if (sock_writeable(sk)) {
3396 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3397 
3398 		/* rely on refcount_sub from sock_wfree() */
3399 		smp_mb__after_atomic();
3400 		if (wq && waitqueue_active(&wq->wait))
3401 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3402 						EPOLLWRNORM | EPOLLWRBAND);
3403 
3404 		/* Should agree with poll, otherwise some programs break */
3405 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3406 	}
3407 }
3408 
3409 static void sock_def_destruct(struct sock *sk)
3410 {
3411 }
3412 
3413 void sk_send_sigurg(struct sock *sk)
3414 {
3415 	if (sk->sk_socket && sk->sk_socket->file)
3416 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3417 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3418 }
3419 EXPORT_SYMBOL(sk_send_sigurg);
3420 
3421 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3422 		    unsigned long expires)
3423 {
3424 	if (!mod_timer(timer, expires))
3425 		sock_hold(sk);
3426 }
3427 EXPORT_SYMBOL(sk_reset_timer);
3428 
3429 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3430 {
3431 	if (del_timer(timer))
3432 		__sock_put(sk);
3433 }
3434 EXPORT_SYMBOL(sk_stop_timer);
3435 
3436 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3437 {
3438 	if (del_timer_sync(timer))
3439 		__sock_put(sk);
3440 }
3441 EXPORT_SYMBOL(sk_stop_timer_sync);
3442 
3443 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3444 {
3445 	sk_init_common(sk);
3446 	sk->sk_send_head	=	NULL;
3447 
3448 	timer_setup(&sk->sk_timer, NULL, 0);
3449 
3450 	sk->sk_allocation	=	GFP_KERNEL;
3451 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3452 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3453 	sk->sk_state		=	TCP_CLOSE;
3454 	sk->sk_use_task_frag	=	true;
3455 	sk_set_socket(sk, sock);
3456 
3457 	sock_set_flag(sk, SOCK_ZAPPED);
3458 
3459 	if (sock) {
3460 		sk->sk_type	=	sock->type;
3461 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3462 		sock->sk	=	sk;
3463 	} else {
3464 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3465 	}
3466 	sk->sk_uid	=	uid;
3467 
3468 	sk->sk_state_change	=	sock_def_wakeup;
3469 	sk->sk_data_ready	=	sock_def_readable;
3470 	sk->sk_write_space	=	sock_def_write_space;
3471 	sk->sk_error_report	=	sock_def_error_report;
3472 	sk->sk_destruct		=	sock_def_destruct;
3473 
3474 	sk->sk_frag.page	=	NULL;
3475 	sk->sk_frag.offset	=	0;
3476 	sk->sk_peek_off		=	-1;
3477 
3478 	sk->sk_peer_pid 	=	NULL;
3479 	sk->sk_peer_cred	=	NULL;
3480 	spin_lock_init(&sk->sk_peer_lock);
3481 
3482 	sk->sk_write_pending	=	0;
3483 	sk->sk_rcvlowat		=	1;
3484 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3485 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3486 
3487 	sk->sk_stamp = SK_DEFAULT_STAMP;
3488 #if BITS_PER_LONG==32
3489 	seqlock_init(&sk->sk_stamp_seq);
3490 #endif
3491 	atomic_set(&sk->sk_zckey, 0);
3492 
3493 #ifdef CONFIG_NET_RX_BUSY_POLL
3494 	sk->sk_napi_id		=	0;
3495 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3496 #endif
3497 
3498 	sk->sk_max_pacing_rate = ~0UL;
3499 	sk->sk_pacing_rate = ~0UL;
3500 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3501 	sk->sk_incoming_cpu = -1;
3502 
3503 	sk_rx_queue_clear(sk);
3504 	/*
3505 	 * Before updating sk_refcnt, we must commit prior changes to memory
3506 	 * (Documentation/RCU/rculist_nulls.rst for details)
3507 	 */
3508 	smp_wmb();
3509 	refcount_set(&sk->sk_refcnt, 1);
3510 	atomic_set(&sk->sk_drops, 0);
3511 }
3512 EXPORT_SYMBOL(sock_init_data_uid);
3513 
3514 void sock_init_data(struct socket *sock, struct sock *sk)
3515 {
3516 	kuid_t uid = sock ?
3517 		SOCK_INODE(sock)->i_uid :
3518 		make_kuid(sock_net(sk)->user_ns, 0);
3519 
3520 	sock_init_data_uid(sock, sk, uid);
3521 }
3522 EXPORT_SYMBOL(sock_init_data);
3523 
3524 void lock_sock_nested(struct sock *sk, int subclass)
3525 {
3526 	/* The sk_lock has mutex_lock() semantics here. */
3527 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3528 
3529 	might_sleep();
3530 	spin_lock_bh(&sk->sk_lock.slock);
3531 	if (sock_owned_by_user_nocheck(sk))
3532 		__lock_sock(sk);
3533 	sk->sk_lock.owned = 1;
3534 	spin_unlock_bh(&sk->sk_lock.slock);
3535 }
3536 EXPORT_SYMBOL(lock_sock_nested);
3537 
3538 void release_sock(struct sock *sk)
3539 {
3540 	spin_lock_bh(&sk->sk_lock.slock);
3541 	if (sk->sk_backlog.tail)
3542 		__release_sock(sk);
3543 
3544 	if (sk->sk_prot->release_cb)
3545 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3546 				     tcp_release_cb, sk);
3547 
3548 	sock_release_ownership(sk);
3549 	if (waitqueue_active(&sk->sk_lock.wq))
3550 		wake_up(&sk->sk_lock.wq);
3551 	spin_unlock_bh(&sk->sk_lock.slock);
3552 }
3553 EXPORT_SYMBOL(release_sock);
3554 
3555 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3556 {
3557 	might_sleep();
3558 	spin_lock_bh(&sk->sk_lock.slock);
3559 
3560 	if (!sock_owned_by_user_nocheck(sk)) {
3561 		/*
3562 		 * Fast path return with bottom halves disabled and
3563 		 * sock::sk_lock.slock held.
3564 		 *
3565 		 * The 'mutex' is not contended and holding
3566 		 * sock::sk_lock.slock prevents all other lockers to
3567 		 * proceed so the corresponding unlock_sock_fast() can
3568 		 * avoid the slow path of release_sock() completely and
3569 		 * just release slock.
3570 		 *
3571 		 * From a semantical POV this is equivalent to 'acquiring'
3572 		 * the 'mutex', hence the corresponding lockdep
3573 		 * mutex_release() has to happen in the fast path of
3574 		 * unlock_sock_fast().
3575 		 */
3576 		return false;
3577 	}
3578 
3579 	__lock_sock(sk);
3580 	sk->sk_lock.owned = 1;
3581 	__acquire(&sk->sk_lock.slock);
3582 	spin_unlock_bh(&sk->sk_lock.slock);
3583 	return true;
3584 }
3585 EXPORT_SYMBOL(__lock_sock_fast);
3586 
3587 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3588 		   bool timeval, bool time32)
3589 {
3590 	struct sock *sk = sock->sk;
3591 	struct timespec64 ts;
3592 
3593 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3594 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3595 	if (ts.tv_sec == -1)
3596 		return -ENOENT;
3597 	if (ts.tv_sec == 0) {
3598 		ktime_t kt = ktime_get_real();
3599 		sock_write_timestamp(sk, kt);
3600 		ts = ktime_to_timespec64(kt);
3601 	}
3602 
3603 	if (timeval)
3604 		ts.tv_nsec /= 1000;
3605 
3606 #ifdef CONFIG_COMPAT_32BIT_TIME
3607 	if (time32)
3608 		return put_old_timespec32(&ts, userstamp);
3609 #endif
3610 #ifdef CONFIG_SPARC64
3611 	/* beware of padding in sparc64 timeval */
3612 	if (timeval && !in_compat_syscall()) {
3613 		struct __kernel_old_timeval __user tv = {
3614 			.tv_sec = ts.tv_sec,
3615 			.tv_usec = ts.tv_nsec,
3616 		};
3617 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3618 			return -EFAULT;
3619 		return 0;
3620 	}
3621 #endif
3622 	return put_timespec64(&ts, userstamp);
3623 }
3624 EXPORT_SYMBOL(sock_gettstamp);
3625 
3626 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3627 {
3628 	if (!sock_flag(sk, flag)) {
3629 		unsigned long previous_flags = sk->sk_flags;
3630 
3631 		sock_set_flag(sk, flag);
3632 		/*
3633 		 * we just set one of the two flags which require net
3634 		 * time stamping, but time stamping might have been on
3635 		 * already because of the other one
3636 		 */
3637 		if (sock_needs_netstamp(sk) &&
3638 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3639 			net_enable_timestamp();
3640 	}
3641 }
3642 
3643 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3644 		       int level, int type)
3645 {
3646 	struct sock_exterr_skb *serr;
3647 	struct sk_buff *skb;
3648 	int copied, err;
3649 
3650 	err = -EAGAIN;
3651 	skb = sock_dequeue_err_skb(sk);
3652 	if (skb == NULL)
3653 		goto out;
3654 
3655 	copied = skb->len;
3656 	if (copied > len) {
3657 		msg->msg_flags |= MSG_TRUNC;
3658 		copied = len;
3659 	}
3660 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3661 	if (err)
3662 		goto out_free_skb;
3663 
3664 	sock_recv_timestamp(msg, sk, skb);
3665 
3666 	serr = SKB_EXT_ERR(skb);
3667 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3668 
3669 	msg->msg_flags |= MSG_ERRQUEUE;
3670 	err = copied;
3671 
3672 out_free_skb:
3673 	kfree_skb(skb);
3674 out:
3675 	return err;
3676 }
3677 EXPORT_SYMBOL(sock_recv_errqueue);
3678 
3679 /*
3680  *	Get a socket option on an socket.
3681  *
3682  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3683  *	asynchronous errors should be reported by getsockopt. We assume
3684  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3685  */
3686 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3687 			   char __user *optval, int __user *optlen)
3688 {
3689 	struct sock *sk = sock->sk;
3690 
3691 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3692 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3693 }
3694 EXPORT_SYMBOL(sock_common_getsockopt);
3695 
3696 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3697 			int flags)
3698 {
3699 	struct sock *sk = sock->sk;
3700 	int addr_len = 0;
3701 	int err;
3702 
3703 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3704 	if (err >= 0)
3705 		msg->msg_namelen = addr_len;
3706 	return err;
3707 }
3708 EXPORT_SYMBOL(sock_common_recvmsg);
3709 
3710 /*
3711  *	Set socket options on an inet socket.
3712  */
3713 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3714 			   sockptr_t optval, unsigned int optlen)
3715 {
3716 	struct sock *sk = sock->sk;
3717 
3718 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3719 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3720 }
3721 EXPORT_SYMBOL(sock_common_setsockopt);
3722 
3723 void sk_common_release(struct sock *sk)
3724 {
3725 	if (sk->sk_prot->destroy)
3726 		sk->sk_prot->destroy(sk);
3727 
3728 	/*
3729 	 * Observation: when sk_common_release is called, processes have
3730 	 * no access to socket. But net still has.
3731 	 * Step one, detach it from networking:
3732 	 *
3733 	 * A. Remove from hash tables.
3734 	 */
3735 
3736 	sk->sk_prot->unhash(sk);
3737 
3738 	/*
3739 	 * In this point socket cannot receive new packets, but it is possible
3740 	 * that some packets are in flight because some CPU runs receiver and
3741 	 * did hash table lookup before we unhashed socket. They will achieve
3742 	 * receive queue and will be purged by socket destructor.
3743 	 *
3744 	 * Also we still have packets pending on receive queue and probably,
3745 	 * our own packets waiting in device queues. sock_destroy will drain
3746 	 * receive queue, but transmitted packets will delay socket destruction
3747 	 * until the last reference will be released.
3748 	 */
3749 
3750 	sock_orphan(sk);
3751 
3752 	xfrm_sk_free_policy(sk);
3753 
3754 	sock_put(sk);
3755 }
3756 EXPORT_SYMBOL(sk_common_release);
3757 
3758 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3759 {
3760 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3761 
3762 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3763 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3764 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3765 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3766 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3767 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3768 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3769 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3770 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3771 }
3772 
3773 #ifdef CONFIG_PROC_FS
3774 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3775 
3776 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3777 {
3778 	int cpu, idx = prot->inuse_idx;
3779 	int res = 0;
3780 
3781 	for_each_possible_cpu(cpu)
3782 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3783 
3784 	return res >= 0 ? res : 0;
3785 }
3786 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3787 
3788 int sock_inuse_get(struct net *net)
3789 {
3790 	int cpu, res = 0;
3791 
3792 	for_each_possible_cpu(cpu)
3793 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3794 
3795 	return res;
3796 }
3797 
3798 EXPORT_SYMBOL_GPL(sock_inuse_get);
3799 
3800 static int __net_init sock_inuse_init_net(struct net *net)
3801 {
3802 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3803 	if (net->core.prot_inuse == NULL)
3804 		return -ENOMEM;
3805 	return 0;
3806 }
3807 
3808 static void __net_exit sock_inuse_exit_net(struct net *net)
3809 {
3810 	free_percpu(net->core.prot_inuse);
3811 }
3812 
3813 static struct pernet_operations net_inuse_ops = {
3814 	.init = sock_inuse_init_net,
3815 	.exit = sock_inuse_exit_net,
3816 };
3817 
3818 static __init int net_inuse_init(void)
3819 {
3820 	if (register_pernet_subsys(&net_inuse_ops))
3821 		panic("Cannot initialize net inuse counters");
3822 
3823 	return 0;
3824 }
3825 
3826 core_initcall(net_inuse_init);
3827 
3828 static int assign_proto_idx(struct proto *prot)
3829 {
3830 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3831 
3832 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3833 		pr_err("PROTO_INUSE_NR exhausted\n");
3834 		return -ENOSPC;
3835 	}
3836 
3837 	set_bit(prot->inuse_idx, proto_inuse_idx);
3838 	return 0;
3839 }
3840 
3841 static void release_proto_idx(struct proto *prot)
3842 {
3843 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3844 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3845 }
3846 #else
3847 static inline int assign_proto_idx(struct proto *prot)
3848 {
3849 	return 0;
3850 }
3851 
3852 static inline void release_proto_idx(struct proto *prot)
3853 {
3854 }
3855 
3856 #endif
3857 
3858 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3859 {
3860 	if (!twsk_prot)
3861 		return;
3862 	kfree(twsk_prot->twsk_slab_name);
3863 	twsk_prot->twsk_slab_name = NULL;
3864 	kmem_cache_destroy(twsk_prot->twsk_slab);
3865 	twsk_prot->twsk_slab = NULL;
3866 }
3867 
3868 static int tw_prot_init(const struct proto *prot)
3869 {
3870 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3871 
3872 	if (!twsk_prot)
3873 		return 0;
3874 
3875 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3876 					      prot->name);
3877 	if (!twsk_prot->twsk_slab_name)
3878 		return -ENOMEM;
3879 
3880 	twsk_prot->twsk_slab =
3881 		kmem_cache_create(twsk_prot->twsk_slab_name,
3882 				  twsk_prot->twsk_obj_size, 0,
3883 				  SLAB_ACCOUNT | prot->slab_flags,
3884 				  NULL);
3885 	if (!twsk_prot->twsk_slab) {
3886 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3887 			prot->name);
3888 		return -ENOMEM;
3889 	}
3890 
3891 	return 0;
3892 }
3893 
3894 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3895 {
3896 	if (!rsk_prot)
3897 		return;
3898 	kfree(rsk_prot->slab_name);
3899 	rsk_prot->slab_name = NULL;
3900 	kmem_cache_destroy(rsk_prot->slab);
3901 	rsk_prot->slab = NULL;
3902 }
3903 
3904 static int req_prot_init(const struct proto *prot)
3905 {
3906 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3907 
3908 	if (!rsk_prot)
3909 		return 0;
3910 
3911 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3912 					prot->name);
3913 	if (!rsk_prot->slab_name)
3914 		return -ENOMEM;
3915 
3916 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3917 					   rsk_prot->obj_size, 0,
3918 					   SLAB_ACCOUNT | prot->slab_flags,
3919 					   NULL);
3920 
3921 	if (!rsk_prot->slab) {
3922 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3923 			prot->name);
3924 		return -ENOMEM;
3925 	}
3926 	return 0;
3927 }
3928 
3929 int proto_register(struct proto *prot, int alloc_slab)
3930 {
3931 	int ret = -ENOBUFS;
3932 
3933 	if (prot->memory_allocated && !prot->sysctl_mem) {
3934 		pr_err("%s: missing sysctl_mem\n", prot->name);
3935 		return -EINVAL;
3936 	}
3937 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3938 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3939 		return -EINVAL;
3940 	}
3941 	if (alloc_slab) {
3942 		prot->slab = kmem_cache_create_usercopy(prot->name,
3943 					prot->obj_size, 0,
3944 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3945 					prot->slab_flags,
3946 					prot->useroffset, prot->usersize,
3947 					NULL);
3948 
3949 		if (prot->slab == NULL) {
3950 			pr_crit("%s: Can't create sock SLAB cache!\n",
3951 				prot->name);
3952 			goto out;
3953 		}
3954 
3955 		if (req_prot_init(prot))
3956 			goto out_free_request_sock_slab;
3957 
3958 		if (tw_prot_init(prot))
3959 			goto out_free_timewait_sock_slab;
3960 	}
3961 
3962 	mutex_lock(&proto_list_mutex);
3963 	ret = assign_proto_idx(prot);
3964 	if (ret) {
3965 		mutex_unlock(&proto_list_mutex);
3966 		goto out_free_timewait_sock_slab;
3967 	}
3968 	list_add(&prot->node, &proto_list);
3969 	mutex_unlock(&proto_list_mutex);
3970 	return ret;
3971 
3972 out_free_timewait_sock_slab:
3973 	if (alloc_slab)
3974 		tw_prot_cleanup(prot->twsk_prot);
3975 out_free_request_sock_slab:
3976 	if (alloc_slab) {
3977 		req_prot_cleanup(prot->rsk_prot);
3978 
3979 		kmem_cache_destroy(prot->slab);
3980 		prot->slab = NULL;
3981 	}
3982 out:
3983 	return ret;
3984 }
3985 EXPORT_SYMBOL(proto_register);
3986 
3987 void proto_unregister(struct proto *prot)
3988 {
3989 	mutex_lock(&proto_list_mutex);
3990 	release_proto_idx(prot);
3991 	list_del(&prot->node);
3992 	mutex_unlock(&proto_list_mutex);
3993 
3994 	kmem_cache_destroy(prot->slab);
3995 	prot->slab = NULL;
3996 
3997 	req_prot_cleanup(prot->rsk_prot);
3998 	tw_prot_cleanup(prot->twsk_prot);
3999 }
4000 EXPORT_SYMBOL(proto_unregister);
4001 
4002 int sock_load_diag_module(int family, int protocol)
4003 {
4004 	if (!protocol) {
4005 		if (!sock_is_registered(family))
4006 			return -ENOENT;
4007 
4008 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4009 				      NETLINK_SOCK_DIAG, family);
4010 	}
4011 
4012 #ifdef CONFIG_INET
4013 	if (family == AF_INET &&
4014 	    protocol != IPPROTO_RAW &&
4015 	    protocol < MAX_INET_PROTOS &&
4016 	    !rcu_access_pointer(inet_protos[protocol]))
4017 		return -ENOENT;
4018 #endif
4019 
4020 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4021 			      NETLINK_SOCK_DIAG, family, protocol);
4022 }
4023 EXPORT_SYMBOL(sock_load_diag_module);
4024 
4025 #ifdef CONFIG_PROC_FS
4026 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4027 	__acquires(proto_list_mutex)
4028 {
4029 	mutex_lock(&proto_list_mutex);
4030 	return seq_list_start_head(&proto_list, *pos);
4031 }
4032 
4033 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4034 {
4035 	return seq_list_next(v, &proto_list, pos);
4036 }
4037 
4038 static void proto_seq_stop(struct seq_file *seq, void *v)
4039 	__releases(proto_list_mutex)
4040 {
4041 	mutex_unlock(&proto_list_mutex);
4042 }
4043 
4044 static char proto_method_implemented(const void *method)
4045 {
4046 	return method == NULL ? 'n' : 'y';
4047 }
4048 static long sock_prot_memory_allocated(struct proto *proto)
4049 {
4050 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4051 }
4052 
4053 static const char *sock_prot_memory_pressure(struct proto *proto)
4054 {
4055 	return proto->memory_pressure != NULL ?
4056 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4057 }
4058 
4059 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4060 {
4061 
4062 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4063 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4064 		   proto->name,
4065 		   proto->obj_size,
4066 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4067 		   sock_prot_memory_allocated(proto),
4068 		   sock_prot_memory_pressure(proto),
4069 		   proto->max_header,
4070 		   proto->slab == NULL ? "no" : "yes",
4071 		   module_name(proto->owner),
4072 		   proto_method_implemented(proto->close),
4073 		   proto_method_implemented(proto->connect),
4074 		   proto_method_implemented(proto->disconnect),
4075 		   proto_method_implemented(proto->accept),
4076 		   proto_method_implemented(proto->ioctl),
4077 		   proto_method_implemented(proto->init),
4078 		   proto_method_implemented(proto->destroy),
4079 		   proto_method_implemented(proto->shutdown),
4080 		   proto_method_implemented(proto->setsockopt),
4081 		   proto_method_implemented(proto->getsockopt),
4082 		   proto_method_implemented(proto->sendmsg),
4083 		   proto_method_implemented(proto->recvmsg),
4084 		   proto_method_implemented(proto->bind),
4085 		   proto_method_implemented(proto->backlog_rcv),
4086 		   proto_method_implemented(proto->hash),
4087 		   proto_method_implemented(proto->unhash),
4088 		   proto_method_implemented(proto->get_port),
4089 		   proto_method_implemented(proto->enter_memory_pressure));
4090 }
4091 
4092 static int proto_seq_show(struct seq_file *seq, void *v)
4093 {
4094 	if (v == &proto_list)
4095 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4096 			   "protocol",
4097 			   "size",
4098 			   "sockets",
4099 			   "memory",
4100 			   "press",
4101 			   "maxhdr",
4102 			   "slab",
4103 			   "module",
4104 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4105 	else
4106 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4107 	return 0;
4108 }
4109 
4110 static const struct seq_operations proto_seq_ops = {
4111 	.start  = proto_seq_start,
4112 	.next   = proto_seq_next,
4113 	.stop   = proto_seq_stop,
4114 	.show   = proto_seq_show,
4115 };
4116 
4117 static __net_init int proto_init_net(struct net *net)
4118 {
4119 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4120 			sizeof(struct seq_net_private)))
4121 		return -ENOMEM;
4122 
4123 	return 0;
4124 }
4125 
4126 static __net_exit void proto_exit_net(struct net *net)
4127 {
4128 	remove_proc_entry("protocols", net->proc_net);
4129 }
4130 
4131 
4132 static __net_initdata struct pernet_operations proto_net_ops = {
4133 	.init = proto_init_net,
4134 	.exit = proto_exit_net,
4135 };
4136 
4137 static int __init proto_init(void)
4138 {
4139 	return register_pernet_subsys(&proto_net_ops);
4140 }
4141 
4142 subsys_initcall(proto_init);
4143 
4144 #endif /* PROC_FS */
4145 
4146 #ifdef CONFIG_NET_RX_BUSY_POLL
4147 bool sk_busy_loop_end(void *p, unsigned long start_time)
4148 {
4149 	struct sock *sk = p;
4150 
4151 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4152 		return true;
4153 
4154 	if (sk_is_udp(sk) &&
4155 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4156 		return true;
4157 
4158 	return sk_busy_loop_timeout(sk, start_time);
4159 }
4160 EXPORT_SYMBOL(sk_busy_loop_end);
4161 #endif /* CONFIG_NET_RX_BUSY_POLL */
4162 
4163 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4164 {
4165 	if (!sk->sk_prot->bind_add)
4166 		return -EOPNOTSUPP;
4167 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4168 }
4169 EXPORT_SYMBOL(sock_bind_add);
4170 
4171 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4172 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4173 		     void __user *arg, void *karg, size_t size)
4174 {
4175 	int ret;
4176 
4177 	if (copy_from_user(karg, arg, size))
4178 		return -EFAULT;
4179 
4180 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4181 	if (ret)
4182 		return ret;
4183 
4184 	if (copy_to_user(arg, karg, size))
4185 		return -EFAULT;
4186 
4187 	return 0;
4188 }
4189 EXPORT_SYMBOL(sock_ioctl_inout);
4190 
4191 /* This is the most common ioctl prep function, where the result (4 bytes) is
4192  * copied back to userspace if the ioctl() returns successfully. No input is
4193  * copied from userspace as input argument.
4194  */
4195 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4196 {
4197 	int ret, karg = 0;
4198 
4199 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4200 	if (ret)
4201 		return ret;
4202 
4203 	return put_user(karg, (int __user *)arg);
4204 }
4205 
4206 /* A wrapper around sock ioctls, which copies the data from userspace
4207  * (depending on the protocol/ioctl), and copies back the result to userspace.
4208  * The main motivation for this function is to pass kernel memory to the
4209  * protocol ioctl callbacks, instead of userspace memory.
4210  */
4211 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4212 {
4213 	int rc = 1;
4214 
4215 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4216 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4217 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4218 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4219 	else if (sk_is_phonet(sk))
4220 		rc = phonet_sk_ioctl(sk, cmd, arg);
4221 
4222 	/* If ioctl was processed, returns its value */
4223 	if (rc <= 0)
4224 		return rc;
4225 
4226 	/* Otherwise call the default handler */
4227 	return sock_ioctl_out(sk, cmd, arg);
4228 }
4229 EXPORT_SYMBOL(sk_ioctl);
4230 
4231 static int __init sock_struct_check(void)
4232 {
4233 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4234 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4235 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4236 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4237 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4238 
4239 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4240 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4241 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4242 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4243 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4244 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4245 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4246 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4247 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4248 
4249 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4250 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4251 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4252 
4253 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4254 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4255 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4256 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4257 
4258 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4259 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4260 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4261 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4262 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4263 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4264 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4265 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4266 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4267 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4268 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4269 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4270 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4271 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4272 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4273 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4274 
4275 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4276 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4277 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4278 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4279 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4280 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4281 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4282 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4283 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4284 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4285 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4286 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4287 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4288 	return 0;
4289 }
4290 
4291 core_initcall(sock_struct_check);
4292