xref: /linux/net/core/sock.c (revision f15e3b3ddb9fab1c1731b6154e2cd6573fb54c4d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include "dev.h"
152 
153 static DEFINE_MUTEX(proto_list_mutex);
154 static LIST_HEAD(proto_list);
155 
156 static void sock_def_write_space_wfree(struct sock *sk);
157 static void sock_def_write_space(struct sock *sk);
158 
159 /**
160  * sk_ns_capable - General socket capability test
161  * @sk: Socket to use a capability on or through
162  * @user_ns: The user namespace of the capability to use
163  * @cap: The capability to use
164  *
165  * Test to see if the opener of the socket had when the socket was
166  * created and the current process has the capability @cap in the user
167  * namespace @user_ns.
168  */
169 bool sk_ns_capable(const struct sock *sk,
170 		   struct user_namespace *user_ns, int cap)
171 {
172 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173 		ns_capable(user_ns, cap);
174 }
175 EXPORT_SYMBOL(sk_ns_capable);
176 
177 /**
178  * sk_capable - Socket global capability test
179  * @sk: Socket to use a capability on or through
180  * @cap: The global capability to use
181  *
182  * Test to see if the opener of the socket had when the socket was
183  * created and the current process has the capability @cap in all user
184  * namespaces.
185  */
186 bool sk_capable(const struct sock *sk, int cap)
187 {
188 	return sk_ns_capable(sk, &init_user_ns, cap);
189 }
190 EXPORT_SYMBOL(sk_capable);
191 
192 /**
193  * sk_net_capable - Network namespace socket capability test
194  * @sk: Socket to use a capability on or through
195  * @cap: The capability to use
196  *
197  * Test to see if the opener of the socket had when the socket was created
198  * and the current process has the capability @cap over the network namespace
199  * the socket is a member of.
200  */
201 bool sk_net_capable(const struct sock *sk, int cap)
202 {
203 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204 }
205 EXPORT_SYMBOL(sk_net_capable);
206 
207 /*
208  * Each address family might have different locking rules, so we have
209  * one slock key per address family and separate keys for internal and
210  * userspace sockets.
211  */
212 static struct lock_class_key af_family_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_keys[AF_MAX];
214 static struct lock_class_key af_family_slock_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216 
217 /*
218  * Make lock validator output more readable. (we pre-construct these
219  * strings build-time, so that runtime initialization of socket
220  * locks is fast):
221  */
222 
223 #define _sock_locks(x)						  \
224   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
225   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
226   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
227   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
228   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
229   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
230   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
231   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
232   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
233   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
234   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
235   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
236   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
237   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
238   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
239   x "AF_MCTP"  , \
240   x "AF_MAX"
241 
242 static const char *const af_family_key_strings[AF_MAX+1] = {
243 	_sock_locks("sk_lock-")
244 };
245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("slock-")
247 };
248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("clock-")
250 };
251 
252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-sk_lock-")
254 };
255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-slock-")
257 };
258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259 	_sock_locks("k-clock-")
260 };
261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("rlock-")
263 };
264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265 	_sock_locks("wlock-")
266 };
267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268 	_sock_locks("elock-")
269 };
270 
271 /*
272  * sk_callback_lock and sk queues locking rules are per-address-family,
273  * so split the lock classes by using a per-AF key:
274  */
275 static struct lock_class_key af_callback_keys[AF_MAX];
276 static struct lock_class_key af_rlock_keys[AF_MAX];
277 static struct lock_class_key af_wlock_keys[AF_MAX];
278 static struct lock_class_key af_elock_keys[AF_MAX];
279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
280 
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283 EXPORT_SYMBOL(sysctl_wmem_max);
284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285 EXPORT_SYMBOL(sysctl_rmem_max);
286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288 
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_branch_inc(&memalloc_socks_key);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_branch_dec(&memalloc_socks_key);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned int noreclaim_flag;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	noreclaim_flag = memalloc_noreclaim_save();
334 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
335 				 tcp_v6_do_rcv,
336 				 tcp_v4_do_rcv,
337 				 sk, skb);
338 	memalloc_noreclaim_restore(noreclaim_flag);
339 
340 	return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343 
344 void sk_error_report(struct sock *sk)
345 {
346 	sk->sk_error_report(sk);
347 
348 	switch (sk->sk_family) {
349 	case AF_INET:
350 		fallthrough;
351 	case AF_INET6:
352 		trace_inet_sk_error_report(sk);
353 		break;
354 	default:
355 		break;
356 	}
357 }
358 EXPORT_SYMBOL(sk_error_report);
359 
360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
361 {
362 	struct __kernel_sock_timeval tv;
363 
364 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
365 		tv.tv_sec = 0;
366 		tv.tv_usec = 0;
367 	} else {
368 		tv.tv_sec = timeo / HZ;
369 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
370 	}
371 
372 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
373 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
374 		*(struct old_timeval32 *)optval = tv32;
375 		return sizeof(tv32);
376 	}
377 
378 	if (old_timeval) {
379 		struct __kernel_old_timeval old_tv;
380 		old_tv.tv_sec = tv.tv_sec;
381 		old_tv.tv_usec = tv.tv_usec;
382 		*(struct __kernel_old_timeval *)optval = old_tv;
383 		return sizeof(old_tv);
384 	}
385 
386 	*(struct __kernel_sock_timeval *)optval = tv;
387 	return sizeof(tv);
388 }
389 EXPORT_SYMBOL(sock_get_timeout);
390 
391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
392 			   sockptr_t optval, int optlen, bool old_timeval)
393 {
394 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
395 		struct old_timeval32 tv32;
396 
397 		if (optlen < sizeof(tv32))
398 			return -EINVAL;
399 
400 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
401 			return -EFAULT;
402 		tv->tv_sec = tv32.tv_sec;
403 		tv->tv_usec = tv32.tv_usec;
404 	} else if (old_timeval) {
405 		struct __kernel_old_timeval old_tv;
406 
407 		if (optlen < sizeof(old_tv))
408 			return -EINVAL;
409 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
410 			return -EFAULT;
411 		tv->tv_sec = old_tv.tv_sec;
412 		tv->tv_usec = old_tv.tv_usec;
413 	} else {
414 		if (optlen < sizeof(*tv))
415 			return -EINVAL;
416 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
417 			return -EFAULT;
418 	}
419 
420 	return 0;
421 }
422 EXPORT_SYMBOL(sock_copy_user_timeval);
423 
424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
425 			    bool old_timeval)
426 {
427 	struct __kernel_sock_timeval tv;
428 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
429 	long val;
430 
431 	if (err)
432 		return err;
433 
434 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
435 		return -EDOM;
436 
437 	if (tv.tv_sec < 0) {
438 		static int warned __read_mostly;
439 
440 		WRITE_ONCE(*timeo_p, 0);
441 		if (warned < 10 && net_ratelimit()) {
442 			warned++;
443 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 				__func__, current->comm, task_pid_nr(current));
445 		}
446 		return 0;
447 	}
448 	val = MAX_SCHEDULE_TIMEOUT;
449 	if ((tv.tv_sec || tv.tv_usec) &&
450 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
451 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
452 						    USEC_PER_SEC / HZ);
453 	WRITE_ONCE(*timeo_p, val);
454 	return 0;
455 }
456 
457 static bool sock_needs_netstamp(const struct sock *sk)
458 {
459 	switch (sk->sk_family) {
460 	case AF_UNSPEC:
461 	case AF_UNIX:
462 		return false;
463 	default:
464 		return true;
465 	}
466 }
467 
468 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
469 {
470 	if (sk->sk_flags & flags) {
471 		sk->sk_flags &= ~flags;
472 		if (sock_needs_netstamp(sk) &&
473 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
474 			net_disable_timestamp();
475 	}
476 }
477 
478 
479 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
480 {
481 	unsigned long flags;
482 	struct sk_buff_head *list = &sk->sk_receive_queue;
483 
484 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
485 		atomic_inc(&sk->sk_drops);
486 		trace_sock_rcvqueue_full(sk, skb);
487 		return -ENOMEM;
488 	}
489 
490 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
491 		atomic_inc(&sk->sk_drops);
492 		return -ENOBUFS;
493 	}
494 
495 	skb->dev = NULL;
496 	skb_set_owner_r(skb, sk);
497 
498 	/* we escape from rcu protected region, make sure we dont leak
499 	 * a norefcounted dst
500 	 */
501 	skb_dst_force(skb);
502 
503 	spin_lock_irqsave(&list->lock, flags);
504 	sock_skb_set_dropcount(sk, skb);
505 	__skb_queue_tail(list, skb);
506 	spin_unlock_irqrestore(&list->lock, flags);
507 
508 	if (!sock_flag(sk, SOCK_DEAD))
509 		sk->sk_data_ready(sk);
510 	return 0;
511 }
512 EXPORT_SYMBOL(__sock_queue_rcv_skb);
513 
514 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
515 			      enum skb_drop_reason *reason)
516 {
517 	enum skb_drop_reason drop_reason;
518 	int err;
519 
520 	err = sk_filter(sk, skb);
521 	if (err) {
522 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
523 		goto out;
524 	}
525 	err = __sock_queue_rcv_skb(sk, skb);
526 	switch (err) {
527 	case -ENOMEM:
528 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
529 		break;
530 	case -ENOBUFS:
531 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
532 		break;
533 	default:
534 		drop_reason = SKB_NOT_DROPPED_YET;
535 		break;
536 	}
537 out:
538 	if (reason)
539 		*reason = drop_reason;
540 	return err;
541 }
542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
543 
544 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
545 		     const int nested, unsigned int trim_cap, bool refcounted)
546 {
547 	int rc = NET_RX_SUCCESS;
548 
549 	if (sk_filter_trim_cap(sk, skb, trim_cap))
550 		goto discard_and_relse;
551 
552 	skb->dev = NULL;
553 
554 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
555 		atomic_inc(&sk->sk_drops);
556 		goto discard_and_relse;
557 	}
558 	if (nested)
559 		bh_lock_sock_nested(sk);
560 	else
561 		bh_lock_sock(sk);
562 	if (!sock_owned_by_user(sk)) {
563 		/*
564 		 * trylock + unlock semantics:
565 		 */
566 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
567 
568 		rc = sk_backlog_rcv(sk, skb);
569 
570 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
571 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
572 		bh_unlock_sock(sk);
573 		atomic_inc(&sk->sk_drops);
574 		goto discard_and_relse;
575 	}
576 
577 	bh_unlock_sock(sk);
578 out:
579 	if (refcounted)
580 		sock_put(sk);
581 	return rc;
582 discard_and_relse:
583 	kfree_skb(skb);
584 	goto out;
585 }
586 EXPORT_SYMBOL(__sk_receive_skb);
587 
588 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
589 							  u32));
590 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
591 							   u32));
592 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
593 {
594 	struct dst_entry *dst = __sk_dst_get(sk);
595 
596 	if (dst && dst->obsolete &&
597 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
598 			       dst, cookie) == NULL) {
599 		sk_tx_queue_clear(sk);
600 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
601 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
602 		dst_release(dst);
603 		return NULL;
604 	}
605 
606 	return dst;
607 }
608 EXPORT_SYMBOL(__sk_dst_check);
609 
610 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
611 {
612 	struct dst_entry *dst = sk_dst_get(sk);
613 
614 	if (dst && dst->obsolete &&
615 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
616 			       dst, cookie) == NULL) {
617 		sk_dst_reset(sk);
618 		dst_release(dst);
619 		return NULL;
620 	}
621 
622 	return dst;
623 }
624 EXPORT_SYMBOL(sk_dst_check);
625 
626 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
627 {
628 	int ret = -ENOPROTOOPT;
629 #ifdef CONFIG_NETDEVICES
630 	struct net *net = sock_net(sk);
631 
632 	/* Sorry... */
633 	ret = -EPERM;
634 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
635 		goto out;
636 
637 	ret = -EINVAL;
638 	if (ifindex < 0)
639 		goto out;
640 
641 	/* Paired with all READ_ONCE() done locklessly. */
642 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
643 
644 	if (sk->sk_prot->rehash)
645 		sk->sk_prot->rehash(sk);
646 	sk_dst_reset(sk);
647 
648 	ret = 0;
649 
650 out:
651 #endif
652 
653 	return ret;
654 }
655 
656 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
657 {
658 	int ret;
659 
660 	if (lock_sk)
661 		lock_sock(sk);
662 	ret = sock_bindtoindex_locked(sk, ifindex);
663 	if (lock_sk)
664 		release_sock(sk);
665 
666 	return ret;
667 }
668 EXPORT_SYMBOL(sock_bindtoindex);
669 
670 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
671 {
672 	int ret = -ENOPROTOOPT;
673 #ifdef CONFIG_NETDEVICES
674 	struct net *net = sock_net(sk);
675 	char devname[IFNAMSIZ];
676 	int index;
677 
678 	ret = -EINVAL;
679 	if (optlen < 0)
680 		goto out;
681 
682 	/* Bind this socket to a particular device like "eth0",
683 	 * as specified in the passed interface name. If the
684 	 * name is "" or the option length is zero the socket
685 	 * is not bound.
686 	 */
687 	if (optlen > IFNAMSIZ - 1)
688 		optlen = IFNAMSIZ - 1;
689 	memset(devname, 0, sizeof(devname));
690 
691 	ret = -EFAULT;
692 	if (copy_from_sockptr(devname, optval, optlen))
693 		goto out;
694 
695 	index = 0;
696 	if (devname[0] != '\0') {
697 		struct net_device *dev;
698 
699 		rcu_read_lock();
700 		dev = dev_get_by_name_rcu(net, devname);
701 		if (dev)
702 			index = dev->ifindex;
703 		rcu_read_unlock();
704 		ret = -ENODEV;
705 		if (!dev)
706 			goto out;
707 	}
708 
709 	sockopt_lock_sock(sk);
710 	ret = sock_bindtoindex_locked(sk, index);
711 	sockopt_release_sock(sk);
712 out:
713 #endif
714 
715 	return ret;
716 }
717 
718 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
719 				sockptr_t optlen, int len)
720 {
721 	int ret = -ENOPROTOOPT;
722 #ifdef CONFIG_NETDEVICES
723 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
724 	struct net *net = sock_net(sk);
725 	char devname[IFNAMSIZ];
726 
727 	if (bound_dev_if == 0) {
728 		len = 0;
729 		goto zero;
730 	}
731 
732 	ret = -EINVAL;
733 	if (len < IFNAMSIZ)
734 		goto out;
735 
736 	ret = netdev_get_name(net, devname, bound_dev_if);
737 	if (ret)
738 		goto out;
739 
740 	len = strlen(devname) + 1;
741 
742 	ret = -EFAULT;
743 	if (copy_to_sockptr(optval, devname, len))
744 		goto out;
745 
746 zero:
747 	ret = -EFAULT;
748 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
749 		goto out;
750 
751 	ret = 0;
752 
753 out:
754 #endif
755 
756 	return ret;
757 }
758 
759 bool sk_mc_loop(const struct sock *sk)
760 {
761 	if (dev_recursion_level())
762 		return false;
763 	if (!sk)
764 		return true;
765 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
766 	switch (READ_ONCE(sk->sk_family)) {
767 	case AF_INET:
768 		return inet_test_bit(MC_LOOP, sk);
769 #if IS_ENABLED(CONFIG_IPV6)
770 	case AF_INET6:
771 		return inet6_test_bit(MC6_LOOP, sk);
772 #endif
773 	}
774 	WARN_ON_ONCE(1);
775 	return true;
776 }
777 EXPORT_SYMBOL(sk_mc_loop);
778 
779 void sock_set_reuseaddr(struct sock *sk)
780 {
781 	lock_sock(sk);
782 	sk->sk_reuse = SK_CAN_REUSE;
783 	release_sock(sk);
784 }
785 EXPORT_SYMBOL(sock_set_reuseaddr);
786 
787 void sock_set_reuseport(struct sock *sk)
788 {
789 	lock_sock(sk);
790 	sk->sk_reuseport = true;
791 	release_sock(sk);
792 }
793 EXPORT_SYMBOL(sock_set_reuseport);
794 
795 void sock_no_linger(struct sock *sk)
796 {
797 	lock_sock(sk);
798 	WRITE_ONCE(sk->sk_lingertime, 0);
799 	sock_set_flag(sk, SOCK_LINGER);
800 	release_sock(sk);
801 }
802 EXPORT_SYMBOL(sock_no_linger);
803 
804 void sock_set_priority(struct sock *sk, u32 priority)
805 {
806 	WRITE_ONCE(sk->sk_priority, priority);
807 }
808 EXPORT_SYMBOL(sock_set_priority);
809 
810 void sock_set_sndtimeo(struct sock *sk, s64 secs)
811 {
812 	lock_sock(sk);
813 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
814 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
815 	else
816 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
817 	release_sock(sk);
818 }
819 EXPORT_SYMBOL(sock_set_sndtimeo);
820 
821 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
822 {
823 	if (val)  {
824 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
825 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
826 		sock_set_flag(sk, SOCK_RCVTSTAMP);
827 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
828 	} else {
829 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
830 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
831 	}
832 }
833 
834 void sock_enable_timestamps(struct sock *sk)
835 {
836 	lock_sock(sk);
837 	__sock_set_timestamps(sk, true, false, true);
838 	release_sock(sk);
839 }
840 EXPORT_SYMBOL(sock_enable_timestamps);
841 
842 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
843 {
844 	switch (optname) {
845 	case SO_TIMESTAMP_OLD:
846 		__sock_set_timestamps(sk, valbool, false, false);
847 		break;
848 	case SO_TIMESTAMP_NEW:
849 		__sock_set_timestamps(sk, valbool, true, false);
850 		break;
851 	case SO_TIMESTAMPNS_OLD:
852 		__sock_set_timestamps(sk, valbool, false, true);
853 		break;
854 	case SO_TIMESTAMPNS_NEW:
855 		__sock_set_timestamps(sk, valbool, true, true);
856 		break;
857 	}
858 }
859 
860 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
861 {
862 	struct net *net = sock_net(sk);
863 	struct net_device *dev = NULL;
864 	bool match = false;
865 	int *vclock_index;
866 	int i, num;
867 
868 	if (sk->sk_bound_dev_if)
869 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
870 
871 	if (!dev) {
872 		pr_err("%s: sock not bind to device\n", __func__);
873 		return -EOPNOTSUPP;
874 	}
875 
876 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
877 	dev_put(dev);
878 
879 	for (i = 0; i < num; i++) {
880 		if (*(vclock_index + i) == phc_index) {
881 			match = true;
882 			break;
883 		}
884 	}
885 
886 	if (num > 0)
887 		kfree(vclock_index);
888 
889 	if (!match)
890 		return -EINVAL;
891 
892 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
893 
894 	return 0;
895 }
896 
897 int sock_set_timestamping(struct sock *sk, int optname,
898 			  struct so_timestamping timestamping)
899 {
900 	int val = timestamping.flags;
901 	int ret;
902 
903 	if (val & ~SOF_TIMESTAMPING_MASK)
904 		return -EINVAL;
905 
906 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
907 	    !(val & SOF_TIMESTAMPING_OPT_ID))
908 		return -EINVAL;
909 
910 	if (val & SOF_TIMESTAMPING_OPT_ID &&
911 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
912 		if (sk_is_tcp(sk)) {
913 			if ((1 << sk->sk_state) &
914 			    (TCPF_CLOSE | TCPF_LISTEN))
915 				return -EINVAL;
916 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
917 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
918 			else
919 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
920 		} else {
921 			atomic_set(&sk->sk_tskey, 0);
922 		}
923 	}
924 
925 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
926 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
927 		return -EINVAL;
928 
929 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
930 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
931 		if (ret)
932 			return ret;
933 	}
934 
935 	WRITE_ONCE(sk->sk_tsflags, val);
936 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
937 
938 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
939 		sock_enable_timestamp(sk,
940 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
941 	else
942 		sock_disable_timestamp(sk,
943 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
944 	return 0;
945 }
946 
947 void sock_set_keepalive(struct sock *sk)
948 {
949 	lock_sock(sk);
950 	if (sk->sk_prot->keepalive)
951 		sk->sk_prot->keepalive(sk, true);
952 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
953 	release_sock(sk);
954 }
955 EXPORT_SYMBOL(sock_set_keepalive);
956 
957 static void __sock_set_rcvbuf(struct sock *sk, int val)
958 {
959 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
960 	 * as a negative value.
961 	 */
962 	val = min_t(int, val, INT_MAX / 2);
963 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
964 
965 	/* We double it on the way in to account for "struct sk_buff" etc.
966 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
967 	 * will allow that much actual data to be received on that socket.
968 	 *
969 	 * Applications are unaware that "struct sk_buff" and other overheads
970 	 * allocate from the receive buffer during socket buffer allocation.
971 	 *
972 	 * And after considering the possible alternatives, returning the value
973 	 * we actually used in getsockopt is the most desirable behavior.
974 	 */
975 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
976 }
977 
978 void sock_set_rcvbuf(struct sock *sk, int val)
979 {
980 	lock_sock(sk);
981 	__sock_set_rcvbuf(sk, val);
982 	release_sock(sk);
983 }
984 EXPORT_SYMBOL(sock_set_rcvbuf);
985 
986 static void __sock_set_mark(struct sock *sk, u32 val)
987 {
988 	if (val != sk->sk_mark) {
989 		WRITE_ONCE(sk->sk_mark, val);
990 		sk_dst_reset(sk);
991 	}
992 }
993 
994 void sock_set_mark(struct sock *sk, u32 val)
995 {
996 	lock_sock(sk);
997 	__sock_set_mark(sk, val);
998 	release_sock(sk);
999 }
1000 EXPORT_SYMBOL(sock_set_mark);
1001 
1002 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1003 {
1004 	/* Round down bytes to multiple of pages */
1005 	bytes = round_down(bytes, PAGE_SIZE);
1006 
1007 	WARN_ON(bytes > sk->sk_reserved_mem);
1008 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1009 	sk_mem_reclaim(sk);
1010 }
1011 
1012 static int sock_reserve_memory(struct sock *sk, int bytes)
1013 {
1014 	long allocated;
1015 	bool charged;
1016 	int pages;
1017 
1018 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1019 		return -EOPNOTSUPP;
1020 
1021 	if (!bytes)
1022 		return 0;
1023 
1024 	pages = sk_mem_pages(bytes);
1025 
1026 	/* pre-charge to memcg */
1027 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1028 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1029 	if (!charged)
1030 		return -ENOMEM;
1031 
1032 	/* pre-charge to forward_alloc */
1033 	sk_memory_allocated_add(sk, pages);
1034 	allocated = sk_memory_allocated(sk);
1035 	/* If the system goes into memory pressure with this
1036 	 * precharge, give up and return error.
1037 	 */
1038 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1039 		sk_memory_allocated_sub(sk, pages);
1040 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1041 		return -ENOMEM;
1042 	}
1043 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1044 
1045 	WRITE_ONCE(sk->sk_reserved_mem,
1046 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1047 
1048 	return 0;
1049 }
1050 
1051 #ifdef CONFIG_PAGE_POOL
1052 
1053 /* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in
1054  * 1 syscall. The limit exists to limit the amount of memory the kernel
1055  * allocates to copy these tokens.
1056  */
1057 #define MAX_DONTNEED_TOKENS 128
1058 
1059 static noinline_for_stack int
1060 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1061 {
1062 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1063 	struct dmabuf_token *tokens;
1064 	netmem_ref netmems[16];
1065 	int ret = 0;
1066 
1067 	if (!sk_is_tcp(sk))
1068 		return -EBADF;
1069 
1070 	if (optlen % sizeof(struct dmabuf_token) ||
1071 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1072 		return -EINVAL;
1073 
1074 	tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
1075 	if (!tokens)
1076 		return -ENOMEM;
1077 
1078 	num_tokens = optlen / sizeof(struct dmabuf_token);
1079 	if (copy_from_sockptr(tokens, optval, optlen)) {
1080 		kvfree(tokens);
1081 		return -EFAULT;
1082 	}
1083 
1084 	xa_lock_bh(&sk->sk_user_frags);
1085 	for (i = 0; i < num_tokens; i++) {
1086 		for (j = 0; j < tokens[i].token_count; j++) {
1087 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1088 				&sk->sk_user_frags, tokens[i].token_start + j);
1089 
1090 			if (netmem &&
1091 			    !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) {
1092 				netmems[netmem_num++] = netmem;
1093 				if (netmem_num == ARRAY_SIZE(netmems)) {
1094 					xa_unlock_bh(&sk->sk_user_frags);
1095 					for (k = 0; k < netmem_num; k++)
1096 						WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1097 					netmem_num = 0;
1098 					xa_lock_bh(&sk->sk_user_frags);
1099 				}
1100 				ret++;
1101 			}
1102 		}
1103 	}
1104 
1105 	xa_unlock_bh(&sk->sk_user_frags);
1106 	for (k = 0; k < netmem_num; k++)
1107 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1108 
1109 	kvfree(tokens);
1110 	return ret;
1111 }
1112 #endif
1113 
1114 void sockopt_lock_sock(struct sock *sk)
1115 {
1116 	/* When current->bpf_ctx is set, the setsockopt is called from
1117 	 * a bpf prog.  bpf has ensured the sk lock has been
1118 	 * acquired before calling setsockopt().
1119 	 */
1120 	if (has_current_bpf_ctx())
1121 		return;
1122 
1123 	lock_sock(sk);
1124 }
1125 EXPORT_SYMBOL(sockopt_lock_sock);
1126 
1127 void sockopt_release_sock(struct sock *sk)
1128 {
1129 	if (has_current_bpf_ctx())
1130 		return;
1131 
1132 	release_sock(sk);
1133 }
1134 EXPORT_SYMBOL(sockopt_release_sock);
1135 
1136 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1137 {
1138 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1139 }
1140 EXPORT_SYMBOL(sockopt_ns_capable);
1141 
1142 bool sockopt_capable(int cap)
1143 {
1144 	return has_current_bpf_ctx() || capable(cap);
1145 }
1146 EXPORT_SYMBOL(sockopt_capable);
1147 
1148 static int sockopt_validate_clockid(__kernel_clockid_t value)
1149 {
1150 	switch (value) {
1151 	case CLOCK_REALTIME:
1152 	case CLOCK_MONOTONIC:
1153 	case CLOCK_TAI:
1154 		return 0;
1155 	}
1156 	return -EINVAL;
1157 }
1158 
1159 /*
1160  *	This is meant for all protocols to use and covers goings on
1161  *	at the socket level. Everything here is generic.
1162  */
1163 
1164 int sk_setsockopt(struct sock *sk, int level, int optname,
1165 		  sockptr_t optval, unsigned int optlen)
1166 {
1167 	struct so_timestamping timestamping;
1168 	struct socket *sock = sk->sk_socket;
1169 	struct sock_txtime sk_txtime;
1170 	int val;
1171 	int valbool;
1172 	struct linger ling;
1173 	int ret = 0;
1174 
1175 	/*
1176 	 *	Options without arguments
1177 	 */
1178 
1179 	if (optname == SO_BINDTODEVICE)
1180 		return sock_setbindtodevice(sk, optval, optlen);
1181 
1182 	if (optlen < sizeof(int))
1183 		return -EINVAL;
1184 
1185 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1186 		return -EFAULT;
1187 
1188 	valbool = val ? 1 : 0;
1189 
1190 	/* handle options which do not require locking the socket. */
1191 	switch (optname) {
1192 	case SO_PRIORITY:
1193 		if ((val >= 0 && val <= 6) ||
1194 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1195 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1196 			sock_set_priority(sk, val);
1197 			return 0;
1198 		}
1199 		return -EPERM;
1200 	case SO_PASSSEC:
1201 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1202 		return 0;
1203 	case SO_PASSCRED:
1204 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1205 		return 0;
1206 	case SO_PASSPIDFD:
1207 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1208 		return 0;
1209 	case SO_TYPE:
1210 	case SO_PROTOCOL:
1211 	case SO_DOMAIN:
1212 	case SO_ERROR:
1213 		return -ENOPROTOOPT;
1214 #ifdef CONFIG_NET_RX_BUSY_POLL
1215 	case SO_BUSY_POLL:
1216 		if (val < 0)
1217 			return -EINVAL;
1218 		WRITE_ONCE(sk->sk_ll_usec, val);
1219 		return 0;
1220 	case SO_PREFER_BUSY_POLL:
1221 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1222 			return -EPERM;
1223 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1224 		return 0;
1225 	case SO_BUSY_POLL_BUDGET:
1226 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1227 		    !sockopt_capable(CAP_NET_ADMIN))
1228 			return -EPERM;
1229 		if (val < 0 || val > U16_MAX)
1230 			return -EINVAL;
1231 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1232 		return 0;
1233 #endif
1234 	case SO_MAX_PACING_RATE:
1235 		{
1236 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1237 		unsigned long pacing_rate;
1238 
1239 		if (sizeof(ulval) != sizeof(val) &&
1240 		    optlen >= sizeof(ulval) &&
1241 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1242 			return -EFAULT;
1243 		}
1244 		if (ulval != ~0UL)
1245 			cmpxchg(&sk->sk_pacing_status,
1246 				SK_PACING_NONE,
1247 				SK_PACING_NEEDED);
1248 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1249 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1250 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1251 		if (ulval < pacing_rate)
1252 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1253 		return 0;
1254 		}
1255 	case SO_TXREHASH:
1256 		if (val < -1 || val > 1)
1257 			return -EINVAL;
1258 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1259 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1260 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1261 		 * and sk_getsockopt().
1262 		 */
1263 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1264 		return 0;
1265 	case SO_PEEK_OFF:
1266 		{
1267 		int (*set_peek_off)(struct sock *sk, int val);
1268 
1269 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1270 		if (set_peek_off)
1271 			ret = set_peek_off(sk, val);
1272 		else
1273 			ret = -EOPNOTSUPP;
1274 		return ret;
1275 		}
1276 #ifdef CONFIG_PAGE_POOL
1277 	case SO_DEVMEM_DONTNEED:
1278 		return sock_devmem_dontneed(sk, optval, optlen);
1279 #endif
1280 	}
1281 
1282 	sockopt_lock_sock(sk);
1283 
1284 	switch (optname) {
1285 	case SO_DEBUG:
1286 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1287 			ret = -EACCES;
1288 		else
1289 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1290 		break;
1291 	case SO_REUSEADDR:
1292 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1293 		break;
1294 	case SO_REUSEPORT:
1295 		sk->sk_reuseport = valbool;
1296 		break;
1297 	case SO_DONTROUTE:
1298 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1299 		sk_dst_reset(sk);
1300 		break;
1301 	case SO_BROADCAST:
1302 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1303 		break;
1304 	case SO_SNDBUF:
1305 		/* Don't error on this BSD doesn't and if you think
1306 		 * about it this is right. Otherwise apps have to
1307 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1308 		 * are treated in BSD as hints
1309 		 */
1310 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1311 set_sndbuf:
1312 		/* Ensure val * 2 fits into an int, to prevent max_t()
1313 		 * from treating it as a negative value.
1314 		 */
1315 		val = min_t(int, val, INT_MAX / 2);
1316 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1317 		WRITE_ONCE(sk->sk_sndbuf,
1318 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1319 		/* Wake up sending tasks if we upped the value. */
1320 		sk->sk_write_space(sk);
1321 		break;
1322 
1323 	case SO_SNDBUFFORCE:
1324 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1325 			ret = -EPERM;
1326 			break;
1327 		}
1328 
1329 		/* No negative values (to prevent underflow, as val will be
1330 		 * multiplied by 2).
1331 		 */
1332 		if (val < 0)
1333 			val = 0;
1334 		goto set_sndbuf;
1335 
1336 	case SO_RCVBUF:
1337 		/* Don't error on this BSD doesn't and if you think
1338 		 * about it this is right. Otherwise apps have to
1339 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1340 		 * are treated in BSD as hints
1341 		 */
1342 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1343 		break;
1344 
1345 	case SO_RCVBUFFORCE:
1346 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1347 			ret = -EPERM;
1348 			break;
1349 		}
1350 
1351 		/* No negative values (to prevent underflow, as val will be
1352 		 * multiplied by 2).
1353 		 */
1354 		__sock_set_rcvbuf(sk, max(val, 0));
1355 		break;
1356 
1357 	case SO_KEEPALIVE:
1358 		if (sk->sk_prot->keepalive)
1359 			sk->sk_prot->keepalive(sk, valbool);
1360 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1361 		break;
1362 
1363 	case SO_OOBINLINE:
1364 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1365 		break;
1366 
1367 	case SO_NO_CHECK:
1368 		sk->sk_no_check_tx = valbool;
1369 		break;
1370 
1371 	case SO_LINGER:
1372 		if (optlen < sizeof(ling)) {
1373 			ret = -EINVAL;	/* 1003.1g */
1374 			break;
1375 		}
1376 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1377 			ret = -EFAULT;
1378 			break;
1379 		}
1380 		if (!ling.l_onoff) {
1381 			sock_reset_flag(sk, SOCK_LINGER);
1382 		} else {
1383 			unsigned long t_sec = ling.l_linger;
1384 
1385 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1386 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1387 			else
1388 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1389 			sock_set_flag(sk, SOCK_LINGER);
1390 		}
1391 		break;
1392 
1393 	case SO_BSDCOMPAT:
1394 		break;
1395 
1396 	case SO_TIMESTAMP_OLD:
1397 	case SO_TIMESTAMP_NEW:
1398 	case SO_TIMESTAMPNS_OLD:
1399 	case SO_TIMESTAMPNS_NEW:
1400 		sock_set_timestamp(sk, optname, valbool);
1401 		break;
1402 
1403 	case SO_TIMESTAMPING_NEW:
1404 	case SO_TIMESTAMPING_OLD:
1405 		if (optlen == sizeof(timestamping)) {
1406 			if (copy_from_sockptr(&timestamping, optval,
1407 					      sizeof(timestamping))) {
1408 				ret = -EFAULT;
1409 				break;
1410 			}
1411 		} else {
1412 			memset(&timestamping, 0, sizeof(timestamping));
1413 			timestamping.flags = val;
1414 		}
1415 		ret = sock_set_timestamping(sk, optname, timestamping);
1416 		break;
1417 
1418 	case SO_RCVLOWAT:
1419 		{
1420 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1421 
1422 		if (val < 0)
1423 			val = INT_MAX;
1424 		if (sock)
1425 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1426 		if (set_rcvlowat)
1427 			ret = set_rcvlowat(sk, val);
1428 		else
1429 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1430 		break;
1431 		}
1432 	case SO_RCVTIMEO_OLD:
1433 	case SO_RCVTIMEO_NEW:
1434 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1435 				       optlen, optname == SO_RCVTIMEO_OLD);
1436 		break;
1437 
1438 	case SO_SNDTIMEO_OLD:
1439 	case SO_SNDTIMEO_NEW:
1440 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1441 				       optlen, optname == SO_SNDTIMEO_OLD);
1442 		break;
1443 
1444 	case SO_ATTACH_FILTER: {
1445 		struct sock_fprog fprog;
1446 
1447 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1448 		if (!ret)
1449 			ret = sk_attach_filter(&fprog, sk);
1450 		break;
1451 	}
1452 	case SO_ATTACH_BPF:
1453 		ret = -EINVAL;
1454 		if (optlen == sizeof(u32)) {
1455 			u32 ufd;
1456 
1457 			ret = -EFAULT;
1458 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1459 				break;
1460 
1461 			ret = sk_attach_bpf(ufd, sk);
1462 		}
1463 		break;
1464 
1465 	case SO_ATTACH_REUSEPORT_CBPF: {
1466 		struct sock_fprog fprog;
1467 
1468 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1469 		if (!ret)
1470 			ret = sk_reuseport_attach_filter(&fprog, sk);
1471 		break;
1472 	}
1473 	case SO_ATTACH_REUSEPORT_EBPF:
1474 		ret = -EINVAL;
1475 		if (optlen == sizeof(u32)) {
1476 			u32 ufd;
1477 
1478 			ret = -EFAULT;
1479 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1480 				break;
1481 
1482 			ret = sk_reuseport_attach_bpf(ufd, sk);
1483 		}
1484 		break;
1485 
1486 	case SO_DETACH_REUSEPORT_BPF:
1487 		ret = reuseport_detach_prog(sk);
1488 		break;
1489 
1490 	case SO_DETACH_FILTER:
1491 		ret = sk_detach_filter(sk);
1492 		break;
1493 
1494 	case SO_LOCK_FILTER:
1495 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1496 			ret = -EPERM;
1497 		else
1498 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1499 		break;
1500 
1501 	case SO_MARK:
1502 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1503 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1504 			ret = -EPERM;
1505 			break;
1506 		}
1507 
1508 		__sock_set_mark(sk, val);
1509 		break;
1510 	case SO_RCVMARK:
1511 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1512 		break;
1513 
1514 	case SO_RXQ_OVFL:
1515 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1516 		break;
1517 
1518 	case SO_WIFI_STATUS:
1519 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1520 		break;
1521 
1522 	case SO_NOFCS:
1523 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1524 		break;
1525 
1526 	case SO_SELECT_ERR_QUEUE:
1527 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1528 		break;
1529 
1530 
1531 	case SO_INCOMING_CPU:
1532 		reuseport_update_incoming_cpu(sk, val);
1533 		break;
1534 
1535 	case SO_CNX_ADVICE:
1536 		if (val == 1)
1537 			dst_negative_advice(sk);
1538 		break;
1539 
1540 	case SO_ZEROCOPY:
1541 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1542 			if (!(sk_is_tcp(sk) ||
1543 			      (sk->sk_type == SOCK_DGRAM &&
1544 			       sk->sk_protocol == IPPROTO_UDP)))
1545 				ret = -EOPNOTSUPP;
1546 		} else if (sk->sk_family != PF_RDS) {
1547 			ret = -EOPNOTSUPP;
1548 		}
1549 		if (!ret) {
1550 			if (val < 0 || val > 1)
1551 				ret = -EINVAL;
1552 			else
1553 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1554 		}
1555 		break;
1556 
1557 	case SO_TXTIME:
1558 		if (optlen != sizeof(struct sock_txtime)) {
1559 			ret = -EINVAL;
1560 			break;
1561 		} else if (copy_from_sockptr(&sk_txtime, optval,
1562 			   sizeof(struct sock_txtime))) {
1563 			ret = -EFAULT;
1564 			break;
1565 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1566 			ret = -EINVAL;
1567 			break;
1568 		}
1569 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1570 		 * scheduler has enough safe guards.
1571 		 */
1572 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1573 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1574 			ret = -EPERM;
1575 			break;
1576 		}
1577 
1578 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1579 		if (ret)
1580 			break;
1581 
1582 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1583 		sk->sk_clockid = sk_txtime.clockid;
1584 		sk->sk_txtime_deadline_mode =
1585 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1586 		sk->sk_txtime_report_errors =
1587 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1588 		break;
1589 
1590 	case SO_BINDTOIFINDEX:
1591 		ret = sock_bindtoindex_locked(sk, val);
1592 		break;
1593 
1594 	case SO_BUF_LOCK:
1595 		if (val & ~SOCK_BUF_LOCK_MASK) {
1596 			ret = -EINVAL;
1597 			break;
1598 		}
1599 		sk->sk_userlocks = val | (sk->sk_userlocks &
1600 					  ~SOCK_BUF_LOCK_MASK);
1601 		break;
1602 
1603 	case SO_RESERVE_MEM:
1604 	{
1605 		int delta;
1606 
1607 		if (val < 0) {
1608 			ret = -EINVAL;
1609 			break;
1610 		}
1611 
1612 		delta = val - sk->sk_reserved_mem;
1613 		if (delta < 0)
1614 			sock_release_reserved_memory(sk, -delta);
1615 		else
1616 			ret = sock_reserve_memory(sk, delta);
1617 		break;
1618 	}
1619 
1620 	default:
1621 		ret = -ENOPROTOOPT;
1622 		break;
1623 	}
1624 	sockopt_release_sock(sk);
1625 	return ret;
1626 }
1627 
1628 int sock_setsockopt(struct socket *sock, int level, int optname,
1629 		    sockptr_t optval, unsigned int optlen)
1630 {
1631 	return sk_setsockopt(sock->sk, level, optname,
1632 			     optval, optlen);
1633 }
1634 EXPORT_SYMBOL(sock_setsockopt);
1635 
1636 static const struct cred *sk_get_peer_cred(struct sock *sk)
1637 {
1638 	const struct cred *cred;
1639 
1640 	spin_lock(&sk->sk_peer_lock);
1641 	cred = get_cred(sk->sk_peer_cred);
1642 	spin_unlock(&sk->sk_peer_lock);
1643 
1644 	return cred;
1645 }
1646 
1647 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1648 			  struct ucred *ucred)
1649 {
1650 	ucred->pid = pid_vnr(pid);
1651 	ucred->uid = ucred->gid = -1;
1652 	if (cred) {
1653 		struct user_namespace *current_ns = current_user_ns();
1654 
1655 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1656 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1657 	}
1658 }
1659 
1660 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1661 {
1662 	struct user_namespace *user_ns = current_user_ns();
1663 	int i;
1664 
1665 	for (i = 0; i < src->ngroups; i++) {
1666 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1667 
1668 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1669 			return -EFAULT;
1670 	}
1671 
1672 	return 0;
1673 }
1674 
1675 int sk_getsockopt(struct sock *sk, int level, int optname,
1676 		  sockptr_t optval, sockptr_t optlen)
1677 {
1678 	struct socket *sock = sk->sk_socket;
1679 
1680 	union {
1681 		int val;
1682 		u64 val64;
1683 		unsigned long ulval;
1684 		struct linger ling;
1685 		struct old_timeval32 tm32;
1686 		struct __kernel_old_timeval tm;
1687 		struct  __kernel_sock_timeval stm;
1688 		struct sock_txtime txtime;
1689 		struct so_timestamping timestamping;
1690 	} v;
1691 
1692 	int lv = sizeof(int);
1693 	int len;
1694 
1695 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1696 		return -EFAULT;
1697 	if (len < 0)
1698 		return -EINVAL;
1699 
1700 	memset(&v, 0, sizeof(v));
1701 
1702 	switch (optname) {
1703 	case SO_DEBUG:
1704 		v.val = sock_flag(sk, SOCK_DBG);
1705 		break;
1706 
1707 	case SO_DONTROUTE:
1708 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1709 		break;
1710 
1711 	case SO_BROADCAST:
1712 		v.val = sock_flag(sk, SOCK_BROADCAST);
1713 		break;
1714 
1715 	case SO_SNDBUF:
1716 		v.val = READ_ONCE(sk->sk_sndbuf);
1717 		break;
1718 
1719 	case SO_RCVBUF:
1720 		v.val = READ_ONCE(sk->sk_rcvbuf);
1721 		break;
1722 
1723 	case SO_REUSEADDR:
1724 		v.val = sk->sk_reuse;
1725 		break;
1726 
1727 	case SO_REUSEPORT:
1728 		v.val = sk->sk_reuseport;
1729 		break;
1730 
1731 	case SO_KEEPALIVE:
1732 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1733 		break;
1734 
1735 	case SO_TYPE:
1736 		v.val = sk->sk_type;
1737 		break;
1738 
1739 	case SO_PROTOCOL:
1740 		v.val = sk->sk_protocol;
1741 		break;
1742 
1743 	case SO_DOMAIN:
1744 		v.val = sk->sk_family;
1745 		break;
1746 
1747 	case SO_ERROR:
1748 		v.val = -sock_error(sk);
1749 		if (v.val == 0)
1750 			v.val = xchg(&sk->sk_err_soft, 0);
1751 		break;
1752 
1753 	case SO_OOBINLINE:
1754 		v.val = sock_flag(sk, SOCK_URGINLINE);
1755 		break;
1756 
1757 	case SO_NO_CHECK:
1758 		v.val = sk->sk_no_check_tx;
1759 		break;
1760 
1761 	case SO_PRIORITY:
1762 		v.val = READ_ONCE(sk->sk_priority);
1763 		break;
1764 
1765 	case SO_LINGER:
1766 		lv		= sizeof(v.ling);
1767 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1768 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1769 		break;
1770 
1771 	case SO_BSDCOMPAT:
1772 		break;
1773 
1774 	case SO_TIMESTAMP_OLD:
1775 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1776 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1777 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1778 		break;
1779 
1780 	case SO_TIMESTAMPNS_OLD:
1781 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1782 		break;
1783 
1784 	case SO_TIMESTAMP_NEW:
1785 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1786 		break;
1787 
1788 	case SO_TIMESTAMPNS_NEW:
1789 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1790 		break;
1791 
1792 	case SO_TIMESTAMPING_OLD:
1793 	case SO_TIMESTAMPING_NEW:
1794 		lv = sizeof(v.timestamping);
1795 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1796 		 * returning the flags when they were set through the same option.
1797 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1798 		 */
1799 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1800 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1801 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1802 		}
1803 		break;
1804 
1805 	case SO_RCVTIMEO_OLD:
1806 	case SO_RCVTIMEO_NEW:
1807 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1808 				      SO_RCVTIMEO_OLD == optname);
1809 		break;
1810 
1811 	case SO_SNDTIMEO_OLD:
1812 	case SO_SNDTIMEO_NEW:
1813 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1814 				      SO_SNDTIMEO_OLD == optname);
1815 		break;
1816 
1817 	case SO_RCVLOWAT:
1818 		v.val = READ_ONCE(sk->sk_rcvlowat);
1819 		break;
1820 
1821 	case SO_SNDLOWAT:
1822 		v.val = 1;
1823 		break;
1824 
1825 	case SO_PASSCRED:
1826 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1827 		break;
1828 
1829 	case SO_PASSPIDFD:
1830 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1831 		break;
1832 
1833 	case SO_PEERCRED:
1834 	{
1835 		struct ucred peercred;
1836 		if (len > sizeof(peercred))
1837 			len = sizeof(peercred);
1838 
1839 		spin_lock(&sk->sk_peer_lock);
1840 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1841 		spin_unlock(&sk->sk_peer_lock);
1842 
1843 		if (copy_to_sockptr(optval, &peercred, len))
1844 			return -EFAULT;
1845 		goto lenout;
1846 	}
1847 
1848 	case SO_PEERPIDFD:
1849 	{
1850 		struct pid *peer_pid;
1851 		struct file *pidfd_file = NULL;
1852 		int pidfd;
1853 
1854 		if (len > sizeof(pidfd))
1855 			len = sizeof(pidfd);
1856 
1857 		spin_lock(&sk->sk_peer_lock);
1858 		peer_pid = get_pid(sk->sk_peer_pid);
1859 		spin_unlock(&sk->sk_peer_lock);
1860 
1861 		if (!peer_pid)
1862 			return -ENODATA;
1863 
1864 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1865 		put_pid(peer_pid);
1866 		if (pidfd < 0)
1867 			return pidfd;
1868 
1869 		if (copy_to_sockptr(optval, &pidfd, len) ||
1870 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1871 			put_unused_fd(pidfd);
1872 			fput(pidfd_file);
1873 
1874 			return -EFAULT;
1875 		}
1876 
1877 		fd_install(pidfd, pidfd_file);
1878 		return 0;
1879 	}
1880 
1881 	case SO_PEERGROUPS:
1882 	{
1883 		const struct cred *cred;
1884 		int ret, n;
1885 
1886 		cred = sk_get_peer_cred(sk);
1887 		if (!cred)
1888 			return -ENODATA;
1889 
1890 		n = cred->group_info->ngroups;
1891 		if (len < n * sizeof(gid_t)) {
1892 			len = n * sizeof(gid_t);
1893 			put_cred(cred);
1894 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1895 		}
1896 		len = n * sizeof(gid_t);
1897 
1898 		ret = groups_to_user(optval, cred->group_info);
1899 		put_cred(cred);
1900 		if (ret)
1901 			return ret;
1902 		goto lenout;
1903 	}
1904 
1905 	case SO_PEERNAME:
1906 	{
1907 		struct sockaddr_storage address;
1908 
1909 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1910 		if (lv < 0)
1911 			return -ENOTCONN;
1912 		if (lv < len)
1913 			return -EINVAL;
1914 		if (copy_to_sockptr(optval, &address, len))
1915 			return -EFAULT;
1916 		goto lenout;
1917 	}
1918 
1919 	/* Dubious BSD thing... Probably nobody even uses it, but
1920 	 * the UNIX standard wants it for whatever reason... -DaveM
1921 	 */
1922 	case SO_ACCEPTCONN:
1923 		v.val = sk->sk_state == TCP_LISTEN;
1924 		break;
1925 
1926 	case SO_PASSSEC:
1927 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1928 		break;
1929 
1930 	case SO_PEERSEC:
1931 		return security_socket_getpeersec_stream(sock,
1932 							 optval, optlen, len);
1933 
1934 	case SO_MARK:
1935 		v.val = READ_ONCE(sk->sk_mark);
1936 		break;
1937 
1938 	case SO_RCVMARK:
1939 		v.val = sock_flag(sk, SOCK_RCVMARK);
1940 		break;
1941 
1942 	case SO_RXQ_OVFL:
1943 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1944 		break;
1945 
1946 	case SO_WIFI_STATUS:
1947 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1948 		break;
1949 
1950 	case SO_PEEK_OFF:
1951 		if (!READ_ONCE(sock->ops)->set_peek_off)
1952 			return -EOPNOTSUPP;
1953 
1954 		v.val = READ_ONCE(sk->sk_peek_off);
1955 		break;
1956 	case SO_NOFCS:
1957 		v.val = sock_flag(sk, SOCK_NOFCS);
1958 		break;
1959 
1960 	case SO_BINDTODEVICE:
1961 		return sock_getbindtodevice(sk, optval, optlen, len);
1962 
1963 	case SO_GET_FILTER:
1964 		len = sk_get_filter(sk, optval, len);
1965 		if (len < 0)
1966 			return len;
1967 
1968 		goto lenout;
1969 
1970 	case SO_LOCK_FILTER:
1971 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1972 		break;
1973 
1974 	case SO_BPF_EXTENSIONS:
1975 		v.val = bpf_tell_extensions();
1976 		break;
1977 
1978 	case SO_SELECT_ERR_QUEUE:
1979 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1980 		break;
1981 
1982 #ifdef CONFIG_NET_RX_BUSY_POLL
1983 	case SO_BUSY_POLL:
1984 		v.val = READ_ONCE(sk->sk_ll_usec);
1985 		break;
1986 	case SO_PREFER_BUSY_POLL:
1987 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1988 		break;
1989 #endif
1990 
1991 	case SO_MAX_PACING_RATE:
1992 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1993 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1994 			lv = sizeof(v.ulval);
1995 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1996 		} else {
1997 			/* 32bit version */
1998 			v.val = min_t(unsigned long, ~0U,
1999 				      READ_ONCE(sk->sk_max_pacing_rate));
2000 		}
2001 		break;
2002 
2003 	case SO_INCOMING_CPU:
2004 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2005 		break;
2006 
2007 	case SO_MEMINFO:
2008 	{
2009 		u32 meminfo[SK_MEMINFO_VARS];
2010 
2011 		sk_get_meminfo(sk, meminfo);
2012 
2013 		len = min_t(unsigned int, len, sizeof(meminfo));
2014 		if (copy_to_sockptr(optval, &meminfo, len))
2015 			return -EFAULT;
2016 
2017 		goto lenout;
2018 	}
2019 
2020 #ifdef CONFIG_NET_RX_BUSY_POLL
2021 	case SO_INCOMING_NAPI_ID:
2022 		v.val = READ_ONCE(sk->sk_napi_id);
2023 
2024 		/* aggregate non-NAPI IDs down to 0 */
2025 		if (v.val < MIN_NAPI_ID)
2026 			v.val = 0;
2027 
2028 		break;
2029 #endif
2030 
2031 	case SO_COOKIE:
2032 		lv = sizeof(u64);
2033 		if (len < lv)
2034 			return -EINVAL;
2035 		v.val64 = sock_gen_cookie(sk);
2036 		break;
2037 
2038 	case SO_ZEROCOPY:
2039 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2040 		break;
2041 
2042 	case SO_TXTIME:
2043 		lv = sizeof(v.txtime);
2044 		v.txtime.clockid = sk->sk_clockid;
2045 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2046 				  SOF_TXTIME_DEADLINE_MODE : 0;
2047 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2048 				  SOF_TXTIME_REPORT_ERRORS : 0;
2049 		break;
2050 
2051 	case SO_BINDTOIFINDEX:
2052 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2053 		break;
2054 
2055 	case SO_NETNS_COOKIE:
2056 		lv = sizeof(u64);
2057 		if (len != lv)
2058 			return -EINVAL;
2059 		v.val64 = sock_net(sk)->net_cookie;
2060 		break;
2061 
2062 	case SO_BUF_LOCK:
2063 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2064 		break;
2065 
2066 	case SO_RESERVE_MEM:
2067 		v.val = READ_ONCE(sk->sk_reserved_mem);
2068 		break;
2069 
2070 	case SO_TXREHASH:
2071 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2072 		v.val = READ_ONCE(sk->sk_txrehash);
2073 		break;
2074 
2075 	default:
2076 		/* We implement the SO_SNDLOWAT etc to not be settable
2077 		 * (1003.1g 7).
2078 		 */
2079 		return -ENOPROTOOPT;
2080 	}
2081 
2082 	if (len > lv)
2083 		len = lv;
2084 	if (copy_to_sockptr(optval, &v, len))
2085 		return -EFAULT;
2086 lenout:
2087 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2088 		return -EFAULT;
2089 	return 0;
2090 }
2091 
2092 /*
2093  * Initialize an sk_lock.
2094  *
2095  * (We also register the sk_lock with the lock validator.)
2096  */
2097 static inline void sock_lock_init(struct sock *sk)
2098 {
2099 	if (sk->sk_kern_sock)
2100 		sock_lock_init_class_and_name(
2101 			sk,
2102 			af_family_kern_slock_key_strings[sk->sk_family],
2103 			af_family_kern_slock_keys + sk->sk_family,
2104 			af_family_kern_key_strings[sk->sk_family],
2105 			af_family_kern_keys + sk->sk_family);
2106 	else
2107 		sock_lock_init_class_and_name(
2108 			sk,
2109 			af_family_slock_key_strings[sk->sk_family],
2110 			af_family_slock_keys + sk->sk_family,
2111 			af_family_key_strings[sk->sk_family],
2112 			af_family_keys + sk->sk_family);
2113 }
2114 
2115 /*
2116  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2117  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2118  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2119  */
2120 static void sock_copy(struct sock *nsk, const struct sock *osk)
2121 {
2122 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2123 #ifdef CONFIG_SECURITY_NETWORK
2124 	void *sptr = nsk->sk_security;
2125 #endif
2126 
2127 	/* If we move sk_tx_queue_mapping out of the private section,
2128 	 * we must check if sk_tx_queue_clear() is called after
2129 	 * sock_copy() in sk_clone_lock().
2130 	 */
2131 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2132 		     offsetof(struct sock, sk_dontcopy_begin) ||
2133 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2134 		     offsetof(struct sock, sk_dontcopy_end));
2135 
2136 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2137 
2138 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2139 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2140 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2141 
2142 #ifdef CONFIG_SECURITY_NETWORK
2143 	nsk->sk_security = sptr;
2144 	security_sk_clone(osk, nsk);
2145 #endif
2146 }
2147 
2148 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2149 		int family)
2150 {
2151 	struct sock *sk;
2152 	struct kmem_cache *slab;
2153 
2154 	slab = prot->slab;
2155 	if (slab != NULL) {
2156 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2157 		if (!sk)
2158 			return sk;
2159 		if (want_init_on_alloc(priority))
2160 			sk_prot_clear_nulls(sk, prot->obj_size);
2161 	} else
2162 		sk = kmalloc(prot->obj_size, priority);
2163 
2164 	if (sk != NULL) {
2165 		if (security_sk_alloc(sk, family, priority))
2166 			goto out_free;
2167 
2168 		if (!try_module_get(prot->owner))
2169 			goto out_free_sec;
2170 	}
2171 
2172 	return sk;
2173 
2174 out_free_sec:
2175 	security_sk_free(sk);
2176 out_free:
2177 	if (slab != NULL)
2178 		kmem_cache_free(slab, sk);
2179 	else
2180 		kfree(sk);
2181 	return NULL;
2182 }
2183 
2184 static void sk_prot_free(struct proto *prot, struct sock *sk)
2185 {
2186 	struct kmem_cache *slab;
2187 	struct module *owner;
2188 
2189 	owner = prot->owner;
2190 	slab = prot->slab;
2191 
2192 	cgroup_sk_free(&sk->sk_cgrp_data);
2193 	mem_cgroup_sk_free(sk);
2194 	security_sk_free(sk);
2195 	if (slab != NULL)
2196 		kmem_cache_free(slab, sk);
2197 	else
2198 		kfree(sk);
2199 	module_put(owner);
2200 }
2201 
2202 /**
2203  *	sk_alloc - All socket objects are allocated here
2204  *	@net: the applicable net namespace
2205  *	@family: protocol family
2206  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2207  *	@prot: struct proto associated with this new sock instance
2208  *	@kern: is this to be a kernel socket?
2209  */
2210 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2211 		      struct proto *prot, int kern)
2212 {
2213 	struct sock *sk;
2214 
2215 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2216 	if (sk) {
2217 		sk->sk_family = family;
2218 		/*
2219 		 * See comment in struct sock definition to understand
2220 		 * why we need sk_prot_creator -acme
2221 		 */
2222 		sk->sk_prot = sk->sk_prot_creator = prot;
2223 		sk->sk_kern_sock = kern;
2224 		sock_lock_init(sk);
2225 		sk->sk_net_refcnt = kern ? 0 : 1;
2226 		if (likely(sk->sk_net_refcnt)) {
2227 			get_net_track(net, &sk->ns_tracker, priority);
2228 			sock_inuse_add(net, 1);
2229 		} else {
2230 			__netns_tracker_alloc(net, &sk->ns_tracker,
2231 					      false, priority);
2232 		}
2233 
2234 		sock_net_set(sk, net);
2235 		refcount_set(&sk->sk_wmem_alloc, 1);
2236 
2237 		mem_cgroup_sk_alloc(sk);
2238 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2239 		sock_update_classid(&sk->sk_cgrp_data);
2240 		sock_update_netprioidx(&sk->sk_cgrp_data);
2241 		sk_tx_queue_clear(sk);
2242 	}
2243 
2244 	return sk;
2245 }
2246 EXPORT_SYMBOL(sk_alloc);
2247 
2248 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2249  * grace period. This is the case for UDP sockets and TCP listeners.
2250  */
2251 static void __sk_destruct(struct rcu_head *head)
2252 {
2253 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2254 	struct sk_filter *filter;
2255 
2256 	if (sk->sk_destruct)
2257 		sk->sk_destruct(sk);
2258 
2259 	filter = rcu_dereference_check(sk->sk_filter,
2260 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2261 	if (filter) {
2262 		sk_filter_uncharge(sk, filter);
2263 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2264 	}
2265 
2266 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2267 
2268 #ifdef CONFIG_BPF_SYSCALL
2269 	bpf_sk_storage_free(sk);
2270 #endif
2271 
2272 	if (atomic_read(&sk->sk_omem_alloc))
2273 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2274 			 __func__, atomic_read(&sk->sk_omem_alloc));
2275 
2276 	if (sk->sk_frag.page) {
2277 		put_page(sk->sk_frag.page);
2278 		sk->sk_frag.page = NULL;
2279 	}
2280 
2281 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2282 	put_cred(sk->sk_peer_cred);
2283 	put_pid(sk->sk_peer_pid);
2284 
2285 	if (likely(sk->sk_net_refcnt))
2286 		put_net_track(sock_net(sk), &sk->ns_tracker);
2287 	else
2288 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2289 
2290 	sk_prot_free(sk->sk_prot_creator, sk);
2291 }
2292 
2293 void sk_destruct(struct sock *sk)
2294 {
2295 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2296 
2297 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2298 		reuseport_detach_sock(sk);
2299 		use_call_rcu = true;
2300 	}
2301 
2302 	if (use_call_rcu)
2303 		call_rcu(&sk->sk_rcu, __sk_destruct);
2304 	else
2305 		__sk_destruct(&sk->sk_rcu);
2306 }
2307 
2308 static void __sk_free(struct sock *sk)
2309 {
2310 	if (likely(sk->sk_net_refcnt))
2311 		sock_inuse_add(sock_net(sk), -1);
2312 
2313 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2314 		sock_diag_broadcast_destroy(sk);
2315 	else
2316 		sk_destruct(sk);
2317 }
2318 
2319 void sk_free(struct sock *sk)
2320 {
2321 	/*
2322 	 * We subtract one from sk_wmem_alloc and can know if
2323 	 * some packets are still in some tx queue.
2324 	 * If not null, sock_wfree() will call __sk_free(sk) later
2325 	 */
2326 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2327 		__sk_free(sk);
2328 }
2329 EXPORT_SYMBOL(sk_free);
2330 
2331 static void sk_init_common(struct sock *sk)
2332 {
2333 	skb_queue_head_init(&sk->sk_receive_queue);
2334 	skb_queue_head_init(&sk->sk_write_queue);
2335 	skb_queue_head_init(&sk->sk_error_queue);
2336 
2337 	rwlock_init(&sk->sk_callback_lock);
2338 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2339 			af_rlock_keys + sk->sk_family,
2340 			af_family_rlock_key_strings[sk->sk_family]);
2341 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2342 			af_wlock_keys + sk->sk_family,
2343 			af_family_wlock_key_strings[sk->sk_family]);
2344 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2345 			af_elock_keys + sk->sk_family,
2346 			af_family_elock_key_strings[sk->sk_family]);
2347 	if (sk->sk_kern_sock)
2348 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2349 			af_kern_callback_keys + sk->sk_family,
2350 			af_family_kern_clock_key_strings[sk->sk_family]);
2351 	else
2352 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2353 			af_callback_keys + sk->sk_family,
2354 			af_family_clock_key_strings[sk->sk_family]);
2355 }
2356 
2357 /**
2358  *	sk_clone_lock - clone a socket, and lock its clone
2359  *	@sk: the socket to clone
2360  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2361  *
2362  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2363  */
2364 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2365 {
2366 	struct proto *prot = READ_ONCE(sk->sk_prot);
2367 	struct sk_filter *filter;
2368 	bool is_charged = true;
2369 	struct sock *newsk;
2370 
2371 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2372 	if (!newsk)
2373 		goto out;
2374 
2375 	sock_copy(newsk, sk);
2376 
2377 	newsk->sk_prot_creator = prot;
2378 
2379 	/* SANITY */
2380 	if (likely(newsk->sk_net_refcnt)) {
2381 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2382 		sock_inuse_add(sock_net(newsk), 1);
2383 	} else {
2384 		/* Kernel sockets are not elevating the struct net refcount.
2385 		 * Instead, use a tracker to more easily detect if a layer
2386 		 * is not properly dismantling its kernel sockets at netns
2387 		 * destroy time.
2388 		 */
2389 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2390 				      false, priority);
2391 	}
2392 	sk_node_init(&newsk->sk_node);
2393 	sock_lock_init(newsk);
2394 	bh_lock_sock(newsk);
2395 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2396 	newsk->sk_backlog.len = 0;
2397 
2398 	atomic_set(&newsk->sk_rmem_alloc, 0);
2399 
2400 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2401 	refcount_set(&newsk->sk_wmem_alloc, 1);
2402 
2403 	atomic_set(&newsk->sk_omem_alloc, 0);
2404 	sk_init_common(newsk);
2405 
2406 	newsk->sk_dst_cache	= NULL;
2407 	newsk->sk_dst_pending_confirm = 0;
2408 	newsk->sk_wmem_queued	= 0;
2409 	newsk->sk_forward_alloc = 0;
2410 	newsk->sk_reserved_mem  = 0;
2411 	atomic_set(&newsk->sk_drops, 0);
2412 	newsk->sk_send_head	= NULL;
2413 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2414 	atomic_set(&newsk->sk_zckey, 0);
2415 
2416 	sock_reset_flag(newsk, SOCK_DONE);
2417 
2418 	/* sk->sk_memcg will be populated at accept() time */
2419 	newsk->sk_memcg = NULL;
2420 
2421 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2422 
2423 	rcu_read_lock();
2424 	filter = rcu_dereference(sk->sk_filter);
2425 	if (filter != NULL)
2426 		/* though it's an empty new sock, the charging may fail
2427 		 * if sysctl_optmem_max was changed between creation of
2428 		 * original socket and cloning
2429 		 */
2430 		is_charged = sk_filter_charge(newsk, filter);
2431 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2432 	rcu_read_unlock();
2433 
2434 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2435 		/* We need to make sure that we don't uncharge the new
2436 		 * socket if we couldn't charge it in the first place
2437 		 * as otherwise we uncharge the parent's filter.
2438 		 */
2439 		if (!is_charged)
2440 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2441 		sk_free_unlock_clone(newsk);
2442 		newsk = NULL;
2443 		goto out;
2444 	}
2445 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2446 
2447 	if (bpf_sk_storage_clone(sk, newsk)) {
2448 		sk_free_unlock_clone(newsk);
2449 		newsk = NULL;
2450 		goto out;
2451 	}
2452 
2453 	/* Clear sk_user_data if parent had the pointer tagged
2454 	 * as not suitable for copying when cloning.
2455 	 */
2456 	if (sk_user_data_is_nocopy(newsk))
2457 		newsk->sk_user_data = NULL;
2458 
2459 	newsk->sk_err	   = 0;
2460 	newsk->sk_err_soft = 0;
2461 	newsk->sk_priority = 0;
2462 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2463 
2464 	/* Before updating sk_refcnt, we must commit prior changes to memory
2465 	 * (Documentation/RCU/rculist_nulls.rst for details)
2466 	 */
2467 	smp_wmb();
2468 	refcount_set(&newsk->sk_refcnt, 2);
2469 
2470 	sk_set_socket(newsk, NULL);
2471 	sk_tx_queue_clear(newsk);
2472 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2473 
2474 	if (newsk->sk_prot->sockets_allocated)
2475 		sk_sockets_allocated_inc(newsk);
2476 
2477 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2478 		net_enable_timestamp();
2479 out:
2480 	return newsk;
2481 }
2482 EXPORT_SYMBOL_GPL(sk_clone_lock);
2483 
2484 void sk_free_unlock_clone(struct sock *sk)
2485 {
2486 	/* It is still raw copy of parent, so invalidate
2487 	 * destructor and make plain sk_free() */
2488 	sk->sk_destruct = NULL;
2489 	bh_unlock_sock(sk);
2490 	sk_free(sk);
2491 }
2492 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2493 
2494 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2495 {
2496 	bool is_ipv6 = false;
2497 	u32 max_size;
2498 
2499 #if IS_ENABLED(CONFIG_IPV6)
2500 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2501 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2502 #endif
2503 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2504 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2505 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2506 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2507 		max_size = GSO_LEGACY_MAX_SIZE;
2508 
2509 	return max_size - (MAX_TCP_HEADER + 1);
2510 }
2511 
2512 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2513 {
2514 	u32 max_segs = 1;
2515 
2516 	sk->sk_route_caps = dst->dev->features;
2517 	if (sk_is_tcp(sk))
2518 		sk->sk_route_caps |= NETIF_F_GSO;
2519 	if (sk->sk_route_caps & NETIF_F_GSO)
2520 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2521 	if (unlikely(sk->sk_gso_disabled))
2522 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2523 	if (sk_can_gso(sk)) {
2524 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2525 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2526 		} else {
2527 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2528 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2529 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2530 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2531 		}
2532 	}
2533 	sk->sk_gso_max_segs = max_segs;
2534 	sk_dst_set(sk, dst);
2535 }
2536 EXPORT_SYMBOL_GPL(sk_setup_caps);
2537 
2538 /*
2539  *	Simple resource managers for sockets.
2540  */
2541 
2542 
2543 /*
2544  * Write buffer destructor automatically called from kfree_skb.
2545  */
2546 void sock_wfree(struct sk_buff *skb)
2547 {
2548 	struct sock *sk = skb->sk;
2549 	unsigned int len = skb->truesize;
2550 	bool free;
2551 
2552 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2553 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2554 		    sk->sk_write_space == sock_def_write_space) {
2555 			rcu_read_lock();
2556 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2557 			sock_def_write_space_wfree(sk);
2558 			rcu_read_unlock();
2559 			if (unlikely(free))
2560 				__sk_free(sk);
2561 			return;
2562 		}
2563 
2564 		/*
2565 		 * Keep a reference on sk_wmem_alloc, this will be released
2566 		 * after sk_write_space() call
2567 		 */
2568 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2569 		sk->sk_write_space(sk);
2570 		len = 1;
2571 	}
2572 	/*
2573 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2574 	 * could not do because of in-flight packets
2575 	 */
2576 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2577 		__sk_free(sk);
2578 }
2579 EXPORT_SYMBOL(sock_wfree);
2580 
2581 /* This variant of sock_wfree() is used by TCP,
2582  * since it sets SOCK_USE_WRITE_QUEUE.
2583  */
2584 void __sock_wfree(struct sk_buff *skb)
2585 {
2586 	struct sock *sk = skb->sk;
2587 
2588 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2589 		__sk_free(sk);
2590 }
2591 
2592 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2593 {
2594 	skb_orphan(skb);
2595 #ifdef CONFIG_INET
2596 	if (unlikely(!sk_fullsock(sk)))
2597 		return skb_set_owner_edemux(skb, sk);
2598 #endif
2599 	skb->sk = sk;
2600 	skb->destructor = sock_wfree;
2601 	skb_set_hash_from_sk(skb, sk);
2602 	/*
2603 	 * We used to take a refcount on sk, but following operation
2604 	 * is enough to guarantee sk_free() won't free this sock until
2605 	 * all in-flight packets are completed
2606 	 */
2607 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2608 }
2609 EXPORT_SYMBOL(skb_set_owner_w);
2610 
2611 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2612 {
2613 	/* Drivers depend on in-order delivery for crypto offload,
2614 	 * partial orphan breaks out-of-order-OK logic.
2615 	 */
2616 	if (skb_is_decrypted(skb))
2617 		return false;
2618 
2619 	return (skb->destructor == sock_wfree ||
2620 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2621 }
2622 
2623 /* This helper is used by netem, as it can hold packets in its
2624  * delay queue. We want to allow the owner socket to send more
2625  * packets, as if they were already TX completed by a typical driver.
2626  * But we also want to keep skb->sk set because some packet schedulers
2627  * rely on it (sch_fq for example).
2628  */
2629 void skb_orphan_partial(struct sk_buff *skb)
2630 {
2631 	if (skb_is_tcp_pure_ack(skb))
2632 		return;
2633 
2634 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2635 		return;
2636 
2637 	skb_orphan(skb);
2638 }
2639 EXPORT_SYMBOL(skb_orphan_partial);
2640 
2641 /*
2642  * Read buffer destructor automatically called from kfree_skb.
2643  */
2644 void sock_rfree(struct sk_buff *skb)
2645 {
2646 	struct sock *sk = skb->sk;
2647 	unsigned int len = skb->truesize;
2648 
2649 	atomic_sub(len, &sk->sk_rmem_alloc);
2650 	sk_mem_uncharge(sk, len);
2651 }
2652 EXPORT_SYMBOL(sock_rfree);
2653 
2654 /*
2655  * Buffer destructor for skbs that are not used directly in read or write
2656  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2657  */
2658 void sock_efree(struct sk_buff *skb)
2659 {
2660 	sock_put(skb->sk);
2661 }
2662 EXPORT_SYMBOL(sock_efree);
2663 
2664 /* Buffer destructor for prefetch/receive path where reference count may
2665  * not be held, e.g. for listen sockets.
2666  */
2667 #ifdef CONFIG_INET
2668 void sock_pfree(struct sk_buff *skb)
2669 {
2670 	struct sock *sk = skb->sk;
2671 
2672 	if (!sk_is_refcounted(sk))
2673 		return;
2674 
2675 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2676 		inet_reqsk(sk)->rsk_listener = NULL;
2677 		reqsk_free(inet_reqsk(sk));
2678 		return;
2679 	}
2680 
2681 	sock_gen_put(sk);
2682 }
2683 EXPORT_SYMBOL(sock_pfree);
2684 #endif /* CONFIG_INET */
2685 
2686 kuid_t sock_i_uid(struct sock *sk)
2687 {
2688 	kuid_t uid;
2689 
2690 	read_lock_bh(&sk->sk_callback_lock);
2691 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2692 	read_unlock_bh(&sk->sk_callback_lock);
2693 	return uid;
2694 }
2695 EXPORT_SYMBOL(sock_i_uid);
2696 
2697 unsigned long __sock_i_ino(struct sock *sk)
2698 {
2699 	unsigned long ino;
2700 
2701 	read_lock(&sk->sk_callback_lock);
2702 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2703 	read_unlock(&sk->sk_callback_lock);
2704 	return ino;
2705 }
2706 EXPORT_SYMBOL(__sock_i_ino);
2707 
2708 unsigned long sock_i_ino(struct sock *sk)
2709 {
2710 	unsigned long ino;
2711 
2712 	local_bh_disable();
2713 	ino = __sock_i_ino(sk);
2714 	local_bh_enable();
2715 	return ino;
2716 }
2717 EXPORT_SYMBOL(sock_i_ino);
2718 
2719 /*
2720  * Allocate a skb from the socket's send buffer.
2721  */
2722 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2723 			     gfp_t priority)
2724 {
2725 	if (force ||
2726 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2727 		struct sk_buff *skb = alloc_skb(size, priority);
2728 
2729 		if (skb) {
2730 			skb_set_owner_w(skb, sk);
2731 			return skb;
2732 		}
2733 	}
2734 	return NULL;
2735 }
2736 EXPORT_SYMBOL(sock_wmalloc);
2737 
2738 static void sock_ofree(struct sk_buff *skb)
2739 {
2740 	struct sock *sk = skb->sk;
2741 
2742 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2743 }
2744 
2745 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2746 			     gfp_t priority)
2747 {
2748 	struct sk_buff *skb;
2749 
2750 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2751 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2752 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2753 		return NULL;
2754 
2755 	skb = alloc_skb(size, priority);
2756 	if (!skb)
2757 		return NULL;
2758 
2759 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2760 	skb->sk = sk;
2761 	skb->destructor = sock_ofree;
2762 	return skb;
2763 }
2764 
2765 /*
2766  * Allocate a memory block from the socket's option memory buffer.
2767  */
2768 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2769 {
2770 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2771 
2772 	if ((unsigned int)size <= optmem_max &&
2773 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2774 		void *mem;
2775 		/* First do the add, to avoid the race if kmalloc
2776 		 * might sleep.
2777 		 */
2778 		atomic_add(size, &sk->sk_omem_alloc);
2779 		mem = kmalloc(size, priority);
2780 		if (mem)
2781 			return mem;
2782 		atomic_sub(size, &sk->sk_omem_alloc);
2783 	}
2784 	return NULL;
2785 }
2786 EXPORT_SYMBOL(sock_kmalloc);
2787 
2788 /* Free an option memory block. Note, we actually want the inline
2789  * here as this allows gcc to detect the nullify and fold away the
2790  * condition entirely.
2791  */
2792 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2793 				  const bool nullify)
2794 {
2795 	if (WARN_ON_ONCE(!mem))
2796 		return;
2797 	if (nullify)
2798 		kfree_sensitive(mem);
2799 	else
2800 		kfree(mem);
2801 	atomic_sub(size, &sk->sk_omem_alloc);
2802 }
2803 
2804 void sock_kfree_s(struct sock *sk, void *mem, int size)
2805 {
2806 	__sock_kfree_s(sk, mem, size, false);
2807 }
2808 EXPORT_SYMBOL(sock_kfree_s);
2809 
2810 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2811 {
2812 	__sock_kfree_s(sk, mem, size, true);
2813 }
2814 EXPORT_SYMBOL(sock_kzfree_s);
2815 
2816 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2817    I think, these locks should be removed for datagram sockets.
2818  */
2819 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2820 {
2821 	DEFINE_WAIT(wait);
2822 
2823 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2824 	for (;;) {
2825 		if (!timeo)
2826 			break;
2827 		if (signal_pending(current))
2828 			break;
2829 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2830 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2831 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2832 			break;
2833 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2834 			break;
2835 		if (READ_ONCE(sk->sk_err))
2836 			break;
2837 		timeo = schedule_timeout(timeo);
2838 	}
2839 	finish_wait(sk_sleep(sk), &wait);
2840 	return timeo;
2841 }
2842 
2843 
2844 /*
2845  *	Generic send/receive buffer handlers
2846  */
2847 
2848 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2849 				     unsigned long data_len, int noblock,
2850 				     int *errcode, int max_page_order)
2851 {
2852 	struct sk_buff *skb;
2853 	long timeo;
2854 	int err;
2855 
2856 	timeo = sock_sndtimeo(sk, noblock);
2857 	for (;;) {
2858 		err = sock_error(sk);
2859 		if (err != 0)
2860 			goto failure;
2861 
2862 		err = -EPIPE;
2863 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2864 			goto failure;
2865 
2866 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2867 			break;
2868 
2869 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2870 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2871 		err = -EAGAIN;
2872 		if (!timeo)
2873 			goto failure;
2874 		if (signal_pending(current))
2875 			goto interrupted;
2876 		timeo = sock_wait_for_wmem(sk, timeo);
2877 	}
2878 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2879 				   errcode, sk->sk_allocation);
2880 	if (skb)
2881 		skb_set_owner_w(skb, sk);
2882 	return skb;
2883 
2884 interrupted:
2885 	err = sock_intr_errno(timeo);
2886 failure:
2887 	*errcode = err;
2888 	return NULL;
2889 }
2890 EXPORT_SYMBOL(sock_alloc_send_pskb);
2891 
2892 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2893 		     struct sockcm_cookie *sockc)
2894 {
2895 	u32 tsflags;
2896 
2897 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
2898 
2899 	switch (cmsg->cmsg_type) {
2900 	case SO_MARK:
2901 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2902 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2903 			return -EPERM;
2904 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2905 			return -EINVAL;
2906 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2907 		break;
2908 	case SO_TIMESTAMPING_OLD:
2909 	case SO_TIMESTAMPING_NEW:
2910 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2911 			return -EINVAL;
2912 
2913 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2914 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2915 			return -EINVAL;
2916 
2917 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2918 		sockc->tsflags |= tsflags;
2919 		break;
2920 	case SCM_TXTIME:
2921 		if (!sock_flag(sk, SOCK_TXTIME))
2922 			return -EINVAL;
2923 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2924 			return -EINVAL;
2925 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2926 		break;
2927 	case SCM_TS_OPT_ID:
2928 		if (sk_is_tcp(sk))
2929 			return -EINVAL;
2930 		tsflags = READ_ONCE(sk->sk_tsflags);
2931 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
2932 			return -EINVAL;
2933 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2934 			return -EINVAL;
2935 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
2936 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
2937 		break;
2938 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2939 	case SCM_RIGHTS:
2940 	case SCM_CREDENTIALS:
2941 		break;
2942 	default:
2943 		return -EINVAL;
2944 	}
2945 	return 0;
2946 }
2947 EXPORT_SYMBOL(__sock_cmsg_send);
2948 
2949 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2950 		   struct sockcm_cookie *sockc)
2951 {
2952 	struct cmsghdr *cmsg;
2953 	int ret;
2954 
2955 	for_each_cmsghdr(cmsg, msg) {
2956 		if (!CMSG_OK(msg, cmsg))
2957 			return -EINVAL;
2958 		if (cmsg->cmsg_level != SOL_SOCKET)
2959 			continue;
2960 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2961 		if (ret)
2962 			return ret;
2963 	}
2964 	return 0;
2965 }
2966 EXPORT_SYMBOL(sock_cmsg_send);
2967 
2968 static void sk_enter_memory_pressure(struct sock *sk)
2969 {
2970 	if (!sk->sk_prot->enter_memory_pressure)
2971 		return;
2972 
2973 	sk->sk_prot->enter_memory_pressure(sk);
2974 }
2975 
2976 static void sk_leave_memory_pressure(struct sock *sk)
2977 {
2978 	if (sk->sk_prot->leave_memory_pressure) {
2979 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2980 				     tcp_leave_memory_pressure, sk);
2981 	} else {
2982 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2983 
2984 		if (memory_pressure && READ_ONCE(*memory_pressure))
2985 			WRITE_ONCE(*memory_pressure, 0);
2986 	}
2987 }
2988 
2989 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2990 
2991 /**
2992  * skb_page_frag_refill - check that a page_frag contains enough room
2993  * @sz: minimum size of the fragment we want to get
2994  * @pfrag: pointer to page_frag
2995  * @gfp: priority for memory allocation
2996  *
2997  * Note: While this allocator tries to use high order pages, there is
2998  * no guarantee that allocations succeed. Therefore, @sz MUST be
2999  * less or equal than PAGE_SIZE.
3000  */
3001 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3002 {
3003 	if (pfrag->page) {
3004 		if (page_ref_count(pfrag->page) == 1) {
3005 			pfrag->offset = 0;
3006 			return true;
3007 		}
3008 		if (pfrag->offset + sz <= pfrag->size)
3009 			return true;
3010 		put_page(pfrag->page);
3011 	}
3012 
3013 	pfrag->offset = 0;
3014 	if (SKB_FRAG_PAGE_ORDER &&
3015 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3016 		/* Avoid direct reclaim but allow kswapd to wake */
3017 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3018 					  __GFP_COMP | __GFP_NOWARN |
3019 					  __GFP_NORETRY,
3020 					  SKB_FRAG_PAGE_ORDER);
3021 		if (likely(pfrag->page)) {
3022 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3023 			return true;
3024 		}
3025 	}
3026 	pfrag->page = alloc_page(gfp);
3027 	if (likely(pfrag->page)) {
3028 		pfrag->size = PAGE_SIZE;
3029 		return true;
3030 	}
3031 	return false;
3032 }
3033 EXPORT_SYMBOL(skb_page_frag_refill);
3034 
3035 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3036 {
3037 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3038 		return true;
3039 
3040 	sk_enter_memory_pressure(sk);
3041 	sk_stream_moderate_sndbuf(sk);
3042 	return false;
3043 }
3044 EXPORT_SYMBOL(sk_page_frag_refill);
3045 
3046 void __lock_sock(struct sock *sk)
3047 	__releases(&sk->sk_lock.slock)
3048 	__acquires(&sk->sk_lock.slock)
3049 {
3050 	DEFINE_WAIT(wait);
3051 
3052 	for (;;) {
3053 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3054 					TASK_UNINTERRUPTIBLE);
3055 		spin_unlock_bh(&sk->sk_lock.slock);
3056 		schedule();
3057 		spin_lock_bh(&sk->sk_lock.slock);
3058 		if (!sock_owned_by_user(sk))
3059 			break;
3060 	}
3061 	finish_wait(&sk->sk_lock.wq, &wait);
3062 }
3063 
3064 void __release_sock(struct sock *sk)
3065 	__releases(&sk->sk_lock.slock)
3066 	__acquires(&sk->sk_lock.slock)
3067 {
3068 	struct sk_buff *skb, *next;
3069 
3070 	while ((skb = sk->sk_backlog.head) != NULL) {
3071 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3072 
3073 		spin_unlock_bh(&sk->sk_lock.slock);
3074 
3075 		do {
3076 			next = skb->next;
3077 			prefetch(next);
3078 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3079 			skb_mark_not_on_list(skb);
3080 			sk_backlog_rcv(sk, skb);
3081 
3082 			cond_resched();
3083 
3084 			skb = next;
3085 		} while (skb != NULL);
3086 
3087 		spin_lock_bh(&sk->sk_lock.slock);
3088 	}
3089 
3090 	/*
3091 	 * Doing the zeroing here guarantee we can not loop forever
3092 	 * while a wild producer attempts to flood us.
3093 	 */
3094 	sk->sk_backlog.len = 0;
3095 }
3096 
3097 void __sk_flush_backlog(struct sock *sk)
3098 {
3099 	spin_lock_bh(&sk->sk_lock.slock);
3100 	__release_sock(sk);
3101 
3102 	if (sk->sk_prot->release_cb)
3103 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3104 				     tcp_release_cb, sk);
3105 
3106 	spin_unlock_bh(&sk->sk_lock.slock);
3107 }
3108 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3109 
3110 /**
3111  * sk_wait_data - wait for data to arrive at sk_receive_queue
3112  * @sk:    sock to wait on
3113  * @timeo: for how long
3114  * @skb:   last skb seen on sk_receive_queue
3115  *
3116  * Now socket state including sk->sk_err is changed only under lock,
3117  * hence we may omit checks after joining wait queue.
3118  * We check receive queue before schedule() only as optimization;
3119  * it is very likely that release_sock() added new data.
3120  */
3121 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3122 {
3123 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3124 	int rc;
3125 
3126 	add_wait_queue(sk_sleep(sk), &wait);
3127 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3128 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3129 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3130 	remove_wait_queue(sk_sleep(sk), &wait);
3131 	return rc;
3132 }
3133 EXPORT_SYMBOL(sk_wait_data);
3134 
3135 /**
3136  *	__sk_mem_raise_allocated - increase memory_allocated
3137  *	@sk: socket
3138  *	@size: memory size to allocate
3139  *	@amt: pages to allocate
3140  *	@kind: allocation type
3141  *
3142  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3143  *
3144  *	Unlike the globally shared limits among the sockets under same protocol,
3145  *	consuming the budget of a memcg won't have direct effect on other ones.
3146  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3147  *	whether or not to raise allocated through sk_under_memory_pressure() or
3148  *	its variants.
3149  */
3150 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3151 {
3152 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3153 	struct proto *prot = sk->sk_prot;
3154 	bool charged = false;
3155 	long allocated;
3156 
3157 	sk_memory_allocated_add(sk, amt);
3158 	allocated = sk_memory_allocated(sk);
3159 
3160 	if (memcg) {
3161 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3162 			goto suppress_allocation;
3163 		charged = true;
3164 	}
3165 
3166 	/* Under limit. */
3167 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3168 		sk_leave_memory_pressure(sk);
3169 		return 1;
3170 	}
3171 
3172 	/* Under pressure. */
3173 	if (allocated > sk_prot_mem_limits(sk, 1))
3174 		sk_enter_memory_pressure(sk);
3175 
3176 	/* Over hard limit. */
3177 	if (allocated > sk_prot_mem_limits(sk, 2))
3178 		goto suppress_allocation;
3179 
3180 	/* Guarantee minimum buffer size under pressure (either global
3181 	 * or memcg) to make sure features described in RFC 7323 (TCP
3182 	 * Extensions for High Performance) work properly.
3183 	 *
3184 	 * This rule does NOT stand when exceeds global or memcg's hard
3185 	 * limit, or else a DoS attack can be taken place by spawning
3186 	 * lots of sockets whose usage are under minimum buffer size.
3187 	 */
3188 	if (kind == SK_MEM_RECV) {
3189 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3190 			return 1;
3191 
3192 	} else { /* SK_MEM_SEND */
3193 		int wmem0 = sk_get_wmem0(sk, prot);
3194 
3195 		if (sk->sk_type == SOCK_STREAM) {
3196 			if (sk->sk_wmem_queued < wmem0)
3197 				return 1;
3198 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3199 				return 1;
3200 		}
3201 	}
3202 
3203 	if (sk_has_memory_pressure(sk)) {
3204 		u64 alloc;
3205 
3206 		/* The following 'average' heuristic is within the
3207 		 * scope of global accounting, so it only makes
3208 		 * sense for global memory pressure.
3209 		 */
3210 		if (!sk_under_global_memory_pressure(sk))
3211 			return 1;
3212 
3213 		/* Try to be fair among all the sockets under global
3214 		 * pressure by allowing the ones that below average
3215 		 * usage to raise.
3216 		 */
3217 		alloc = sk_sockets_allocated_read_positive(sk);
3218 		if (sk_prot_mem_limits(sk, 2) > alloc *
3219 		    sk_mem_pages(sk->sk_wmem_queued +
3220 				 atomic_read(&sk->sk_rmem_alloc) +
3221 				 sk->sk_forward_alloc))
3222 			return 1;
3223 	}
3224 
3225 suppress_allocation:
3226 
3227 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3228 		sk_stream_moderate_sndbuf(sk);
3229 
3230 		/* Fail only if socket is _under_ its sndbuf.
3231 		 * In this case we cannot block, so that we have to fail.
3232 		 */
3233 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3234 			/* Force charge with __GFP_NOFAIL */
3235 			if (memcg && !charged) {
3236 				mem_cgroup_charge_skmem(memcg, amt,
3237 					gfp_memcg_charge() | __GFP_NOFAIL);
3238 			}
3239 			return 1;
3240 		}
3241 	}
3242 
3243 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3244 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3245 
3246 	sk_memory_allocated_sub(sk, amt);
3247 
3248 	if (charged)
3249 		mem_cgroup_uncharge_skmem(memcg, amt);
3250 
3251 	return 0;
3252 }
3253 
3254 /**
3255  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3256  *	@sk: socket
3257  *	@size: memory size to allocate
3258  *	@kind: allocation type
3259  *
3260  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3261  *	rmem allocation. This function assumes that protocols which have
3262  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3263  */
3264 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3265 {
3266 	int ret, amt = sk_mem_pages(size);
3267 
3268 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3269 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3270 	if (!ret)
3271 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3272 	return ret;
3273 }
3274 EXPORT_SYMBOL(__sk_mem_schedule);
3275 
3276 /**
3277  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3278  *	@sk: socket
3279  *	@amount: number of quanta
3280  *
3281  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3282  */
3283 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3284 {
3285 	sk_memory_allocated_sub(sk, amount);
3286 
3287 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3288 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3289 
3290 	if (sk_under_global_memory_pressure(sk) &&
3291 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3292 		sk_leave_memory_pressure(sk);
3293 }
3294 
3295 /**
3296  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3297  *	@sk: socket
3298  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3299  */
3300 void __sk_mem_reclaim(struct sock *sk, int amount)
3301 {
3302 	amount >>= PAGE_SHIFT;
3303 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3304 	__sk_mem_reduce_allocated(sk, amount);
3305 }
3306 EXPORT_SYMBOL(__sk_mem_reclaim);
3307 
3308 int sk_set_peek_off(struct sock *sk, int val)
3309 {
3310 	WRITE_ONCE(sk->sk_peek_off, val);
3311 	return 0;
3312 }
3313 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3314 
3315 /*
3316  * Set of default routines for initialising struct proto_ops when
3317  * the protocol does not support a particular function. In certain
3318  * cases where it makes no sense for a protocol to have a "do nothing"
3319  * function, some default processing is provided.
3320  */
3321 
3322 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3323 {
3324 	return -EOPNOTSUPP;
3325 }
3326 EXPORT_SYMBOL(sock_no_bind);
3327 
3328 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3329 		    int len, int flags)
3330 {
3331 	return -EOPNOTSUPP;
3332 }
3333 EXPORT_SYMBOL(sock_no_connect);
3334 
3335 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3336 {
3337 	return -EOPNOTSUPP;
3338 }
3339 EXPORT_SYMBOL(sock_no_socketpair);
3340 
3341 int sock_no_accept(struct socket *sock, struct socket *newsock,
3342 		   struct proto_accept_arg *arg)
3343 {
3344 	return -EOPNOTSUPP;
3345 }
3346 EXPORT_SYMBOL(sock_no_accept);
3347 
3348 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3349 		    int peer)
3350 {
3351 	return -EOPNOTSUPP;
3352 }
3353 EXPORT_SYMBOL(sock_no_getname);
3354 
3355 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3356 {
3357 	return -EOPNOTSUPP;
3358 }
3359 EXPORT_SYMBOL(sock_no_ioctl);
3360 
3361 int sock_no_listen(struct socket *sock, int backlog)
3362 {
3363 	return -EOPNOTSUPP;
3364 }
3365 EXPORT_SYMBOL(sock_no_listen);
3366 
3367 int sock_no_shutdown(struct socket *sock, int how)
3368 {
3369 	return -EOPNOTSUPP;
3370 }
3371 EXPORT_SYMBOL(sock_no_shutdown);
3372 
3373 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3374 {
3375 	return -EOPNOTSUPP;
3376 }
3377 EXPORT_SYMBOL(sock_no_sendmsg);
3378 
3379 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3380 {
3381 	return -EOPNOTSUPP;
3382 }
3383 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3384 
3385 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3386 		    int flags)
3387 {
3388 	return -EOPNOTSUPP;
3389 }
3390 EXPORT_SYMBOL(sock_no_recvmsg);
3391 
3392 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3393 {
3394 	/* Mirror missing mmap method error code */
3395 	return -ENODEV;
3396 }
3397 EXPORT_SYMBOL(sock_no_mmap);
3398 
3399 /*
3400  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3401  * various sock-based usage counts.
3402  */
3403 void __receive_sock(struct file *file)
3404 {
3405 	struct socket *sock;
3406 
3407 	sock = sock_from_file(file);
3408 	if (sock) {
3409 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3410 		sock_update_classid(&sock->sk->sk_cgrp_data);
3411 	}
3412 }
3413 
3414 /*
3415  *	Default Socket Callbacks
3416  */
3417 
3418 static void sock_def_wakeup(struct sock *sk)
3419 {
3420 	struct socket_wq *wq;
3421 
3422 	rcu_read_lock();
3423 	wq = rcu_dereference(sk->sk_wq);
3424 	if (skwq_has_sleeper(wq))
3425 		wake_up_interruptible_all(&wq->wait);
3426 	rcu_read_unlock();
3427 }
3428 
3429 static void sock_def_error_report(struct sock *sk)
3430 {
3431 	struct socket_wq *wq;
3432 
3433 	rcu_read_lock();
3434 	wq = rcu_dereference(sk->sk_wq);
3435 	if (skwq_has_sleeper(wq))
3436 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3437 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3438 	rcu_read_unlock();
3439 }
3440 
3441 void sock_def_readable(struct sock *sk)
3442 {
3443 	struct socket_wq *wq;
3444 
3445 	trace_sk_data_ready(sk);
3446 
3447 	rcu_read_lock();
3448 	wq = rcu_dereference(sk->sk_wq);
3449 	if (skwq_has_sleeper(wq))
3450 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3451 						EPOLLRDNORM | EPOLLRDBAND);
3452 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3453 	rcu_read_unlock();
3454 }
3455 
3456 static void sock_def_write_space(struct sock *sk)
3457 {
3458 	struct socket_wq *wq;
3459 
3460 	rcu_read_lock();
3461 
3462 	/* Do not wake up a writer until he can make "significant"
3463 	 * progress.  --DaveM
3464 	 */
3465 	if (sock_writeable(sk)) {
3466 		wq = rcu_dereference(sk->sk_wq);
3467 		if (skwq_has_sleeper(wq))
3468 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3469 						EPOLLWRNORM | EPOLLWRBAND);
3470 
3471 		/* Should agree with poll, otherwise some programs break */
3472 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3473 	}
3474 
3475 	rcu_read_unlock();
3476 }
3477 
3478 /* An optimised version of sock_def_write_space(), should only be called
3479  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3480  * ->sk_wmem_alloc.
3481  */
3482 static void sock_def_write_space_wfree(struct sock *sk)
3483 {
3484 	/* Do not wake up a writer until he can make "significant"
3485 	 * progress.  --DaveM
3486 	 */
3487 	if (sock_writeable(sk)) {
3488 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3489 
3490 		/* rely on refcount_sub from sock_wfree() */
3491 		smp_mb__after_atomic();
3492 		if (wq && waitqueue_active(&wq->wait))
3493 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3494 						EPOLLWRNORM | EPOLLWRBAND);
3495 
3496 		/* Should agree with poll, otherwise some programs break */
3497 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3498 	}
3499 }
3500 
3501 static void sock_def_destruct(struct sock *sk)
3502 {
3503 }
3504 
3505 void sk_send_sigurg(struct sock *sk)
3506 {
3507 	if (sk->sk_socket && sk->sk_socket->file)
3508 		if (send_sigurg(sk->sk_socket->file))
3509 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3510 }
3511 EXPORT_SYMBOL(sk_send_sigurg);
3512 
3513 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3514 		    unsigned long expires)
3515 {
3516 	if (!mod_timer(timer, expires))
3517 		sock_hold(sk);
3518 }
3519 EXPORT_SYMBOL(sk_reset_timer);
3520 
3521 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3522 {
3523 	if (del_timer(timer))
3524 		__sock_put(sk);
3525 }
3526 EXPORT_SYMBOL(sk_stop_timer);
3527 
3528 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3529 {
3530 	if (del_timer_sync(timer))
3531 		__sock_put(sk);
3532 }
3533 EXPORT_SYMBOL(sk_stop_timer_sync);
3534 
3535 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3536 {
3537 	sk_init_common(sk);
3538 	sk->sk_send_head	=	NULL;
3539 
3540 	timer_setup(&sk->sk_timer, NULL, 0);
3541 
3542 	sk->sk_allocation	=	GFP_KERNEL;
3543 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3544 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3545 	sk->sk_state		=	TCP_CLOSE;
3546 	sk->sk_use_task_frag	=	true;
3547 	sk_set_socket(sk, sock);
3548 
3549 	sock_set_flag(sk, SOCK_ZAPPED);
3550 
3551 	if (sock) {
3552 		sk->sk_type	=	sock->type;
3553 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3554 		sock->sk	=	sk;
3555 	} else {
3556 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3557 	}
3558 	sk->sk_uid	=	uid;
3559 
3560 	sk->sk_state_change	=	sock_def_wakeup;
3561 	sk->sk_data_ready	=	sock_def_readable;
3562 	sk->sk_write_space	=	sock_def_write_space;
3563 	sk->sk_error_report	=	sock_def_error_report;
3564 	sk->sk_destruct		=	sock_def_destruct;
3565 
3566 	sk->sk_frag.page	=	NULL;
3567 	sk->sk_frag.offset	=	0;
3568 	sk->sk_peek_off		=	-1;
3569 
3570 	sk->sk_peer_pid 	=	NULL;
3571 	sk->sk_peer_cred	=	NULL;
3572 	spin_lock_init(&sk->sk_peer_lock);
3573 
3574 	sk->sk_write_pending	=	0;
3575 	sk->sk_rcvlowat		=	1;
3576 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3577 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3578 
3579 	sk->sk_stamp = SK_DEFAULT_STAMP;
3580 #if BITS_PER_LONG==32
3581 	seqlock_init(&sk->sk_stamp_seq);
3582 #endif
3583 	atomic_set(&sk->sk_zckey, 0);
3584 
3585 #ifdef CONFIG_NET_RX_BUSY_POLL
3586 	sk->sk_napi_id		=	0;
3587 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3588 #endif
3589 
3590 	sk->sk_max_pacing_rate = ~0UL;
3591 	sk->sk_pacing_rate = ~0UL;
3592 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3593 	sk->sk_incoming_cpu = -1;
3594 
3595 	sk_rx_queue_clear(sk);
3596 	/*
3597 	 * Before updating sk_refcnt, we must commit prior changes to memory
3598 	 * (Documentation/RCU/rculist_nulls.rst for details)
3599 	 */
3600 	smp_wmb();
3601 	refcount_set(&sk->sk_refcnt, 1);
3602 	atomic_set(&sk->sk_drops, 0);
3603 }
3604 EXPORT_SYMBOL(sock_init_data_uid);
3605 
3606 void sock_init_data(struct socket *sock, struct sock *sk)
3607 {
3608 	kuid_t uid = sock ?
3609 		SOCK_INODE(sock)->i_uid :
3610 		make_kuid(sock_net(sk)->user_ns, 0);
3611 
3612 	sock_init_data_uid(sock, sk, uid);
3613 }
3614 EXPORT_SYMBOL(sock_init_data);
3615 
3616 void lock_sock_nested(struct sock *sk, int subclass)
3617 {
3618 	/* The sk_lock has mutex_lock() semantics here. */
3619 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3620 
3621 	might_sleep();
3622 	spin_lock_bh(&sk->sk_lock.slock);
3623 	if (sock_owned_by_user_nocheck(sk))
3624 		__lock_sock(sk);
3625 	sk->sk_lock.owned = 1;
3626 	spin_unlock_bh(&sk->sk_lock.slock);
3627 }
3628 EXPORT_SYMBOL(lock_sock_nested);
3629 
3630 void release_sock(struct sock *sk)
3631 {
3632 	spin_lock_bh(&sk->sk_lock.slock);
3633 	if (sk->sk_backlog.tail)
3634 		__release_sock(sk);
3635 
3636 	if (sk->sk_prot->release_cb)
3637 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3638 				     tcp_release_cb, sk);
3639 
3640 	sock_release_ownership(sk);
3641 	if (waitqueue_active(&sk->sk_lock.wq))
3642 		wake_up(&sk->sk_lock.wq);
3643 	spin_unlock_bh(&sk->sk_lock.slock);
3644 }
3645 EXPORT_SYMBOL(release_sock);
3646 
3647 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3648 {
3649 	might_sleep();
3650 	spin_lock_bh(&sk->sk_lock.slock);
3651 
3652 	if (!sock_owned_by_user_nocheck(sk)) {
3653 		/*
3654 		 * Fast path return with bottom halves disabled and
3655 		 * sock::sk_lock.slock held.
3656 		 *
3657 		 * The 'mutex' is not contended and holding
3658 		 * sock::sk_lock.slock prevents all other lockers to
3659 		 * proceed so the corresponding unlock_sock_fast() can
3660 		 * avoid the slow path of release_sock() completely and
3661 		 * just release slock.
3662 		 *
3663 		 * From a semantical POV this is equivalent to 'acquiring'
3664 		 * the 'mutex', hence the corresponding lockdep
3665 		 * mutex_release() has to happen in the fast path of
3666 		 * unlock_sock_fast().
3667 		 */
3668 		return false;
3669 	}
3670 
3671 	__lock_sock(sk);
3672 	sk->sk_lock.owned = 1;
3673 	__acquire(&sk->sk_lock.slock);
3674 	spin_unlock_bh(&sk->sk_lock.slock);
3675 	return true;
3676 }
3677 EXPORT_SYMBOL(__lock_sock_fast);
3678 
3679 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3680 		   bool timeval, bool time32)
3681 {
3682 	struct sock *sk = sock->sk;
3683 	struct timespec64 ts;
3684 
3685 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3686 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3687 	if (ts.tv_sec == -1)
3688 		return -ENOENT;
3689 	if (ts.tv_sec == 0) {
3690 		ktime_t kt = ktime_get_real();
3691 		sock_write_timestamp(sk, kt);
3692 		ts = ktime_to_timespec64(kt);
3693 	}
3694 
3695 	if (timeval)
3696 		ts.tv_nsec /= 1000;
3697 
3698 #ifdef CONFIG_COMPAT_32BIT_TIME
3699 	if (time32)
3700 		return put_old_timespec32(&ts, userstamp);
3701 #endif
3702 #ifdef CONFIG_SPARC64
3703 	/* beware of padding in sparc64 timeval */
3704 	if (timeval && !in_compat_syscall()) {
3705 		struct __kernel_old_timeval __user tv = {
3706 			.tv_sec = ts.tv_sec,
3707 			.tv_usec = ts.tv_nsec,
3708 		};
3709 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3710 			return -EFAULT;
3711 		return 0;
3712 	}
3713 #endif
3714 	return put_timespec64(&ts, userstamp);
3715 }
3716 EXPORT_SYMBOL(sock_gettstamp);
3717 
3718 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3719 {
3720 	if (!sock_flag(sk, flag)) {
3721 		unsigned long previous_flags = sk->sk_flags;
3722 
3723 		sock_set_flag(sk, flag);
3724 		/*
3725 		 * we just set one of the two flags which require net
3726 		 * time stamping, but time stamping might have been on
3727 		 * already because of the other one
3728 		 */
3729 		if (sock_needs_netstamp(sk) &&
3730 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3731 			net_enable_timestamp();
3732 	}
3733 }
3734 
3735 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3736 		       int level, int type)
3737 {
3738 	struct sock_exterr_skb *serr;
3739 	struct sk_buff *skb;
3740 	int copied, err;
3741 
3742 	err = -EAGAIN;
3743 	skb = sock_dequeue_err_skb(sk);
3744 	if (skb == NULL)
3745 		goto out;
3746 
3747 	copied = skb->len;
3748 	if (copied > len) {
3749 		msg->msg_flags |= MSG_TRUNC;
3750 		copied = len;
3751 	}
3752 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3753 	if (err)
3754 		goto out_free_skb;
3755 
3756 	sock_recv_timestamp(msg, sk, skb);
3757 
3758 	serr = SKB_EXT_ERR(skb);
3759 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3760 
3761 	msg->msg_flags |= MSG_ERRQUEUE;
3762 	err = copied;
3763 
3764 out_free_skb:
3765 	kfree_skb(skb);
3766 out:
3767 	return err;
3768 }
3769 EXPORT_SYMBOL(sock_recv_errqueue);
3770 
3771 /*
3772  *	Get a socket option on an socket.
3773  *
3774  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3775  *	asynchronous errors should be reported by getsockopt. We assume
3776  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3777  */
3778 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3779 			   char __user *optval, int __user *optlen)
3780 {
3781 	struct sock *sk = sock->sk;
3782 
3783 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3784 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3785 }
3786 EXPORT_SYMBOL(sock_common_getsockopt);
3787 
3788 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3789 			int flags)
3790 {
3791 	struct sock *sk = sock->sk;
3792 	int addr_len = 0;
3793 	int err;
3794 
3795 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3796 	if (err >= 0)
3797 		msg->msg_namelen = addr_len;
3798 	return err;
3799 }
3800 EXPORT_SYMBOL(sock_common_recvmsg);
3801 
3802 /*
3803  *	Set socket options on an inet socket.
3804  */
3805 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3806 			   sockptr_t optval, unsigned int optlen)
3807 {
3808 	struct sock *sk = sock->sk;
3809 
3810 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3811 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3812 }
3813 EXPORT_SYMBOL(sock_common_setsockopt);
3814 
3815 void sk_common_release(struct sock *sk)
3816 {
3817 	if (sk->sk_prot->destroy)
3818 		sk->sk_prot->destroy(sk);
3819 
3820 	/*
3821 	 * Observation: when sk_common_release is called, processes have
3822 	 * no access to socket. But net still has.
3823 	 * Step one, detach it from networking:
3824 	 *
3825 	 * A. Remove from hash tables.
3826 	 */
3827 
3828 	sk->sk_prot->unhash(sk);
3829 
3830 	if (sk->sk_socket)
3831 		sk->sk_socket->sk = NULL;
3832 
3833 	/*
3834 	 * In this point socket cannot receive new packets, but it is possible
3835 	 * that some packets are in flight because some CPU runs receiver and
3836 	 * did hash table lookup before we unhashed socket. They will achieve
3837 	 * receive queue and will be purged by socket destructor.
3838 	 *
3839 	 * Also we still have packets pending on receive queue and probably,
3840 	 * our own packets waiting in device queues. sock_destroy will drain
3841 	 * receive queue, but transmitted packets will delay socket destruction
3842 	 * until the last reference will be released.
3843 	 */
3844 
3845 	sock_orphan(sk);
3846 
3847 	xfrm_sk_free_policy(sk);
3848 
3849 	sock_put(sk);
3850 }
3851 EXPORT_SYMBOL(sk_common_release);
3852 
3853 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3854 {
3855 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3856 
3857 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3858 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3859 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3860 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3861 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3862 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3863 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3864 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3865 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3866 }
3867 
3868 #ifdef CONFIG_PROC_FS
3869 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3870 
3871 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3872 {
3873 	int cpu, idx = prot->inuse_idx;
3874 	int res = 0;
3875 
3876 	for_each_possible_cpu(cpu)
3877 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3878 
3879 	return res >= 0 ? res : 0;
3880 }
3881 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3882 
3883 int sock_inuse_get(struct net *net)
3884 {
3885 	int cpu, res = 0;
3886 
3887 	for_each_possible_cpu(cpu)
3888 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3889 
3890 	return res;
3891 }
3892 
3893 EXPORT_SYMBOL_GPL(sock_inuse_get);
3894 
3895 static int __net_init sock_inuse_init_net(struct net *net)
3896 {
3897 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3898 	if (net->core.prot_inuse == NULL)
3899 		return -ENOMEM;
3900 	return 0;
3901 }
3902 
3903 static void __net_exit sock_inuse_exit_net(struct net *net)
3904 {
3905 	free_percpu(net->core.prot_inuse);
3906 }
3907 
3908 static struct pernet_operations net_inuse_ops = {
3909 	.init = sock_inuse_init_net,
3910 	.exit = sock_inuse_exit_net,
3911 };
3912 
3913 static __init int net_inuse_init(void)
3914 {
3915 	if (register_pernet_subsys(&net_inuse_ops))
3916 		panic("Cannot initialize net inuse counters");
3917 
3918 	return 0;
3919 }
3920 
3921 core_initcall(net_inuse_init);
3922 
3923 static int assign_proto_idx(struct proto *prot)
3924 {
3925 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3926 
3927 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3928 		pr_err("PROTO_INUSE_NR exhausted\n");
3929 		return -ENOSPC;
3930 	}
3931 
3932 	set_bit(prot->inuse_idx, proto_inuse_idx);
3933 	return 0;
3934 }
3935 
3936 static void release_proto_idx(struct proto *prot)
3937 {
3938 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3939 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3940 }
3941 #else
3942 static inline int assign_proto_idx(struct proto *prot)
3943 {
3944 	return 0;
3945 }
3946 
3947 static inline void release_proto_idx(struct proto *prot)
3948 {
3949 }
3950 
3951 #endif
3952 
3953 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3954 {
3955 	if (!twsk_prot)
3956 		return;
3957 	kfree(twsk_prot->twsk_slab_name);
3958 	twsk_prot->twsk_slab_name = NULL;
3959 	kmem_cache_destroy(twsk_prot->twsk_slab);
3960 	twsk_prot->twsk_slab = NULL;
3961 }
3962 
3963 static int tw_prot_init(const struct proto *prot)
3964 {
3965 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3966 
3967 	if (!twsk_prot)
3968 		return 0;
3969 
3970 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3971 					      prot->name);
3972 	if (!twsk_prot->twsk_slab_name)
3973 		return -ENOMEM;
3974 
3975 	twsk_prot->twsk_slab =
3976 		kmem_cache_create(twsk_prot->twsk_slab_name,
3977 				  twsk_prot->twsk_obj_size, 0,
3978 				  SLAB_ACCOUNT | prot->slab_flags,
3979 				  NULL);
3980 	if (!twsk_prot->twsk_slab) {
3981 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3982 			prot->name);
3983 		return -ENOMEM;
3984 	}
3985 
3986 	return 0;
3987 }
3988 
3989 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3990 {
3991 	if (!rsk_prot)
3992 		return;
3993 	kfree(rsk_prot->slab_name);
3994 	rsk_prot->slab_name = NULL;
3995 	kmem_cache_destroy(rsk_prot->slab);
3996 	rsk_prot->slab = NULL;
3997 }
3998 
3999 static int req_prot_init(const struct proto *prot)
4000 {
4001 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4002 
4003 	if (!rsk_prot)
4004 		return 0;
4005 
4006 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4007 					prot->name);
4008 	if (!rsk_prot->slab_name)
4009 		return -ENOMEM;
4010 
4011 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4012 					   rsk_prot->obj_size, 0,
4013 					   SLAB_ACCOUNT | prot->slab_flags,
4014 					   NULL);
4015 
4016 	if (!rsk_prot->slab) {
4017 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4018 			prot->name);
4019 		return -ENOMEM;
4020 	}
4021 	return 0;
4022 }
4023 
4024 int proto_register(struct proto *prot, int alloc_slab)
4025 {
4026 	int ret = -ENOBUFS;
4027 
4028 	if (prot->memory_allocated && !prot->sysctl_mem) {
4029 		pr_err("%s: missing sysctl_mem\n", prot->name);
4030 		return -EINVAL;
4031 	}
4032 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4033 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4034 		return -EINVAL;
4035 	}
4036 	if (alloc_slab) {
4037 		prot->slab = kmem_cache_create_usercopy(prot->name,
4038 					prot->obj_size, 0,
4039 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4040 					prot->slab_flags,
4041 					prot->useroffset, prot->usersize,
4042 					NULL);
4043 
4044 		if (prot->slab == NULL) {
4045 			pr_crit("%s: Can't create sock SLAB cache!\n",
4046 				prot->name);
4047 			goto out;
4048 		}
4049 
4050 		if (req_prot_init(prot))
4051 			goto out_free_request_sock_slab;
4052 
4053 		if (tw_prot_init(prot))
4054 			goto out_free_timewait_sock_slab;
4055 	}
4056 
4057 	mutex_lock(&proto_list_mutex);
4058 	ret = assign_proto_idx(prot);
4059 	if (ret) {
4060 		mutex_unlock(&proto_list_mutex);
4061 		goto out_free_timewait_sock_slab;
4062 	}
4063 	list_add(&prot->node, &proto_list);
4064 	mutex_unlock(&proto_list_mutex);
4065 	return ret;
4066 
4067 out_free_timewait_sock_slab:
4068 	if (alloc_slab)
4069 		tw_prot_cleanup(prot->twsk_prot);
4070 out_free_request_sock_slab:
4071 	if (alloc_slab) {
4072 		req_prot_cleanup(prot->rsk_prot);
4073 
4074 		kmem_cache_destroy(prot->slab);
4075 		prot->slab = NULL;
4076 	}
4077 out:
4078 	return ret;
4079 }
4080 EXPORT_SYMBOL(proto_register);
4081 
4082 void proto_unregister(struct proto *prot)
4083 {
4084 	mutex_lock(&proto_list_mutex);
4085 	release_proto_idx(prot);
4086 	list_del(&prot->node);
4087 	mutex_unlock(&proto_list_mutex);
4088 
4089 	kmem_cache_destroy(prot->slab);
4090 	prot->slab = NULL;
4091 
4092 	req_prot_cleanup(prot->rsk_prot);
4093 	tw_prot_cleanup(prot->twsk_prot);
4094 }
4095 EXPORT_SYMBOL(proto_unregister);
4096 
4097 int sock_load_diag_module(int family, int protocol)
4098 {
4099 	if (!protocol) {
4100 		if (!sock_is_registered(family))
4101 			return -ENOENT;
4102 
4103 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4104 				      NETLINK_SOCK_DIAG, family);
4105 	}
4106 
4107 #ifdef CONFIG_INET
4108 	if (family == AF_INET &&
4109 	    protocol != IPPROTO_RAW &&
4110 	    protocol < MAX_INET_PROTOS &&
4111 	    !rcu_access_pointer(inet_protos[protocol]))
4112 		return -ENOENT;
4113 #endif
4114 
4115 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4116 			      NETLINK_SOCK_DIAG, family, protocol);
4117 }
4118 EXPORT_SYMBOL(sock_load_diag_module);
4119 
4120 #ifdef CONFIG_PROC_FS
4121 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4122 	__acquires(proto_list_mutex)
4123 {
4124 	mutex_lock(&proto_list_mutex);
4125 	return seq_list_start_head(&proto_list, *pos);
4126 }
4127 
4128 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4129 {
4130 	return seq_list_next(v, &proto_list, pos);
4131 }
4132 
4133 static void proto_seq_stop(struct seq_file *seq, void *v)
4134 	__releases(proto_list_mutex)
4135 {
4136 	mutex_unlock(&proto_list_mutex);
4137 }
4138 
4139 static char proto_method_implemented(const void *method)
4140 {
4141 	return method == NULL ? 'n' : 'y';
4142 }
4143 static long sock_prot_memory_allocated(struct proto *proto)
4144 {
4145 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4146 }
4147 
4148 static const char *sock_prot_memory_pressure(struct proto *proto)
4149 {
4150 	return proto->memory_pressure != NULL ?
4151 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4152 }
4153 
4154 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4155 {
4156 
4157 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4158 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4159 		   proto->name,
4160 		   proto->obj_size,
4161 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4162 		   sock_prot_memory_allocated(proto),
4163 		   sock_prot_memory_pressure(proto),
4164 		   proto->max_header,
4165 		   proto->slab == NULL ? "no" : "yes",
4166 		   module_name(proto->owner),
4167 		   proto_method_implemented(proto->close),
4168 		   proto_method_implemented(proto->connect),
4169 		   proto_method_implemented(proto->disconnect),
4170 		   proto_method_implemented(proto->accept),
4171 		   proto_method_implemented(proto->ioctl),
4172 		   proto_method_implemented(proto->init),
4173 		   proto_method_implemented(proto->destroy),
4174 		   proto_method_implemented(proto->shutdown),
4175 		   proto_method_implemented(proto->setsockopt),
4176 		   proto_method_implemented(proto->getsockopt),
4177 		   proto_method_implemented(proto->sendmsg),
4178 		   proto_method_implemented(proto->recvmsg),
4179 		   proto_method_implemented(proto->bind),
4180 		   proto_method_implemented(proto->backlog_rcv),
4181 		   proto_method_implemented(proto->hash),
4182 		   proto_method_implemented(proto->unhash),
4183 		   proto_method_implemented(proto->get_port),
4184 		   proto_method_implemented(proto->enter_memory_pressure));
4185 }
4186 
4187 static int proto_seq_show(struct seq_file *seq, void *v)
4188 {
4189 	if (v == &proto_list)
4190 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4191 			   "protocol",
4192 			   "size",
4193 			   "sockets",
4194 			   "memory",
4195 			   "press",
4196 			   "maxhdr",
4197 			   "slab",
4198 			   "module",
4199 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4200 	else
4201 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4202 	return 0;
4203 }
4204 
4205 static const struct seq_operations proto_seq_ops = {
4206 	.start  = proto_seq_start,
4207 	.next   = proto_seq_next,
4208 	.stop   = proto_seq_stop,
4209 	.show   = proto_seq_show,
4210 };
4211 
4212 static __net_init int proto_init_net(struct net *net)
4213 {
4214 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4215 			sizeof(struct seq_net_private)))
4216 		return -ENOMEM;
4217 
4218 	return 0;
4219 }
4220 
4221 static __net_exit void proto_exit_net(struct net *net)
4222 {
4223 	remove_proc_entry("protocols", net->proc_net);
4224 }
4225 
4226 
4227 static __net_initdata struct pernet_operations proto_net_ops = {
4228 	.init = proto_init_net,
4229 	.exit = proto_exit_net,
4230 };
4231 
4232 static int __init proto_init(void)
4233 {
4234 	return register_pernet_subsys(&proto_net_ops);
4235 }
4236 
4237 subsys_initcall(proto_init);
4238 
4239 #endif /* PROC_FS */
4240 
4241 #ifdef CONFIG_NET_RX_BUSY_POLL
4242 bool sk_busy_loop_end(void *p, unsigned long start_time)
4243 {
4244 	struct sock *sk = p;
4245 
4246 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4247 		return true;
4248 
4249 	if (sk_is_udp(sk) &&
4250 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4251 		return true;
4252 
4253 	return sk_busy_loop_timeout(sk, start_time);
4254 }
4255 EXPORT_SYMBOL(sk_busy_loop_end);
4256 #endif /* CONFIG_NET_RX_BUSY_POLL */
4257 
4258 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4259 {
4260 	if (!sk->sk_prot->bind_add)
4261 		return -EOPNOTSUPP;
4262 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4263 }
4264 EXPORT_SYMBOL(sock_bind_add);
4265 
4266 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4267 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4268 		     void __user *arg, void *karg, size_t size)
4269 {
4270 	int ret;
4271 
4272 	if (copy_from_user(karg, arg, size))
4273 		return -EFAULT;
4274 
4275 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4276 	if (ret)
4277 		return ret;
4278 
4279 	if (copy_to_user(arg, karg, size))
4280 		return -EFAULT;
4281 
4282 	return 0;
4283 }
4284 EXPORT_SYMBOL(sock_ioctl_inout);
4285 
4286 /* This is the most common ioctl prep function, where the result (4 bytes) is
4287  * copied back to userspace if the ioctl() returns successfully. No input is
4288  * copied from userspace as input argument.
4289  */
4290 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4291 {
4292 	int ret, karg = 0;
4293 
4294 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4295 	if (ret)
4296 		return ret;
4297 
4298 	return put_user(karg, (int __user *)arg);
4299 }
4300 
4301 /* A wrapper around sock ioctls, which copies the data from userspace
4302  * (depending on the protocol/ioctl), and copies back the result to userspace.
4303  * The main motivation for this function is to pass kernel memory to the
4304  * protocol ioctl callbacks, instead of userspace memory.
4305  */
4306 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4307 {
4308 	int rc = 1;
4309 
4310 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4311 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4312 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4313 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4314 	else if (sk_is_phonet(sk))
4315 		rc = phonet_sk_ioctl(sk, cmd, arg);
4316 
4317 	/* If ioctl was processed, returns its value */
4318 	if (rc <= 0)
4319 		return rc;
4320 
4321 	/* Otherwise call the default handler */
4322 	return sock_ioctl_out(sk, cmd, arg);
4323 }
4324 EXPORT_SYMBOL(sk_ioctl);
4325 
4326 static int __init sock_struct_check(void)
4327 {
4328 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4329 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4330 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4331 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4332 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4333 
4334 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4335 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4336 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4337 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4338 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4339 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4340 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4341 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4342 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4343 
4344 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4345 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4346 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4347 
4348 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4349 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4350 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4351 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4352 
4353 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4354 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4355 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4356 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4357 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4358 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4359 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4360 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4361 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4362 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4363 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4364 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4365 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4366 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4367 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4368 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4369 
4370 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4371 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4372 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4373 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4374 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4375 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4376 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4377 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4378 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4379 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4380 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4381 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4382 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4383 	return 0;
4384 }
4385 
4386 core_initcall(sock_struct_check);
4387