xref: /linux/net/core/sock.c (revision 05e352444b2430de4b183b4a988085381e5fd6ad)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include <uapi/linux/pidfd.h>
152 
153 #include "dev.h"
154 
155 static DEFINE_MUTEX(proto_list_mutex);
156 static LIST_HEAD(proto_list);
157 
158 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
159 static void sock_def_write_space(struct sock *sk);
160 
161 /**
162  * sk_ns_capable - General socket capability test
163  * @sk: Socket to use a capability on or through
164  * @user_ns: The user namespace of the capability to use
165  * @cap: The capability to use
166  *
167  * Test to see if the opener of the socket had when the socket was
168  * created and the current process has the capability @cap in the user
169  * namespace @user_ns.
170  */
171 bool sk_ns_capable(const struct sock *sk,
172 		   struct user_namespace *user_ns, int cap)
173 {
174 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
175 		ns_capable(user_ns, cap);
176 }
177 EXPORT_SYMBOL(sk_ns_capable);
178 
179 /**
180  * sk_capable - Socket global capability test
181  * @sk: Socket to use a capability on or through
182  * @cap: The global capability to use
183  *
184  * Test to see if the opener of the socket had when the socket was
185  * created and the current process has the capability @cap in all user
186  * namespaces.
187  */
188 bool sk_capable(const struct sock *sk, int cap)
189 {
190 	return sk_ns_capable(sk, &init_user_ns, cap);
191 }
192 EXPORT_SYMBOL(sk_capable);
193 
194 /**
195  * sk_net_capable - Network namespace socket capability test
196  * @sk: Socket to use a capability on or through
197  * @cap: The capability to use
198  *
199  * Test to see if the opener of the socket had when the socket was created
200  * and the current process has the capability @cap over the network namespace
201  * the socket is a member of.
202  */
203 bool sk_net_capable(const struct sock *sk, int cap)
204 {
205 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206 }
207 EXPORT_SYMBOL(sk_net_capable);
208 
209 /*
210  * Each address family might have different locking rules, so we have
211  * one slock key per address family and separate keys for internal and
212  * userspace sockets.
213  */
214 static struct lock_class_key af_family_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_keys[AF_MAX];
216 static struct lock_class_key af_family_slock_keys[AF_MAX];
217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218 
219 /*
220  * Make lock validator output more readable. (we pre-construct these
221  * strings build-time, so that runtime initialization of socket
222  * locks is fast):
223  */
224 
225 #define _sock_locks(x)						  \
226   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
227   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
228   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
229   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
230   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
231   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
232   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
233   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
234   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
235   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
236   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
237   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
238   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
239   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
240   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
241   x "AF_MCTP"  , \
242   x "AF_MAX"
243 
244 static const char *const af_family_key_strings[AF_MAX+1] = {
245 	_sock_locks("sk_lock-")
246 };
247 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
248 	_sock_locks("slock-")
249 };
250 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
251 	_sock_locks("clock-")
252 };
253 
254 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
255 	_sock_locks("k-sk_lock-")
256 };
257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
258 	_sock_locks("k-slock-")
259 };
260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
261 	_sock_locks("k-clock-")
262 };
263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
264 	_sock_locks("rlock-")
265 };
266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
267 	_sock_locks("wlock-")
268 };
269 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
270 	_sock_locks("elock-")
271 };
272 
273 /*
274  * sk_callback_lock and sk queues locking rules are per-address-family,
275  * so split the lock classes by using a per-AF key:
276  */
277 static struct lock_class_key af_callback_keys[AF_MAX];
278 static struct lock_class_key af_rlock_keys[AF_MAX];
279 static struct lock_class_key af_wlock_keys[AF_MAX];
280 static struct lock_class_key af_elock_keys[AF_MAX];
281 static struct lock_class_key af_kern_callback_keys[AF_MAX];
282 
283 /* Run time adjustable parameters. */
284 __u32 sysctl_wmem_max __read_mostly = 4 << 20;
285 EXPORT_SYMBOL(sysctl_wmem_max);
286 __u32 sysctl_rmem_max __read_mostly = 4 << 20;
287 EXPORT_SYMBOL(sysctl_rmem_max);
288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
290 
291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292 EXPORT_SYMBOL_GPL(memalloc_socks_key);
293 
294 /**
295  * sk_set_memalloc - sets %SOCK_MEMALLOC
296  * @sk: socket to set it on
297  *
298  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299  * It's the responsibility of the admin to adjust min_free_kbytes
300  * to meet the requirements
301  */
302 void sk_set_memalloc(struct sock *sk)
303 {
304 	sock_set_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation |= __GFP_MEMALLOC;
306 	static_branch_inc(&memalloc_socks_key);
307 }
308 EXPORT_SYMBOL_GPL(sk_set_memalloc);
309 
310 void sk_clear_memalloc(struct sock *sk)
311 {
312 	sock_reset_flag(sk, SOCK_MEMALLOC);
313 	sk->sk_allocation &= ~__GFP_MEMALLOC;
314 	static_branch_dec(&memalloc_socks_key);
315 
316 	/*
317 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
319 	 * it has rmem allocations due to the last swapfile being deactivated
320 	 * but there is a risk that the socket is unusable due to exceeding
321 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
322 	 */
323 	sk_mem_reclaim(sk);
324 }
325 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326 
327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
328 {
329 	int ret;
330 	unsigned int noreclaim_flag;
331 
332 	/* these should have been dropped before queueing */
333 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334 
335 	noreclaim_flag = memalloc_noreclaim_save();
336 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337 				 tcp_v6_do_rcv,
338 				 tcp_v4_do_rcv,
339 				 sk, skb);
340 	memalloc_noreclaim_restore(noreclaim_flag);
341 
342 	return ret;
343 }
344 EXPORT_SYMBOL(__sk_backlog_rcv);
345 
346 void sk_error_report(struct sock *sk)
347 {
348 	sk->sk_error_report(sk);
349 
350 	switch (sk->sk_family) {
351 	case AF_INET:
352 		fallthrough;
353 	case AF_INET6:
354 		trace_inet_sk_error_report(sk);
355 		break;
356 	default:
357 		break;
358 	}
359 }
360 EXPORT_SYMBOL(sk_error_report);
361 
362 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363 {
364 	struct __kernel_sock_timeval tv;
365 
366 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
367 		tv.tv_sec = 0;
368 		tv.tv_usec = 0;
369 	} else {
370 		tv.tv_sec = timeo / HZ;
371 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372 	}
373 
374 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376 		*(struct old_timeval32 *)optval = tv32;
377 		return sizeof(tv32);
378 	}
379 
380 	if (old_timeval) {
381 		struct __kernel_old_timeval old_tv;
382 		old_tv.tv_sec = tv.tv_sec;
383 		old_tv.tv_usec = tv.tv_usec;
384 		*(struct __kernel_old_timeval *)optval = old_tv;
385 		return sizeof(old_tv);
386 	}
387 
388 	*(struct __kernel_sock_timeval *)optval = tv;
389 	return sizeof(tv);
390 }
391 EXPORT_SYMBOL(sock_get_timeout);
392 
393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394 			   sockptr_t optval, int optlen, bool old_timeval)
395 {
396 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397 		struct old_timeval32 tv32;
398 
399 		if (optlen < sizeof(tv32))
400 			return -EINVAL;
401 
402 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
403 			return -EFAULT;
404 		tv->tv_sec = tv32.tv_sec;
405 		tv->tv_usec = tv32.tv_usec;
406 	} else if (old_timeval) {
407 		struct __kernel_old_timeval old_tv;
408 
409 		if (optlen < sizeof(old_tv))
410 			return -EINVAL;
411 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
412 			return -EFAULT;
413 		tv->tv_sec = old_tv.tv_sec;
414 		tv->tv_usec = old_tv.tv_usec;
415 	} else {
416 		if (optlen < sizeof(*tv))
417 			return -EINVAL;
418 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
419 			return -EFAULT;
420 	}
421 
422 	return 0;
423 }
424 EXPORT_SYMBOL(sock_copy_user_timeval);
425 
426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
427 			    bool old_timeval)
428 {
429 	struct __kernel_sock_timeval tv;
430 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431 	long val;
432 
433 	if (err)
434 		return err;
435 
436 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 		return -EDOM;
438 
439 	if (tv.tv_sec < 0) {
440 		static int warned __read_mostly;
441 
442 		WRITE_ONCE(*timeo_p, 0);
443 		if (warned < 10 && net_ratelimit()) {
444 			warned++;
445 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 				__func__, current->comm, task_pid_nr(current));
447 		}
448 		return 0;
449 	}
450 	val = MAX_SCHEDULE_TIMEOUT;
451 	if ((tv.tv_sec || tv.tv_usec) &&
452 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
453 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454 						    USEC_PER_SEC / HZ);
455 	WRITE_ONCE(*timeo_p, val);
456 	return 0;
457 }
458 
459 static bool sk_set_prio_allowed(const struct sock *sk, int val)
460 {
461 	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
462 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
463 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
464 }
465 
466 static bool sock_needs_netstamp(const struct sock *sk)
467 {
468 	switch (sk->sk_family) {
469 	case AF_UNSPEC:
470 	case AF_UNIX:
471 		return false;
472 	default:
473 		return true;
474 	}
475 }
476 
477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
478 {
479 	if (sk->sk_flags & flags) {
480 		sk->sk_flags &= ~flags;
481 		if (sock_needs_netstamp(sk) &&
482 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
483 			net_disable_timestamp();
484 	}
485 }
486 
487 
488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
489 {
490 	unsigned long flags;
491 	struct sk_buff_head *list = &sk->sk_receive_queue;
492 
493 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
494 		sk_drops_inc(sk);
495 		trace_sock_rcvqueue_full(sk, skb);
496 		return -ENOMEM;
497 	}
498 
499 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
500 		sk_drops_inc(sk);
501 		return -ENOBUFS;
502 	}
503 
504 	skb->dev = NULL;
505 	skb_set_owner_r(skb, sk);
506 
507 	/* we escape from rcu protected region, make sure we dont leak
508 	 * a norefcounted dst
509 	 */
510 	skb_dst_force(skb);
511 
512 	spin_lock_irqsave(&list->lock, flags);
513 	sock_skb_set_dropcount(sk, skb);
514 	__skb_queue_tail(list, skb);
515 	spin_unlock_irqrestore(&list->lock, flags);
516 
517 	if (!sock_flag(sk, SOCK_DEAD))
518 		sk->sk_data_ready(sk);
519 	return 0;
520 }
521 EXPORT_SYMBOL(__sock_queue_rcv_skb);
522 
523 enum skb_drop_reason
524 sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb)
525 {
526 	enum skb_drop_reason drop_reason;
527 	int err;
528 
529 	drop_reason = sk_filter_reason(sk, skb);
530 	if (drop_reason)
531 		return drop_reason;
532 
533 	err = __sock_queue_rcv_skb(sk, skb);
534 	switch (err) {
535 	case -ENOMEM:
536 		return SKB_DROP_REASON_SOCKET_RCVBUFF;
537 	case -ENOBUFS:
538 		return SKB_DROP_REASON_PROTO_MEM;
539 	}
540 	return SKB_NOT_DROPPED_YET;
541 }
542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
543 
544 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
545 		     const int nested, unsigned int trim_cap, bool refcounted)
546 {
547 	enum skb_drop_reason reason;
548 	int rc = NET_RX_SUCCESS;
549 	int err;
550 
551 	reason = sk_filter_trim_cap(sk, skb, trim_cap);
552 	if (reason)
553 		goto discard_and_relse;
554 
555 	skb->dev = NULL;
556 
557 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
558 		sk_drops_inc(sk);
559 		reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
560 		goto discard_and_relse;
561 	}
562 	if (nested)
563 		bh_lock_sock_nested(sk);
564 	else
565 		bh_lock_sock(sk);
566 	if (!sock_owned_by_user(sk)) {
567 		/*
568 		 * trylock + unlock semantics:
569 		 */
570 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
571 
572 		rc = sk_backlog_rcv(sk, skb);
573 
574 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
575 	} else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
576 		bh_unlock_sock(sk);
577 		if (err == -ENOMEM)
578 			reason = SKB_DROP_REASON_PFMEMALLOC;
579 		if (err == -ENOBUFS)
580 			reason = SKB_DROP_REASON_SOCKET_BACKLOG;
581 		sk_drops_inc(sk);
582 		goto discard_and_relse;
583 	}
584 
585 	bh_unlock_sock(sk);
586 out:
587 	if (refcounted)
588 		sock_put(sk);
589 	return rc;
590 discard_and_relse:
591 	sk_skb_reason_drop(sk, skb, reason);
592 	goto out;
593 }
594 EXPORT_SYMBOL(__sk_receive_skb);
595 
596 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
597 							  u32));
598 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
599 							   u32));
600 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
601 {
602 	struct dst_entry *dst = __sk_dst_get(sk);
603 
604 	if (dst && READ_ONCE(dst->obsolete) &&
605 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
606 			       dst, cookie) == NULL) {
607 		sk_tx_queue_clear(sk);
608 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
609 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
610 		dst_release(dst);
611 		return NULL;
612 	}
613 
614 	return dst;
615 }
616 EXPORT_SYMBOL(__sk_dst_check);
617 
618 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
619 {
620 	struct dst_entry *dst = sk_dst_get(sk);
621 
622 	if (dst && READ_ONCE(dst->obsolete) &&
623 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
624 			       dst, cookie) == NULL) {
625 		sk_dst_reset(sk);
626 		dst_release(dst);
627 		return NULL;
628 	}
629 
630 	return dst;
631 }
632 EXPORT_SYMBOL(sk_dst_check);
633 
634 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
635 {
636 	int ret = -ENOPROTOOPT;
637 #ifdef CONFIG_NETDEVICES
638 	struct net *net = sock_net(sk);
639 
640 	/* Sorry... */
641 	ret = -EPERM;
642 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
643 		goto out;
644 
645 	ret = -EINVAL;
646 	if (ifindex < 0)
647 		goto out;
648 
649 	/* Paired with all READ_ONCE() done locklessly. */
650 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
651 
652 	if (sk->sk_prot->rehash)
653 		sk->sk_prot->rehash(sk);
654 	sk_dst_reset(sk);
655 
656 	ret = 0;
657 
658 out:
659 #endif
660 
661 	return ret;
662 }
663 
664 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
665 {
666 	int ret;
667 
668 	if (lock_sk)
669 		lock_sock(sk);
670 	ret = sock_bindtoindex_locked(sk, ifindex);
671 	if (lock_sk)
672 		release_sock(sk);
673 
674 	return ret;
675 }
676 EXPORT_SYMBOL(sock_bindtoindex);
677 
678 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
679 {
680 	int ret = -ENOPROTOOPT;
681 #ifdef CONFIG_NETDEVICES
682 	struct net *net = sock_net(sk);
683 	char devname[IFNAMSIZ];
684 	int index;
685 
686 	ret = -EINVAL;
687 	if (optlen < 0)
688 		goto out;
689 
690 	/* Bind this socket to a particular device like "eth0",
691 	 * as specified in the passed interface name. If the
692 	 * name is "" or the option length is zero the socket
693 	 * is not bound.
694 	 */
695 	if (optlen > IFNAMSIZ - 1)
696 		optlen = IFNAMSIZ - 1;
697 	memset(devname, 0, sizeof(devname));
698 
699 	ret = -EFAULT;
700 	if (copy_from_sockptr(devname, optval, optlen))
701 		goto out;
702 
703 	index = 0;
704 	if (devname[0] != '\0') {
705 		struct net_device *dev;
706 
707 		rcu_read_lock();
708 		dev = dev_get_by_name_rcu(net, devname);
709 		if (dev)
710 			index = dev->ifindex;
711 		rcu_read_unlock();
712 		ret = -ENODEV;
713 		if (!dev)
714 			goto out;
715 	}
716 
717 	sockopt_lock_sock(sk);
718 	ret = sock_bindtoindex_locked(sk, index);
719 	sockopt_release_sock(sk);
720 out:
721 #endif
722 
723 	return ret;
724 }
725 
726 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
727 				sockptr_t optlen, int len)
728 {
729 	int ret = -ENOPROTOOPT;
730 #ifdef CONFIG_NETDEVICES
731 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
732 	struct net *net = sock_net(sk);
733 	char devname[IFNAMSIZ];
734 
735 	if (bound_dev_if == 0) {
736 		len = 0;
737 		goto zero;
738 	}
739 
740 	ret = -EINVAL;
741 	if (len < IFNAMSIZ)
742 		goto out;
743 
744 	ret = netdev_get_name(net, devname, bound_dev_if);
745 	if (ret)
746 		goto out;
747 
748 	len = strlen(devname) + 1;
749 
750 	ret = -EFAULT;
751 	if (copy_to_sockptr(optval, devname, len))
752 		goto out;
753 
754 zero:
755 	ret = -EFAULT;
756 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
757 		goto out;
758 
759 	ret = 0;
760 
761 out:
762 #endif
763 
764 	return ret;
765 }
766 
767 bool sk_mc_loop(const struct sock *sk)
768 {
769 	if (dev_recursion_level())
770 		return false;
771 	if (!sk)
772 		return true;
773 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
774 	switch (READ_ONCE(sk->sk_family)) {
775 	case AF_INET:
776 		return inet_test_bit(MC_LOOP, sk);
777 #if IS_ENABLED(CONFIG_IPV6)
778 	case AF_INET6:
779 		return inet6_test_bit(MC6_LOOP, sk);
780 #endif
781 	}
782 	WARN_ON_ONCE(1);
783 	return true;
784 }
785 EXPORT_SYMBOL(sk_mc_loop);
786 
787 void sock_set_reuseaddr(struct sock *sk)
788 {
789 	lock_sock(sk);
790 	sk->sk_reuse = SK_CAN_REUSE;
791 	release_sock(sk);
792 }
793 EXPORT_SYMBOL(sock_set_reuseaddr);
794 
795 void sock_set_reuseport(struct sock *sk)
796 {
797 	lock_sock(sk);
798 	sk->sk_reuseport = true;
799 	release_sock(sk);
800 }
801 EXPORT_SYMBOL(sock_set_reuseport);
802 
803 void sock_no_linger(struct sock *sk)
804 {
805 	lock_sock(sk);
806 	WRITE_ONCE(sk->sk_lingertime, 0);
807 	sock_set_flag(sk, SOCK_LINGER);
808 	release_sock(sk);
809 }
810 EXPORT_SYMBOL(sock_no_linger);
811 
812 void sock_set_priority(struct sock *sk, u32 priority)
813 {
814 	WRITE_ONCE(sk->sk_priority, priority);
815 }
816 EXPORT_SYMBOL(sock_set_priority);
817 
818 void sock_set_sndtimeo(struct sock *sk, s64 secs)
819 {
820 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
821 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
822 	else
823 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
824 }
825 EXPORT_SYMBOL(sock_set_sndtimeo);
826 
827 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
828 {
829 	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
830 	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
831 	if (val)  {
832 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
833 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
834 	}
835 }
836 
837 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
838 {
839 	switch (optname) {
840 	case SO_TIMESTAMP_OLD:
841 		__sock_set_timestamps(sk, valbool, false, false);
842 		break;
843 	case SO_TIMESTAMP_NEW:
844 		__sock_set_timestamps(sk, valbool, true, false);
845 		break;
846 	case SO_TIMESTAMPNS_OLD:
847 		__sock_set_timestamps(sk, valbool, false, true);
848 		break;
849 	case SO_TIMESTAMPNS_NEW:
850 		__sock_set_timestamps(sk, valbool, true, true);
851 		break;
852 	}
853 }
854 
855 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
856 {
857 	struct net *net = sock_net(sk);
858 	struct net_device *dev = NULL;
859 	bool match = false;
860 	int *vclock_index;
861 	int i, num;
862 
863 	if (sk->sk_bound_dev_if)
864 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
865 
866 	if (!dev) {
867 		pr_err("%s: sock not bind to device\n", __func__);
868 		return -EOPNOTSUPP;
869 	}
870 
871 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
872 	dev_put(dev);
873 
874 	for (i = 0; i < num; i++) {
875 		if (*(vclock_index + i) == phc_index) {
876 			match = true;
877 			break;
878 		}
879 	}
880 
881 	if (num > 0)
882 		kfree(vclock_index);
883 
884 	if (!match)
885 		return -EINVAL;
886 
887 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
888 
889 	return 0;
890 }
891 
892 int sock_set_timestamping(struct sock *sk, int optname,
893 			  struct so_timestamping timestamping)
894 {
895 	int val = timestamping.flags;
896 	int ret;
897 
898 	if (val & ~SOF_TIMESTAMPING_MASK)
899 		return -EINVAL;
900 
901 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
902 	    !(val & SOF_TIMESTAMPING_OPT_ID))
903 		return -EINVAL;
904 
905 	if (val & SOF_TIMESTAMPING_OPT_ID &&
906 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
907 		if (sk_is_tcp(sk)) {
908 			if ((1 << sk->sk_state) &
909 			    (TCPF_CLOSE | TCPF_LISTEN))
910 				return -EINVAL;
911 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
912 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
913 			else
914 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
915 		} else {
916 			atomic_set(&sk->sk_tskey, 0);
917 		}
918 	}
919 
920 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
921 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
922 		return -EINVAL;
923 
924 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
925 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
926 		if (ret)
927 			return ret;
928 	}
929 
930 	WRITE_ONCE(sk->sk_tsflags, val);
931 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
932 	sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
933 
934 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
935 		sock_enable_timestamp(sk,
936 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
937 	else
938 		sock_disable_timestamp(sk,
939 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
940 	return 0;
941 }
942 
943 #if defined(CONFIG_CGROUP_BPF)
944 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
945 {
946 	struct bpf_sock_ops_kern sock_ops;
947 
948 	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
949 	sock_ops.op = op;
950 	sock_ops.is_fullsock = 1;
951 	sock_ops.sk = sk;
952 	bpf_skops_init_skb(&sock_ops, skb, 0);
953 	__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
954 }
955 #endif
956 
957 void sock_set_keepalive(struct sock *sk)
958 {
959 	lock_sock(sk);
960 	if (sk->sk_prot->keepalive)
961 		sk->sk_prot->keepalive(sk, true);
962 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
963 	release_sock(sk);
964 }
965 EXPORT_SYMBOL(sock_set_keepalive);
966 
967 static void __sock_set_rcvbuf(struct sock *sk, int val)
968 {
969 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
970 	 * as a negative value.
971 	 */
972 	val = min_t(int, val, INT_MAX / 2);
973 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
974 
975 	/* We double it on the way in to account for "struct sk_buff" etc.
976 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
977 	 * will allow that much actual data to be received on that socket.
978 	 *
979 	 * Applications are unaware that "struct sk_buff" and other overheads
980 	 * allocate from the receive buffer during socket buffer allocation.
981 	 *
982 	 * And after considering the possible alternatives, returning the value
983 	 * we actually used in getsockopt is the most desirable behavior.
984 	 */
985 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
986 }
987 
988 void sock_set_rcvbuf(struct sock *sk, int val)
989 {
990 	lock_sock(sk);
991 	__sock_set_rcvbuf(sk, val);
992 	release_sock(sk);
993 }
994 EXPORT_SYMBOL(sock_set_rcvbuf);
995 
996 static void __sock_set_mark(struct sock *sk, u32 val)
997 {
998 	if (val != sk->sk_mark) {
999 		WRITE_ONCE(sk->sk_mark, val);
1000 		sk_dst_reset(sk);
1001 	}
1002 }
1003 
1004 void sock_set_mark(struct sock *sk, u32 val)
1005 {
1006 	lock_sock(sk);
1007 	__sock_set_mark(sk, val);
1008 	release_sock(sk);
1009 }
1010 EXPORT_SYMBOL(sock_set_mark);
1011 
1012 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1013 {
1014 	/* Round down bytes to multiple of pages */
1015 	bytes = round_down(bytes, PAGE_SIZE);
1016 
1017 	WARN_ON(bytes > sk->sk_reserved_mem);
1018 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1019 	sk_mem_reclaim(sk);
1020 }
1021 
1022 static int sock_reserve_memory(struct sock *sk, int bytes)
1023 {
1024 	long allocated;
1025 	bool charged;
1026 	int pages;
1027 
1028 	if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
1029 		return -EOPNOTSUPP;
1030 
1031 	if (!bytes)
1032 		return 0;
1033 
1034 	pages = sk_mem_pages(bytes);
1035 
1036 	/* pre-charge to memcg */
1037 	charged = mem_cgroup_sk_charge(sk, pages,
1038 				       GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1039 	if (!charged)
1040 		return -ENOMEM;
1041 
1042 	if (sk->sk_bypass_prot_mem)
1043 		goto success;
1044 
1045 	/* pre-charge to forward_alloc */
1046 	sk_memory_allocated_add(sk, pages);
1047 	allocated = sk_memory_allocated(sk);
1048 
1049 	/* If the system goes into memory pressure with this
1050 	 * precharge, give up and return error.
1051 	 */
1052 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1053 		sk_memory_allocated_sub(sk, pages);
1054 		mem_cgroup_sk_uncharge(sk, pages);
1055 		return -ENOMEM;
1056 	}
1057 
1058 success:
1059 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1060 
1061 	WRITE_ONCE(sk->sk_reserved_mem,
1062 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1063 
1064 	return 0;
1065 }
1066 
1067 #ifdef CONFIG_PAGE_POOL
1068 
1069 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1070  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1071  * allocates to copy these tokens, and to prevent looping over the frags for
1072  * too long.
1073  */
1074 #define MAX_DONTNEED_TOKENS 128
1075 #define MAX_DONTNEED_FRAGS 1024
1076 
1077 static noinline_for_stack int
1078 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1079 {
1080 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1081 	struct dmabuf_token *tokens;
1082 	int ret = 0, num_frags = 0;
1083 	netmem_ref netmems[16];
1084 
1085 	if (!sk_is_tcp(sk))
1086 		return -EBADF;
1087 
1088 	if (optlen % sizeof(*tokens) ||
1089 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1090 		return -EINVAL;
1091 
1092 	num_tokens = optlen / sizeof(*tokens);
1093 	tokens = kvmalloc_objs(*tokens, num_tokens);
1094 	if (!tokens)
1095 		return -ENOMEM;
1096 
1097 	if (copy_from_sockptr(tokens, optval, optlen)) {
1098 		kvfree(tokens);
1099 		return -EFAULT;
1100 	}
1101 
1102 	xa_lock_bh(&sk->sk_user_frags);
1103 	for (i = 0; i < num_tokens; i++) {
1104 		for (j = 0; j < tokens[i].token_count; j++) {
1105 			if (++num_frags > MAX_DONTNEED_FRAGS)
1106 				goto frag_limit_reached;
1107 
1108 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1109 				&sk->sk_user_frags, tokens[i].token_start + j);
1110 
1111 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1112 				continue;
1113 
1114 			netmems[netmem_num++] = netmem;
1115 			if (netmem_num == ARRAY_SIZE(netmems)) {
1116 				xa_unlock_bh(&sk->sk_user_frags);
1117 				for (k = 0; k < netmem_num; k++)
1118 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1119 				netmem_num = 0;
1120 				xa_lock_bh(&sk->sk_user_frags);
1121 			}
1122 			ret++;
1123 		}
1124 	}
1125 
1126 frag_limit_reached:
1127 	xa_unlock_bh(&sk->sk_user_frags);
1128 	for (k = 0; k < netmem_num; k++)
1129 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1130 
1131 	kvfree(tokens);
1132 	return ret;
1133 }
1134 #endif
1135 
1136 void sockopt_lock_sock(struct sock *sk)
1137 {
1138 	/* When current->bpf_ctx is set, the setsockopt is called from
1139 	 * a bpf prog.  bpf has ensured the sk lock has been
1140 	 * acquired before calling setsockopt().
1141 	 */
1142 	if (has_current_bpf_ctx())
1143 		return;
1144 
1145 	lock_sock(sk);
1146 }
1147 EXPORT_SYMBOL(sockopt_lock_sock);
1148 
1149 void sockopt_release_sock(struct sock *sk)
1150 {
1151 	if (has_current_bpf_ctx())
1152 		return;
1153 
1154 	release_sock(sk);
1155 }
1156 EXPORT_SYMBOL(sockopt_release_sock);
1157 
1158 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1159 {
1160 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1161 }
1162 EXPORT_SYMBOL(sockopt_ns_capable);
1163 
1164 bool sockopt_capable(int cap)
1165 {
1166 	return has_current_bpf_ctx() || capable(cap);
1167 }
1168 EXPORT_SYMBOL(sockopt_capable);
1169 
1170 static int sockopt_validate_clockid(__kernel_clockid_t value)
1171 {
1172 	switch (value) {
1173 	case CLOCK_REALTIME:
1174 	case CLOCK_MONOTONIC:
1175 	case CLOCK_TAI:
1176 		return 0;
1177 	}
1178 	return -EINVAL;
1179 }
1180 
1181 /*
1182  *	This is meant for all protocols to use and covers goings on
1183  *	at the socket level. Everything here is generic.
1184  */
1185 
1186 int sk_setsockopt(struct sock *sk, int level, int optname,
1187 		  sockptr_t optval, unsigned int optlen)
1188 {
1189 	struct so_timestamping timestamping;
1190 	struct socket *sock = sk->sk_socket;
1191 	struct sock_txtime sk_txtime;
1192 	int val;
1193 	int valbool;
1194 	struct linger ling;
1195 	int ret = 0;
1196 
1197 	/*
1198 	 *	Options without arguments
1199 	 */
1200 
1201 	if (optname == SO_BINDTODEVICE)
1202 		return sock_setbindtodevice(sk, optval, optlen);
1203 
1204 	if (optlen < sizeof(int))
1205 		return -EINVAL;
1206 
1207 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1208 		return -EFAULT;
1209 
1210 	valbool = val ? 1 : 0;
1211 
1212 	/* handle options which do not require locking the socket. */
1213 	switch (optname) {
1214 	case SO_PRIORITY:
1215 		if (sk_set_prio_allowed(sk, val)) {
1216 			sock_set_priority(sk, val);
1217 			return 0;
1218 		}
1219 		return -EPERM;
1220 	case SO_TYPE:
1221 	case SO_PROTOCOL:
1222 	case SO_DOMAIN:
1223 	case SO_ERROR:
1224 		return -ENOPROTOOPT;
1225 #ifdef CONFIG_NET_RX_BUSY_POLL
1226 	case SO_BUSY_POLL:
1227 		if (val < 0)
1228 			return -EINVAL;
1229 		WRITE_ONCE(sk->sk_ll_usec, val);
1230 		return 0;
1231 	case SO_PREFER_BUSY_POLL:
1232 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1233 			return -EPERM;
1234 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1235 		return 0;
1236 	case SO_BUSY_POLL_BUDGET:
1237 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1238 		    !sockopt_capable(CAP_NET_ADMIN))
1239 			return -EPERM;
1240 		if (val < 0 || val > U16_MAX)
1241 			return -EINVAL;
1242 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1243 		return 0;
1244 #endif
1245 	case SO_MAX_PACING_RATE:
1246 		{
1247 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1248 		unsigned long pacing_rate;
1249 
1250 		if (sizeof(ulval) != sizeof(val) &&
1251 		    optlen >= sizeof(ulval) &&
1252 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1253 			return -EFAULT;
1254 		}
1255 		if (ulval != ~0UL)
1256 			cmpxchg(&sk->sk_pacing_status,
1257 				SK_PACING_NONE,
1258 				SK_PACING_NEEDED);
1259 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1260 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1261 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1262 		if (ulval < pacing_rate)
1263 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1264 		return 0;
1265 		}
1266 	case SO_TXREHASH:
1267 		if (!sk_is_tcp(sk))
1268 			return -EOPNOTSUPP;
1269 		if (val < -1 || val > 1)
1270 			return -EINVAL;
1271 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1272 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1273 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1274 		 * and sk_getsockopt().
1275 		 */
1276 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1277 		return 0;
1278 	case SO_PEEK_OFF:
1279 		{
1280 		int (*set_peek_off)(struct sock *sk, int val);
1281 
1282 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1283 		if (set_peek_off)
1284 			ret = set_peek_off(sk, val);
1285 		else
1286 			ret = -EOPNOTSUPP;
1287 		return ret;
1288 		}
1289 #ifdef CONFIG_PAGE_POOL
1290 	case SO_DEVMEM_DONTNEED:
1291 		return sock_devmem_dontneed(sk, optval, optlen);
1292 #endif
1293 	case SO_SNDTIMEO_OLD:
1294 	case SO_SNDTIMEO_NEW:
1295 		return sock_set_timeout(&sk->sk_sndtimeo, optval,
1296 					optlen, optname == SO_SNDTIMEO_OLD);
1297 	case SO_RCVTIMEO_OLD:
1298 	case SO_RCVTIMEO_NEW:
1299 		return sock_set_timeout(&sk->sk_rcvtimeo, optval,
1300 					optlen, optname == SO_RCVTIMEO_OLD);
1301 	}
1302 
1303 	sockopt_lock_sock(sk);
1304 
1305 	switch (optname) {
1306 	case SO_DEBUG:
1307 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1308 			ret = -EACCES;
1309 		else
1310 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1311 		break;
1312 	case SO_REUSEADDR:
1313 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1314 		break;
1315 	case SO_REUSEPORT:
1316 		if (valbool && !sk_is_inet(sk))
1317 			ret = -EOPNOTSUPP;
1318 		else
1319 			sk->sk_reuseport = valbool;
1320 		break;
1321 	case SO_DONTROUTE:
1322 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1323 		sk_dst_reset(sk);
1324 		break;
1325 	case SO_BROADCAST:
1326 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1327 		break;
1328 	case SO_SNDBUF:
1329 		/* Don't error on this BSD doesn't and if you think
1330 		 * about it this is right. Otherwise apps have to
1331 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1332 		 * are treated in BSD as hints
1333 		 */
1334 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1335 set_sndbuf:
1336 		/* Ensure val * 2 fits into an int, to prevent max_t()
1337 		 * from treating it as a negative value.
1338 		 */
1339 		val = min_t(int, val, INT_MAX / 2);
1340 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1341 		WRITE_ONCE(sk->sk_sndbuf,
1342 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1343 		/* Wake up sending tasks if we upped the value. */
1344 		sk->sk_write_space(sk);
1345 		break;
1346 
1347 	case SO_SNDBUFFORCE:
1348 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1349 			ret = -EPERM;
1350 			break;
1351 		}
1352 
1353 		/* No negative values (to prevent underflow, as val will be
1354 		 * multiplied by 2).
1355 		 */
1356 		if (val < 0)
1357 			val = 0;
1358 		goto set_sndbuf;
1359 
1360 	case SO_RCVBUF:
1361 		/* Don't error on this BSD doesn't and if you think
1362 		 * about it this is right. Otherwise apps have to
1363 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1364 		 * are treated in BSD as hints
1365 		 */
1366 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1367 		break;
1368 
1369 	case SO_RCVBUFFORCE:
1370 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1371 			ret = -EPERM;
1372 			break;
1373 		}
1374 
1375 		/* No negative values (to prevent underflow, as val will be
1376 		 * multiplied by 2).
1377 		 */
1378 		__sock_set_rcvbuf(sk, max(val, 0));
1379 		break;
1380 
1381 	case SO_KEEPALIVE:
1382 		if (sk->sk_prot->keepalive)
1383 			sk->sk_prot->keepalive(sk, valbool);
1384 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1385 		break;
1386 
1387 	case SO_OOBINLINE:
1388 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1389 		break;
1390 
1391 	case SO_NO_CHECK:
1392 		sk->sk_no_check_tx = valbool;
1393 		break;
1394 
1395 	case SO_LINGER:
1396 		if (optlen < sizeof(ling)) {
1397 			ret = -EINVAL;	/* 1003.1g */
1398 			break;
1399 		}
1400 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1401 			ret = -EFAULT;
1402 			break;
1403 		}
1404 		if (!ling.l_onoff) {
1405 			sock_reset_flag(sk, SOCK_LINGER);
1406 		} else {
1407 			unsigned long t_sec = ling.l_linger;
1408 
1409 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1410 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1411 			else
1412 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1413 			sock_set_flag(sk, SOCK_LINGER);
1414 		}
1415 		break;
1416 
1417 	case SO_BSDCOMPAT:
1418 		break;
1419 
1420 	case SO_TIMESTAMP_OLD:
1421 	case SO_TIMESTAMP_NEW:
1422 	case SO_TIMESTAMPNS_OLD:
1423 	case SO_TIMESTAMPNS_NEW:
1424 		sock_set_timestamp(sk, optname, valbool);
1425 		break;
1426 
1427 	case SO_TIMESTAMPING_NEW:
1428 	case SO_TIMESTAMPING_OLD:
1429 		if (optlen == sizeof(timestamping)) {
1430 			if (copy_from_sockptr(&timestamping, optval,
1431 					      sizeof(timestamping))) {
1432 				ret = -EFAULT;
1433 				break;
1434 			}
1435 		} else {
1436 			memset(&timestamping, 0, sizeof(timestamping));
1437 			timestamping.flags = val;
1438 		}
1439 		ret = sock_set_timestamping(sk, optname, timestamping);
1440 		break;
1441 
1442 	case SO_RCVLOWAT:
1443 		{
1444 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1445 
1446 		if (val < 0)
1447 			val = INT_MAX;
1448 		if (sock)
1449 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1450 		if (set_rcvlowat)
1451 			ret = set_rcvlowat(sk, val);
1452 		else
1453 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1454 		break;
1455 		}
1456 	case SO_ATTACH_FILTER: {
1457 		struct sock_fprog fprog;
1458 
1459 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1460 		if (!ret)
1461 			ret = sk_attach_filter(&fprog, sk);
1462 		break;
1463 	}
1464 	case SO_ATTACH_BPF:
1465 		ret = -EINVAL;
1466 		if (optlen == sizeof(u32)) {
1467 			u32 ufd;
1468 
1469 			ret = -EFAULT;
1470 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1471 				break;
1472 
1473 			ret = sk_attach_bpf(ufd, sk);
1474 		}
1475 		break;
1476 
1477 	case SO_ATTACH_REUSEPORT_CBPF: {
1478 		struct sock_fprog fprog;
1479 
1480 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1481 		if (!ret)
1482 			ret = sk_reuseport_attach_filter(&fprog, sk);
1483 		break;
1484 	}
1485 	case SO_ATTACH_REUSEPORT_EBPF:
1486 		ret = -EINVAL;
1487 		if (optlen == sizeof(u32)) {
1488 			u32 ufd;
1489 
1490 			ret = -EFAULT;
1491 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1492 				break;
1493 
1494 			ret = sk_reuseport_attach_bpf(ufd, sk);
1495 		}
1496 		break;
1497 
1498 	case SO_DETACH_REUSEPORT_BPF:
1499 		ret = reuseport_detach_prog(sk);
1500 		break;
1501 
1502 	case SO_DETACH_FILTER:
1503 		ret = sk_detach_filter(sk);
1504 		break;
1505 
1506 	case SO_LOCK_FILTER:
1507 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1508 			ret = -EPERM;
1509 		else
1510 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1511 		break;
1512 
1513 	case SO_MARK:
1514 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1515 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1516 			ret = -EPERM;
1517 			break;
1518 		}
1519 
1520 		__sock_set_mark(sk, val);
1521 		break;
1522 	case SO_RCVMARK:
1523 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1524 		break;
1525 
1526 	case SO_RCVPRIORITY:
1527 		sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1528 		break;
1529 
1530 	case SO_RXQ_OVFL:
1531 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1532 		break;
1533 
1534 	case SO_WIFI_STATUS:
1535 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1536 		break;
1537 
1538 	case SO_NOFCS:
1539 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1540 		break;
1541 
1542 	case SO_SELECT_ERR_QUEUE:
1543 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1544 		break;
1545 
1546 	case SO_PASSCRED:
1547 		if (sk_may_scm_recv(sk))
1548 			sk->sk_scm_credentials = valbool;
1549 		else
1550 			ret = -EOPNOTSUPP;
1551 		break;
1552 
1553 	case SO_PASSSEC:
1554 		if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1555 			sk->sk_scm_security = valbool;
1556 		else
1557 			ret = -EOPNOTSUPP;
1558 		break;
1559 
1560 	case SO_PASSPIDFD:
1561 		if (sk_is_unix(sk))
1562 			sk->sk_scm_pidfd = valbool;
1563 		else
1564 			ret = -EOPNOTSUPP;
1565 		break;
1566 
1567 	case SO_PASSRIGHTS:
1568 		if (sk_is_unix(sk))
1569 			sk->sk_scm_rights = valbool;
1570 		else
1571 			ret = -EOPNOTSUPP;
1572 		break;
1573 
1574 	case SO_INCOMING_CPU:
1575 		reuseport_update_incoming_cpu(sk, val);
1576 		break;
1577 
1578 	case SO_CNX_ADVICE:
1579 		if (val == 1)
1580 			dst_negative_advice(sk);
1581 		break;
1582 
1583 	case SO_ZEROCOPY:
1584 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1585 			if (!(sk_is_tcp(sk) ||
1586 			      (sk->sk_type == SOCK_DGRAM &&
1587 			       sk->sk_protocol == IPPROTO_UDP)))
1588 				ret = -EOPNOTSUPP;
1589 		} else if (sk->sk_family != PF_RDS) {
1590 			ret = -EOPNOTSUPP;
1591 		}
1592 		if (!ret) {
1593 			if (val < 0 || val > 1)
1594 				ret = -EINVAL;
1595 			else
1596 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1597 		}
1598 		break;
1599 
1600 	case SO_TXTIME:
1601 		if (optlen != sizeof(struct sock_txtime)) {
1602 			ret = -EINVAL;
1603 			break;
1604 		} else if (copy_from_sockptr(&sk_txtime, optval,
1605 			   sizeof(struct sock_txtime))) {
1606 			ret = -EFAULT;
1607 			break;
1608 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1609 			ret = -EINVAL;
1610 			break;
1611 		}
1612 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1613 		 * scheduler has enough safe guards.
1614 		 */
1615 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1616 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1617 			ret = -EPERM;
1618 			break;
1619 		}
1620 
1621 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1622 		if (ret)
1623 			break;
1624 
1625 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1626 		sk->sk_clockid = sk_txtime.clockid;
1627 		sk->sk_txtime_deadline_mode =
1628 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1629 		sk->sk_txtime_report_errors =
1630 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1631 		break;
1632 
1633 	case SO_BINDTOIFINDEX:
1634 		ret = sock_bindtoindex_locked(sk, val);
1635 		break;
1636 
1637 	case SO_BUF_LOCK:
1638 		if (val & ~SOCK_BUF_LOCK_MASK) {
1639 			ret = -EINVAL;
1640 			break;
1641 		}
1642 		sk->sk_userlocks = val | (sk->sk_userlocks &
1643 					  ~SOCK_BUF_LOCK_MASK);
1644 		break;
1645 
1646 	case SO_RESERVE_MEM:
1647 	{
1648 		int delta;
1649 
1650 		if (val < 0) {
1651 			ret = -EINVAL;
1652 			break;
1653 		}
1654 
1655 		delta = val - sk->sk_reserved_mem;
1656 		if (delta < 0)
1657 			sock_release_reserved_memory(sk, -delta);
1658 		else
1659 			ret = sock_reserve_memory(sk, delta);
1660 		break;
1661 	}
1662 
1663 	default:
1664 		ret = -ENOPROTOOPT;
1665 		break;
1666 	}
1667 	sockopt_release_sock(sk);
1668 	return ret;
1669 }
1670 
1671 int sock_setsockopt(struct socket *sock, int level, int optname,
1672 		    sockptr_t optval, unsigned int optlen)
1673 {
1674 	return sk_setsockopt(sock->sk, level, optname,
1675 			     optval, optlen);
1676 }
1677 EXPORT_SYMBOL(sock_setsockopt);
1678 
1679 static const struct cred *sk_get_peer_cred(struct sock *sk)
1680 {
1681 	const struct cred *cred;
1682 
1683 	spin_lock(&sk->sk_peer_lock);
1684 	cred = get_cred(sk->sk_peer_cred);
1685 	spin_unlock(&sk->sk_peer_lock);
1686 
1687 	return cred;
1688 }
1689 
1690 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1691 			  struct ucred *ucred)
1692 {
1693 	ucred->pid = pid_vnr(pid);
1694 	ucred->uid = ucred->gid = -1;
1695 	if (cred) {
1696 		struct user_namespace *current_ns = current_user_ns();
1697 
1698 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1699 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1700 	}
1701 }
1702 
1703 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1704 {
1705 	struct user_namespace *user_ns = current_user_ns();
1706 	int i;
1707 
1708 	for (i = 0; i < src->ngroups; i++) {
1709 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1710 
1711 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1712 			return -EFAULT;
1713 	}
1714 
1715 	return 0;
1716 }
1717 
1718 int sk_getsockopt(struct sock *sk, int level, int optname,
1719 		  sockptr_t optval, sockptr_t optlen)
1720 {
1721 	struct socket *sock = sk->sk_socket;
1722 
1723 	union {
1724 		int val;
1725 		u64 val64;
1726 		unsigned long ulval;
1727 		struct linger ling;
1728 		struct old_timeval32 tm32;
1729 		struct __kernel_old_timeval tm;
1730 		struct  __kernel_sock_timeval stm;
1731 		struct sock_txtime txtime;
1732 		struct so_timestamping timestamping;
1733 	} v;
1734 
1735 	int lv = sizeof(int);
1736 	int len;
1737 
1738 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1739 		return -EFAULT;
1740 	if (len < 0)
1741 		return -EINVAL;
1742 
1743 	memset(&v, 0, sizeof(v));
1744 
1745 	switch (optname) {
1746 	case SO_DEBUG:
1747 		v.val = sock_flag(sk, SOCK_DBG);
1748 		break;
1749 
1750 	case SO_DONTROUTE:
1751 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1752 		break;
1753 
1754 	case SO_BROADCAST:
1755 		v.val = sock_flag(sk, SOCK_BROADCAST);
1756 		break;
1757 
1758 	case SO_SNDBUF:
1759 		v.val = READ_ONCE(sk->sk_sndbuf);
1760 		break;
1761 
1762 	case SO_RCVBUF:
1763 		v.val = READ_ONCE(sk->sk_rcvbuf);
1764 		break;
1765 
1766 	case SO_REUSEADDR:
1767 		v.val = sk->sk_reuse;
1768 		break;
1769 
1770 	case SO_REUSEPORT:
1771 		v.val = sk->sk_reuseport;
1772 		break;
1773 
1774 	case SO_KEEPALIVE:
1775 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1776 		break;
1777 
1778 	case SO_TYPE:
1779 		v.val = sk->sk_type;
1780 		break;
1781 
1782 	case SO_PROTOCOL:
1783 		v.val = sk->sk_protocol;
1784 		break;
1785 
1786 	case SO_DOMAIN:
1787 		v.val = sk->sk_family;
1788 		break;
1789 
1790 	case SO_ERROR:
1791 		v.val = -sock_error(sk);
1792 		if (v.val == 0)
1793 			v.val = xchg(&sk->sk_err_soft, 0);
1794 		break;
1795 
1796 	case SO_OOBINLINE:
1797 		v.val = sock_flag(sk, SOCK_URGINLINE);
1798 		break;
1799 
1800 	case SO_NO_CHECK:
1801 		v.val = sk->sk_no_check_tx;
1802 		break;
1803 
1804 	case SO_PRIORITY:
1805 		v.val = READ_ONCE(sk->sk_priority);
1806 		break;
1807 
1808 	case SO_LINGER:
1809 		lv		= sizeof(v.ling);
1810 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1811 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1812 		break;
1813 
1814 	case SO_BSDCOMPAT:
1815 		break;
1816 
1817 	case SO_TIMESTAMP_OLD:
1818 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1819 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1820 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1821 		break;
1822 
1823 	case SO_TIMESTAMPNS_OLD:
1824 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1825 		break;
1826 
1827 	case SO_TIMESTAMP_NEW:
1828 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1829 		break;
1830 
1831 	case SO_TIMESTAMPNS_NEW:
1832 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1833 		break;
1834 
1835 	case SO_TIMESTAMPING_OLD:
1836 	case SO_TIMESTAMPING_NEW:
1837 		lv = sizeof(v.timestamping);
1838 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1839 		 * returning the flags when they were set through the same option.
1840 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1841 		 */
1842 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1843 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1844 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1845 		}
1846 		break;
1847 
1848 	case SO_RCVTIMEO_OLD:
1849 	case SO_RCVTIMEO_NEW:
1850 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1851 				      SO_RCVTIMEO_OLD == optname);
1852 		break;
1853 
1854 	case SO_SNDTIMEO_OLD:
1855 	case SO_SNDTIMEO_NEW:
1856 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1857 				      SO_SNDTIMEO_OLD == optname);
1858 		break;
1859 
1860 	case SO_RCVLOWAT:
1861 		v.val = READ_ONCE(sk->sk_rcvlowat);
1862 		break;
1863 
1864 	case SO_SNDLOWAT:
1865 		v.val = 1;
1866 		break;
1867 
1868 	case SO_PASSCRED:
1869 		if (!sk_may_scm_recv(sk))
1870 			return -EOPNOTSUPP;
1871 
1872 		v.val = sk->sk_scm_credentials;
1873 		break;
1874 
1875 	case SO_PASSPIDFD:
1876 		if (!sk_is_unix(sk))
1877 			return -EOPNOTSUPP;
1878 
1879 		v.val = sk->sk_scm_pidfd;
1880 		break;
1881 
1882 	case SO_PASSRIGHTS:
1883 		if (!sk_is_unix(sk))
1884 			return -EOPNOTSUPP;
1885 
1886 		v.val = sk->sk_scm_rights;
1887 		break;
1888 
1889 	case SO_PEERCRED:
1890 	{
1891 		struct ucred peercred;
1892 		if (len > sizeof(peercred))
1893 			len = sizeof(peercred);
1894 
1895 		spin_lock(&sk->sk_peer_lock);
1896 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1897 		spin_unlock(&sk->sk_peer_lock);
1898 
1899 		if (copy_to_sockptr(optval, &peercred, len))
1900 			return -EFAULT;
1901 		goto lenout;
1902 	}
1903 
1904 	case SO_PEERPIDFD:
1905 	{
1906 		struct pid *peer_pid;
1907 		struct file *pidfd_file = NULL;
1908 		unsigned int flags = 0;
1909 		int pidfd;
1910 
1911 		if (len > sizeof(pidfd))
1912 			len = sizeof(pidfd);
1913 
1914 		spin_lock(&sk->sk_peer_lock);
1915 		peer_pid = get_pid(sk->sk_peer_pid);
1916 		spin_unlock(&sk->sk_peer_lock);
1917 
1918 		if (!peer_pid)
1919 			return -ENODATA;
1920 
1921 		/* The use of PIDFD_STALE requires stashing of struct pid
1922 		 * on pidfs with pidfs_register_pid() and only AF_UNIX
1923 		 * were prepared for this.
1924 		 */
1925 		if (sk->sk_family == AF_UNIX)
1926 			flags = PIDFD_STALE;
1927 
1928 		pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
1929 		put_pid(peer_pid);
1930 		if (pidfd < 0)
1931 			return pidfd;
1932 
1933 		if (copy_to_sockptr(optval, &pidfd, len) ||
1934 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1935 			put_unused_fd(pidfd);
1936 			fput(pidfd_file);
1937 
1938 			return -EFAULT;
1939 		}
1940 
1941 		fd_install(pidfd, pidfd_file);
1942 		return 0;
1943 	}
1944 
1945 	case SO_PEERGROUPS:
1946 	{
1947 		const struct cred *cred;
1948 		int ret, n;
1949 
1950 		cred = sk_get_peer_cred(sk);
1951 		if (!cred)
1952 			return -ENODATA;
1953 
1954 		n = cred->group_info->ngroups;
1955 		if (len < n * sizeof(gid_t)) {
1956 			len = n * sizeof(gid_t);
1957 			put_cred(cred);
1958 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1959 		}
1960 		len = n * sizeof(gid_t);
1961 
1962 		ret = groups_to_user(optval, cred->group_info);
1963 		put_cred(cred);
1964 		if (ret)
1965 			return ret;
1966 		goto lenout;
1967 	}
1968 
1969 	case SO_PEERNAME:
1970 	{
1971 		struct sockaddr_storage address;
1972 
1973 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1974 		if (lv < 0)
1975 			return -ENOTCONN;
1976 		if (lv < len)
1977 			return -EINVAL;
1978 		if (copy_to_sockptr(optval, &address, len))
1979 			return -EFAULT;
1980 		goto lenout;
1981 	}
1982 
1983 	/* Dubious BSD thing... Probably nobody even uses it, but
1984 	 * the UNIX standard wants it for whatever reason... -DaveM
1985 	 */
1986 	case SO_ACCEPTCONN:
1987 		v.val = sk->sk_state == TCP_LISTEN;
1988 		break;
1989 
1990 	case SO_PASSSEC:
1991 		if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
1992 			return -EOPNOTSUPP;
1993 
1994 		v.val = sk->sk_scm_security;
1995 		break;
1996 
1997 	case SO_PEERSEC:
1998 		return security_socket_getpeersec_stream(sock,
1999 							 optval, optlen, len);
2000 
2001 	case SO_MARK:
2002 		v.val = READ_ONCE(sk->sk_mark);
2003 		break;
2004 
2005 	case SO_RCVMARK:
2006 		v.val = sock_flag(sk, SOCK_RCVMARK);
2007 		break;
2008 
2009 	case SO_RCVPRIORITY:
2010 		v.val = sock_flag(sk, SOCK_RCVPRIORITY);
2011 		break;
2012 
2013 	case SO_RXQ_OVFL:
2014 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
2015 		break;
2016 
2017 	case SO_WIFI_STATUS:
2018 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
2019 		break;
2020 
2021 	case SO_PEEK_OFF:
2022 		if (!READ_ONCE(sock->ops)->set_peek_off)
2023 			return -EOPNOTSUPP;
2024 
2025 		v.val = READ_ONCE(sk->sk_peek_off);
2026 		break;
2027 	case SO_NOFCS:
2028 		v.val = sock_flag(sk, SOCK_NOFCS);
2029 		break;
2030 
2031 	case SO_BINDTODEVICE:
2032 		return sock_getbindtodevice(sk, optval, optlen, len);
2033 
2034 	case SO_GET_FILTER:
2035 		len = sk_get_filter(sk, optval, len);
2036 		if (len < 0)
2037 			return len;
2038 
2039 		goto lenout;
2040 
2041 	case SO_LOCK_FILTER:
2042 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
2043 		break;
2044 
2045 	case SO_BPF_EXTENSIONS:
2046 		v.val = bpf_tell_extensions();
2047 		break;
2048 
2049 	case SO_SELECT_ERR_QUEUE:
2050 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2051 		break;
2052 
2053 #ifdef CONFIG_NET_RX_BUSY_POLL
2054 	case SO_BUSY_POLL:
2055 		v.val = READ_ONCE(sk->sk_ll_usec);
2056 		break;
2057 	case SO_PREFER_BUSY_POLL:
2058 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2059 		break;
2060 #endif
2061 
2062 	case SO_MAX_PACING_RATE:
2063 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2064 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2065 			lv = sizeof(v.ulval);
2066 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2067 		} else {
2068 			/* 32bit version */
2069 			v.val = min_t(unsigned long, ~0U,
2070 				      READ_ONCE(sk->sk_max_pacing_rate));
2071 		}
2072 		break;
2073 
2074 	case SO_INCOMING_CPU:
2075 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2076 		break;
2077 
2078 	case SO_MEMINFO:
2079 	{
2080 		u32 meminfo[SK_MEMINFO_VARS];
2081 
2082 		sk_get_meminfo(sk, meminfo);
2083 
2084 		len = min_t(unsigned int, len, sizeof(meminfo));
2085 		if (copy_to_sockptr(optval, &meminfo, len))
2086 			return -EFAULT;
2087 
2088 		goto lenout;
2089 	}
2090 
2091 #ifdef CONFIG_NET_RX_BUSY_POLL
2092 	case SO_INCOMING_NAPI_ID:
2093 		v.val = READ_ONCE(sk->sk_napi_id);
2094 
2095 		/* aggregate non-NAPI IDs down to 0 */
2096 		if (!napi_id_valid(v.val))
2097 			v.val = 0;
2098 
2099 		break;
2100 #endif
2101 
2102 	case SO_COOKIE:
2103 		lv = sizeof(u64);
2104 		if (len < lv)
2105 			return -EINVAL;
2106 		v.val64 = sock_gen_cookie(sk);
2107 		break;
2108 
2109 	case SO_ZEROCOPY:
2110 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2111 		break;
2112 
2113 	case SO_TXTIME:
2114 		lv = sizeof(v.txtime);
2115 		v.txtime.clockid = sk->sk_clockid;
2116 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2117 				  SOF_TXTIME_DEADLINE_MODE : 0;
2118 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2119 				  SOF_TXTIME_REPORT_ERRORS : 0;
2120 		break;
2121 
2122 	case SO_BINDTOIFINDEX:
2123 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2124 		break;
2125 
2126 	case SO_NETNS_COOKIE:
2127 		lv = sizeof(u64);
2128 		if (len != lv)
2129 			return -EINVAL;
2130 		v.val64 = sock_net(sk)->net_cookie;
2131 		break;
2132 
2133 	case SO_BUF_LOCK:
2134 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2135 		break;
2136 
2137 	case SO_RESERVE_MEM:
2138 		v.val = READ_ONCE(sk->sk_reserved_mem);
2139 		break;
2140 
2141 	case SO_TXREHASH:
2142 		if (!sk_is_tcp(sk))
2143 			return -EOPNOTSUPP;
2144 
2145 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2146 		v.val = READ_ONCE(sk->sk_txrehash);
2147 		break;
2148 
2149 	default:
2150 		/* We implement the SO_SNDLOWAT etc to not be settable
2151 		 * (1003.1g 7).
2152 		 */
2153 		return -ENOPROTOOPT;
2154 	}
2155 
2156 	if (len > lv)
2157 		len = lv;
2158 	if (copy_to_sockptr(optval, &v, len))
2159 		return -EFAULT;
2160 lenout:
2161 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2162 		return -EFAULT;
2163 	return 0;
2164 }
2165 
2166 /*
2167  * Initialize an sk_lock.
2168  *
2169  * (We also register the sk_lock with the lock validator.)
2170  */
2171 static inline void sock_lock_init(struct sock *sk)
2172 {
2173 	sk_owner_clear(sk);
2174 
2175 	if (sk->sk_kern_sock)
2176 		sock_lock_init_class_and_name(
2177 			sk,
2178 			af_family_kern_slock_key_strings[sk->sk_family],
2179 			af_family_kern_slock_keys + sk->sk_family,
2180 			af_family_kern_key_strings[sk->sk_family],
2181 			af_family_kern_keys + sk->sk_family);
2182 	else
2183 		sock_lock_init_class_and_name(
2184 			sk,
2185 			af_family_slock_key_strings[sk->sk_family],
2186 			af_family_slock_keys + sk->sk_family,
2187 			af_family_key_strings[sk->sk_family],
2188 			af_family_keys + sk->sk_family);
2189 }
2190 
2191 /*
2192  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2193  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2194  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2195  */
2196 static void sock_copy(struct sock *nsk, const struct sock *osk)
2197 {
2198 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2199 #ifdef CONFIG_SECURITY_NETWORK
2200 	void *sptr = nsk->sk_security;
2201 #endif
2202 
2203 	/* If we move sk_tx_queue_mapping out of the private section,
2204 	 * we must check if sk_tx_queue_clear() is called after
2205 	 * sock_copy() in sk_clone_lock().
2206 	 */
2207 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2208 		     offsetof(struct sock, sk_dontcopy_begin) ||
2209 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2210 		     offsetof(struct sock, sk_dontcopy_end));
2211 
2212 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2213 
2214 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2215 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2216 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2217 
2218 #ifdef CONFIG_SECURITY_NETWORK
2219 	nsk->sk_security = sptr;
2220 	security_sk_clone(osk, nsk);
2221 #endif
2222 }
2223 
2224 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2225 		int family)
2226 {
2227 	struct sock *sk;
2228 	struct kmem_cache *slab;
2229 
2230 	slab = prot->slab;
2231 	if (slab != NULL) {
2232 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2233 		if (!sk)
2234 			return sk;
2235 		if (want_init_on_alloc(priority))
2236 			sk_prot_clear_nulls(sk, prot->obj_size);
2237 	} else
2238 		sk = kmalloc(prot->obj_size, priority);
2239 
2240 	if (sk != NULL) {
2241 		if (security_sk_alloc(sk, family, priority))
2242 			goto out_free;
2243 
2244 		if (!try_module_get(prot->owner))
2245 			goto out_free_sec;
2246 	}
2247 
2248 	return sk;
2249 
2250 out_free_sec:
2251 	security_sk_free(sk);
2252 out_free:
2253 	if (slab != NULL)
2254 		kmem_cache_free(slab, sk);
2255 	else
2256 		kfree(sk);
2257 	return NULL;
2258 }
2259 
2260 static void sk_prot_free(struct proto *prot, struct sock *sk)
2261 {
2262 	struct kmem_cache *slab;
2263 	struct module *owner;
2264 
2265 	owner = prot->owner;
2266 	slab = prot->slab;
2267 
2268 	cgroup_sk_free(&sk->sk_cgrp_data);
2269 	mem_cgroup_sk_free(sk);
2270 	security_sk_free(sk);
2271 
2272 	sk_owner_put(sk);
2273 
2274 	if (slab != NULL)
2275 		kmem_cache_free(slab, sk);
2276 	else
2277 		kfree(sk);
2278 	module_put(owner);
2279 }
2280 
2281 /**
2282  *	sk_alloc - All socket objects are allocated here
2283  *	@net: the applicable net namespace
2284  *	@family: protocol family
2285  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2286  *	@prot: struct proto associated with this new sock instance
2287  *	@kern: is this to be a kernel socket?
2288  */
2289 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2290 		      struct proto *prot, int kern)
2291 {
2292 	struct sock *sk;
2293 
2294 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2295 	if (sk) {
2296 		sk->sk_family = family;
2297 		/*
2298 		 * See comment in struct sock definition to understand
2299 		 * why we need sk_prot_creator -acme
2300 		 */
2301 		sk->sk_prot = sk->sk_prot_creator = prot;
2302 
2303 		if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
2304 			sk->sk_bypass_prot_mem = 1;
2305 
2306 		sk->sk_kern_sock = kern;
2307 		sock_lock_init(sk);
2308 
2309 		sk->sk_net_refcnt = kern ? 0 : 1;
2310 		if (likely(sk->sk_net_refcnt)) {
2311 			get_net_track(net, &sk->ns_tracker, priority);
2312 			sock_inuse_add(net, 1);
2313 		} else {
2314 			net_passive_inc(net);
2315 			__netns_tracker_alloc(net, &sk->ns_tracker,
2316 					      false, priority);
2317 		}
2318 
2319 		sock_net_set(sk, net);
2320 		refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2321 
2322 		mem_cgroup_sk_alloc(sk);
2323 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2324 		sock_update_classid(&sk->sk_cgrp_data);
2325 		sock_update_netprioidx(&sk->sk_cgrp_data);
2326 		sk_tx_queue_clear(sk);
2327 	}
2328 
2329 	return sk;
2330 }
2331 EXPORT_SYMBOL(sk_alloc);
2332 
2333 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2334  * grace period. This is the case for UDP sockets and TCP listeners.
2335  */
2336 static void __sk_destruct(struct rcu_head *head)
2337 {
2338 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2339 	struct net *net = sock_net(sk);
2340 	struct sk_filter *filter;
2341 
2342 	if (sk->sk_destruct)
2343 		sk->sk_destruct(sk);
2344 
2345 	filter = rcu_dereference_check(sk->sk_filter,
2346 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2347 	if (filter) {
2348 		sk_filter_uncharge(sk, filter);
2349 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2350 	}
2351 
2352 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2353 
2354 #ifdef CONFIG_BPF_SYSCALL
2355 	bpf_sk_storage_free(sk);
2356 #endif
2357 
2358 	if (atomic_read(&sk->sk_omem_alloc))
2359 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2360 			 __func__, atomic_read(&sk->sk_omem_alloc));
2361 
2362 	if (sk->sk_frag.page) {
2363 		put_page(sk->sk_frag.page);
2364 		sk->sk_frag.page = NULL;
2365 	}
2366 
2367 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2368 	put_cred(sk->sk_peer_cred);
2369 	put_pid(sk->sk_peer_pid);
2370 
2371 	if (likely(sk->sk_net_refcnt)) {
2372 		put_net_track(net, &sk->ns_tracker);
2373 	} else {
2374 		__netns_tracker_free(net, &sk->ns_tracker, false);
2375 		net_passive_dec(net);
2376 	}
2377 	sk_prot_free(sk->sk_prot_creator, sk);
2378 }
2379 
2380 void sk_net_refcnt_upgrade(struct sock *sk)
2381 {
2382 	struct net *net = sock_net(sk);
2383 
2384 	WARN_ON_ONCE(sk->sk_net_refcnt);
2385 	__netns_tracker_free(net, &sk->ns_tracker, false);
2386 	net_passive_dec(net);
2387 	sk->sk_net_refcnt = 1;
2388 	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2389 	sock_inuse_add(net, 1);
2390 }
2391 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2392 
2393 void sk_destruct(struct sock *sk)
2394 {
2395 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2396 
2397 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2398 		reuseport_detach_sock(sk);
2399 		use_call_rcu = true;
2400 	}
2401 
2402 	if (use_call_rcu)
2403 		call_rcu(&sk->sk_rcu, __sk_destruct);
2404 	else
2405 		__sk_destruct(&sk->sk_rcu);
2406 }
2407 
2408 static void __sk_free(struct sock *sk)
2409 {
2410 	if (likely(sk->sk_net_refcnt))
2411 		sock_inuse_add(sock_net(sk), -1);
2412 
2413 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2414 		sock_diag_broadcast_destroy(sk);
2415 	else
2416 		sk_destruct(sk);
2417 }
2418 
2419 void sk_free(struct sock *sk)
2420 {
2421 	/*
2422 	 * We subtract one from sk_wmem_alloc and can know if
2423 	 * some packets are still in some tx queue.
2424 	 * If not null, sock_wfree() will call __sk_free(sk) later
2425 	 */
2426 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2427 		__sk_free(sk);
2428 }
2429 EXPORT_SYMBOL(sk_free);
2430 
2431 static void sk_init_common(struct sock *sk)
2432 {
2433 	skb_queue_head_init(&sk->sk_receive_queue);
2434 	skb_queue_head_init(&sk->sk_write_queue);
2435 	skb_queue_head_init(&sk->sk_error_queue);
2436 
2437 	rwlock_init(&sk->sk_callback_lock);
2438 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2439 			af_rlock_keys + sk->sk_family,
2440 			af_family_rlock_key_strings[sk->sk_family]);
2441 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2442 			af_wlock_keys + sk->sk_family,
2443 			af_family_wlock_key_strings[sk->sk_family]);
2444 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2445 			af_elock_keys + sk->sk_family,
2446 			af_family_elock_key_strings[sk->sk_family]);
2447 	if (sk->sk_kern_sock)
2448 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2449 			af_kern_callback_keys + sk->sk_family,
2450 			af_family_kern_clock_key_strings[sk->sk_family]);
2451 	else
2452 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2453 			af_callback_keys + sk->sk_family,
2454 			af_family_clock_key_strings[sk->sk_family]);
2455 }
2456 
2457 /**
2458  * sk_clone - clone a socket
2459  * @sk: the socket to clone
2460  * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2461  * @lock: if true, lock the cloned sk
2462  *
2463  * If @lock is true, the clone is locked by bh_lock_sock(), and
2464  * caller must unlock socket even in error path by bh_unlock_sock().
2465  */
2466 struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
2467 		      bool lock)
2468 {
2469 	struct proto *prot = READ_ONCE(sk->sk_prot);
2470 	struct sk_filter *filter;
2471 	bool is_charged = true;
2472 	struct sock *newsk;
2473 
2474 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2475 	if (!newsk)
2476 		goto out;
2477 
2478 	sock_copy(newsk, sk);
2479 
2480 	newsk->sk_prot_creator = prot;
2481 
2482 	/* SANITY */
2483 	if (likely(newsk->sk_net_refcnt)) {
2484 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2485 		sock_inuse_add(sock_net(newsk), 1);
2486 	} else {
2487 		/* Kernel sockets are not elevating the struct net refcount.
2488 		 * Instead, use a tracker to more easily detect if a layer
2489 		 * is not properly dismantling its kernel sockets at netns
2490 		 * destroy time.
2491 		 */
2492 		net_passive_inc(sock_net(newsk));
2493 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2494 				      false, priority);
2495 	}
2496 
2497 	sk_node_init(&newsk->sk_node);
2498 	sock_lock_init(newsk);
2499 
2500 	if (lock)
2501 		bh_lock_sock(newsk);
2502 
2503 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2504 	newsk->sk_backlog.len = 0;
2505 
2506 	atomic_set(&newsk->sk_rmem_alloc, 0);
2507 
2508 	refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2509 
2510 	atomic_set(&newsk->sk_omem_alloc, 0);
2511 	sk_init_common(newsk);
2512 
2513 	newsk->sk_dst_cache	= NULL;
2514 	newsk->sk_dst_pending_confirm = 0;
2515 	newsk->sk_wmem_queued	= 0;
2516 	newsk->sk_forward_alloc = 0;
2517 	newsk->sk_reserved_mem  = 0;
2518 	DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
2519 	sk_drops_reset(newsk);
2520 	newsk->sk_send_head	= NULL;
2521 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2522 	atomic_set(&newsk->sk_zckey, 0);
2523 
2524 	sock_reset_flag(newsk, SOCK_DONE);
2525 
2526 #ifdef CONFIG_MEMCG
2527 	/* sk->sk_memcg will be populated at accept() time */
2528 	newsk->sk_memcg = NULL;
2529 #endif
2530 
2531 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2532 
2533 	rcu_read_lock();
2534 	filter = rcu_dereference(sk->sk_filter);
2535 	if (filter != NULL)
2536 		/* though it's an empty new sock, the charging may fail
2537 		 * if sysctl_optmem_max was changed between creation of
2538 		 * original socket and cloning
2539 		 */
2540 		is_charged = sk_filter_charge(newsk, filter);
2541 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2542 	rcu_read_unlock();
2543 
2544 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2545 		/* We need to make sure that we don't uncharge the new
2546 		 * socket if we couldn't charge it in the first place
2547 		 * as otherwise we uncharge the parent's filter.
2548 		 */
2549 		if (!is_charged)
2550 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2551 
2552 		goto free;
2553 	}
2554 
2555 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2556 
2557 	if (bpf_sk_storage_clone(sk, newsk))
2558 		goto free;
2559 
2560 	/* Clear sk_user_data if parent had the pointer tagged
2561 	 * as not suitable for copying when cloning.
2562 	 */
2563 	if (sk_user_data_is_nocopy(newsk))
2564 		newsk->sk_user_data = NULL;
2565 
2566 	newsk->sk_err	   = 0;
2567 	newsk->sk_err_soft = 0;
2568 	newsk->sk_priority = 0;
2569 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2570 
2571 	/* Before updating sk_refcnt, we must commit prior changes to memory
2572 	 * (Documentation/RCU/rculist_nulls.rst for details)
2573 	 */
2574 	smp_wmb();
2575 	refcount_set(&newsk->sk_refcnt, 2);
2576 
2577 	sk_set_socket(newsk, NULL);
2578 	sk_tx_queue_clear(newsk);
2579 	sk_rx_queue_clear(newsk);
2580 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2581 
2582 	if (newsk->sk_prot->sockets_allocated)
2583 		sk_sockets_allocated_inc(newsk);
2584 
2585 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2586 		net_enable_timestamp();
2587 out:
2588 	return newsk;
2589 free:
2590 	/* It is still raw copy of parent, so invalidate
2591 	 * destructor and make plain sk_free()
2592 	 */
2593 	newsk->sk_destruct = NULL;
2594 	if (lock)
2595 		bh_unlock_sock(newsk);
2596 	sk_free(newsk);
2597 	newsk = NULL;
2598 	goto out;
2599 }
2600 EXPORT_SYMBOL_GPL(sk_clone);
2601 
2602 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
2603 {
2604 	bool is_ipv6 = false;
2605 	u32 max_size;
2606 
2607 #if IS_ENABLED(CONFIG_IPV6)
2608 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2609 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2610 #endif
2611 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2612 	max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
2613 			READ_ONCE(dev->gso_ipv4_max_size);
2614 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2615 		max_size = GSO_LEGACY_MAX_SIZE;
2616 
2617 	return max_size - (MAX_TCP_HEADER + 1);
2618 }
2619 
2620 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2621 {
2622 	const struct net_device *dev;
2623 	u32 max_segs = 1;
2624 
2625 	rcu_read_lock();
2626 	dev = dst_dev_rcu(dst);
2627 	sk->sk_route_caps = dev->features;
2628 	if (sk_is_tcp(sk)) {
2629 		struct inet_connection_sock *icsk = inet_csk(sk);
2630 
2631 		sk->sk_route_caps |= NETIF_F_GSO;
2632 		icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2633 	}
2634 	if (sk->sk_route_caps & NETIF_F_GSO)
2635 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2636 	if (unlikely(sk->sk_gso_disabled))
2637 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2638 	if (sk_can_gso(sk)) {
2639 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2640 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2641 		} else {
2642 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2643 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
2644 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2645 			max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
2646 		}
2647 	}
2648 	sk->sk_gso_max_segs = max_segs;
2649 	sk_dst_set(sk, dst);
2650 	rcu_read_unlock();
2651 }
2652 EXPORT_SYMBOL_GPL(sk_setup_caps);
2653 
2654 /*
2655  *	Simple resource managers for sockets.
2656  */
2657 
2658 
2659 /*
2660  * Write buffer destructor automatically called from kfree_skb.
2661  */
2662 void sock_wfree(struct sk_buff *skb)
2663 {
2664 	unsigned int len = skb->truesize;
2665 	struct sock *sk = skb->sk;
2666 	bool free;
2667 	int old;
2668 
2669 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2670 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2671 		    sk->sk_write_space == sock_def_write_space) {
2672 			rcu_read_lock();
2673 			free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
2674 						       &old);
2675 			sock_def_write_space_wfree(sk, old - len);
2676 			rcu_read_unlock();
2677 			if (unlikely(free))
2678 				__sk_free(sk);
2679 			return;
2680 		}
2681 
2682 		/*
2683 		 * Keep a reference on sk_wmem_alloc, this will be released
2684 		 * after sk_write_space() call
2685 		 */
2686 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2687 		sk->sk_write_space(sk);
2688 		len = 1;
2689 	}
2690 	/*
2691 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2692 	 * could not do because of in-flight packets
2693 	 */
2694 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2695 		__sk_free(sk);
2696 }
2697 EXPORT_SYMBOL(sock_wfree);
2698 
2699 /* This variant of sock_wfree() is used by TCP,
2700  * since it sets SOCK_USE_WRITE_QUEUE.
2701  */
2702 void __sock_wfree(struct sk_buff *skb)
2703 {
2704 	struct sock *sk = skb->sk;
2705 
2706 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2707 		__sk_free(sk);
2708 }
2709 
2710 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2711 {
2712 	int old_wmem;
2713 
2714 	skb_orphan(skb);
2715 #ifdef CONFIG_INET
2716 	if (unlikely(!sk_fullsock(sk)))
2717 		return skb_set_owner_edemux(skb, sk);
2718 #endif
2719 	skb->sk = sk;
2720 	skb->destructor = sock_wfree;
2721 	skb_set_hash_from_sk(skb, sk);
2722 	/*
2723 	 * We used to take a refcount on sk, but following operation
2724 	 * is enough to guarantee sk_free() won't free this sock until
2725 	 * all in-flight packets are completed
2726 	 */
2727 	__refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);
2728 
2729 	/* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
2730 	 * is in a host queue (qdisc, NIC queue).
2731 	 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
2732 	 * based on XPS for better performance.
2733 	 * Otherwise clear ooo_okay to not risk Out Of Order delivery.
2734 	 */
2735 	skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
2736 }
2737 EXPORT_SYMBOL(skb_set_owner_w);
2738 
2739 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2740 {
2741 	/* Drivers depend on in-order delivery for crypto offload,
2742 	 * partial orphan breaks out-of-order-OK logic.
2743 	 */
2744 	if (skb_is_decrypted(skb))
2745 		return false;
2746 
2747 	return (skb->destructor == sock_wfree ||
2748 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2749 }
2750 
2751 /* This helper is used by netem, as it can hold packets in its
2752  * delay queue. We want to allow the owner socket to send more
2753  * packets, as if they were already TX completed by a typical driver.
2754  * But we also want to keep skb->sk set because some packet schedulers
2755  * rely on it (sch_fq for example).
2756  */
2757 void skb_orphan_partial(struct sk_buff *skb)
2758 {
2759 	if (skb_is_tcp_pure_ack(skb))
2760 		return;
2761 
2762 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2763 		return;
2764 
2765 	skb_orphan(skb);
2766 }
2767 EXPORT_SYMBOL(skb_orphan_partial);
2768 
2769 /*
2770  * Read buffer destructor automatically called from kfree_skb.
2771  */
2772 void sock_rfree(struct sk_buff *skb)
2773 {
2774 	struct sock *sk = skb->sk;
2775 	unsigned int len = skb->truesize;
2776 
2777 	atomic_sub(len, &sk->sk_rmem_alloc);
2778 	sk_mem_uncharge(sk, len);
2779 }
2780 EXPORT_SYMBOL(sock_rfree);
2781 
2782 /*
2783  * Buffer destructor for skbs that are not used directly in read or write
2784  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2785  */
2786 void sock_efree(struct sk_buff *skb)
2787 {
2788 	sock_put(skb->sk);
2789 }
2790 EXPORT_SYMBOL(sock_efree);
2791 
2792 /* Buffer destructor for prefetch/receive path where reference count may
2793  * not be held, e.g. for listen sockets.
2794  */
2795 #ifdef CONFIG_INET
2796 void sock_pfree(struct sk_buff *skb)
2797 {
2798 	struct sock *sk = skb->sk;
2799 
2800 	if (!sk_is_refcounted(sk))
2801 		return;
2802 
2803 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2804 		inet_reqsk(sk)->rsk_listener = NULL;
2805 		reqsk_free(inet_reqsk(sk));
2806 		return;
2807 	}
2808 
2809 	sock_gen_put(sk);
2810 }
2811 EXPORT_SYMBOL(sock_pfree);
2812 #endif /* CONFIG_INET */
2813 
2814 /*
2815  * Allocate a skb from the socket's send buffer.
2816  */
2817 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2818 			     gfp_t priority)
2819 {
2820 	if (force ||
2821 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2822 		struct sk_buff *skb = alloc_skb(size, priority);
2823 
2824 		if (skb) {
2825 			skb_set_owner_w(skb, sk);
2826 			return skb;
2827 		}
2828 	}
2829 	return NULL;
2830 }
2831 EXPORT_SYMBOL(sock_wmalloc);
2832 
2833 static void sock_ofree(struct sk_buff *skb)
2834 {
2835 	struct sock *sk = skb->sk;
2836 
2837 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2838 }
2839 
2840 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2841 			     gfp_t priority)
2842 {
2843 	struct sk_buff *skb;
2844 
2845 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2846 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2847 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2848 		return NULL;
2849 
2850 	skb = alloc_skb(size, priority);
2851 	if (!skb)
2852 		return NULL;
2853 
2854 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2855 	skb->sk = sk;
2856 	skb->destructor = sock_ofree;
2857 	return skb;
2858 }
2859 
2860 /*
2861  * Allocate a memory block from the socket's option memory buffer.
2862  */
2863 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2864 {
2865 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2866 
2867 	if ((unsigned int)size <= optmem_max &&
2868 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2869 		void *mem;
2870 		/* First do the add, to avoid the race if kmalloc
2871 		 * might sleep.
2872 		 */
2873 		atomic_add(size, &sk->sk_omem_alloc);
2874 		mem = kmalloc(size, priority);
2875 		if (mem)
2876 			return mem;
2877 		atomic_sub(size, &sk->sk_omem_alloc);
2878 	}
2879 	return NULL;
2880 }
2881 EXPORT_SYMBOL(sock_kmalloc);
2882 
2883 /*
2884  * Duplicate the input "src" memory block using the socket's
2885  * option memory buffer.
2886  */
2887 void *sock_kmemdup(struct sock *sk, const void *src,
2888 		   int size, gfp_t priority)
2889 {
2890 	void *mem;
2891 
2892 	mem = sock_kmalloc(sk, size, priority);
2893 	if (mem)
2894 		memcpy(mem, src, size);
2895 	return mem;
2896 }
2897 EXPORT_SYMBOL(sock_kmemdup);
2898 
2899 /* Free an option memory block. Note, we actually want the inline
2900  * here as this allows gcc to detect the nullify and fold away the
2901  * condition entirely.
2902  */
2903 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2904 				  const bool nullify)
2905 {
2906 	if (WARN_ON_ONCE(!mem))
2907 		return;
2908 	if (nullify)
2909 		kfree_sensitive(mem);
2910 	else
2911 		kfree(mem);
2912 	atomic_sub(size, &sk->sk_omem_alloc);
2913 }
2914 
2915 void sock_kfree_s(struct sock *sk, void *mem, int size)
2916 {
2917 	__sock_kfree_s(sk, mem, size, false);
2918 }
2919 EXPORT_SYMBOL(sock_kfree_s);
2920 
2921 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2922 {
2923 	__sock_kfree_s(sk, mem, size, true);
2924 }
2925 EXPORT_SYMBOL(sock_kzfree_s);
2926 
2927 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2928    I think, these locks should be removed for datagram sockets.
2929  */
2930 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2931 {
2932 	DEFINE_WAIT(wait);
2933 
2934 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2935 	for (;;) {
2936 		if (!timeo)
2937 			break;
2938 		if (signal_pending(current))
2939 			break;
2940 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2941 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2942 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2943 			break;
2944 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2945 			break;
2946 		if (READ_ONCE(sk->sk_err))
2947 			break;
2948 		timeo = schedule_timeout(timeo);
2949 	}
2950 	finish_wait(sk_sleep(sk), &wait);
2951 	return timeo;
2952 }
2953 
2954 
2955 /*
2956  *	Generic send/receive buffer handlers
2957  */
2958 
2959 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2960 				     unsigned long data_len, int noblock,
2961 				     int *errcode, int max_page_order)
2962 {
2963 	struct sk_buff *skb;
2964 	long timeo;
2965 	int err;
2966 
2967 	timeo = sock_sndtimeo(sk, noblock);
2968 	for (;;) {
2969 		err = sock_error(sk);
2970 		if (err != 0)
2971 			goto failure;
2972 
2973 		err = -EPIPE;
2974 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2975 			goto failure;
2976 
2977 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2978 			break;
2979 
2980 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2981 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2982 		err = -EAGAIN;
2983 		if (!timeo)
2984 			goto failure;
2985 		if (signal_pending(current))
2986 			goto interrupted;
2987 		timeo = sock_wait_for_wmem(sk, timeo);
2988 	}
2989 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2990 				   errcode, sk->sk_allocation);
2991 	if (skb)
2992 		skb_set_owner_w(skb, sk);
2993 	return skb;
2994 
2995 interrupted:
2996 	err = sock_intr_errno(timeo);
2997 failure:
2998 	*errcode = err;
2999 	return NULL;
3000 }
3001 EXPORT_SYMBOL(sock_alloc_send_pskb);
3002 
3003 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
3004 		     struct sockcm_cookie *sockc)
3005 {
3006 	u32 tsflags;
3007 
3008 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
3009 
3010 	switch (cmsg->cmsg_type) {
3011 	case SO_MARK:
3012 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
3013 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3014 			return -EPERM;
3015 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3016 			return -EINVAL;
3017 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
3018 		break;
3019 	case SO_TIMESTAMPING_OLD:
3020 	case SO_TIMESTAMPING_NEW:
3021 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3022 			return -EINVAL;
3023 
3024 		tsflags = *(u32 *)CMSG_DATA(cmsg);
3025 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3026 			return -EINVAL;
3027 
3028 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3029 		sockc->tsflags |= tsflags;
3030 		break;
3031 	case SCM_TXTIME:
3032 		if (!sock_flag(sk, SOCK_TXTIME))
3033 			return -EINVAL;
3034 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3035 			return -EINVAL;
3036 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3037 		break;
3038 	case SCM_TS_OPT_ID:
3039 		if (sk_is_tcp(sk))
3040 			return -EINVAL;
3041 		tsflags = READ_ONCE(sk->sk_tsflags);
3042 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3043 			return -EINVAL;
3044 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3045 			return -EINVAL;
3046 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
3047 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
3048 		break;
3049 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
3050 	case SCM_RIGHTS:
3051 	case SCM_CREDENTIALS:
3052 		break;
3053 	case SO_PRIORITY:
3054 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3055 			return -EINVAL;
3056 		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
3057 			return -EPERM;
3058 		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
3059 		break;
3060 	case SCM_DEVMEM_DMABUF:
3061 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3062 			return -EINVAL;
3063 		sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
3064 		break;
3065 	default:
3066 		return -EINVAL;
3067 	}
3068 	return 0;
3069 }
3070 EXPORT_SYMBOL(__sock_cmsg_send);
3071 
3072 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
3073 		   struct sockcm_cookie *sockc)
3074 {
3075 	struct cmsghdr *cmsg;
3076 	int ret;
3077 
3078 	for_each_cmsghdr(cmsg, msg) {
3079 		if (!CMSG_OK(msg, cmsg))
3080 			return -EINVAL;
3081 		if (cmsg->cmsg_level != SOL_SOCKET)
3082 			continue;
3083 		ret = __sock_cmsg_send(sk, cmsg, sockc);
3084 		if (ret)
3085 			return ret;
3086 	}
3087 	return 0;
3088 }
3089 EXPORT_SYMBOL(sock_cmsg_send);
3090 
3091 static void sk_enter_memory_pressure(struct sock *sk)
3092 {
3093 	if (!sk->sk_prot->enter_memory_pressure)
3094 		return;
3095 
3096 	sk->sk_prot->enter_memory_pressure(sk);
3097 }
3098 
3099 static void sk_leave_memory_pressure(struct sock *sk)
3100 {
3101 	if (sk->sk_prot->leave_memory_pressure) {
3102 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3103 				     tcp_leave_memory_pressure, sk);
3104 	} else {
3105 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3106 
3107 		if (memory_pressure && READ_ONCE(*memory_pressure))
3108 			WRITE_ONCE(*memory_pressure, 0);
3109 	}
3110 }
3111 
3112 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3113 
3114 /**
3115  * skb_page_frag_refill - check that a page_frag contains enough room
3116  * @sz: minimum size of the fragment we want to get
3117  * @pfrag: pointer to page_frag
3118  * @gfp: priority for memory allocation
3119  *
3120  * Note: While this allocator tries to use high order pages, there is
3121  * no guarantee that allocations succeed. Therefore, @sz MUST be
3122  * less or equal than PAGE_SIZE.
3123  */
3124 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3125 {
3126 	if (pfrag->page) {
3127 		if (page_ref_count(pfrag->page) == 1) {
3128 			pfrag->offset = 0;
3129 			return true;
3130 		}
3131 		if (pfrag->offset + sz <= pfrag->size)
3132 			return true;
3133 		put_page(pfrag->page);
3134 	}
3135 
3136 	pfrag->offset = 0;
3137 	if (SKB_FRAG_PAGE_ORDER &&
3138 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3139 		/* Avoid direct reclaim but allow kswapd to wake */
3140 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3141 					  __GFP_COMP | __GFP_NOWARN |
3142 					  __GFP_NORETRY,
3143 					  SKB_FRAG_PAGE_ORDER);
3144 		if (likely(pfrag->page)) {
3145 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3146 			return true;
3147 		}
3148 	}
3149 	pfrag->page = alloc_page(gfp);
3150 	if (likely(pfrag->page)) {
3151 		pfrag->size = PAGE_SIZE;
3152 		return true;
3153 	}
3154 	return false;
3155 }
3156 EXPORT_SYMBOL(skb_page_frag_refill);
3157 
3158 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3159 {
3160 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3161 		return true;
3162 
3163 	if (!sk->sk_bypass_prot_mem)
3164 		sk_enter_memory_pressure(sk);
3165 
3166 	sk_stream_moderate_sndbuf(sk);
3167 
3168 	return false;
3169 }
3170 EXPORT_SYMBOL(sk_page_frag_refill);
3171 
3172 static void __lock_sock(struct sock *sk)
3173 	__releases(&sk->sk_lock.slock)
3174 	__acquires(&sk->sk_lock.slock)
3175 {
3176 	DEFINE_WAIT(wait);
3177 
3178 	for (;;) {
3179 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3180 					TASK_UNINTERRUPTIBLE);
3181 		spin_unlock_bh(&sk->sk_lock.slock);
3182 		schedule();
3183 		spin_lock_bh(&sk->sk_lock.slock);
3184 		if (!sock_owned_by_user(sk))
3185 			break;
3186 	}
3187 	finish_wait(&sk->sk_lock.wq, &wait);
3188 }
3189 
3190 void __release_sock(struct sock *sk)
3191 	__releases(&sk->sk_lock.slock)
3192 	__acquires(&sk->sk_lock.slock)
3193 {
3194 	struct sk_buff *skb, *next;
3195 	int nb = 0;
3196 
3197 	while ((skb = sk->sk_backlog.head) != NULL) {
3198 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3199 
3200 		spin_unlock_bh(&sk->sk_lock.slock);
3201 
3202 		while (1) {
3203 			next = skb->next;
3204 			prefetch(next);
3205 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3206 			skb_mark_not_on_list(skb);
3207 			sk_backlog_rcv(sk, skb);
3208 
3209 			skb = next;
3210 			if (!skb)
3211 				break;
3212 
3213 			if (!(++nb & 15))
3214 				cond_resched();
3215 		}
3216 
3217 		spin_lock_bh(&sk->sk_lock.slock);
3218 	}
3219 
3220 	/*
3221 	 * Doing the zeroing here guarantee we can not loop forever
3222 	 * while a wild producer attempts to flood us.
3223 	 */
3224 	sk->sk_backlog.len = 0;
3225 }
3226 
3227 void __sk_flush_backlog(struct sock *sk)
3228 {
3229 	spin_lock_bh(&sk->sk_lock.slock);
3230 	__release_sock(sk);
3231 
3232 	if (sk->sk_prot->release_cb)
3233 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3234 				     tcp_release_cb, sk);
3235 
3236 	spin_unlock_bh(&sk->sk_lock.slock);
3237 }
3238 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3239 
3240 /**
3241  * sk_wait_data - wait for data to arrive at sk_receive_queue
3242  * @sk:    sock to wait on
3243  * @timeo: for how long
3244  * @skb:   last skb seen on sk_receive_queue
3245  *
3246  * Now socket state including sk->sk_err is changed only under lock,
3247  * hence we may omit checks after joining wait queue.
3248  * We check receive queue before schedule() only as optimization;
3249  * it is very likely that release_sock() added new data.
3250  */
3251 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3252 {
3253 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3254 	int rc;
3255 
3256 	add_wait_queue(sk_sleep(sk), &wait);
3257 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3258 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3259 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3260 	remove_wait_queue(sk_sleep(sk), &wait);
3261 	return rc;
3262 }
3263 EXPORT_SYMBOL(sk_wait_data);
3264 
3265 /**
3266  *	__sk_mem_raise_allocated - increase memory_allocated
3267  *	@sk: socket
3268  *	@size: memory size to allocate
3269  *	@amt: pages to allocate
3270  *	@kind: allocation type
3271  *
3272  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3273  *
3274  *	Unlike the globally shared limits among the sockets under same protocol,
3275  *	consuming the budget of a memcg won't have direct effect on other ones.
3276  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3277  *	whether or not to raise allocated through sk_under_memory_pressure() or
3278  *	its variants.
3279  */
3280 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3281 {
3282 	bool memcg_enabled = false, charged = false;
3283 	struct proto *prot = sk->sk_prot;
3284 	long allocated = 0;
3285 
3286 	if (!sk->sk_bypass_prot_mem) {
3287 		sk_memory_allocated_add(sk, amt);
3288 		allocated = sk_memory_allocated(sk);
3289 	}
3290 
3291 	if (mem_cgroup_sk_enabled(sk)) {
3292 		memcg_enabled = true;
3293 		charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
3294 		if (!charged)
3295 			goto suppress_allocation;
3296 	}
3297 
3298 	if (!allocated)
3299 		return 1;
3300 
3301 	/* Under limit. */
3302 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3303 		sk_leave_memory_pressure(sk);
3304 		return 1;
3305 	}
3306 
3307 	/* Under pressure. */
3308 	if (allocated > sk_prot_mem_limits(sk, 1))
3309 		sk_enter_memory_pressure(sk);
3310 
3311 	/* Over hard limit. */
3312 	if (allocated > sk_prot_mem_limits(sk, 2))
3313 		goto suppress_allocation;
3314 
3315 	/* Guarantee minimum buffer size under pressure (either global
3316 	 * or memcg) to make sure features described in RFC 7323 (TCP
3317 	 * Extensions for High Performance) work properly.
3318 	 *
3319 	 * This rule does NOT stand when exceeds global or memcg's hard
3320 	 * limit, or else a DoS attack can be taken place by spawning
3321 	 * lots of sockets whose usage are under minimum buffer size.
3322 	 */
3323 	if (kind == SK_MEM_RECV) {
3324 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3325 			return 1;
3326 
3327 	} else { /* SK_MEM_SEND */
3328 		int wmem0 = sk_get_wmem0(sk, prot);
3329 
3330 		if (sk->sk_type == SOCK_STREAM) {
3331 			if (sk->sk_wmem_queued < wmem0)
3332 				return 1;
3333 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3334 				return 1;
3335 		}
3336 	}
3337 
3338 	if (sk_has_memory_pressure(sk)) {
3339 		u64 alloc;
3340 
3341 		/* The following 'average' heuristic is within the
3342 		 * scope of global accounting, so it only makes
3343 		 * sense for global memory pressure.
3344 		 */
3345 		if (!sk_under_global_memory_pressure(sk))
3346 			return 1;
3347 
3348 		/* Try to be fair among all the sockets under global
3349 		 * pressure by allowing the ones that below average
3350 		 * usage to raise.
3351 		 */
3352 		alloc = sk_sockets_allocated_read_positive(sk);
3353 		if (sk_prot_mem_limits(sk, 2) > alloc *
3354 		    sk_mem_pages(sk->sk_wmem_queued +
3355 				 atomic_read(&sk->sk_rmem_alloc) +
3356 				 sk->sk_forward_alloc))
3357 			return 1;
3358 	}
3359 
3360 suppress_allocation:
3361 
3362 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3363 		sk_stream_moderate_sndbuf(sk);
3364 
3365 		/* Fail only if socket is _under_ its sndbuf.
3366 		 * In this case we cannot block, so that we have to fail.
3367 		 */
3368 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3369 			/* Force charge with __GFP_NOFAIL */
3370 			if (memcg_enabled && !charged)
3371 				mem_cgroup_sk_charge(sk, amt,
3372 						     gfp_memcg_charge() | __GFP_NOFAIL);
3373 			return 1;
3374 		}
3375 	}
3376 
3377 	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3378 
3379 	if (allocated)
3380 		sk_memory_allocated_sub(sk, amt);
3381 
3382 	if (charged)
3383 		mem_cgroup_sk_uncharge(sk, amt);
3384 
3385 	return 0;
3386 }
3387 
3388 /**
3389  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3390  *	@sk: socket
3391  *	@size: memory size to allocate
3392  *	@kind: allocation type
3393  *
3394  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3395  *	rmem allocation. This function assumes that protocols which have
3396  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3397  */
3398 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3399 {
3400 	int ret, amt = sk_mem_pages(size);
3401 
3402 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3403 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3404 	if (!ret)
3405 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3406 	return ret;
3407 }
3408 EXPORT_SYMBOL(__sk_mem_schedule);
3409 
3410 /**
3411  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3412  *	@sk: socket
3413  *	@amount: number of quanta
3414  *
3415  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3416  */
3417 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3418 {
3419 	if (mem_cgroup_sk_enabled(sk))
3420 		mem_cgroup_sk_uncharge(sk, amount);
3421 
3422 	if (sk->sk_bypass_prot_mem)
3423 		return;
3424 
3425 	sk_memory_allocated_sub(sk, amount);
3426 
3427 	if (sk_under_global_memory_pressure(sk) &&
3428 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3429 		sk_leave_memory_pressure(sk);
3430 }
3431 
3432 /**
3433  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3434  *	@sk: socket
3435  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3436  */
3437 void __sk_mem_reclaim(struct sock *sk, int amount)
3438 {
3439 	amount >>= PAGE_SHIFT;
3440 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3441 	__sk_mem_reduce_allocated(sk, amount);
3442 }
3443 EXPORT_SYMBOL(__sk_mem_reclaim);
3444 
3445 void __sk_charge(struct sock *sk, gfp_t gfp)
3446 {
3447 	int amt;
3448 
3449 	gfp |= __GFP_NOFAIL;
3450 	if (mem_cgroup_from_sk(sk)) {
3451 		/* The socket has not been accepted yet, no need
3452 		 * to look at newsk->sk_wmem_queued.
3453 		 */
3454 		amt = sk_mem_pages(sk->sk_forward_alloc +
3455 				   atomic_read(&sk->sk_rmem_alloc));
3456 		if (amt)
3457 			mem_cgroup_sk_charge(sk, amt, gfp);
3458 	}
3459 
3460 	kmem_cache_charge(sk, gfp);
3461 }
3462 
3463 int sk_set_peek_off(struct sock *sk, int val)
3464 {
3465 	WRITE_ONCE(sk->sk_peek_off, val);
3466 	return 0;
3467 }
3468 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3469 
3470 /*
3471  * Set of default routines for initialising struct proto_ops when
3472  * the protocol does not support a particular function. In certain
3473  * cases where it makes no sense for a protocol to have a "do nothing"
3474  * function, some default processing is provided.
3475  */
3476 
3477 int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
3478 {
3479 	return -EOPNOTSUPP;
3480 }
3481 EXPORT_SYMBOL(sock_no_bind);
3482 
3483 int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
3484 		    int len, int flags)
3485 {
3486 	return -EOPNOTSUPP;
3487 }
3488 EXPORT_SYMBOL(sock_no_connect);
3489 
3490 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3491 {
3492 	return -EOPNOTSUPP;
3493 }
3494 EXPORT_SYMBOL(sock_no_socketpair);
3495 
3496 int sock_no_accept(struct socket *sock, struct socket *newsock,
3497 		   struct proto_accept_arg *arg)
3498 {
3499 	return -EOPNOTSUPP;
3500 }
3501 EXPORT_SYMBOL(sock_no_accept);
3502 
3503 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3504 		    int peer)
3505 {
3506 	return -EOPNOTSUPP;
3507 }
3508 EXPORT_SYMBOL(sock_no_getname);
3509 
3510 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3511 {
3512 	return -EOPNOTSUPP;
3513 }
3514 EXPORT_SYMBOL(sock_no_ioctl);
3515 
3516 int sock_no_listen(struct socket *sock, int backlog)
3517 {
3518 	return -EOPNOTSUPP;
3519 }
3520 EXPORT_SYMBOL(sock_no_listen);
3521 
3522 int sock_no_shutdown(struct socket *sock, int how)
3523 {
3524 	return -EOPNOTSUPP;
3525 }
3526 EXPORT_SYMBOL(sock_no_shutdown);
3527 
3528 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3529 {
3530 	return -EOPNOTSUPP;
3531 }
3532 EXPORT_SYMBOL(sock_no_sendmsg);
3533 
3534 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3535 {
3536 	return -EOPNOTSUPP;
3537 }
3538 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3539 
3540 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3541 		    int flags)
3542 {
3543 	return -EOPNOTSUPP;
3544 }
3545 EXPORT_SYMBOL(sock_no_recvmsg);
3546 
3547 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3548 {
3549 	/* Mirror missing mmap method error code */
3550 	return -ENODEV;
3551 }
3552 EXPORT_SYMBOL(sock_no_mmap);
3553 
3554 /*
3555  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3556  * various sock-based usage counts.
3557  */
3558 void __receive_sock(struct file *file)
3559 {
3560 	struct socket *sock;
3561 
3562 	sock = sock_from_file(file);
3563 	if (sock) {
3564 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3565 		sock_update_classid(&sock->sk->sk_cgrp_data);
3566 	}
3567 }
3568 
3569 /*
3570  *	Default Socket Callbacks
3571  */
3572 
3573 static void sock_def_wakeup(struct sock *sk)
3574 {
3575 	struct socket_wq *wq;
3576 
3577 	rcu_read_lock();
3578 	wq = rcu_dereference(sk->sk_wq);
3579 	if (skwq_has_sleeper(wq))
3580 		wake_up_interruptible_all(&wq->wait);
3581 	rcu_read_unlock();
3582 }
3583 
3584 static void sock_def_error_report(struct sock *sk)
3585 {
3586 	struct socket_wq *wq;
3587 
3588 	rcu_read_lock();
3589 	wq = rcu_dereference(sk->sk_wq);
3590 	if (skwq_has_sleeper(wq))
3591 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3592 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3593 	rcu_read_unlock();
3594 }
3595 
3596 void sock_def_readable(struct sock *sk)
3597 {
3598 	struct socket_wq *wq;
3599 
3600 	trace_sk_data_ready(sk);
3601 
3602 	rcu_read_lock();
3603 	wq = rcu_dereference(sk->sk_wq);
3604 	if (skwq_has_sleeper(wq))
3605 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3606 						EPOLLRDNORM | EPOLLRDBAND);
3607 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3608 	rcu_read_unlock();
3609 }
3610 
3611 static void sock_def_write_space(struct sock *sk)
3612 {
3613 	struct socket_wq *wq;
3614 
3615 	rcu_read_lock();
3616 
3617 	/* Do not wake up a writer until he can make "significant"
3618 	 * progress.  --DaveM
3619 	 */
3620 	if (sock_writeable(sk)) {
3621 		wq = rcu_dereference(sk->sk_wq);
3622 		if (skwq_has_sleeper(wq))
3623 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3624 						EPOLLWRNORM | EPOLLWRBAND);
3625 
3626 		/* Should agree with poll, otherwise some programs break */
3627 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3628 	}
3629 
3630 	rcu_read_unlock();
3631 }
3632 
3633 /* An optimised version of sock_def_write_space(), should only be called
3634  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3635  * ->sk_wmem_alloc.
3636  */
3637 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
3638 {
3639 	/* Do not wake up a writer until he can make "significant"
3640 	 * progress.  --DaveM
3641 	 */
3642 	if (__sock_writeable(sk, wmem_alloc)) {
3643 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3644 
3645 		/* rely on refcount_sub from sock_wfree() */
3646 		smp_mb__after_atomic();
3647 		if (wq && waitqueue_active(&wq->wait))
3648 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3649 						EPOLLWRNORM | EPOLLWRBAND);
3650 
3651 		/* Should agree with poll, otherwise some programs break */
3652 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3653 	}
3654 }
3655 
3656 static void sock_def_destruct(struct sock *sk)
3657 {
3658 }
3659 
3660 void sk_send_sigurg(struct sock *sk)
3661 {
3662 	if (sk->sk_socket && sk->sk_socket->file)
3663 		if (send_sigurg(sk->sk_socket->file))
3664 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3665 }
3666 EXPORT_SYMBOL(sk_send_sigurg);
3667 
3668 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3669 		    unsigned long expires)
3670 {
3671 	if (!mod_timer(timer, expires))
3672 		sock_hold(sk);
3673 }
3674 EXPORT_SYMBOL(sk_reset_timer);
3675 
3676 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3677 {
3678 	if (timer_delete(timer))
3679 		__sock_put(sk);
3680 }
3681 EXPORT_SYMBOL(sk_stop_timer);
3682 
3683 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3684 {
3685 	if (timer_delete_sync(timer))
3686 		__sock_put(sk);
3687 }
3688 EXPORT_SYMBOL(sk_stop_timer_sync);
3689 
3690 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3691 {
3692 	sk_init_common(sk);
3693 	sk->sk_send_head	=	NULL;
3694 
3695 	timer_setup(&sk->sk_timer, NULL, 0);
3696 
3697 	sk->sk_allocation	=	GFP_KERNEL;
3698 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3699 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3700 	sk->sk_state		=	TCP_CLOSE;
3701 	sk->sk_use_task_frag	=	true;
3702 	sk_set_socket(sk, sock);
3703 
3704 	sock_set_flag(sk, SOCK_ZAPPED);
3705 
3706 	if (sock) {
3707 		sk->sk_type	=	sock->type;
3708 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3709 		sock->sk	=	sk;
3710 	} else {
3711 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3712 	}
3713 	sk->sk_uid	=	uid;
3714 
3715 	sk->sk_state_change	=	sock_def_wakeup;
3716 	sk->sk_data_ready	=	sock_def_readable;
3717 	sk->sk_write_space	=	sock_def_write_space;
3718 	sk->sk_error_report	=	sock_def_error_report;
3719 	sk->sk_destruct		=	sock_def_destruct;
3720 
3721 	sk->sk_frag.page	=	NULL;
3722 	sk->sk_frag.offset	=	0;
3723 	sk->sk_peek_off		=	-1;
3724 
3725 	sk->sk_peer_pid 	=	NULL;
3726 	sk->sk_peer_cred	=	NULL;
3727 	spin_lock_init(&sk->sk_peer_lock);
3728 
3729 	sk->sk_write_pending	=	0;
3730 	sk->sk_rcvlowat		=	1;
3731 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3732 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3733 
3734 	sk->sk_stamp = SK_DEFAULT_STAMP;
3735 #if BITS_PER_LONG==32
3736 	seqlock_init(&sk->sk_stamp_seq);
3737 #endif
3738 	atomic_set(&sk->sk_zckey, 0);
3739 
3740 #ifdef CONFIG_NET_RX_BUSY_POLL
3741 	sk->sk_napi_id		=	0;
3742 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3743 #endif
3744 
3745 	sk->sk_max_pacing_rate = ~0UL;
3746 	sk->sk_pacing_rate = ~0UL;
3747 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3748 	sk->sk_incoming_cpu = -1;
3749 
3750 	sk_rx_queue_clear(sk);
3751 	/*
3752 	 * Before updating sk_refcnt, we must commit prior changes to memory
3753 	 * (Documentation/RCU/rculist_nulls.rst for details)
3754 	 */
3755 	smp_wmb();
3756 	refcount_set(&sk->sk_refcnt, 1);
3757 	sk_drops_reset(sk);
3758 }
3759 EXPORT_SYMBOL(sock_init_data_uid);
3760 
3761 void sock_init_data(struct socket *sock, struct sock *sk)
3762 {
3763 	kuid_t uid = sock ?
3764 		SOCK_INODE(sock)->i_uid :
3765 		make_kuid(sock_net(sk)->user_ns, 0);
3766 
3767 	sock_init_data_uid(sock, sk, uid);
3768 }
3769 EXPORT_SYMBOL(sock_init_data);
3770 
3771 void noinline lock_sock_nested(struct sock *sk, int subclass)
3772 {
3773 	/* The sk_lock has mutex_lock() semantics here. */
3774 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3775 
3776 	might_sleep();
3777 #ifdef CONFIG_64BIT
3778 	if (sizeof(struct slock_owned) == sizeof(long)) {
3779 		socket_lock_t tmp = {
3780 			.slock = __SPIN_LOCK_UNLOCKED(tmp.slock),
3781 			.owned = 1,
3782 		};
3783 		socket_lock_t old = {
3784 			.slock = __SPIN_LOCK_UNLOCKED(old.slock),
3785 			.owned = 0,
3786 		};
3787 
3788 		if (likely(try_cmpxchg(&sk->sk_lock.combined,
3789 				       &old.combined, tmp.combined)))
3790 			return;
3791 	}
3792 #endif
3793 	spin_lock_bh(&sk->sk_lock.slock);
3794 	if (unlikely(sock_owned_by_user_nocheck(sk)))
3795 		__lock_sock(sk);
3796 	sk->sk_lock.owned = 1;
3797 	spin_unlock_bh(&sk->sk_lock.slock);
3798 }
3799 EXPORT_SYMBOL(lock_sock_nested);
3800 
3801 void release_sock(struct sock *sk)
3802 {
3803 	spin_lock_bh(&sk->sk_lock.slock);
3804 
3805 	if (unlikely(sk->sk_backlog.tail))
3806 		__release_sock(sk);
3807 
3808 	if (sk->sk_prot->release_cb) {
3809 		if (!tcp_release_cb_cond(sk))
3810 			sk->sk_prot->release_cb(sk);
3811 	}
3812 	sock_release_ownership(sk);
3813 	if (unlikely(waitqueue_active(&sk->sk_lock.wq)))
3814 		wake_up(&sk->sk_lock.wq);
3815 
3816 	spin_unlock_bh(&sk->sk_lock.slock);
3817 }
3818 EXPORT_SYMBOL(release_sock);
3819 
3820 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3821 {
3822 	might_sleep();
3823 	spin_lock_bh(&sk->sk_lock.slock);
3824 
3825 	if (likely(!sock_owned_by_user_nocheck(sk))) {
3826 		/*
3827 		 * Fast path return with bottom halves disabled and
3828 		 * sock::sk_lock.slock held.
3829 		 *
3830 		 * The 'mutex' is not contended and holding
3831 		 * sock::sk_lock.slock prevents all other lockers to
3832 		 * proceed so the corresponding unlock_sock_fast() can
3833 		 * avoid the slow path of release_sock() completely and
3834 		 * just release slock.
3835 		 *
3836 		 * From a semantical POV this is equivalent to 'acquiring'
3837 		 * the 'mutex', hence the corresponding lockdep
3838 		 * mutex_release() has to happen in the fast path of
3839 		 * unlock_sock_fast().
3840 		 */
3841 		return false;
3842 	}
3843 
3844 	__lock_sock(sk);
3845 	sk->sk_lock.owned = 1;
3846 	__acquire(&sk->sk_lock.slock);
3847 	spin_unlock_bh(&sk->sk_lock.slock);
3848 	return true;
3849 }
3850 EXPORT_SYMBOL(__lock_sock_fast);
3851 
3852 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3853 		   bool timeval, bool time32)
3854 {
3855 	struct sock *sk = sock->sk;
3856 	struct timespec64 ts;
3857 
3858 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3859 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3860 	if (ts.tv_sec == -1)
3861 		return -ENOENT;
3862 	if (ts.tv_sec == 0) {
3863 		ktime_t kt = ktime_get_real();
3864 		sock_write_timestamp(sk, kt);
3865 		ts = ktime_to_timespec64(kt);
3866 	}
3867 
3868 	if (timeval)
3869 		ts.tv_nsec /= 1000;
3870 
3871 #ifdef CONFIG_COMPAT_32BIT_TIME
3872 	if (time32)
3873 		return put_old_timespec32(&ts, userstamp);
3874 #endif
3875 #ifdef CONFIG_SPARC64
3876 	/* beware of padding in sparc64 timeval */
3877 	if (timeval && !in_compat_syscall()) {
3878 		struct __kernel_old_timeval __user tv = {
3879 			.tv_sec = ts.tv_sec,
3880 			.tv_usec = ts.tv_nsec,
3881 		};
3882 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3883 			return -EFAULT;
3884 		return 0;
3885 	}
3886 #endif
3887 	return put_timespec64(&ts, userstamp);
3888 }
3889 EXPORT_SYMBOL(sock_gettstamp);
3890 
3891 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3892 {
3893 	if (!sock_flag(sk, flag)) {
3894 		unsigned long previous_flags = sk->sk_flags;
3895 
3896 		sock_set_flag(sk, flag);
3897 		/*
3898 		 * we just set one of the two flags which require net
3899 		 * time stamping, but time stamping might have been on
3900 		 * already because of the other one
3901 		 */
3902 		if (sock_needs_netstamp(sk) &&
3903 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3904 			net_enable_timestamp();
3905 	}
3906 }
3907 
3908 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3909 		       int level, int type)
3910 {
3911 	struct sock_extended_err ee;
3912 	struct sk_buff *skb;
3913 	int copied, err;
3914 
3915 	err = -EAGAIN;
3916 	skb = sock_dequeue_err_skb(sk);
3917 	if (skb == NULL)
3918 		goto out;
3919 
3920 	copied = skb->len;
3921 	if (copied > len) {
3922 		msg->msg_flags |= MSG_TRUNC;
3923 		copied = len;
3924 	}
3925 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3926 	if (err)
3927 		goto out_free_skb;
3928 
3929 	sock_recv_timestamp(msg, sk, skb);
3930 
3931 	/* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */
3932 	ee = SKB_EXT_ERR(skb)->ee;
3933 	put_cmsg(msg, level, type, sizeof(ee), &ee);
3934 
3935 	msg->msg_flags |= MSG_ERRQUEUE;
3936 	err = copied;
3937 
3938 out_free_skb:
3939 	kfree_skb(skb);
3940 out:
3941 	return err;
3942 }
3943 EXPORT_SYMBOL(sock_recv_errqueue);
3944 
3945 /*
3946  *	Get a socket option on an socket.
3947  *
3948  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3949  *	asynchronous errors should be reported by getsockopt. We assume
3950  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3951  */
3952 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3953 			   char __user *optval, int __user *optlen)
3954 {
3955 	struct sock *sk = sock->sk;
3956 
3957 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3958 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3959 }
3960 EXPORT_SYMBOL(sock_common_getsockopt);
3961 
3962 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3963 			int flags)
3964 {
3965 	struct sock *sk = sock->sk;
3966 
3967 	return sk->sk_prot->recvmsg(sk, msg, size, flags);
3968 }
3969 EXPORT_SYMBOL(sock_common_recvmsg);
3970 
3971 /*
3972  *	Set socket options on an inet socket.
3973  */
3974 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3975 			   sockptr_t optval, unsigned int optlen)
3976 {
3977 	struct sock *sk = sock->sk;
3978 
3979 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3980 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3981 }
3982 EXPORT_SYMBOL(sock_common_setsockopt);
3983 
3984 void sk_common_release(struct sock *sk)
3985 {
3986 	if (sk->sk_prot->destroy)
3987 		sk->sk_prot->destroy(sk);
3988 
3989 	/*
3990 	 * Observation: when sk_common_release is called, processes have
3991 	 * no access to socket. But net still has.
3992 	 * Step one, detach it from networking:
3993 	 *
3994 	 * A. Remove from hash tables.
3995 	 */
3996 
3997 	sk->sk_prot->unhash(sk);
3998 
3999 	/*
4000 	 * In this point socket cannot receive new packets, but it is possible
4001 	 * that some packets are in flight because some CPU runs receiver and
4002 	 * did hash table lookup before we unhashed socket. They will achieve
4003 	 * receive queue and will be purged by socket destructor.
4004 	 *
4005 	 * Also we still have packets pending on receive queue and probably,
4006 	 * our own packets waiting in device queues. sock_destroy will drain
4007 	 * receive queue, but transmitted packets will delay socket destruction
4008 	 * until the last reference will be released.
4009 	 */
4010 
4011 	sock_orphan(sk);
4012 
4013 	xfrm_sk_free_policy(sk);
4014 
4015 	sock_put(sk);
4016 }
4017 EXPORT_SYMBOL(sk_common_release);
4018 
4019 void sk_get_meminfo(const struct sock *sk, u32 *mem)
4020 {
4021 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
4022 
4023 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
4024 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
4025 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
4026 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
4027 	mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
4028 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
4029 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
4030 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
4031 	mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
4032 }
4033 
4034 #ifdef CONFIG_PROC_FS
4035 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
4036 
4037 int sock_prot_inuse_get(struct net *net, struct proto *prot)
4038 {
4039 	int cpu, idx = prot->inuse_idx;
4040 	int res = 0;
4041 
4042 	for_each_possible_cpu(cpu)
4043 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
4044 
4045 	return res >= 0 ? res : 0;
4046 }
4047 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4048 
4049 int sock_inuse_get(struct net *net)
4050 {
4051 	int cpu, res = 0;
4052 
4053 	for_each_possible_cpu(cpu)
4054 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4055 
4056 	return res;
4057 }
4058 
4059 EXPORT_SYMBOL_GPL(sock_inuse_get);
4060 
4061 static int __net_init sock_inuse_init_net(struct net *net)
4062 {
4063 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4064 	if (net->core.prot_inuse == NULL)
4065 		return -ENOMEM;
4066 	return 0;
4067 }
4068 
4069 static void __net_exit sock_inuse_exit_net(struct net *net)
4070 {
4071 	free_percpu(net->core.prot_inuse);
4072 }
4073 
4074 static struct pernet_operations net_inuse_ops = {
4075 	.init = sock_inuse_init_net,
4076 	.exit = sock_inuse_exit_net,
4077 };
4078 
4079 static __init int net_inuse_init(void)
4080 {
4081 	if (register_pernet_subsys(&net_inuse_ops))
4082 		panic("Cannot initialize net inuse counters");
4083 
4084 	return 0;
4085 }
4086 
4087 core_initcall(net_inuse_init);
4088 
4089 static int assign_proto_idx(struct proto *prot)
4090 {
4091 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4092 
4093 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4094 		pr_err("PROTO_INUSE_NR exhausted\n");
4095 		return -ENOSPC;
4096 	}
4097 
4098 	set_bit(prot->inuse_idx, proto_inuse_idx);
4099 	return 0;
4100 }
4101 
4102 static void release_proto_idx(struct proto *prot)
4103 {
4104 	if (prot->inuse_idx != PROTO_INUSE_NR)
4105 		clear_bit(prot->inuse_idx, proto_inuse_idx);
4106 }
4107 #else
4108 static inline int assign_proto_idx(struct proto *prot)
4109 {
4110 	return 0;
4111 }
4112 
4113 static inline void release_proto_idx(struct proto *prot)
4114 {
4115 }
4116 
4117 #endif
4118 
4119 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4120 {
4121 	if (!twsk_prot)
4122 		return;
4123 	kfree(twsk_prot->twsk_slab_name);
4124 	twsk_prot->twsk_slab_name = NULL;
4125 	kmem_cache_destroy(twsk_prot->twsk_slab);
4126 	twsk_prot->twsk_slab = NULL;
4127 }
4128 
4129 static int tw_prot_init(const struct proto *prot)
4130 {
4131 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4132 
4133 	if (!twsk_prot)
4134 		return 0;
4135 
4136 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4137 					      prot->name);
4138 	if (!twsk_prot->twsk_slab_name)
4139 		return -ENOMEM;
4140 
4141 	twsk_prot->twsk_slab =
4142 		kmem_cache_create(twsk_prot->twsk_slab_name,
4143 				  twsk_prot->twsk_obj_size, 0,
4144 				  SLAB_ACCOUNT | prot->slab_flags,
4145 				  NULL);
4146 	if (!twsk_prot->twsk_slab) {
4147 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4148 			prot->name);
4149 		return -ENOMEM;
4150 	}
4151 
4152 	return 0;
4153 }
4154 
4155 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4156 {
4157 	if (!rsk_prot)
4158 		return;
4159 	kfree(rsk_prot->slab_name);
4160 	rsk_prot->slab_name = NULL;
4161 	kmem_cache_destroy(rsk_prot->slab);
4162 	rsk_prot->slab = NULL;
4163 }
4164 
4165 static int req_prot_init(const struct proto *prot)
4166 {
4167 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4168 
4169 	if (!rsk_prot)
4170 		return 0;
4171 
4172 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4173 					prot->name);
4174 	if (!rsk_prot->slab_name)
4175 		return -ENOMEM;
4176 
4177 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4178 					   rsk_prot->obj_size, 0,
4179 					   SLAB_ACCOUNT | prot->slab_flags,
4180 					   NULL);
4181 
4182 	if (!rsk_prot->slab) {
4183 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4184 			prot->name);
4185 		return -ENOMEM;
4186 	}
4187 	return 0;
4188 }
4189 
4190 int proto_register(struct proto *prot, int alloc_slab)
4191 {
4192 	int ret = -ENOBUFS;
4193 
4194 	if (prot->memory_allocated && !prot->sysctl_mem) {
4195 		pr_err("%s: missing sysctl_mem\n", prot->name);
4196 		return -EINVAL;
4197 	}
4198 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4199 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4200 		return -EINVAL;
4201 	}
4202 	if (alloc_slab) {
4203 		struct kmem_cache_args args = {
4204 			.useroffset	= prot->useroffset,
4205 			.usersize	= prot->usersize,
4206 			.freeptr_offset = prot->freeptr_offset,
4207 			.use_freeptr_offset = !!prot->freeptr_offset,
4208 		};
4209 
4210 		prot->slab = kmem_cache_create(prot->name, prot->obj_size,
4211 					&args,
4212 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4213 					prot->slab_flags);
4214 		if (prot->slab == NULL) {
4215 			pr_crit("%s: Can't create sock SLAB cache!\n",
4216 				prot->name);
4217 			goto out;
4218 		}
4219 
4220 		if (req_prot_init(prot))
4221 			goto out_free_request_sock_slab;
4222 
4223 		if (tw_prot_init(prot))
4224 			goto out_free_timewait_sock_slab;
4225 	}
4226 
4227 	mutex_lock(&proto_list_mutex);
4228 	ret = assign_proto_idx(prot);
4229 	if (ret) {
4230 		mutex_unlock(&proto_list_mutex);
4231 		goto out_free_timewait_sock_slab;
4232 	}
4233 	list_add(&prot->node, &proto_list);
4234 	mutex_unlock(&proto_list_mutex);
4235 	return ret;
4236 
4237 out_free_timewait_sock_slab:
4238 	if (alloc_slab)
4239 		tw_prot_cleanup(prot->twsk_prot);
4240 out_free_request_sock_slab:
4241 	if (alloc_slab) {
4242 		req_prot_cleanup(prot->rsk_prot);
4243 
4244 		kmem_cache_destroy(prot->slab);
4245 		prot->slab = NULL;
4246 	}
4247 out:
4248 	return ret;
4249 }
4250 EXPORT_SYMBOL(proto_register);
4251 
4252 void proto_unregister(struct proto *prot)
4253 {
4254 	mutex_lock(&proto_list_mutex);
4255 	release_proto_idx(prot);
4256 	list_del(&prot->node);
4257 	mutex_unlock(&proto_list_mutex);
4258 
4259 	kmem_cache_destroy(prot->slab);
4260 	prot->slab = NULL;
4261 
4262 	req_prot_cleanup(prot->rsk_prot);
4263 	tw_prot_cleanup(prot->twsk_prot);
4264 }
4265 EXPORT_SYMBOL(proto_unregister);
4266 
4267 int sock_load_diag_module(int family, int protocol)
4268 {
4269 	if (!protocol) {
4270 		if (!sock_is_registered(family))
4271 			return -ENOENT;
4272 
4273 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4274 				      NETLINK_SOCK_DIAG, family);
4275 	}
4276 
4277 #ifdef CONFIG_INET
4278 	if (family == AF_INET &&
4279 	    protocol != IPPROTO_RAW &&
4280 	    protocol < MAX_INET_PROTOS &&
4281 	    !rcu_access_pointer(inet_protos[protocol]))
4282 		return -ENOENT;
4283 #endif
4284 
4285 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4286 			      NETLINK_SOCK_DIAG, family, protocol);
4287 }
4288 EXPORT_SYMBOL(sock_load_diag_module);
4289 
4290 #ifdef CONFIG_PROC_FS
4291 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4292 	__acquires(proto_list_mutex)
4293 {
4294 	mutex_lock(&proto_list_mutex);
4295 	return seq_list_start_head(&proto_list, *pos);
4296 }
4297 
4298 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4299 {
4300 	return seq_list_next(v, &proto_list, pos);
4301 }
4302 
4303 static void proto_seq_stop(struct seq_file *seq, void *v)
4304 	__releases(proto_list_mutex)
4305 {
4306 	mutex_unlock(&proto_list_mutex);
4307 }
4308 
4309 static char proto_method_implemented(const void *method)
4310 {
4311 	return method == NULL ? 'n' : 'y';
4312 }
4313 static long sock_prot_memory_allocated(struct proto *proto)
4314 {
4315 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4316 }
4317 
4318 static const char *sock_prot_memory_pressure(struct proto *proto)
4319 {
4320 	return proto->memory_pressure != NULL ?
4321 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4322 }
4323 
4324 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4325 {
4326 
4327 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4328 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4329 		   proto->name,
4330 		   proto->obj_size,
4331 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4332 		   sock_prot_memory_allocated(proto),
4333 		   sock_prot_memory_pressure(proto),
4334 		   proto->max_header,
4335 		   proto->slab == NULL ? "no" : "yes",
4336 		   module_name(proto->owner),
4337 		   proto_method_implemented(proto->close),
4338 		   proto_method_implemented(proto->connect),
4339 		   proto_method_implemented(proto->disconnect),
4340 		   proto_method_implemented(proto->accept),
4341 		   proto_method_implemented(proto->ioctl),
4342 		   proto_method_implemented(proto->init),
4343 		   proto_method_implemented(proto->destroy),
4344 		   proto_method_implemented(proto->shutdown),
4345 		   proto_method_implemented(proto->setsockopt),
4346 		   proto_method_implemented(proto->getsockopt),
4347 		   proto_method_implemented(proto->sendmsg),
4348 		   proto_method_implemented(proto->recvmsg),
4349 		   proto_method_implemented(proto->bind),
4350 		   proto_method_implemented(proto->backlog_rcv),
4351 		   proto_method_implemented(proto->hash),
4352 		   proto_method_implemented(proto->unhash),
4353 		   proto_method_implemented(proto->get_port),
4354 		   proto_method_implemented(proto->enter_memory_pressure));
4355 }
4356 
4357 static int proto_seq_show(struct seq_file *seq, void *v)
4358 {
4359 	if (v == &proto_list)
4360 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4361 			   "protocol",
4362 			   "size",
4363 			   "sockets",
4364 			   "memory",
4365 			   "press",
4366 			   "maxhdr",
4367 			   "slab",
4368 			   "module",
4369 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4370 	else
4371 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4372 	return 0;
4373 }
4374 
4375 static const struct seq_operations proto_seq_ops = {
4376 	.start  = proto_seq_start,
4377 	.next   = proto_seq_next,
4378 	.stop   = proto_seq_stop,
4379 	.show   = proto_seq_show,
4380 };
4381 
4382 static __net_init int proto_init_net(struct net *net)
4383 {
4384 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4385 			sizeof(struct seq_net_private)))
4386 		return -ENOMEM;
4387 
4388 	return 0;
4389 }
4390 
4391 static __net_exit void proto_exit_net(struct net *net)
4392 {
4393 	remove_proc_entry("protocols", net->proc_net);
4394 }
4395 
4396 
4397 static __net_initdata struct pernet_operations proto_net_ops = {
4398 	.init = proto_init_net,
4399 	.exit = proto_exit_net,
4400 };
4401 
4402 static int __init proto_init(void)
4403 {
4404 	return register_pernet_subsys(&proto_net_ops);
4405 }
4406 
4407 subsys_initcall(proto_init);
4408 
4409 #endif /* PROC_FS */
4410 
4411 #ifdef CONFIG_NET_RX_BUSY_POLL
4412 bool sk_busy_loop_end(void *p, unsigned long start_time)
4413 {
4414 	struct sock *sk = p;
4415 
4416 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4417 		return true;
4418 
4419 	if (sk_is_udp(sk) &&
4420 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4421 		return true;
4422 
4423 	return sk_busy_loop_timeout(sk, start_time);
4424 }
4425 EXPORT_SYMBOL(sk_busy_loop_end);
4426 #endif /* CONFIG_NET_RX_BUSY_POLL */
4427 
4428 int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
4429 {
4430 	if (!sk->sk_prot->bind_add)
4431 		return -EOPNOTSUPP;
4432 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4433 }
4434 EXPORT_SYMBOL(sock_bind_add);
4435 
4436 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4437 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4438 		     void __user *arg, void *karg, size_t size)
4439 {
4440 	int ret;
4441 
4442 	if (copy_from_user(karg, arg, size))
4443 		return -EFAULT;
4444 
4445 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4446 	if (ret)
4447 		return ret;
4448 
4449 	if (copy_to_user(arg, karg, size))
4450 		return -EFAULT;
4451 
4452 	return 0;
4453 }
4454 EXPORT_SYMBOL(sock_ioctl_inout);
4455 
4456 /* This is the most common ioctl prep function, where the result (4 bytes) is
4457  * copied back to userspace if the ioctl() returns successfully. No input is
4458  * copied from userspace as input argument.
4459  */
4460 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4461 {
4462 	int ret, karg = 0;
4463 
4464 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4465 	if (ret)
4466 		return ret;
4467 
4468 	return put_user(karg, (int __user *)arg);
4469 }
4470 
4471 /* A wrapper around sock ioctls, which copies the data from userspace
4472  * (depending on the protocol/ioctl), and copies back the result to userspace.
4473  * The main motivation for this function is to pass kernel memory to the
4474  * protocol ioctl callbacks, instead of userspace memory.
4475  */
4476 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4477 {
4478 	int rc = 1;
4479 
4480 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4481 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4482 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4483 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4484 	else if (sk_is_phonet(sk))
4485 		rc = phonet_sk_ioctl(sk, cmd, arg);
4486 
4487 	/* If ioctl was processed, returns its value */
4488 	if (rc <= 0)
4489 		return rc;
4490 
4491 	/* Otherwise call the default handler */
4492 	return sock_ioctl_out(sk, cmd, arg);
4493 }
4494 EXPORT_SYMBOL(sk_ioctl);
4495 
4496 static int __init sock_struct_check(void)
4497 {
4498 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4499 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4500 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4501 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4502 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4503 
4504 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4505 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4506 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4507 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4508 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4509 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4510 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4511 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4512 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4513 
4514 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4515 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4516 #ifdef CONFIG_MEMCG
4517 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4518 #endif
4519 
4520 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4521 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4522 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4523 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4524 
4525 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4526 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4527 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
4528 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4529 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4530 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4531 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4532 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4533 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4534 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4535 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4536 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4537 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4538 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4539 
4540 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
4541 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
4542 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4543 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4544 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4545 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4546 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
4547 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
4548 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4549 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4550 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4551 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4552 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4553 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4554 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
4555 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4556 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4557 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4558 	return 0;
4559 }
4560 
4561 core_initcall(sock_struct_check);
4562