xref: /linux/net/core/sock.c (revision 1bc80d673087e5704adbb3ee8e4b785c14899cce)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include <uapi/linux/pidfd.h>
152 
153 #include "dev.h"
154 
155 static DEFINE_MUTEX(proto_list_mutex);
156 static LIST_HEAD(proto_list);
157 
158 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
159 static void sock_def_write_space(struct sock *sk);
160 
161 /**
162  * sk_ns_capable - General socket capability test
163  * @sk: Socket to use a capability on or through
164  * @user_ns: The user namespace of the capability to use
165  * @cap: The capability to use
166  *
167  * Test to see if the opener of the socket had when the socket was
168  * created and the current process has the capability @cap in the user
169  * namespace @user_ns.
170  */
171 bool sk_ns_capable(const struct sock *sk,
172 		   struct user_namespace *user_ns, int cap)
173 {
174 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
175 		ns_capable(user_ns, cap);
176 }
177 EXPORT_SYMBOL(sk_ns_capable);
178 
179 /**
180  * sk_capable - Socket global capability test
181  * @sk: Socket to use a capability on or through
182  * @cap: The global capability to use
183  *
184  * Test to see if the opener of the socket had when the socket was
185  * created and the current process has the capability @cap in all user
186  * namespaces.
187  */
188 bool sk_capable(const struct sock *sk, int cap)
189 {
190 	return sk_ns_capable(sk, &init_user_ns, cap);
191 }
192 EXPORT_SYMBOL(sk_capable);
193 
194 /**
195  * sk_net_capable - Network namespace socket capability test
196  * @sk: Socket to use a capability on or through
197  * @cap: The capability to use
198  *
199  * Test to see if the opener of the socket had when the socket was created
200  * and the current process has the capability @cap over the network namespace
201  * the socket is a member of.
202  */
203 bool sk_net_capable(const struct sock *sk, int cap)
204 {
205 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206 }
207 EXPORT_SYMBOL(sk_net_capable);
208 
209 /*
210  * Each address family might have different locking rules, so we have
211  * one slock key per address family and separate keys for internal and
212  * userspace sockets.
213  */
214 static struct lock_class_key af_family_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_keys[AF_MAX];
216 static struct lock_class_key af_family_slock_keys[AF_MAX];
217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218 
219 /*
220  * Make lock validator output more readable. (we pre-construct these
221  * strings build-time, so that runtime initialization of socket
222  * locks is fast):
223  */
224 
225 #define _sock_locks(x)						  \
226   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
227   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
228   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
229   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
230   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
231   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
232   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
233   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
234   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
235   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
236   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
237   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
238   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
239   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
240   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
241   x "AF_MCTP"  , \
242   x "AF_MAX"
243 
244 static const char *const af_family_key_strings[AF_MAX+1] = {
245 	_sock_locks("sk_lock-")
246 };
247 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
248 	_sock_locks("slock-")
249 };
250 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
251 	_sock_locks("clock-")
252 };
253 
254 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
255 	_sock_locks("k-sk_lock-")
256 };
257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
258 	_sock_locks("k-slock-")
259 };
260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
261 	_sock_locks("k-clock-")
262 };
263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
264 	_sock_locks("rlock-")
265 };
266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
267 	_sock_locks("wlock-")
268 };
269 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
270 	_sock_locks("elock-")
271 };
272 
273 /*
274  * sk_callback_lock and sk queues locking rules are per-address-family,
275  * so split the lock classes by using a per-AF key:
276  */
277 static struct lock_class_key af_callback_keys[AF_MAX];
278 static struct lock_class_key af_rlock_keys[AF_MAX];
279 static struct lock_class_key af_wlock_keys[AF_MAX];
280 static struct lock_class_key af_elock_keys[AF_MAX];
281 static struct lock_class_key af_kern_callback_keys[AF_MAX];
282 
283 /* Run time adjustable parameters. */
284 __u32 sysctl_wmem_max __read_mostly = 4 << 20;
285 EXPORT_SYMBOL(sysctl_wmem_max);
286 __u32 sysctl_rmem_max __read_mostly = 4 << 20;
287 EXPORT_SYMBOL(sysctl_rmem_max);
288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
290 
291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292 EXPORT_SYMBOL_GPL(memalloc_socks_key);
293 
294 /**
295  * sk_set_memalloc - sets %SOCK_MEMALLOC
296  * @sk: socket to set it on
297  *
298  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299  * It's the responsibility of the admin to adjust min_free_kbytes
300  * to meet the requirements
301  */
302 void sk_set_memalloc(struct sock *sk)
303 {
304 	sock_set_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation |= __GFP_MEMALLOC;
306 	static_branch_inc(&memalloc_socks_key);
307 }
308 EXPORT_SYMBOL_GPL(sk_set_memalloc);
309 
310 void sk_clear_memalloc(struct sock *sk)
311 {
312 	sock_reset_flag(sk, SOCK_MEMALLOC);
313 	sk->sk_allocation &= ~__GFP_MEMALLOC;
314 	static_branch_dec(&memalloc_socks_key);
315 
316 	/*
317 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
319 	 * it has rmem allocations due to the last swapfile being deactivated
320 	 * but there is a risk that the socket is unusable due to exceeding
321 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
322 	 */
323 	sk_mem_reclaim(sk);
324 }
325 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326 
327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
328 {
329 	int ret;
330 	unsigned int noreclaim_flag;
331 
332 	/* these should have been dropped before queueing */
333 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334 
335 	noreclaim_flag = memalloc_noreclaim_save();
336 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337 				 tcp_v6_do_rcv,
338 				 tcp_v4_do_rcv,
339 				 sk, skb);
340 	memalloc_noreclaim_restore(noreclaim_flag);
341 
342 	return ret;
343 }
344 EXPORT_SYMBOL(__sk_backlog_rcv);
345 
346 void sk_error_report(struct sock *sk)
347 {
348 	sk->sk_error_report(sk);
349 
350 	switch (sk->sk_family) {
351 	case AF_INET:
352 		fallthrough;
353 	case AF_INET6:
354 		trace_inet_sk_error_report(sk);
355 		break;
356 	default:
357 		break;
358 	}
359 }
360 EXPORT_SYMBOL(sk_error_report);
361 
362 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363 {
364 	struct __kernel_sock_timeval tv;
365 
366 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
367 		tv.tv_sec = 0;
368 		tv.tv_usec = 0;
369 	} else {
370 		tv.tv_sec = timeo / HZ;
371 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372 	}
373 
374 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376 		*(struct old_timeval32 *)optval = tv32;
377 		return sizeof(tv32);
378 	}
379 
380 	if (old_timeval) {
381 		struct __kernel_old_timeval old_tv;
382 		old_tv.tv_sec = tv.tv_sec;
383 		old_tv.tv_usec = tv.tv_usec;
384 		*(struct __kernel_old_timeval *)optval = old_tv;
385 		return sizeof(old_tv);
386 	}
387 
388 	*(struct __kernel_sock_timeval *)optval = tv;
389 	return sizeof(tv);
390 }
391 EXPORT_SYMBOL(sock_get_timeout);
392 
393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394 			   sockptr_t optval, int optlen, bool old_timeval)
395 {
396 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397 		struct old_timeval32 tv32;
398 
399 		if (optlen < sizeof(tv32))
400 			return -EINVAL;
401 
402 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
403 			return -EFAULT;
404 		tv->tv_sec = tv32.tv_sec;
405 		tv->tv_usec = tv32.tv_usec;
406 	} else if (old_timeval) {
407 		struct __kernel_old_timeval old_tv;
408 
409 		if (optlen < sizeof(old_tv))
410 			return -EINVAL;
411 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
412 			return -EFAULT;
413 		tv->tv_sec = old_tv.tv_sec;
414 		tv->tv_usec = old_tv.tv_usec;
415 	} else {
416 		if (optlen < sizeof(*tv))
417 			return -EINVAL;
418 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
419 			return -EFAULT;
420 	}
421 
422 	return 0;
423 }
424 EXPORT_SYMBOL(sock_copy_user_timeval);
425 
426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
427 			    bool old_timeval)
428 {
429 	struct __kernel_sock_timeval tv;
430 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431 	long val;
432 
433 	if (err)
434 		return err;
435 
436 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 		return -EDOM;
438 
439 	if (tv.tv_sec < 0) {
440 		static int warned __read_mostly;
441 
442 		WRITE_ONCE(*timeo_p, 0);
443 		if (warned < 10 && net_ratelimit()) {
444 			warned++;
445 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 				__func__, current->comm, task_pid_nr(current));
447 		}
448 		return 0;
449 	}
450 	val = MAX_SCHEDULE_TIMEOUT;
451 	if ((tv.tv_sec || tv.tv_usec) &&
452 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
453 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454 						    USEC_PER_SEC / HZ);
455 	WRITE_ONCE(*timeo_p, val);
456 	return 0;
457 }
458 
459 static bool sk_set_prio_allowed(const struct sock *sk, int val)
460 {
461 	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
462 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
463 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
464 }
465 
466 static bool sock_needs_netstamp(const struct sock *sk)
467 {
468 	switch (sk->sk_family) {
469 	case AF_UNSPEC:
470 	case AF_UNIX:
471 		return false;
472 	default:
473 		return true;
474 	}
475 }
476 
477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
478 {
479 	if (sk->sk_flags & flags) {
480 		sk->sk_flags &= ~flags;
481 		if (sock_needs_netstamp(sk) &&
482 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
483 			net_disable_timestamp();
484 	}
485 }
486 
487 
488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
489 {
490 	unsigned long flags;
491 	struct sk_buff_head *list = &sk->sk_receive_queue;
492 
493 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
494 		sk_drops_inc(sk);
495 		trace_sock_rcvqueue_full(sk, skb);
496 		return -ENOMEM;
497 	}
498 
499 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
500 		sk_drops_inc(sk);
501 		return -ENOBUFS;
502 	}
503 
504 	skb->dev = NULL;
505 	skb_set_owner_r(skb, sk);
506 
507 	/* we escape from rcu protected region, make sure we dont leak
508 	 * a norefcounted dst
509 	 */
510 	skb_dst_force(skb);
511 
512 	spin_lock_irqsave(&list->lock, flags);
513 	sock_skb_set_dropcount(sk, skb);
514 	__skb_queue_tail(list, skb);
515 	spin_unlock_irqrestore(&list->lock, flags);
516 
517 	if (!sock_flag(sk, SOCK_DEAD))
518 		sk->sk_data_ready(sk);
519 	return 0;
520 }
521 EXPORT_SYMBOL(__sock_queue_rcv_skb);
522 
523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
524 			      enum skb_drop_reason *reason)
525 {
526 	enum skb_drop_reason drop_reason;
527 	int err;
528 
529 	err = sk_filter_reason(sk, skb, &drop_reason);
530 	if (err)
531 		goto out;
532 
533 	err = __sock_queue_rcv_skb(sk, skb);
534 	switch (err) {
535 	case -ENOMEM:
536 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
537 		break;
538 	case -ENOBUFS:
539 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
540 		break;
541 	default:
542 		drop_reason = SKB_NOT_DROPPED_YET;
543 		break;
544 	}
545 out:
546 	if (reason)
547 		*reason = drop_reason;
548 	return err;
549 }
550 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
551 
552 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
553 		     const int nested, unsigned int trim_cap, bool refcounted)
554 {
555 	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
556 	int rc = NET_RX_SUCCESS;
557 	int err;
558 
559 	if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
560 		goto discard_and_relse;
561 
562 	skb->dev = NULL;
563 
564 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
565 		sk_drops_inc(sk);
566 		reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
567 		goto discard_and_relse;
568 	}
569 	if (nested)
570 		bh_lock_sock_nested(sk);
571 	else
572 		bh_lock_sock(sk);
573 	if (!sock_owned_by_user(sk)) {
574 		/*
575 		 * trylock + unlock semantics:
576 		 */
577 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
578 
579 		rc = sk_backlog_rcv(sk, skb);
580 
581 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
582 	} else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
583 		bh_unlock_sock(sk);
584 		if (err == -ENOMEM)
585 			reason = SKB_DROP_REASON_PFMEMALLOC;
586 		if (err == -ENOBUFS)
587 			reason = SKB_DROP_REASON_SOCKET_BACKLOG;
588 		sk_drops_inc(sk);
589 		goto discard_and_relse;
590 	}
591 
592 	bh_unlock_sock(sk);
593 out:
594 	if (refcounted)
595 		sock_put(sk);
596 	return rc;
597 discard_and_relse:
598 	sk_skb_reason_drop(sk, skb, reason);
599 	goto out;
600 }
601 EXPORT_SYMBOL(__sk_receive_skb);
602 
603 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
604 							  u32));
605 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
606 							   u32));
607 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
608 {
609 	struct dst_entry *dst = __sk_dst_get(sk);
610 
611 	if (dst && READ_ONCE(dst->obsolete) &&
612 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
613 			       dst, cookie) == NULL) {
614 		sk_tx_queue_clear(sk);
615 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
616 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
617 		dst_release(dst);
618 		return NULL;
619 	}
620 
621 	return dst;
622 }
623 EXPORT_SYMBOL(__sk_dst_check);
624 
625 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
626 {
627 	struct dst_entry *dst = sk_dst_get(sk);
628 
629 	if (dst && READ_ONCE(dst->obsolete) &&
630 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
631 			       dst, cookie) == NULL) {
632 		sk_dst_reset(sk);
633 		dst_release(dst);
634 		return NULL;
635 	}
636 
637 	return dst;
638 }
639 EXPORT_SYMBOL(sk_dst_check);
640 
641 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
642 {
643 	int ret = -ENOPROTOOPT;
644 #ifdef CONFIG_NETDEVICES
645 	struct net *net = sock_net(sk);
646 
647 	/* Sorry... */
648 	ret = -EPERM;
649 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
650 		goto out;
651 
652 	ret = -EINVAL;
653 	if (ifindex < 0)
654 		goto out;
655 
656 	/* Paired with all READ_ONCE() done locklessly. */
657 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
658 
659 	if (sk->sk_prot->rehash)
660 		sk->sk_prot->rehash(sk);
661 	sk_dst_reset(sk);
662 
663 	ret = 0;
664 
665 out:
666 #endif
667 
668 	return ret;
669 }
670 
671 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
672 {
673 	int ret;
674 
675 	if (lock_sk)
676 		lock_sock(sk);
677 	ret = sock_bindtoindex_locked(sk, ifindex);
678 	if (lock_sk)
679 		release_sock(sk);
680 
681 	return ret;
682 }
683 EXPORT_SYMBOL(sock_bindtoindex);
684 
685 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
686 {
687 	int ret = -ENOPROTOOPT;
688 #ifdef CONFIG_NETDEVICES
689 	struct net *net = sock_net(sk);
690 	char devname[IFNAMSIZ];
691 	int index;
692 
693 	ret = -EINVAL;
694 	if (optlen < 0)
695 		goto out;
696 
697 	/* Bind this socket to a particular device like "eth0",
698 	 * as specified in the passed interface name. If the
699 	 * name is "" or the option length is zero the socket
700 	 * is not bound.
701 	 */
702 	if (optlen > IFNAMSIZ - 1)
703 		optlen = IFNAMSIZ - 1;
704 	memset(devname, 0, sizeof(devname));
705 
706 	ret = -EFAULT;
707 	if (copy_from_sockptr(devname, optval, optlen))
708 		goto out;
709 
710 	index = 0;
711 	if (devname[0] != '\0') {
712 		struct net_device *dev;
713 
714 		rcu_read_lock();
715 		dev = dev_get_by_name_rcu(net, devname);
716 		if (dev)
717 			index = dev->ifindex;
718 		rcu_read_unlock();
719 		ret = -ENODEV;
720 		if (!dev)
721 			goto out;
722 	}
723 
724 	sockopt_lock_sock(sk);
725 	ret = sock_bindtoindex_locked(sk, index);
726 	sockopt_release_sock(sk);
727 out:
728 #endif
729 
730 	return ret;
731 }
732 
733 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
734 				sockptr_t optlen, int len)
735 {
736 	int ret = -ENOPROTOOPT;
737 #ifdef CONFIG_NETDEVICES
738 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
739 	struct net *net = sock_net(sk);
740 	char devname[IFNAMSIZ];
741 
742 	if (bound_dev_if == 0) {
743 		len = 0;
744 		goto zero;
745 	}
746 
747 	ret = -EINVAL;
748 	if (len < IFNAMSIZ)
749 		goto out;
750 
751 	ret = netdev_get_name(net, devname, bound_dev_if);
752 	if (ret)
753 		goto out;
754 
755 	len = strlen(devname) + 1;
756 
757 	ret = -EFAULT;
758 	if (copy_to_sockptr(optval, devname, len))
759 		goto out;
760 
761 zero:
762 	ret = -EFAULT;
763 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
764 		goto out;
765 
766 	ret = 0;
767 
768 out:
769 #endif
770 
771 	return ret;
772 }
773 
774 bool sk_mc_loop(const struct sock *sk)
775 {
776 	if (dev_recursion_level())
777 		return false;
778 	if (!sk)
779 		return true;
780 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
781 	switch (READ_ONCE(sk->sk_family)) {
782 	case AF_INET:
783 		return inet_test_bit(MC_LOOP, sk);
784 #if IS_ENABLED(CONFIG_IPV6)
785 	case AF_INET6:
786 		return inet6_test_bit(MC6_LOOP, sk);
787 #endif
788 	}
789 	WARN_ON_ONCE(1);
790 	return true;
791 }
792 EXPORT_SYMBOL(sk_mc_loop);
793 
794 void sock_set_reuseaddr(struct sock *sk)
795 {
796 	lock_sock(sk);
797 	sk->sk_reuse = SK_CAN_REUSE;
798 	release_sock(sk);
799 }
800 EXPORT_SYMBOL(sock_set_reuseaddr);
801 
802 void sock_set_reuseport(struct sock *sk)
803 {
804 	lock_sock(sk);
805 	sk->sk_reuseport = true;
806 	release_sock(sk);
807 }
808 EXPORT_SYMBOL(sock_set_reuseport);
809 
810 void sock_no_linger(struct sock *sk)
811 {
812 	lock_sock(sk);
813 	WRITE_ONCE(sk->sk_lingertime, 0);
814 	sock_set_flag(sk, SOCK_LINGER);
815 	release_sock(sk);
816 }
817 EXPORT_SYMBOL(sock_no_linger);
818 
819 void sock_set_priority(struct sock *sk, u32 priority)
820 {
821 	WRITE_ONCE(sk->sk_priority, priority);
822 }
823 EXPORT_SYMBOL(sock_set_priority);
824 
825 void sock_set_sndtimeo(struct sock *sk, s64 secs)
826 {
827 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
828 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
829 	else
830 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
831 }
832 EXPORT_SYMBOL(sock_set_sndtimeo);
833 
834 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
835 {
836 	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
837 	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
838 	if (val)  {
839 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
840 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
841 	}
842 }
843 
844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
845 {
846 	switch (optname) {
847 	case SO_TIMESTAMP_OLD:
848 		__sock_set_timestamps(sk, valbool, false, false);
849 		break;
850 	case SO_TIMESTAMP_NEW:
851 		__sock_set_timestamps(sk, valbool, true, false);
852 		break;
853 	case SO_TIMESTAMPNS_OLD:
854 		__sock_set_timestamps(sk, valbool, false, true);
855 		break;
856 	case SO_TIMESTAMPNS_NEW:
857 		__sock_set_timestamps(sk, valbool, true, true);
858 		break;
859 	}
860 }
861 
862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
863 {
864 	struct net *net = sock_net(sk);
865 	struct net_device *dev = NULL;
866 	bool match = false;
867 	int *vclock_index;
868 	int i, num;
869 
870 	if (sk->sk_bound_dev_if)
871 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
872 
873 	if (!dev) {
874 		pr_err("%s: sock not bind to device\n", __func__);
875 		return -EOPNOTSUPP;
876 	}
877 
878 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
879 	dev_put(dev);
880 
881 	for (i = 0; i < num; i++) {
882 		if (*(vclock_index + i) == phc_index) {
883 			match = true;
884 			break;
885 		}
886 	}
887 
888 	if (num > 0)
889 		kfree(vclock_index);
890 
891 	if (!match)
892 		return -EINVAL;
893 
894 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
895 
896 	return 0;
897 }
898 
899 int sock_set_timestamping(struct sock *sk, int optname,
900 			  struct so_timestamping timestamping)
901 {
902 	int val = timestamping.flags;
903 	int ret;
904 
905 	if (val & ~SOF_TIMESTAMPING_MASK)
906 		return -EINVAL;
907 
908 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
909 	    !(val & SOF_TIMESTAMPING_OPT_ID))
910 		return -EINVAL;
911 
912 	if (val & SOF_TIMESTAMPING_OPT_ID &&
913 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
914 		if (sk_is_tcp(sk)) {
915 			if ((1 << sk->sk_state) &
916 			    (TCPF_CLOSE | TCPF_LISTEN))
917 				return -EINVAL;
918 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
919 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
920 			else
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
922 		} else {
923 			atomic_set(&sk->sk_tskey, 0);
924 		}
925 	}
926 
927 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
928 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
929 		return -EINVAL;
930 
931 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
932 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
933 		if (ret)
934 			return ret;
935 	}
936 
937 	WRITE_ONCE(sk->sk_tsflags, val);
938 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
939 	sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
940 
941 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
942 		sock_enable_timestamp(sk,
943 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
944 	else
945 		sock_disable_timestamp(sk,
946 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
947 	return 0;
948 }
949 
950 #if defined(CONFIG_CGROUP_BPF)
951 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
952 {
953 	struct bpf_sock_ops_kern sock_ops;
954 
955 	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
956 	sock_ops.op = op;
957 	sock_ops.is_fullsock = 1;
958 	sock_ops.sk = sk;
959 	bpf_skops_init_skb(&sock_ops, skb, 0);
960 	__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
961 }
962 #endif
963 
964 void sock_set_keepalive(struct sock *sk)
965 {
966 	lock_sock(sk);
967 	if (sk->sk_prot->keepalive)
968 		sk->sk_prot->keepalive(sk, true);
969 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
970 	release_sock(sk);
971 }
972 EXPORT_SYMBOL(sock_set_keepalive);
973 
974 static void __sock_set_rcvbuf(struct sock *sk, int val)
975 {
976 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
977 	 * as a negative value.
978 	 */
979 	val = min_t(int, val, INT_MAX / 2);
980 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
981 
982 	/* We double it on the way in to account for "struct sk_buff" etc.
983 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
984 	 * will allow that much actual data to be received on that socket.
985 	 *
986 	 * Applications are unaware that "struct sk_buff" and other overheads
987 	 * allocate from the receive buffer during socket buffer allocation.
988 	 *
989 	 * And after considering the possible alternatives, returning the value
990 	 * we actually used in getsockopt is the most desirable behavior.
991 	 */
992 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
993 }
994 
995 void sock_set_rcvbuf(struct sock *sk, int val)
996 {
997 	lock_sock(sk);
998 	__sock_set_rcvbuf(sk, val);
999 	release_sock(sk);
1000 }
1001 EXPORT_SYMBOL(sock_set_rcvbuf);
1002 
1003 static void __sock_set_mark(struct sock *sk, u32 val)
1004 {
1005 	if (val != sk->sk_mark) {
1006 		WRITE_ONCE(sk->sk_mark, val);
1007 		sk_dst_reset(sk);
1008 	}
1009 }
1010 
1011 void sock_set_mark(struct sock *sk, u32 val)
1012 {
1013 	lock_sock(sk);
1014 	__sock_set_mark(sk, val);
1015 	release_sock(sk);
1016 }
1017 EXPORT_SYMBOL(sock_set_mark);
1018 
1019 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1020 {
1021 	/* Round down bytes to multiple of pages */
1022 	bytes = round_down(bytes, PAGE_SIZE);
1023 
1024 	WARN_ON(bytes > sk->sk_reserved_mem);
1025 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1026 	sk_mem_reclaim(sk);
1027 }
1028 
1029 static int sock_reserve_memory(struct sock *sk, int bytes)
1030 {
1031 	long allocated;
1032 	bool charged;
1033 	int pages;
1034 
1035 	if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
1036 		return -EOPNOTSUPP;
1037 
1038 	if (!bytes)
1039 		return 0;
1040 
1041 	pages = sk_mem_pages(bytes);
1042 
1043 	/* pre-charge to memcg */
1044 	charged = mem_cgroup_sk_charge(sk, pages,
1045 				       GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1046 	if (!charged)
1047 		return -ENOMEM;
1048 
1049 	if (sk->sk_bypass_prot_mem)
1050 		goto success;
1051 
1052 	/* pre-charge to forward_alloc */
1053 	sk_memory_allocated_add(sk, pages);
1054 	allocated = sk_memory_allocated(sk);
1055 
1056 	/* If the system goes into memory pressure with this
1057 	 * precharge, give up and return error.
1058 	 */
1059 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1060 		sk_memory_allocated_sub(sk, pages);
1061 		mem_cgroup_sk_uncharge(sk, pages);
1062 		return -ENOMEM;
1063 	}
1064 
1065 success:
1066 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1067 
1068 	WRITE_ONCE(sk->sk_reserved_mem,
1069 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1070 
1071 	return 0;
1072 }
1073 
1074 #ifdef CONFIG_PAGE_POOL
1075 
1076 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1077  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1078  * allocates to copy these tokens, and to prevent looping over the frags for
1079  * too long.
1080  */
1081 #define MAX_DONTNEED_TOKENS 128
1082 #define MAX_DONTNEED_FRAGS 1024
1083 
1084 static noinline_for_stack int
1085 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1086 {
1087 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1088 	struct dmabuf_token *tokens;
1089 	int ret = 0, num_frags = 0;
1090 	netmem_ref netmems[16];
1091 
1092 	if (!sk_is_tcp(sk))
1093 		return -EBADF;
1094 
1095 	if (optlen % sizeof(*tokens) ||
1096 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1097 		return -EINVAL;
1098 
1099 	num_tokens = optlen / sizeof(*tokens);
1100 	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1101 	if (!tokens)
1102 		return -ENOMEM;
1103 
1104 	if (copy_from_sockptr(tokens, optval, optlen)) {
1105 		kvfree(tokens);
1106 		return -EFAULT;
1107 	}
1108 
1109 	xa_lock_bh(&sk->sk_user_frags);
1110 	for (i = 0; i < num_tokens; i++) {
1111 		for (j = 0; j < tokens[i].token_count; j++) {
1112 			if (++num_frags > MAX_DONTNEED_FRAGS)
1113 				goto frag_limit_reached;
1114 
1115 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1116 				&sk->sk_user_frags, tokens[i].token_start + j);
1117 
1118 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1119 				continue;
1120 
1121 			netmems[netmem_num++] = netmem;
1122 			if (netmem_num == ARRAY_SIZE(netmems)) {
1123 				xa_unlock_bh(&sk->sk_user_frags);
1124 				for (k = 0; k < netmem_num; k++)
1125 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1126 				netmem_num = 0;
1127 				xa_lock_bh(&sk->sk_user_frags);
1128 			}
1129 			ret++;
1130 		}
1131 	}
1132 
1133 frag_limit_reached:
1134 	xa_unlock_bh(&sk->sk_user_frags);
1135 	for (k = 0; k < netmem_num; k++)
1136 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1137 
1138 	kvfree(tokens);
1139 	return ret;
1140 }
1141 #endif
1142 
1143 void sockopt_lock_sock(struct sock *sk)
1144 {
1145 	/* When current->bpf_ctx is set, the setsockopt is called from
1146 	 * a bpf prog.  bpf has ensured the sk lock has been
1147 	 * acquired before calling setsockopt().
1148 	 */
1149 	if (has_current_bpf_ctx())
1150 		return;
1151 
1152 	lock_sock(sk);
1153 }
1154 EXPORT_SYMBOL(sockopt_lock_sock);
1155 
1156 void sockopt_release_sock(struct sock *sk)
1157 {
1158 	if (has_current_bpf_ctx())
1159 		return;
1160 
1161 	release_sock(sk);
1162 }
1163 EXPORT_SYMBOL(sockopt_release_sock);
1164 
1165 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1166 {
1167 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1168 }
1169 EXPORT_SYMBOL(sockopt_ns_capable);
1170 
1171 bool sockopt_capable(int cap)
1172 {
1173 	return has_current_bpf_ctx() || capable(cap);
1174 }
1175 EXPORT_SYMBOL(sockopt_capable);
1176 
1177 static int sockopt_validate_clockid(__kernel_clockid_t value)
1178 {
1179 	switch (value) {
1180 	case CLOCK_REALTIME:
1181 	case CLOCK_MONOTONIC:
1182 	case CLOCK_TAI:
1183 		return 0;
1184 	}
1185 	return -EINVAL;
1186 }
1187 
1188 /*
1189  *	This is meant for all protocols to use and covers goings on
1190  *	at the socket level. Everything here is generic.
1191  */
1192 
1193 int sk_setsockopt(struct sock *sk, int level, int optname,
1194 		  sockptr_t optval, unsigned int optlen)
1195 {
1196 	struct so_timestamping timestamping;
1197 	struct socket *sock = sk->sk_socket;
1198 	struct sock_txtime sk_txtime;
1199 	int val;
1200 	int valbool;
1201 	struct linger ling;
1202 	int ret = 0;
1203 
1204 	/*
1205 	 *	Options without arguments
1206 	 */
1207 
1208 	if (optname == SO_BINDTODEVICE)
1209 		return sock_setbindtodevice(sk, optval, optlen);
1210 
1211 	if (optlen < sizeof(int))
1212 		return -EINVAL;
1213 
1214 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1215 		return -EFAULT;
1216 
1217 	valbool = val ? 1 : 0;
1218 
1219 	/* handle options which do not require locking the socket. */
1220 	switch (optname) {
1221 	case SO_PRIORITY:
1222 		if (sk_set_prio_allowed(sk, val)) {
1223 			sock_set_priority(sk, val);
1224 			return 0;
1225 		}
1226 		return -EPERM;
1227 	case SO_TYPE:
1228 	case SO_PROTOCOL:
1229 	case SO_DOMAIN:
1230 	case SO_ERROR:
1231 		return -ENOPROTOOPT;
1232 #ifdef CONFIG_NET_RX_BUSY_POLL
1233 	case SO_BUSY_POLL:
1234 		if (val < 0)
1235 			return -EINVAL;
1236 		WRITE_ONCE(sk->sk_ll_usec, val);
1237 		return 0;
1238 	case SO_PREFER_BUSY_POLL:
1239 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1240 			return -EPERM;
1241 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1242 		return 0;
1243 	case SO_BUSY_POLL_BUDGET:
1244 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1245 		    !sockopt_capable(CAP_NET_ADMIN))
1246 			return -EPERM;
1247 		if (val < 0 || val > U16_MAX)
1248 			return -EINVAL;
1249 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1250 		return 0;
1251 #endif
1252 	case SO_MAX_PACING_RATE:
1253 		{
1254 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1255 		unsigned long pacing_rate;
1256 
1257 		if (sizeof(ulval) != sizeof(val) &&
1258 		    optlen >= sizeof(ulval) &&
1259 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1260 			return -EFAULT;
1261 		}
1262 		if (ulval != ~0UL)
1263 			cmpxchg(&sk->sk_pacing_status,
1264 				SK_PACING_NONE,
1265 				SK_PACING_NEEDED);
1266 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1267 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1268 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1269 		if (ulval < pacing_rate)
1270 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1271 		return 0;
1272 		}
1273 	case SO_TXREHASH:
1274 		if (!sk_is_tcp(sk))
1275 			return -EOPNOTSUPP;
1276 		if (val < -1 || val > 1)
1277 			return -EINVAL;
1278 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1279 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1280 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1281 		 * and sk_getsockopt().
1282 		 */
1283 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1284 		return 0;
1285 	case SO_PEEK_OFF:
1286 		{
1287 		int (*set_peek_off)(struct sock *sk, int val);
1288 
1289 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1290 		if (set_peek_off)
1291 			ret = set_peek_off(sk, val);
1292 		else
1293 			ret = -EOPNOTSUPP;
1294 		return ret;
1295 		}
1296 #ifdef CONFIG_PAGE_POOL
1297 	case SO_DEVMEM_DONTNEED:
1298 		return sock_devmem_dontneed(sk, optval, optlen);
1299 #endif
1300 	case SO_SNDTIMEO_OLD:
1301 	case SO_SNDTIMEO_NEW:
1302 		return sock_set_timeout(&sk->sk_sndtimeo, optval,
1303 					optlen, optname == SO_SNDTIMEO_OLD);
1304 	case SO_RCVTIMEO_OLD:
1305 	case SO_RCVTIMEO_NEW:
1306 		return sock_set_timeout(&sk->sk_rcvtimeo, optval,
1307 					optlen, optname == SO_RCVTIMEO_OLD);
1308 	}
1309 
1310 	sockopt_lock_sock(sk);
1311 
1312 	switch (optname) {
1313 	case SO_DEBUG:
1314 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1315 			ret = -EACCES;
1316 		else
1317 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1318 		break;
1319 	case SO_REUSEADDR:
1320 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1321 		break;
1322 	case SO_REUSEPORT:
1323 		if (valbool && !sk_is_inet(sk))
1324 			ret = -EOPNOTSUPP;
1325 		else
1326 			sk->sk_reuseport = valbool;
1327 		break;
1328 	case SO_DONTROUTE:
1329 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1330 		sk_dst_reset(sk);
1331 		break;
1332 	case SO_BROADCAST:
1333 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1334 		break;
1335 	case SO_SNDBUF:
1336 		/* Don't error on this BSD doesn't and if you think
1337 		 * about it this is right. Otherwise apps have to
1338 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1339 		 * are treated in BSD as hints
1340 		 */
1341 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1342 set_sndbuf:
1343 		/* Ensure val * 2 fits into an int, to prevent max_t()
1344 		 * from treating it as a negative value.
1345 		 */
1346 		val = min_t(int, val, INT_MAX / 2);
1347 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1348 		WRITE_ONCE(sk->sk_sndbuf,
1349 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1350 		/* Wake up sending tasks if we upped the value. */
1351 		sk->sk_write_space(sk);
1352 		break;
1353 
1354 	case SO_SNDBUFFORCE:
1355 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1356 			ret = -EPERM;
1357 			break;
1358 		}
1359 
1360 		/* No negative values (to prevent underflow, as val will be
1361 		 * multiplied by 2).
1362 		 */
1363 		if (val < 0)
1364 			val = 0;
1365 		goto set_sndbuf;
1366 
1367 	case SO_RCVBUF:
1368 		/* Don't error on this BSD doesn't and if you think
1369 		 * about it this is right. Otherwise apps have to
1370 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1371 		 * are treated in BSD as hints
1372 		 */
1373 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1374 		break;
1375 
1376 	case SO_RCVBUFFORCE:
1377 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1378 			ret = -EPERM;
1379 			break;
1380 		}
1381 
1382 		/* No negative values (to prevent underflow, as val will be
1383 		 * multiplied by 2).
1384 		 */
1385 		__sock_set_rcvbuf(sk, max(val, 0));
1386 		break;
1387 
1388 	case SO_KEEPALIVE:
1389 		if (sk->sk_prot->keepalive)
1390 			sk->sk_prot->keepalive(sk, valbool);
1391 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1392 		break;
1393 
1394 	case SO_OOBINLINE:
1395 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1396 		break;
1397 
1398 	case SO_NO_CHECK:
1399 		sk->sk_no_check_tx = valbool;
1400 		break;
1401 
1402 	case SO_LINGER:
1403 		if (optlen < sizeof(ling)) {
1404 			ret = -EINVAL;	/* 1003.1g */
1405 			break;
1406 		}
1407 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1408 			ret = -EFAULT;
1409 			break;
1410 		}
1411 		if (!ling.l_onoff) {
1412 			sock_reset_flag(sk, SOCK_LINGER);
1413 		} else {
1414 			unsigned long t_sec = ling.l_linger;
1415 
1416 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1417 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1418 			else
1419 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1420 			sock_set_flag(sk, SOCK_LINGER);
1421 		}
1422 		break;
1423 
1424 	case SO_BSDCOMPAT:
1425 		break;
1426 
1427 	case SO_TIMESTAMP_OLD:
1428 	case SO_TIMESTAMP_NEW:
1429 	case SO_TIMESTAMPNS_OLD:
1430 	case SO_TIMESTAMPNS_NEW:
1431 		sock_set_timestamp(sk, optname, valbool);
1432 		break;
1433 
1434 	case SO_TIMESTAMPING_NEW:
1435 	case SO_TIMESTAMPING_OLD:
1436 		if (optlen == sizeof(timestamping)) {
1437 			if (copy_from_sockptr(&timestamping, optval,
1438 					      sizeof(timestamping))) {
1439 				ret = -EFAULT;
1440 				break;
1441 			}
1442 		} else {
1443 			memset(&timestamping, 0, sizeof(timestamping));
1444 			timestamping.flags = val;
1445 		}
1446 		ret = sock_set_timestamping(sk, optname, timestamping);
1447 		break;
1448 
1449 	case SO_RCVLOWAT:
1450 		{
1451 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1452 
1453 		if (val < 0)
1454 			val = INT_MAX;
1455 		if (sock)
1456 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1457 		if (set_rcvlowat)
1458 			ret = set_rcvlowat(sk, val);
1459 		else
1460 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1461 		break;
1462 		}
1463 	case SO_ATTACH_FILTER: {
1464 		struct sock_fprog fprog;
1465 
1466 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1467 		if (!ret)
1468 			ret = sk_attach_filter(&fprog, sk);
1469 		break;
1470 	}
1471 	case SO_ATTACH_BPF:
1472 		ret = -EINVAL;
1473 		if (optlen == sizeof(u32)) {
1474 			u32 ufd;
1475 
1476 			ret = -EFAULT;
1477 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1478 				break;
1479 
1480 			ret = sk_attach_bpf(ufd, sk);
1481 		}
1482 		break;
1483 
1484 	case SO_ATTACH_REUSEPORT_CBPF: {
1485 		struct sock_fprog fprog;
1486 
1487 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1488 		if (!ret)
1489 			ret = sk_reuseport_attach_filter(&fprog, sk);
1490 		break;
1491 	}
1492 	case SO_ATTACH_REUSEPORT_EBPF:
1493 		ret = -EINVAL;
1494 		if (optlen == sizeof(u32)) {
1495 			u32 ufd;
1496 
1497 			ret = -EFAULT;
1498 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1499 				break;
1500 
1501 			ret = sk_reuseport_attach_bpf(ufd, sk);
1502 		}
1503 		break;
1504 
1505 	case SO_DETACH_REUSEPORT_BPF:
1506 		ret = reuseport_detach_prog(sk);
1507 		break;
1508 
1509 	case SO_DETACH_FILTER:
1510 		ret = sk_detach_filter(sk);
1511 		break;
1512 
1513 	case SO_LOCK_FILTER:
1514 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1515 			ret = -EPERM;
1516 		else
1517 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1518 		break;
1519 
1520 	case SO_MARK:
1521 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1522 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1523 			ret = -EPERM;
1524 			break;
1525 		}
1526 
1527 		__sock_set_mark(sk, val);
1528 		break;
1529 	case SO_RCVMARK:
1530 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1531 		break;
1532 
1533 	case SO_RCVPRIORITY:
1534 		sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1535 		break;
1536 
1537 	case SO_RXQ_OVFL:
1538 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1539 		break;
1540 
1541 	case SO_WIFI_STATUS:
1542 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1543 		break;
1544 
1545 	case SO_NOFCS:
1546 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1547 		break;
1548 
1549 	case SO_SELECT_ERR_QUEUE:
1550 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1551 		break;
1552 
1553 	case SO_PASSCRED:
1554 		if (sk_may_scm_recv(sk))
1555 			sk->sk_scm_credentials = valbool;
1556 		else
1557 			ret = -EOPNOTSUPP;
1558 		break;
1559 
1560 	case SO_PASSSEC:
1561 		if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1562 			sk->sk_scm_security = valbool;
1563 		else
1564 			ret = -EOPNOTSUPP;
1565 		break;
1566 
1567 	case SO_PASSPIDFD:
1568 		if (sk_is_unix(sk))
1569 			sk->sk_scm_pidfd = valbool;
1570 		else
1571 			ret = -EOPNOTSUPP;
1572 		break;
1573 
1574 	case SO_PASSRIGHTS:
1575 		if (sk_is_unix(sk))
1576 			sk->sk_scm_rights = valbool;
1577 		else
1578 			ret = -EOPNOTSUPP;
1579 		break;
1580 
1581 	case SO_INCOMING_CPU:
1582 		reuseport_update_incoming_cpu(sk, val);
1583 		break;
1584 
1585 	case SO_CNX_ADVICE:
1586 		if (val == 1)
1587 			dst_negative_advice(sk);
1588 		break;
1589 
1590 	case SO_ZEROCOPY:
1591 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1592 			if (!(sk_is_tcp(sk) ||
1593 			      (sk->sk_type == SOCK_DGRAM &&
1594 			       sk->sk_protocol == IPPROTO_UDP)))
1595 				ret = -EOPNOTSUPP;
1596 		} else if (sk->sk_family != PF_RDS) {
1597 			ret = -EOPNOTSUPP;
1598 		}
1599 		if (!ret) {
1600 			if (val < 0 || val > 1)
1601 				ret = -EINVAL;
1602 			else
1603 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1604 		}
1605 		break;
1606 
1607 	case SO_TXTIME:
1608 		if (optlen != sizeof(struct sock_txtime)) {
1609 			ret = -EINVAL;
1610 			break;
1611 		} else if (copy_from_sockptr(&sk_txtime, optval,
1612 			   sizeof(struct sock_txtime))) {
1613 			ret = -EFAULT;
1614 			break;
1615 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1616 			ret = -EINVAL;
1617 			break;
1618 		}
1619 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1620 		 * scheduler has enough safe guards.
1621 		 */
1622 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1623 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1624 			ret = -EPERM;
1625 			break;
1626 		}
1627 
1628 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1629 		if (ret)
1630 			break;
1631 
1632 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1633 		sk->sk_clockid = sk_txtime.clockid;
1634 		sk->sk_txtime_deadline_mode =
1635 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1636 		sk->sk_txtime_report_errors =
1637 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1638 		break;
1639 
1640 	case SO_BINDTOIFINDEX:
1641 		ret = sock_bindtoindex_locked(sk, val);
1642 		break;
1643 
1644 	case SO_BUF_LOCK:
1645 		if (val & ~SOCK_BUF_LOCK_MASK) {
1646 			ret = -EINVAL;
1647 			break;
1648 		}
1649 		sk->sk_userlocks = val | (sk->sk_userlocks &
1650 					  ~SOCK_BUF_LOCK_MASK);
1651 		break;
1652 
1653 	case SO_RESERVE_MEM:
1654 	{
1655 		int delta;
1656 
1657 		if (val < 0) {
1658 			ret = -EINVAL;
1659 			break;
1660 		}
1661 
1662 		delta = val - sk->sk_reserved_mem;
1663 		if (delta < 0)
1664 			sock_release_reserved_memory(sk, -delta);
1665 		else
1666 			ret = sock_reserve_memory(sk, delta);
1667 		break;
1668 	}
1669 
1670 	default:
1671 		ret = -ENOPROTOOPT;
1672 		break;
1673 	}
1674 	sockopt_release_sock(sk);
1675 	return ret;
1676 }
1677 
1678 int sock_setsockopt(struct socket *sock, int level, int optname,
1679 		    sockptr_t optval, unsigned int optlen)
1680 {
1681 	return sk_setsockopt(sock->sk, level, optname,
1682 			     optval, optlen);
1683 }
1684 EXPORT_SYMBOL(sock_setsockopt);
1685 
1686 static const struct cred *sk_get_peer_cred(struct sock *sk)
1687 {
1688 	const struct cred *cred;
1689 
1690 	spin_lock(&sk->sk_peer_lock);
1691 	cred = get_cred(sk->sk_peer_cred);
1692 	spin_unlock(&sk->sk_peer_lock);
1693 
1694 	return cred;
1695 }
1696 
1697 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1698 			  struct ucred *ucred)
1699 {
1700 	ucred->pid = pid_vnr(pid);
1701 	ucred->uid = ucred->gid = -1;
1702 	if (cred) {
1703 		struct user_namespace *current_ns = current_user_ns();
1704 
1705 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1706 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1707 	}
1708 }
1709 
1710 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1711 {
1712 	struct user_namespace *user_ns = current_user_ns();
1713 	int i;
1714 
1715 	for (i = 0; i < src->ngroups; i++) {
1716 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1717 
1718 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1719 			return -EFAULT;
1720 	}
1721 
1722 	return 0;
1723 }
1724 
1725 int sk_getsockopt(struct sock *sk, int level, int optname,
1726 		  sockptr_t optval, sockptr_t optlen)
1727 {
1728 	struct socket *sock = sk->sk_socket;
1729 
1730 	union {
1731 		int val;
1732 		u64 val64;
1733 		unsigned long ulval;
1734 		struct linger ling;
1735 		struct old_timeval32 tm32;
1736 		struct __kernel_old_timeval tm;
1737 		struct  __kernel_sock_timeval stm;
1738 		struct sock_txtime txtime;
1739 		struct so_timestamping timestamping;
1740 	} v;
1741 
1742 	int lv = sizeof(int);
1743 	int len;
1744 
1745 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1746 		return -EFAULT;
1747 	if (len < 0)
1748 		return -EINVAL;
1749 
1750 	memset(&v, 0, sizeof(v));
1751 
1752 	switch (optname) {
1753 	case SO_DEBUG:
1754 		v.val = sock_flag(sk, SOCK_DBG);
1755 		break;
1756 
1757 	case SO_DONTROUTE:
1758 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1759 		break;
1760 
1761 	case SO_BROADCAST:
1762 		v.val = sock_flag(sk, SOCK_BROADCAST);
1763 		break;
1764 
1765 	case SO_SNDBUF:
1766 		v.val = READ_ONCE(sk->sk_sndbuf);
1767 		break;
1768 
1769 	case SO_RCVBUF:
1770 		v.val = READ_ONCE(sk->sk_rcvbuf);
1771 		break;
1772 
1773 	case SO_REUSEADDR:
1774 		v.val = sk->sk_reuse;
1775 		break;
1776 
1777 	case SO_REUSEPORT:
1778 		v.val = sk->sk_reuseport;
1779 		break;
1780 
1781 	case SO_KEEPALIVE:
1782 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1783 		break;
1784 
1785 	case SO_TYPE:
1786 		v.val = sk->sk_type;
1787 		break;
1788 
1789 	case SO_PROTOCOL:
1790 		v.val = sk->sk_protocol;
1791 		break;
1792 
1793 	case SO_DOMAIN:
1794 		v.val = sk->sk_family;
1795 		break;
1796 
1797 	case SO_ERROR:
1798 		v.val = -sock_error(sk);
1799 		if (v.val == 0)
1800 			v.val = xchg(&sk->sk_err_soft, 0);
1801 		break;
1802 
1803 	case SO_OOBINLINE:
1804 		v.val = sock_flag(sk, SOCK_URGINLINE);
1805 		break;
1806 
1807 	case SO_NO_CHECK:
1808 		v.val = sk->sk_no_check_tx;
1809 		break;
1810 
1811 	case SO_PRIORITY:
1812 		v.val = READ_ONCE(sk->sk_priority);
1813 		break;
1814 
1815 	case SO_LINGER:
1816 		lv		= sizeof(v.ling);
1817 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1818 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1819 		break;
1820 
1821 	case SO_BSDCOMPAT:
1822 		break;
1823 
1824 	case SO_TIMESTAMP_OLD:
1825 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1826 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1827 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1828 		break;
1829 
1830 	case SO_TIMESTAMPNS_OLD:
1831 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1832 		break;
1833 
1834 	case SO_TIMESTAMP_NEW:
1835 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1836 		break;
1837 
1838 	case SO_TIMESTAMPNS_NEW:
1839 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1840 		break;
1841 
1842 	case SO_TIMESTAMPING_OLD:
1843 	case SO_TIMESTAMPING_NEW:
1844 		lv = sizeof(v.timestamping);
1845 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1846 		 * returning the flags when they were set through the same option.
1847 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1848 		 */
1849 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1850 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1851 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1852 		}
1853 		break;
1854 
1855 	case SO_RCVTIMEO_OLD:
1856 	case SO_RCVTIMEO_NEW:
1857 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1858 				      SO_RCVTIMEO_OLD == optname);
1859 		break;
1860 
1861 	case SO_SNDTIMEO_OLD:
1862 	case SO_SNDTIMEO_NEW:
1863 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1864 				      SO_SNDTIMEO_OLD == optname);
1865 		break;
1866 
1867 	case SO_RCVLOWAT:
1868 		v.val = READ_ONCE(sk->sk_rcvlowat);
1869 		break;
1870 
1871 	case SO_SNDLOWAT:
1872 		v.val = 1;
1873 		break;
1874 
1875 	case SO_PASSCRED:
1876 		if (!sk_may_scm_recv(sk))
1877 			return -EOPNOTSUPP;
1878 
1879 		v.val = sk->sk_scm_credentials;
1880 		break;
1881 
1882 	case SO_PASSPIDFD:
1883 		if (!sk_is_unix(sk))
1884 			return -EOPNOTSUPP;
1885 
1886 		v.val = sk->sk_scm_pidfd;
1887 		break;
1888 
1889 	case SO_PASSRIGHTS:
1890 		if (!sk_is_unix(sk))
1891 			return -EOPNOTSUPP;
1892 
1893 		v.val = sk->sk_scm_rights;
1894 		break;
1895 
1896 	case SO_PEERCRED:
1897 	{
1898 		struct ucred peercred;
1899 		if (len > sizeof(peercred))
1900 			len = sizeof(peercred);
1901 
1902 		spin_lock(&sk->sk_peer_lock);
1903 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1904 		spin_unlock(&sk->sk_peer_lock);
1905 
1906 		if (copy_to_sockptr(optval, &peercred, len))
1907 			return -EFAULT;
1908 		goto lenout;
1909 	}
1910 
1911 	case SO_PEERPIDFD:
1912 	{
1913 		struct pid *peer_pid;
1914 		struct file *pidfd_file = NULL;
1915 		unsigned int flags = 0;
1916 		int pidfd;
1917 
1918 		if (len > sizeof(pidfd))
1919 			len = sizeof(pidfd);
1920 
1921 		spin_lock(&sk->sk_peer_lock);
1922 		peer_pid = get_pid(sk->sk_peer_pid);
1923 		spin_unlock(&sk->sk_peer_lock);
1924 
1925 		if (!peer_pid)
1926 			return -ENODATA;
1927 
1928 		/* The use of PIDFD_STALE requires stashing of struct pid
1929 		 * on pidfs with pidfs_register_pid() and only AF_UNIX
1930 		 * were prepared for this.
1931 		 */
1932 		if (sk->sk_family == AF_UNIX)
1933 			flags = PIDFD_STALE;
1934 
1935 		pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
1936 		put_pid(peer_pid);
1937 		if (pidfd < 0)
1938 			return pidfd;
1939 
1940 		if (copy_to_sockptr(optval, &pidfd, len) ||
1941 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1942 			put_unused_fd(pidfd);
1943 			fput(pidfd_file);
1944 
1945 			return -EFAULT;
1946 		}
1947 
1948 		fd_install(pidfd, pidfd_file);
1949 		return 0;
1950 	}
1951 
1952 	case SO_PEERGROUPS:
1953 	{
1954 		const struct cred *cred;
1955 		int ret, n;
1956 
1957 		cred = sk_get_peer_cred(sk);
1958 		if (!cred)
1959 			return -ENODATA;
1960 
1961 		n = cred->group_info->ngroups;
1962 		if (len < n * sizeof(gid_t)) {
1963 			len = n * sizeof(gid_t);
1964 			put_cred(cred);
1965 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1966 		}
1967 		len = n * sizeof(gid_t);
1968 
1969 		ret = groups_to_user(optval, cred->group_info);
1970 		put_cred(cred);
1971 		if (ret)
1972 			return ret;
1973 		goto lenout;
1974 	}
1975 
1976 	case SO_PEERNAME:
1977 	{
1978 		struct sockaddr_storage address;
1979 
1980 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1981 		if (lv < 0)
1982 			return -ENOTCONN;
1983 		if (lv < len)
1984 			return -EINVAL;
1985 		if (copy_to_sockptr(optval, &address, len))
1986 			return -EFAULT;
1987 		goto lenout;
1988 	}
1989 
1990 	/* Dubious BSD thing... Probably nobody even uses it, but
1991 	 * the UNIX standard wants it for whatever reason... -DaveM
1992 	 */
1993 	case SO_ACCEPTCONN:
1994 		v.val = sk->sk_state == TCP_LISTEN;
1995 		break;
1996 
1997 	case SO_PASSSEC:
1998 		if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
1999 			return -EOPNOTSUPP;
2000 
2001 		v.val = sk->sk_scm_security;
2002 		break;
2003 
2004 	case SO_PEERSEC:
2005 		return security_socket_getpeersec_stream(sock,
2006 							 optval, optlen, len);
2007 
2008 	case SO_MARK:
2009 		v.val = READ_ONCE(sk->sk_mark);
2010 		break;
2011 
2012 	case SO_RCVMARK:
2013 		v.val = sock_flag(sk, SOCK_RCVMARK);
2014 		break;
2015 
2016 	case SO_RCVPRIORITY:
2017 		v.val = sock_flag(sk, SOCK_RCVPRIORITY);
2018 		break;
2019 
2020 	case SO_RXQ_OVFL:
2021 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
2022 		break;
2023 
2024 	case SO_WIFI_STATUS:
2025 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
2026 		break;
2027 
2028 	case SO_PEEK_OFF:
2029 		if (!READ_ONCE(sock->ops)->set_peek_off)
2030 			return -EOPNOTSUPP;
2031 
2032 		v.val = READ_ONCE(sk->sk_peek_off);
2033 		break;
2034 	case SO_NOFCS:
2035 		v.val = sock_flag(sk, SOCK_NOFCS);
2036 		break;
2037 
2038 	case SO_BINDTODEVICE:
2039 		return sock_getbindtodevice(sk, optval, optlen, len);
2040 
2041 	case SO_GET_FILTER:
2042 		len = sk_get_filter(sk, optval, len);
2043 		if (len < 0)
2044 			return len;
2045 
2046 		goto lenout;
2047 
2048 	case SO_LOCK_FILTER:
2049 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
2050 		break;
2051 
2052 	case SO_BPF_EXTENSIONS:
2053 		v.val = bpf_tell_extensions();
2054 		break;
2055 
2056 	case SO_SELECT_ERR_QUEUE:
2057 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2058 		break;
2059 
2060 #ifdef CONFIG_NET_RX_BUSY_POLL
2061 	case SO_BUSY_POLL:
2062 		v.val = READ_ONCE(sk->sk_ll_usec);
2063 		break;
2064 	case SO_PREFER_BUSY_POLL:
2065 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2066 		break;
2067 #endif
2068 
2069 	case SO_MAX_PACING_RATE:
2070 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2071 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2072 			lv = sizeof(v.ulval);
2073 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2074 		} else {
2075 			/* 32bit version */
2076 			v.val = min_t(unsigned long, ~0U,
2077 				      READ_ONCE(sk->sk_max_pacing_rate));
2078 		}
2079 		break;
2080 
2081 	case SO_INCOMING_CPU:
2082 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2083 		break;
2084 
2085 	case SO_MEMINFO:
2086 	{
2087 		u32 meminfo[SK_MEMINFO_VARS];
2088 
2089 		sk_get_meminfo(sk, meminfo);
2090 
2091 		len = min_t(unsigned int, len, sizeof(meminfo));
2092 		if (copy_to_sockptr(optval, &meminfo, len))
2093 			return -EFAULT;
2094 
2095 		goto lenout;
2096 	}
2097 
2098 #ifdef CONFIG_NET_RX_BUSY_POLL
2099 	case SO_INCOMING_NAPI_ID:
2100 		v.val = READ_ONCE(sk->sk_napi_id);
2101 
2102 		/* aggregate non-NAPI IDs down to 0 */
2103 		if (!napi_id_valid(v.val))
2104 			v.val = 0;
2105 
2106 		break;
2107 #endif
2108 
2109 	case SO_COOKIE:
2110 		lv = sizeof(u64);
2111 		if (len < lv)
2112 			return -EINVAL;
2113 		v.val64 = sock_gen_cookie(sk);
2114 		break;
2115 
2116 	case SO_ZEROCOPY:
2117 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2118 		break;
2119 
2120 	case SO_TXTIME:
2121 		lv = sizeof(v.txtime);
2122 		v.txtime.clockid = sk->sk_clockid;
2123 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2124 				  SOF_TXTIME_DEADLINE_MODE : 0;
2125 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2126 				  SOF_TXTIME_REPORT_ERRORS : 0;
2127 		break;
2128 
2129 	case SO_BINDTOIFINDEX:
2130 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2131 		break;
2132 
2133 	case SO_NETNS_COOKIE:
2134 		lv = sizeof(u64);
2135 		if (len != lv)
2136 			return -EINVAL;
2137 		v.val64 = sock_net(sk)->net_cookie;
2138 		break;
2139 
2140 	case SO_BUF_LOCK:
2141 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2142 		break;
2143 
2144 	case SO_RESERVE_MEM:
2145 		v.val = READ_ONCE(sk->sk_reserved_mem);
2146 		break;
2147 
2148 	case SO_TXREHASH:
2149 		if (!sk_is_tcp(sk))
2150 			return -EOPNOTSUPP;
2151 
2152 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2153 		v.val = READ_ONCE(sk->sk_txrehash);
2154 		break;
2155 
2156 	default:
2157 		/* We implement the SO_SNDLOWAT etc to not be settable
2158 		 * (1003.1g 7).
2159 		 */
2160 		return -ENOPROTOOPT;
2161 	}
2162 
2163 	if (len > lv)
2164 		len = lv;
2165 	if (copy_to_sockptr(optval, &v, len))
2166 		return -EFAULT;
2167 lenout:
2168 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2169 		return -EFAULT;
2170 	return 0;
2171 }
2172 
2173 /*
2174  * Initialize an sk_lock.
2175  *
2176  * (We also register the sk_lock with the lock validator.)
2177  */
2178 static inline void sock_lock_init(struct sock *sk)
2179 {
2180 	sk_owner_clear(sk);
2181 
2182 	if (sk->sk_kern_sock)
2183 		sock_lock_init_class_and_name(
2184 			sk,
2185 			af_family_kern_slock_key_strings[sk->sk_family],
2186 			af_family_kern_slock_keys + sk->sk_family,
2187 			af_family_kern_key_strings[sk->sk_family],
2188 			af_family_kern_keys + sk->sk_family);
2189 	else
2190 		sock_lock_init_class_and_name(
2191 			sk,
2192 			af_family_slock_key_strings[sk->sk_family],
2193 			af_family_slock_keys + sk->sk_family,
2194 			af_family_key_strings[sk->sk_family],
2195 			af_family_keys + sk->sk_family);
2196 }
2197 
2198 /*
2199  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2200  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2201  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2202  */
2203 static void sock_copy(struct sock *nsk, const struct sock *osk)
2204 {
2205 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2206 #ifdef CONFIG_SECURITY_NETWORK
2207 	void *sptr = nsk->sk_security;
2208 #endif
2209 
2210 	/* If we move sk_tx_queue_mapping out of the private section,
2211 	 * we must check if sk_tx_queue_clear() is called after
2212 	 * sock_copy() in sk_clone_lock().
2213 	 */
2214 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2215 		     offsetof(struct sock, sk_dontcopy_begin) ||
2216 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2217 		     offsetof(struct sock, sk_dontcopy_end));
2218 
2219 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2220 
2221 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2222 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2223 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2224 
2225 #ifdef CONFIG_SECURITY_NETWORK
2226 	nsk->sk_security = sptr;
2227 	security_sk_clone(osk, nsk);
2228 #endif
2229 }
2230 
2231 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2232 		int family)
2233 {
2234 	struct sock *sk;
2235 	struct kmem_cache *slab;
2236 
2237 	slab = prot->slab;
2238 	if (slab != NULL) {
2239 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2240 		if (!sk)
2241 			return sk;
2242 		if (want_init_on_alloc(priority))
2243 			sk_prot_clear_nulls(sk, prot->obj_size);
2244 	} else
2245 		sk = kmalloc(prot->obj_size, priority);
2246 
2247 	if (sk != NULL) {
2248 		if (security_sk_alloc(sk, family, priority))
2249 			goto out_free;
2250 
2251 		if (!try_module_get(prot->owner))
2252 			goto out_free_sec;
2253 	}
2254 
2255 	return sk;
2256 
2257 out_free_sec:
2258 	security_sk_free(sk);
2259 out_free:
2260 	if (slab != NULL)
2261 		kmem_cache_free(slab, sk);
2262 	else
2263 		kfree(sk);
2264 	return NULL;
2265 }
2266 
2267 static void sk_prot_free(struct proto *prot, struct sock *sk)
2268 {
2269 	struct kmem_cache *slab;
2270 	struct module *owner;
2271 
2272 	owner = prot->owner;
2273 	slab = prot->slab;
2274 
2275 	cgroup_sk_free(&sk->sk_cgrp_data);
2276 	mem_cgroup_sk_free(sk);
2277 	security_sk_free(sk);
2278 
2279 	sk_owner_put(sk);
2280 
2281 	if (slab != NULL)
2282 		kmem_cache_free(slab, sk);
2283 	else
2284 		kfree(sk);
2285 	module_put(owner);
2286 }
2287 
2288 /**
2289  *	sk_alloc - All socket objects are allocated here
2290  *	@net: the applicable net namespace
2291  *	@family: protocol family
2292  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2293  *	@prot: struct proto associated with this new sock instance
2294  *	@kern: is this to be a kernel socket?
2295  */
2296 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2297 		      struct proto *prot, int kern)
2298 {
2299 	struct sock *sk;
2300 
2301 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2302 	if (sk) {
2303 		sk->sk_family = family;
2304 		/*
2305 		 * See comment in struct sock definition to understand
2306 		 * why we need sk_prot_creator -acme
2307 		 */
2308 		sk->sk_prot = sk->sk_prot_creator = prot;
2309 
2310 		if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
2311 			sk->sk_bypass_prot_mem = 1;
2312 
2313 		sk->sk_kern_sock = kern;
2314 		sock_lock_init(sk);
2315 
2316 		sk->sk_net_refcnt = kern ? 0 : 1;
2317 		if (likely(sk->sk_net_refcnt)) {
2318 			get_net_track(net, &sk->ns_tracker, priority);
2319 			sock_inuse_add(net, 1);
2320 		} else {
2321 			net_passive_inc(net);
2322 			__netns_tracker_alloc(net, &sk->ns_tracker,
2323 					      false, priority);
2324 		}
2325 
2326 		sock_net_set(sk, net);
2327 		refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2328 
2329 		mem_cgroup_sk_alloc(sk);
2330 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2331 		sock_update_classid(&sk->sk_cgrp_data);
2332 		sock_update_netprioidx(&sk->sk_cgrp_data);
2333 		sk_tx_queue_clear(sk);
2334 	}
2335 
2336 	return sk;
2337 }
2338 EXPORT_SYMBOL(sk_alloc);
2339 
2340 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2341  * grace period. This is the case for UDP sockets and TCP listeners.
2342  */
2343 static void __sk_destruct(struct rcu_head *head)
2344 {
2345 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2346 	struct net *net = sock_net(sk);
2347 	struct sk_filter *filter;
2348 
2349 	if (sk->sk_destruct)
2350 		sk->sk_destruct(sk);
2351 
2352 	filter = rcu_dereference_check(sk->sk_filter,
2353 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2354 	if (filter) {
2355 		sk_filter_uncharge(sk, filter);
2356 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2357 	}
2358 
2359 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2360 
2361 #ifdef CONFIG_BPF_SYSCALL
2362 	bpf_sk_storage_free(sk);
2363 #endif
2364 
2365 	if (atomic_read(&sk->sk_omem_alloc))
2366 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2367 			 __func__, atomic_read(&sk->sk_omem_alloc));
2368 
2369 	if (sk->sk_frag.page) {
2370 		put_page(sk->sk_frag.page);
2371 		sk->sk_frag.page = NULL;
2372 	}
2373 
2374 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2375 	put_cred(sk->sk_peer_cred);
2376 	put_pid(sk->sk_peer_pid);
2377 
2378 	if (likely(sk->sk_net_refcnt)) {
2379 		put_net_track(net, &sk->ns_tracker);
2380 	} else {
2381 		__netns_tracker_free(net, &sk->ns_tracker, false);
2382 		net_passive_dec(net);
2383 	}
2384 	sk_prot_free(sk->sk_prot_creator, sk);
2385 }
2386 
2387 void sk_net_refcnt_upgrade(struct sock *sk)
2388 {
2389 	struct net *net = sock_net(sk);
2390 
2391 	WARN_ON_ONCE(sk->sk_net_refcnt);
2392 	__netns_tracker_free(net, &sk->ns_tracker, false);
2393 	net_passive_dec(net);
2394 	sk->sk_net_refcnt = 1;
2395 	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2396 	sock_inuse_add(net, 1);
2397 }
2398 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2399 
2400 void sk_destruct(struct sock *sk)
2401 {
2402 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2403 
2404 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2405 		reuseport_detach_sock(sk);
2406 		use_call_rcu = true;
2407 	}
2408 
2409 	if (use_call_rcu)
2410 		call_rcu(&sk->sk_rcu, __sk_destruct);
2411 	else
2412 		__sk_destruct(&sk->sk_rcu);
2413 }
2414 
2415 static void __sk_free(struct sock *sk)
2416 {
2417 	if (likely(sk->sk_net_refcnt))
2418 		sock_inuse_add(sock_net(sk), -1);
2419 
2420 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2421 		sock_diag_broadcast_destroy(sk);
2422 	else
2423 		sk_destruct(sk);
2424 }
2425 
2426 void sk_free(struct sock *sk)
2427 {
2428 	/*
2429 	 * We subtract one from sk_wmem_alloc and can know if
2430 	 * some packets are still in some tx queue.
2431 	 * If not null, sock_wfree() will call __sk_free(sk) later
2432 	 */
2433 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2434 		__sk_free(sk);
2435 }
2436 EXPORT_SYMBOL(sk_free);
2437 
2438 static void sk_init_common(struct sock *sk)
2439 {
2440 	skb_queue_head_init(&sk->sk_receive_queue);
2441 	skb_queue_head_init(&sk->sk_write_queue);
2442 	skb_queue_head_init(&sk->sk_error_queue);
2443 
2444 	rwlock_init(&sk->sk_callback_lock);
2445 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2446 			af_rlock_keys + sk->sk_family,
2447 			af_family_rlock_key_strings[sk->sk_family]);
2448 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2449 			af_wlock_keys + sk->sk_family,
2450 			af_family_wlock_key_strings[sk->sk_family]);
2451 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2452 			af_elock_keys + sk->sk_family,
2453 			af_family_elock_key_strings[sk->sk_family]);
2454 	if (sk->sk_kern_sock)
2455 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2456 			af_kern_callback_keys + sk->sk_family,
2457 			af_family_kern_clock_key_strings[sk->sk_family]);
2458 	else
2459 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2460 			af_callback_keys + sk->sk_family,
2461 			af_family_clock_key_strings[sk->sk_family]);
2462 }
2463 
2464 /**
2465  *	sk_clone_lock - clone a socket, and lock its clone
2466  *	@sk: the socket to clone
2467  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2468  *
2469  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2470  */
2471 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2472 {
2473 	struct proto *prot = READ_ONCE(sk->sk_prot);
2474 	struct sk_filter *filter;
2475 	bool is_charged = true;
2476 	struct sock *newsk;
2477 
2478 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2479 	if (!newsk)
2480 		goto out;
2481 
2482 	sock_copy(newsk, sk);
2483 
2484 	newsk->sk_prot_creator = prot;
2485 
2486 	/* SANITY */
2487 	if (likely(newsk->sk_net_refcnt)) {
2488 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2489 		sock_inuse_add(sock_net(newsk), 1);
2490 	} else {
2491 		/* Kernel sockets are not elevating the struct net refcount.
2492 		 * Instead, use a tracker to more easily detect if a layer
2493 		 * is not properly dismantling its kernel sockets at netns
2494 		 * destroy time.
2495 		 */
2496 		net_passive_inc(sock_net(newsk));
2497 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2498 				      false, priority);
2499 	}
2500 	sk_node_init(&newsk->sk_node);
2501 	sock_lock_init(newsk);
2502 	bh_lock_sock(newsk);
2503 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2504 	newsk->sk_backlog.len = 0;
2505 
2506 	atomic_set(&newsk->sk_rmem_alloc, 0);
2507 
2508 	refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2509 
2510 	atomic_set(&newsk->sk_omem_alloc, 0);
2511 	sk_init_common(newsk);
2512 
2513 	newsk->sk_dst_cache	= NULL;
2514 	newsk->sk_dst_pending_confirm = 0;
2515 	newsk->sk_wmem_queued	= 0;
2516 	newsk->sk_forward_alloc = 0;
2517 	newsk->sk_reserved_mem  = 0;
2518 	DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
2519 	sk_drops_reset(newsk);
2520 	newsk->sk_send_head	= NULL;
2521 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2522 	atomic_set(&newsk->sk_zckey, 0);
2523 
2524 	sock_reset_flag(newsk, SOCK_DONE);
2525 
2526 #ifdef CONFIG_MEMCG
2527 	/* sk->sk_memcg will be populated at accept() time */
2528 	newsk->sk_memcg = NULL;
2529 #endif
2530 
2531 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2532 
2533 	rcu_read_lock();
2534 	filter = rcu_dereference(sk->sk_filter);
2535 	if (filter != NULL)
2536 		/* though it's an empty new sock, the charging may fail
2537 		 * if sysctl_optmem_max was changed between creation of
2538 		 * original socket and cloning
2539 		 */
2540 		is_charged = sk_filter_charge(newsk, filter);
2541 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2542 	rcu_read_unlock();
2543 
2544 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2545 		/* We need to make sure that we don't uncharge the new
2546 		 * socket if we couldn't charge it in the first place
2547 		 * as otherwise we uncharge the parent's filter.
2548 		 */
2549 		if (!is_charged)
2550 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2551 
2552 		goto free;
2553 	}
2554 
2555 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2556 
2557 	if (bpf_sk_storage_clone(sk, newsk))
2558 		goto free;
2559 
2560 	/* Clear sk_user_data if parent had the pointer tagged
2561 	 * as not suitable for copying when cloning.
2562 	 */
2563 	if (sk_user_data_is_nocopy(newsk))
2564 		newsk->sk_user_data = NULL;
2565 
2566 	newsk->sk_err	   = 0;
2567 	newsk->sk_err_soft = 0;
2568 	newsk->sk_priority = 0;
2569 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2570 
2571 	/* Before updating sk_refcnt, we must commit prior changes to memory
2572 	 * (Documentation/RCU/rculist_nulls.rst for details)
2573 	 */
2574 	smp_wmb();
2575 	refcount_set(&newsk->sk_refcnt, 2);
2576 
2577 	sk_set_socket(newsk, NULL);
2578 	sk_tx_queue_clear(newsk);
2579 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2580 
2581 	if (newsk->sk_prot->sockets_allocated)
2582 		sk_sockets_allocated_inc(newsk);
2583 
2584 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2585 		net_enable_timestamp();
2586 out:
2587 	return newsk;
2588 free:
2589 	/* It is still raw copy of parent, so invalidate
2590 	 * destructor and make plain sk_free()
2591 	 */
2592 	newsk->sk_destruct = NULL;
2593 	bh_unlock_sock(newsk);
2594 	sk_free(newsk);
2595 	newsk = NULL;
2596 	goto out;
2597 }
2598 EXPORT_SYMBOL_GPL(sk_clone_lock);
2599 
2600 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
2601 {
2602 	bool is_ipv6 = false;
2603 	u32 max_size;
2604 
2605 #if IS_ENABLED(CONFIG_IPV6)
2606 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2607 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2608 #endif
2609 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2610 	max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
2611 			READ_ONCE(dev->gso_ipv4_max_size);
2612 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2613 		max_size = GSO_LEGACY_MAX_SIZE;
2614 
2615 	return max_size - (MAX_TCP_HEADER + 1);
2616 }
2617 
2618 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2619 {
2620 	const struct net_device *dev;
2621 	u32 max_segs = 1;
2622 
2623 	rcu_read_lock();
2624 	dev = dst_dev_rcu(dst);
2625 	sk->sk_route_caps = dev->features;
2626 	if (sk_is_tcp(sk)) {
2627 		struct inet_connection_sock *icsk = inet_csk(sk);
2628 
2629 		sk->sk_route_caps |= NETIF_F_GSO;
2630 		icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2631 	}
2632 	if (sk->sk_route_caps & NETIF_F_GSO)
2633 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2634 	if (unlikely(sk->sk_gso_disabled))
2635 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2636 	if (sk_can_gso(sk)) {
2637 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2638 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2639 		} else {
2640 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2641 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
2642 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2643 			max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
2644 		}
2645 	}
2646 	sk->sk_gso_max_segs = max_segs;
2647 	sk_dst_set(sk, dst);
2648 	rcu_read_unlock();
2649 }
2650 EXPORT_SYMBOL_GPL(sk_setup_caps);
2651 
2652 /*
2653  *	Simple resource managers for sockets.
2654  */
2655 
2656 
2657 /*
2658  * Write buffer destructor automatically called from kfree_skb.
2659  */
2660 void sock_wfree(struct sk_buff *skb)
2661 {
2662 	unsigned int len = skb->truesize;
2663 	struct sock *sk = skb->sk;
2664 	bool free;
2665 	int old;
2666 
2667 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2668 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2669 		    sk->sk_write_space == sock_def_write_space) {
2670 			rcu_read_lock();
2671 			free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
2672 						       &old);
2673 			sock_def_write_space_wfree(sk, old - len);
2674 			rcu_read_unlock();
2675 			if (unlikely(free))
2676 				__sk_free(sk);
2677 			return;
2678 		}
2679 
2680 		/*
2681 		 * Keep a reference on sk_wmem_alloc, this will be released
2682 		 * after sk_write_space() call
2683 		 */
2684 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2685 		sk->sk_write_space(sk);
2686 		len = 1;
2687 	}
2688 	/*
2689 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2690 	 * could not do because of in-flight packets
2691 	 */
2692 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2693 		__sk_free(sk);
2694 }
2695 EXPORT_SYMBOL(sock_wfree);
2696 
2697 /* This variant of sock_wfree() is used by TCP,
2698  * since it sets SOCK_USE_WRITE_QUEUE.
2699  */
2700 void __sock_wfree(struct sk_buff *skb)
2701 {
2702 	struct sock *sk = skb->sk;
2703 
2704 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2705 		__sk_free(sk);
2706 }
2707 
2708 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2709 {
2710 	int old_wmem;
2711 
2712 	skb_orphan(skb);
2713 #ifdef CONFIG_INET
2714 	if (unlikely(!sk_fullsock(sk)))
2715 		return skb_set_owner_edemux(skb, sk);
2716 #endif
2717 	skb->sk = sk;
2718 	skb->destructor = sock_wfree;
2719 	skb_set_hash_from_sk(skb, sk);
2720 	/*
2721 	 * We used to take a refcount on sk, but following operation
2722 	 * is enough to guarantee sk_free() won't free this sock until
2723 	 * all in-flight packets are completed
2724 	 */
2725 	__refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);
2726 
2727 	/* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
2728 	 * is in a host queue (qdisc, NIC queue).
2729 	 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
2730 	 * based on XPS for better performance.
2731 	 * Otherwise clear ooo_okay to not risk Out Of Order delivery.
2732 	 */
2733 	skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
2734 }
2735 EXPORT_SYMBOL(skb_set_owner_w);
2736 
2737 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2738 {
2739 	/* Drivers depend on in-order delivery for crypto offload,
2740 	 * partial orphan breaks out-of-order-OK logic.
2741 	 */
2742 	if (skb_is_decrypted(skb))
2743 		return false;
2744 
2745 	return (skb->destructor == sock_wfree ||
2746 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2747 }
2748 
2749 /* This helper is used by netem, as it can hold packets in its
2750  * delay queue. We want to allow the owner socket to send more
2751  * packets, as if they were already TX completed by a typical driver.
2752  * But we also want to keep skb->sk set because some packet schedulers
2753  * rely on it (sch_fq for example).
2754  */
2755 void skb_orphan_partial(struct sk_buff *skb)
2756 {
2757 	if (skb_is_tcp_pure_ack(skb))
2758 		return;
2759 
2760 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2761 		return;
2762 
2763 	skb_orphan(skb);
2764 }
2765 EXPORT_SYMBOL(skb_orphan_partial);
2766 
2767 /*
2768  * Read buffer destructor automatically called from kfree_skb.
2769  */
2770 void sock_rfree(struct sk_buff *skb)
2771 {
2772 	struct sock *sk = skb->sk;
2773 	unsigned int len = skb->truesize;
2774 
2775 	atomic_sub(len, &sk->sk_rmem_alloc);
2776 	sk_mem_uncharge(sk, len);
2777 }
2778 EXPORT_SYMBOL(sock_rfree);
2779 
2780 /*
2781  * Buffer destructor for skbs that are not used directly in read or write
2782  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2783  */
2784 void sock_efree(struct sk_buff *skb)
2785 {
2786 	sock_put(skb->sk);
2787 }
2788 EXPORT_SYMBOL(sock_efree);
2789 
2790 /* Buffer destructor for prefetch/receive path where reference count may
2791  * not be held, e.g. for listen sockets.
2792  */
2793 #ifdef CONFIG_INET
2794 void sock_pfree(struct sk_buff *skb)
2795 {
2796 	struct sock *sk = skb->sk;
2797 
2798 	if (!sk_is_refcounted(sk))
2799 		return;
2800 
2801 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2802 		inet_reqsk(sk)->rsk_listener = NULL;
2803 		reqsk_free(inet_reqsk(sk));
2804 		return;
2805 	}
2806 
2807 	sock_gen_put(sk);
2808 }
2809 EXPORT_SYMBOL(sock_pfree);
2810 #endif /* CONFIG_INET */
2811 
2812 /*
2813  * Allocate a skb from the socket's send buffer.
2814  */
2815 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2816 			     gfp_t priority)
2817 {
2818 	if (force ||
2819 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2820 		struct sk_buff *skb = alloc_skb(size, priority);
2821 
2822 		if (skb) {
2823 			skb_set_owner_w(skb, sk);
2824 			return skb;
2825 		}
2826 	}
2827 	return NULL;
2828 }
2829 EXPORT_SYMBOL(sock_wmalloc);
2830 
2831 static void sock_ofree(struct sk_buff *skb)
2832 {
2833 	struct sock *sk = skb->sk;
2834 
2835 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2836 }
2837 
2838 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2839 			     gfp_t priority)
2840 {
2841 	struct sk_buff *skb;
2842 
2843 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2844 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2845 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2846 		return NULL;
2847 
2848 	skb = alloc_skb(size, priority);
2849 	if (!skb)
2850 		return NULL;
2851 
2852 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2853 	skb->sk = sk;
2854 	skb->destructor = sock_ofree;
2855 	return skb;
2856 }
2857 
2858 /*
2859  * Allocate a memory block from the socket's option memory buffer.
2860  */
2861 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2862 {
2863 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2864 
2865 	if ((unsigned int)size <= optmem_max &&
2866 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2867 		void *mem;
2868 		/* First do the add, to avoid the race if kmalloc
2869 		 * might sleep.
2870 		 */
2871 		atomic_add(size, &sk->sk_omem_alloc);
2872 		mem = kmalloc(size, priority);
2873 		if (mem)
2874 			return mem;
2875 		atomic_sub(size, &sk->sk_omem_alloc);
2876 	}
2877 	return NULL;
2878 }
2879 EXPORT_SYMBOL(sock_kmalloc);
2880 
2881 /*
2882  * Duplicate the input "src" memory block using the socket's
2883  * option memory buffer.
2884  */
2885 void *sock_kmemdup(struct sock *sk, const void *src,
2886 		   int size, gfp_t priority)
2887 {
2888 	void *mem;
2889 
2890 	mem = sock_kmalloc(sk, size, priority);
2891 	if (mem)
2892 		memcpy(mem, src, size);
2893 	return mem;
2894 }
2895 EXPORT_SYMBOL(sock_kmemdup);
2896 
2897 /* Free an option memory block. Note, we actually want the inline
2898  * here as this allows gcc to detect the nullify and fold away the
2899  * condition entirely.
2900  */
2901 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2902 				  const bool nullify)
2903 {
2904 	if (WARN_ON_ONCE(!mem))
2905 		return;
2906 	if (nullify)
2907 		kfree_sensitive(mem);
2908 	else
2909 		kfree(mem);
2910 	atomic_sub(size, &sk->sk_omem_alloc);
2911 }
2912 
2913 void sock_kfree_s(struct sock *sk, void *mem, int size)
2914 {
2915 	__sock_kfree_s(sk, mem, size, false);
2916 }
2917 EXPORT_SYMBOL(sock_kfree_s);
2918 
2919 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2920 {
2921 	__sock_kfree_s(sk, mem, size, true);
2922 }
2923 EXPORT_SYMBOL(sock_kzfree_s);
2924 
2925 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2926    I think, these locks should be removed for datagram sockets.
2927  */
2928 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2929 {
2930 	DEFINE_WAIT(wait);
2931 
2932 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2933 	for (;;) {
2934 		if (!timeo)
2935 			break;
2936 		if (signal_pending(current))
2937 			break;
2938 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2939 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2940 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2941 			break;
2942 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2943 			break;
2944 		if (READ_ONCE(sk->sk_err))
2945 			break;
2946 		timeo = schedule_timeout(timeo);
2947 	}
2948 	finish_wait(sk_sleep(sk), &wait);
2949 	return timeo;
2950 }
2951 
2952 
2953 /*
2954  *	Generic send/receive buffer handlers
2955  */
2956 
2957 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2958 				     unsigned long data_len, int noblock,
2959 				     int *errcode, int max_page_order)
2960 {
2961 	struct sk_buff *skb;
2962 	long timeo;
2963 	int err;
2964 
2965 	timeo = sock_sndtimeo(sk, noblock);
2966 	for (;;) {
2967 		err = sock_error(sk);
2968 		if (err != 0)
2969 			goto failure;
2970 
2971 		err = -EPIPE;
2972 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2973 			goto failure;
2974 
2975 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2976 			break;
2977 
2978 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2979 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2980 		err = -EAGAIN;
2981 		if (!timeo)
2982 			goto failure;
2983 		if (signal_pending(current))
2984 			goto interrupted;
2985 		timeo = sock_wait_for_wmem(sk, timeo);
2986 	}
2987 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2988 				   errcode, sk->sk_allocation);
2989 	if (skb)
2990 		skb_set_owner_w(skb, sk);
2991 	return skb;
2992 
2993 interrupted:
2994 	err = sock_intr_errno(timeo);
2995 failure:
2996 	*errcode = err;
2997 	return NULL;
2998 }
2999 EXPORT_SYMBOL(sock_alloc_send_pskb);
3000 
3001 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
3002 		     struct sockcm_cookie *sockc)
3003 {
3004 	u32 tsflags;
3005 
3006 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
3007 
3008 	switch (cmsg->cmsg_type) {
3009 	case SO_MARK:
3010 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
3011 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3012 			return -EPERM;
3013 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3014 			return -EINVAL;
3015 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
3016 		break;
3017 	case SO_TIMESTAMPING_OLD:
3018 	case SO_TIMESTAMPING_NEW:
3019 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3020 			return -EINVAL;
3021 
3022 		tsflags = *(u32 *)CMSG_DATA(cmsg);
3023 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3024 			return -EINVAL;
3025 
3026 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3027 		sockc->tsflags |= tsflags;
3028 		break;
3029 	case SCM_TXTIME:
3030 		if (!sock_flag(sk, SOCK_TXTIME))
3031 			return -EINVAL;
3032 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3033 			return -EINVAL;
3034 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3035 		break;
3036 	case SCM_TS_OPT_ID:
3037 		if (sk_is_tcp(sk))
3038 			return -EINVAL;
3039 		tsflags = READ_ONCE(sk->sk_tsflags);
3040 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3041 			return -EINVAL;
3042 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3043 			return -EINVAL;
3044 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
3045 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
3046 		break;
3047 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
3048 	case SCM_RIGHTS:
3049 	case SCM_CREDENTIALS:
3050 		break;
3051 	case SO_PRIORITY:
3052 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3053 			return -EINVAL;
3054 		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
3055 			return -EPERM;
3056 		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
3057 		break;
3058 	case SCM_DEVMEM_DMABUF:
3059 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3060 			return -EINVAL;
3061 		sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
3062 		break;
3063 	default:
3064 		return -EINVAL;
3065 	}
3066 	return 0;
3067 }
3068 EXPORT_SYMBOL(__sock_cmsg_send);
3069 
3070 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
3071 		   struct sockcm_cookie *sockc)
3072 {
3073 	struct cmsghdr *cmsg;
3074 	int ret;
3075 
3076 	for_each_cmsghdr(cmsg, msg) {
3077 		if (!CMSG_OK(msg, cmsg))
3078 			return -EINVAL;
3079 		if (cmsg->cmsg_level != SOL_SOCKET)
3080 			continue;
3081 		ret = __sock_cmsg_send(sk, cmsg, sockc);
3082 		if (ret)
3083 			return ret;
3084 	}
3085 	return 0;
3086 }
3087 EXPORT_SYMBOL(sock_cmsg_send);
3088 
3089 static void sk_enter_memory_pressure(struct sock *sk)
3090 {
3091 	if (!sk->sk_prot->enter_memory_pressure)
3092 		return;
3093 
3094 	sk->sk_prot->enter_memory_pressure(sk);
3095 }
3096 
3097 static void sk_leave_memory_pressure(struct sock *sk)
3098 {
3099 	if (sk->sk_prot->leave_memory_pressure) {
3100 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3101 				     tcp_leave_memory_pressure, sk);
3102 	} else {
3103 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3104 
3105 		if (memory_pressure && READ_ONCE(*memory_pressure))
3106 			WRITE_ONCE(*memory_pressure, 0);
3107 	}
3108 }
3109 
3110 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3111 
3112 /**
3113  * skb_page_frag_refill - check that a page_frag contains enough room
3114  * @sz: minimum size of the fragment we want to get
3115  * @pfrag: pointer to page_frag
3116  * @gfp: priority for memory allocation
3117  *
3118  * Note: While this allocator tries to use high order pages, there is
3119  * no guarantee that allocations succeed. Therefore, @sz MUST be
3120  * less or equal than PAGE_SIZE.
3121  */
3122 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3123 {
3124 	if (pfrag->page) {
3125 		if (page_ref_count(pfrag->page) == 1) {
3126 			pfrag->offset = 0;
3127 			return true;
3128 		}
3129 		if (pfrag->offset + sz <= pfrag->size)
3130 			return true;
3131 		put_page(pfrag->page);
3132 	}
3133 
3134 	pfrag->offset = 0;
3135 	if (SKB_FRAG_PAGE_ORDER &&
3136 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3137 		/* Avoid direct reclaim but allow kswapd to wake */
3138 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3139 					  __GFP_COMP | __GFP_NOWARN |
3140 					  __GFP_NORETRY,
3141 					  SKB_FRAG_PAGE_ORDER);
3142 		if (likely(pfrag->page)) {
3143 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3144 			return true;
3145 		}
3146 	}
3147 	pfrag->page = alloc_page(gfp);
3148 	if (likely(pfrag->page)) {
3149 		pfrag->size = PAGE_SIZE;
3150 		return true;
3151 	}
3152 	return false;
3153 }
3154 EXPORT_SYMBOL(skb_page_frag_refill);
3155 
3156 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3157 {
3158 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3159 		return true;
3160 
3161 	if (!sk->sk_bypass_prot_mem)
3162 		sk_enter_memory_pressure(sk);
3163 
3164 	sk_stream_moderate_sndbuf(sk);
3165 
3166 	return false;
3167 }
3168 EXPORT_SYMBOL(sk_page_frag_refill);
3169 
3170 void __lock_sock(struct sock *sk)
3171 	__releases(&sk->sk_lock.slock)
3172 	__acquires(&sk->sk_lock.slock)
3173 {
3174 	DEFINE_WAIT(wait);
3175 
3176 	for (;;) {
3177 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3178 					TASK_UNINTERRUPTIBLE);
3179 		spin_unlock_bh(&sk->sk_lock.slock);
3180 		schedule();
3181 		spin_lock_bh(&sk->sk_lock.slock);
3182 		if (!sock_owned_by_user(sk))
3183 			break;
3184 	}
3185 	finish_wait(&sk->sk_lock.wq, &wait);
3186 }
3187 
3188 void __release_sock(struct sock *sk)
3189 	__releases(&sk->sk_lock.slock)
3190 	__acquires(&sk->sk_lock.slock)
3191 {
3192 	struct sk_buff *skb, *next;
3193 	int nb = 0;
3194 
3195 	while ((skb = sk->sk_backlog.head) != NULL) {
3196 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3197 
3198 		spin_unlock_bh(&sk->sk_lock.slock);
3199 
3200 		while (1) {
3201 			next = skb->next;
3202 			prefetch(next);
3203 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3204 			skb_mark_not_on_list(skb);
3205 			sk_backlog_rcv(sk, skb);
3206 
3207 			skb = next;
3208 			if (!skb)
3209 				break;
3210 
3211 			if (!(++nb & 15))
3212 				cond_resched();
3213 		}
3214 
3215 		spin_lock_bh(&sk->sk_lock.slock);
3216 	}
3217 
3218 	/*
3219 	 * Doing the zeroing here guarantee we can not loop forever
3220 	 * while a wild producer attempts to flood us.
3221 	 */
3222 	sk->sk_backlog.len = 0;
3223 }
3224 
3225 void __sk_flush_backlog(struct sock *sk)
3226 {
3227 	spin_lock_bh(&sk->sk_lock.slock);
3228 	__release_sock(sk);
3229 
3230 	if (sk->sk_prot->release_cb)
3231 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3232 				     tcp_release_cb, sk);
3233 
3234 	spin_unlock_bh(&sk->sk_lock.slock);
3235 }
3236 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3237 
3238 /**
3239  * sk_wait_data - wait for data to arrive at sk_receive_queue
3240  * @sk:    sock to wait on
3241  * @timeo: for how long
3242  * @skb:   last skb seen on sk_receive_queue
3243  *
3244  * Now socket state including sk->sk_err is changed only under lock,
3245  * hence we may omit checks after joining wait queue.
3246  * We check receive queue before schedule() only as optimization;
3247  * it is very likely that release_sock() added new data.
3248  */
3249 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3250 {
3251 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3252 	int rc;
3253 
3254 	add_wait_queue(sk_sleep(sk), &wait);
3255 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3256 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3257 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3258 	remove_wait_queue(sk_sleep(sk), &wait);
3259 	return rc;
3260 }
3261 EXPORT_SYMBOL(sk_wait_data);
3262 
3263 /**
3264  *	__sk_mem_raise_allocated - increase memory_allocated
3265  *	@sk: socket
3266  *	@size: memory size to allocate
3267  *	@amt: pages to allocate
3268  *	@kind: allocation type
3269  *
3270  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3271  *
3272  *	Unlike the globally shared limits among the sockets under same protocol,
3273  *	consuming the budget of a memcg won't have direct effect on other ones.
3274  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3275  *	whether or not to raise allocated through sk_under_memory_pressure() or
3276  *	its variants.
3277  */
3278 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3279 {
3280 	bool memcg_enabled = false, charged = false;
3281 	struct proto *prot = sk->sk_prot;
3282 	long allocated = 0;
3283 
3284 	if (!sk->sk_bypass_prot_mem) {
3285 		sk_memory_allocated_add(sk, amt);
3286 		allocated = sk_memory_allocated(sk);
3287 	}
3288 
3289 	if (mem_cgroup_sk_enabled(sk)) {
3290 		memcg_enabled = true;
3291 		charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
3292 		if (!charged)
3293 			goto suppress_allocation;
3294 	}
3295 
3296 	if (!allocated)
3297 		return 1;
3298 
3299 	/* Under limit. */
3300 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3301 		sk_leave_memory_pressure(sk);
3302 		return 1;
3303 	}
3304 
3305 	/* Under pressure. */
3306 	if (allocated > sk_prot_mem_limits(sk, 1))
3307 		sk_enter_memory_pressure(sk);
3308 
3309 	/* Over hard limit. */
3310 	if (allocated > sk_prot_mem_limits(sk, 2))
3311 		goto suppress_allocation;
3312 
3313 	/* Guarantee minimum buffer size under pressure (either global
3314 	 * or memcg) to make sure features described in RFC 7323 (TCP
3315 	 * Extensions for High Performance) work properly.
3316 	 *
3317 	 * This rule does NOT stand when exceeds global or memcg's hard
3318 	 * limit, or else a DoS attack can be taken place by spawning
3319 	 * lots of sockets whose usage are under minimum buffer size.
3320 	 */
3321 	if (kind == SK_MEM_RECV) {
3322 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3323 			return 1;
3324 
3325 	} else { /* SK_MEM_SEND */
3326 		int wmem0 = sk_get_wmem0(sk, prot);
3327 
3328 		if (sk->sk_type == SOCK_STREAM) {
3329 			if (sk->sk_wmem_queued < wmem0)
3330 				return 1;
3331 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3332 				return 1;
3333 		}
3334 	}
3335 
3336 	if (sk_has_memory_pressure(sk)) {
3337 		u64 alloc;
3338 
3339 		/* The following 'average' heuristic is within the
3340 		 * scope of global accounting, so it only makes
3341 		 * sense for global memory pressure.
3342 		 */
3343 		if (!sk_under_global_memory_pressure(sk))
3344 			return 1;
3345 
3346 		/* Try to be fair among all the sockets under global
3347 		 * pressure by allowing the ones that below average
3348 		 * usage to raise.
3349 		 */
3350 		alloc = sk_sockets_allocated_read_positive(sk);
3351 		if (sk_prot_mem_limits(sk, 2) > alloc *
3352 		    sk_mem_pages(sk->sk_wmem_queued +
3353 				 atomic_read(&sk->sk_rmem_alloc) +
3354 				 sk->sk_forward_alloc))
3355 			return 1;
3356 	}
3357 
3358 suppress_allocation:
3359 
3360 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3361 		sk_stream_moderate_sndbuf(sk);
3362 
3363 		/* Fail only if socket is _under_ its sndbuf.
3364 		 * In this case we cannot block, so that we have to fail.
3365 		 */
3366 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3367 			/* Force charge with __GFP_NOFAIL */
3368 			if (memcg_enabled && !charged)
3369 				mem_cgroup_sk_charge(sk, amt,
3370 						     gfp_memcg_charge() | __GFP_NOFAIL);
3371 			return 1;
3372 		}
3373 	}
3374 
3375 	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3376 
3377 	if (allocated)
3378 		sk_memory_allocated_sub(sk, amt);
3379 
3380 	if (charged)
3381 		mem_cgroup_sk_uncharge(sk, amt);
3382 
3383 	return 0;
3384 }
3385 
3386 /**
3387  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3388  *	@sk: socket
3389  *	@size: memory size to allocate
3390  *	@kind: allocation type
3391  *
3392  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3393  *	rmem allocation. This function assumes that protocols which have
3394  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3395  */
3396 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3397 {
3398 	int ret, amt = sk_mem_pages(size);
3399 
3400 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3401 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3402 	if (!ret)
3403 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3404 	return ret;
3405 }
3406 EXPORT_SYMBOL(__sk_mem_schedule);
3407 
3408 /**
3409  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3410  *	@sk: socket
3411  *	@amount: number of quanta
3412  *
3413  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3414  */
3415 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3416 {
3417 	if (mem_cgroup_sk_enabled(sk))
3418 		mem_cgroup_sk_uncharge(sk, amount);
3419 
3420 	if (sk->sk_bypass_prot_mem)
3421 		return;
3422 
3423 	sk_memory_allocated_sub(sk, amount);
3424 
3425 	if (sk_under_global_memory_pressure(sk) &&
3426 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3427 		sk_leave_memory_pressure(sk);
3428 }
3429 
3430 /**
3431  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3432  *	@sk: socket
3433  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3434  */
3435 void __sk_mem_reclaim(struct sock *sk, int amount)
3436 {
3437 	amount >>= PAGE_SHIFT;
3438 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3439 	__sk_mem_reduce_allocated(sk, amount);
3440 }
3441 EXPORT_SYMBOL(__sk_mem_reclaim);
3442 
3443 int sk_set_peek_off(struct sock *sk, int val)
3444 {
3445 	WRITE_ONCE(sk->sk_peek_off, val);
3446 	return 0;
3447 }
3448 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3449 
3450 /*
3451  * Set of default routines for initialising struct proto_ops when
3452  * the protocol does not support a particular function. In certain
3453  * cases where it makes no sense for a protocol to have a "do nothing"
3454  * function, some default processing is provided.
3455  */
3456 
3457 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3458 {
3459 	return -EOPNOTSUPP;
3460 }
3461 EXPORT_SYMBOL(sock_no_bind);
3462 
3463 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3464 		    int len, int flags)
3465 {
3466 	return -EOPNOTSUPP;
3467 }
3468 EXPORT_SYMBOL(sock_no_connect);
3469 
3470 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3471 {
3472 	return -EOPNOTSUPP;
3473 }
3474 EXPORT_SYMBOL(sock_no_socketpair);
3475 
3476 int sock_no_accept(struct socket *sock, struct socket *newsock,
3477 		   struct proto_accept_arg *arg)
3478 {
3479 	return -EOPNOTSUPP;
3480 }
3481 EXPORT_SYMBOL(sock_no_accept);
3482 
3483 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3484 		    int peer)
3485 {
3486 	return -EOPNOTSUPP;
3487 }
3488 EXPORT_SYMBOL(sock_no_getname);
3489 
3490 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3491 {
3492 	return -EOPNOTSUPP;
3493 }
3494 EXPORT_SYMBOL(sock_no_ioctl);
3495 
3496 int sock_no_listen(struct socket *sock, int backlog)
3497 {
3498 	return -EOPNOTSUPP;
3499 }
3500 EXPORT_SYMBOL(sock_no_listen);
3501 
3502 int sock_no_shutdown(struct socket *sock, int how)
3503 {
3504 	return -EOPNOTSUPP;
3505 }
3506 EXPORT_SYMBOL(sock_no_shutdown);
3507 
3508 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3509 {
3510 	return -EOPNOTSUPP;
3511 }
3512 EXPORT_SYMBOL(sock_no_sendmsg);
3513 
3514 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3515 {
3516 	return -EOPNOTSUPP;
3517 }
3518 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3519 
3520 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3521 		    int flags)
3522 {
3523 	return -EOPNOTSUPP;
3524 }
3525 EXPORT_SYMBOL(sock_no_recvmsg);
3526 
3527 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3528 {
3529 	/* Mirror missing mmap method error code */
3530 	return -ENODEV;
3531 }
3532 EXPORT_SYMBOL(sock_no_mmap);
3533 
3534 /*
3535  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3536  * various sock-based usage counts.
3537  */
3538 void __receive_sock(struct file *file)
3539 {
3540 	struct socket *sock;
3541 
3542 	sock = sock_from_file(file);
3543 	if (sock) {
3544 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3545 		sock_update_classid(&sock->sk->sk_cgrp_data);
3546 	}
3547 }
3548 
3549 /*
3550  *	Default Socket Callbacks
3551  */
3552 
3553 static void sock_def_wakeup(struct sock *sk)
3554 {
3555 	struct socket_wq *wq;
3556 
3557 	rcu_read_lock();
3558 	wq = rcu_dereference(sk->sk_wq);
3559 	if (skwq_has_sleeper(wq))
3560 		wake_up_interruptible_all(&wq->wait);
3561 	rcu_read_unlock();
3562 }
3563 
3564 static void sock_def_error_report(struct sock *sk)
3565 {
3566 	struct socket_wq *wq;
3567 
3568 	rcu_read_lock();
3569 	wq = rcu_dereference(sk->sk_wq);
3570 	if (skwq_has_sleeper(wq))
3571 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3572 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3573 	rcu_read_unlock();
3574 }
3575 
3576 void sock_def_readable(struct sock *sk)
3577 {
3578 	struct socket_wq *wq;
3579 
3580 	trace_sk_data_ready(sk);
3581 
3582 	rcu_read_lock();
3583 	wq = rcu_dereference(sk->sk_wq);
3584 	if (skwq_has_sleeper(wq))
3585 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3586 						EPOLLRDNORM | EPOLLRDBAND);
3587 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3588 	rcu_read_unlock();
3589 }
3590 
3591 static void sock_def_write_space(struct sock *sk)
3592 {
3593 	struct socket_wq *wq;
3594 
3595 	rcu_read_lock();
3596 
3597 	/* Do not wake up a writer until he can make "significant"
3598 	 * progress.  --DaveM
3599 	 */
3600 	if (sock_writeable(sk)) {
3601 		wq = rcu_dereference(sk->sk_wq);
3602 		if (skwq_has_sleeper(wq))
3603 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3604 						EPOLLWRNORM | EPOLLWRBAND);
3605 
3606 		/* Should agree with poll, otherwise some programs break */
3607 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3608 	}
3609 
3610 	rcu_read_unlock();
3611 }
3612 
3613 /* An optimised version of sock_def_write_space(), should only be called
3614  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3615  * ->sk_wmem_alloc.
3616  */
3617 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
3618 {
3619 	/* Do not wake up a writer until he can make "significant"
3620 	 * progress.  --DaveM
3621 	 */
3622 	if (__sock_writeable(sk, wmem_alloc)) {
3623 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3624 
3625 		/* rely on refcount_sub from sock_wfree() */
3626 		smp_mb__after_atomic();
3627 		if (wq && waitqueue_active(&wq->wait))
3628 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3629 						EPOLLWRNORM | EPOLLWRBAND);
3630 
3631 		/* Should agree with poll, otherwise some programs break */
3632 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3633 	}
3634 }
3635 
3636 static void sock_def_destruct(struct sock *sk)
3637 {
3638 }
3639 
3640 void sk_send_sigurg(struct sock *sk)
3641 {
3642 	if (sk->sk_socket && sk->sk_socket->file)
3643 		if (send_sigurg(sk->sk_socket->file))
3644 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3645 }
3646 EXPORT_SYMBOL(sk_send_sigurg);
3647 
3648 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3649 		    unsigned long expires)
3650 {
3651 	if (!mod_timer(timer, expires))
3652 		sock_hold(sk);
3653 }
3654 EXPORT_SYMBOL(sk_reset_timer);
3655 
3656 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3657 {
3658 	if (timer_delete(timer))
3659 		__sock_put(sk);
3660 }
3661 EXPORT_SYMBOL(sk_stop_timer);
3662 
3663 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3664 {
3665 	if (timer_delete_sync(timer))
3666 		__sock_put(sk);
3667 }
3668 EXPORT_SYMBOL(sk_stop_timer_sync);
3669 
3670 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3671 {
3672 	sk_init_common(sk);
3673 	sk->sk_send_head	=	NULL;
3674 
3675 	timer_setup(&sk->sk_timer, NULL, 0);
3676 
3677 	sk->sk_allocation	=	GFP_KERNEL;
3678 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3679 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3680 	sk->sk_state		=	TCP_CLOSE;
3681 	sk->sk_use_task_frag	=	true;
3682 	sk_set_socket(sk, sock);
3683 
3684 	sock_set_flag(sk, SOCK_ZAPPED);
3685 
3686 	if (sock) {
3687 		sk->sk_type	=	sock->type;
3688 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3689 		sock->sk	=	sk;
3690 	} else {
3691 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3692 	}
3693 	sk->sk_uid	=	uid;
3694 
3695 	sk->sk_state_change	=	sock_def_wakeup;
3696 	sk->sk_data_ready	=	sock_def_readable;
3697 	sk->sk_write_space	=	sock_def_write_space;
3698 	sk->sk_error_report	=	sock_def_error_report;
3699 	sk->sk_destruct		=	sock_def_destruct;
3700 
3701 	sk->sk_frag.page	=	NULL;
3702 	sk->sk_frag.offset	=	0;
3703 	sk->sk_peek_off		=	-1;
3704 
3705 	sk->sk_peer_pid 	=	NULL;
3706 	sk->sk_peer_cred	=	NULL;
3707 	spin_lock_init(&sk->sk_peer_lock);
3708 
3709 	sk->sk_write_pending	=	0;
3710 	sk->sk_rcvlowat		=	1;
3711 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3712 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3713 
3714 	sk->sk_stamp = SK_DEFAULT_STAMP;
3715 #if BITS_PER_LONG==32
3716 	seqlock_init(&sk->sk_stamp_seq);
3717 #endif
3718 	atomic_set(&sk->sk_zckey, 0);
3719 
3720 #ifdef CONFIG_NET_RX_BUSY_POLL
3721 	sk->sk_napi_id		=	0;
3722 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3723 #endif
3724 
3725 	sk->sk_max_pacing_rate = ~0UL;
3726 	sk->sk_pacing_rate = ~0UL;
3727 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3728 	sk->sk_incoming_cpu = -1;
3729 
3730 	sk_rx_queue_clear(sk);
3731 	/*
3732 	 * Before updating sk_refcnt, we must commit prior changes to memory
3733 	 * (Documentation/RCU/rculist_nulls.rst for details)
3734 	 */
3735 	smp_wmb();
3736 	refcount_set(&sk->sk_refcnt, 1);
3737 	sk_drops_reset(sk);
3738 }
3739 EXPORT_SYMBOL(sock_init_data_uid);
3740 
3741 void sock_init_data(struct socket *sock, struct sock *sk)
3742 {
3743 	kuid_t uid = sock ?
3744 		SOCK_INODE(sock)->i_uid :
3745 		make_kuid(sock_net(sk)->user_ns, 0);
3746 
3747 	sock_init_data_uid(sock, sk, uid);
3748 }
3749 EXPORT_SYMBOL(sock_init_data);
3750 
3751 void lock_sock_nested(struct sock *sk, int subclass)
3752 {
3753 	/* The sk_lock has mutex_lock() semantics here. */
3754 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3755 
3756 	might_sleep();
3757 	spin_lock_bh(&sk->sk_lock.slock);
3758 	if (sock_owned_by_user_nocheck(sk))
3759 		__lock_sock(sk);
3760 	sk->sk_lock.owned = 1;
3761 	spin_unlock_bh(&sk->sk_lock.slock);
3762 }
3763 EXPORT_SYMBOL(lock_sock_nested);
3764 
3765 void release_sock(struct sock *sk)
3766 {
3767 	spin_lock_bh(&sk->sk_lock.slock);
3768 	if (sk->sk_backlog.tail)
3769 		__release_sock(sk);
3770 
3771 	if (sk->sk_prot->release_cb)
3772 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3773 				     tcp_release_cb, sk);
3774 
3775 	sock_release_ownership(sk);
3776 	if (waitqueue_active(&sk->sk_lock.wq))
3777 		wake_up(&sk->sk_lock.wq);
3778 	spin_unlock_bh(&sk->sk_lock.slock);
3779 }
3780 EXPORT_SYMBOL(release_sock);
3781 
3782 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3783 {
3784 	might_sleep();
3785 	spin_lock_bh(&sk->sk_lock.slock);
3786 
3787 	if (!sock_owned_by_user_nocheck(sk)) {
3788 		/*
3789 		 * Fast path return with bottom halves disabled and
3790 		 * sock::sk_lock.slock held.
3791 		 *
3792 		 * The 'mutex' is not contended and holding
3793 		 * sock::sk_lock.slock prevents all other lockers to
3794 		 * proceed so the corresponding unlock_sock_fast() can
3795 		 * avoid the slow path of release_sock() completely and
3796 		 * just release slock.
3797 		 *
3798 		 * From a semantical POV this is equivalent to 'acquiring'
3799 		 * the 'mutex', hence the corresponding lockdep
3800 		 * mutex_release() has to happen in the fast path of
3801 		 * unlock_sock_fast().
3802 		 */
3803 		return false;
3804 	}
3805 
3806 	__lock_sock(sk);
3807 	sk->sk_lock.owned = 1;
3808 	__acquire(&sk->sk_lock.slock);
3809 	spin_unlock_bh(&sk->sk_lock.slock);
3810 	return true;
3811 }
3812 EXPORT_SYMBOL(__lock_sock_fast);
3813 
3814 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3815 		   bool timeval, bool time32)
3816 {
3817 	struct sock *sk = sock->sk;
3818 	struct timespec64 ts;
3819 
3820 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3821 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3822 	if (ts.tv_sec == -1)
3823 		return -ENOENT;
3824 	if (ts.tv_sec == 0) {
3825 		ktime_t kt = ktime_get_real();
3826 		sock_write_timestamp(sk, kt);
3827 		ts = ktime_to_timespec64(kt);
3828 	}
3829 
3830 	if (timeval)
3831 		ts.tv_nsec /= 1000;
3832 
3833 #ifdef CONFIG_COMPAT_32BIT_TIME
3834 	if (time32)
3835 		return put_old_timespec32(&ts, userstamp);
3836 #endif
3837 #ifdef CONFIG_SPARC64
3838 	/* beware of padding in sparc64 timeval */
3839 	if (timeval && !in_compat_syscall()) {
3840 		struct __kernel_old_timeval __user tv = {
3841 			.tv_sec = ts.tv_sec,
3842 			.tv_usec = ts.tv_nsec,
3843 		};
3844 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3845 			return -EFAULT;
3846 		return 0;
3847 	}
3848 #endif
3849 	return put_timespec64(&ts, userstamp);
3850 }
3851 EXPORT_SYMBOL(sock_gettstamp);
3852 
3853 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3854 {
3855 	if (!sock_flag(sk, flag)) {
3856 		unsigned long previous_flags = sk->sk_flags;
3857 
3858 		sock_set_flag(sk, flag);
3859 		/*
3860 		 * we just set one of the two flags which require net
3861 		 * time stamping, but time stamping might have been on
3862 		 * already because of the other one
3863 		 */
3864 		if (sock_needs_netstamp(sk) &&
3865 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3866 			net_enable_timestamp();
3867 	}
3868 }
3869 
3870 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3871 		       int level, int type)
3872 {
3873 	struct sock_exterr_skb *serr;
3874 	struct sk_buff *skb;
3875 	int copied, err;
3876 
3877 	err = -EAGAIN;
3878 	skb = sock_dequeue_err_skb(sk);
3879 	if (skb == NULL)
3880 		goto out;
3881 
3882 	copied = skb->len;
3883 	if (copied > len) {
3884 		msg->msg_flags |= MSG_TRUNC;
3885 		copied = len;
3886 	}
3887 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3888 	if (err)
3889 		goto out_free_skb;
3890 
3891 	sock_recv_timestamp(msg, sk, skb);
3892 
3893 	serr = SKB_EXT_ERR(skb);
3894 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3895 
3896 	msg->msg_flags |= MSG_ERRQUEUE;
3897 	err = copied;
3898 
3899 out_free_skb:
3900 	kfree_skb(skb);
3901 out:
3902 	return err;
3903 }
3904 EXPORT_SYMBOL(sock_recv_errqueue);
3905 
3906 /*
3907  *	Get a socket option on an socket.
3908  *
3909  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3910  *	asynchronous errors should be reported by getsockopt. We assume
3911  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3912  */
3913 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3914 			   char __user *optval, int __user *optlen)
3915 {
3916 	struct sock *sk = sock->sk;
3917 
3918 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3919 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3920 }
3921 EXPORT_SYMBOL(sock_common_getsockopt);
3922 
3923 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3924 			int flags)
3925 {
3926 	struct sock *sk = sock->sk;
3927 	int addr_len = 0;
3928 	int err;
3929 
3930 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3931 	if (err >= 0)
3932 		msg->msg_namelen = addr_len;
3933 	return err;
3934 }
3935 EXPORT_SYMBOL(sock_common_recvmsg);
3936 
3937 /*
3938  *	Set socket options on an inet socket.
3939  */
3940 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3941 			   sockptr_t optval, unsigned int optlen)
3942 {
3943 	struct sock *sk = sock->sk;
3944 
3945 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3946 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3947 }
3948 EXPORT_SYMBOL(sock_common_setsockopt);
3949 
3950 void sk_common_release(struct sock *sk)
3951 {
3952 	if (sk->sk_prot->destroy)
3953 		sk->sk_prot->destroy(sk);
3954 
3955 	/*
3956 	 * Observation: when sk_common_release is called, processes have
3957 	 * no access to socket. But net still has.
3958 	 * Step one, detach it from networking:
3959 	 *
3960 	 * A. Remove from hash tables.
3961 	 */
3962 
3963 	sk->sk_prot->unhash(sk);
3964 
3965 	/*
3966 	 * In this point socket cannot receive new packets, but it is possible
3967 	 * that some packets are in flight because some CPU runs receiver and
3968 	 * did hash table lookup before we unhashed socket. They will achieve
3969 	 * receive queue and will be purged by socket destructor.
3970 	 *
3971 	 * Also we still have packets pending on receive queue and probably,
3972 	 * our own packets waiting in device queues. sock_destroy will drain
3973 	 * receive queue, but transmitted packets will delay socket destruction
3974 	 * until the last reference will be released.
3975 	 */
3976 
3977 	sock_orphan(sk);
3978 
3979 	xfrm_sk_free_policy(sk);
3980 
3981 	sock_put(sk);
3982 }
3983 EXPORT_SYMBOL(sk_common_release);
3984 
3985 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3986 {
3987 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3988 
3989 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3990 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3991 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3992 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3993 	mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
3994 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3995 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3996 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3997 	mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
3998 }
3999 
4000 #ifdef CONFIG_PROC_FS
4001 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
4002 
4003 int sock_prot_inuse_get(struct net *net, struct proto *prot)
4004 {
4005 	int cpu, idx = prot->inuse_idx;
4006 	int res = 0;
4007 
4008 	for_each_possible_cpu(cpu)
4009 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
4010 
4011 	return res >= 0 ? res : 0;
4012 }
4013 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4014 
4015 int sock_inuse_get(struct net *net)
4016 {
4017 	int cpu, res = 0;
4018 
4019 	for_each_possible_cpu(cpu)
4020 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4021 
4022 	return res;
4023 }
4024 
4025 EXPORT_SYMBOL_GPL(sock_inuse_get);
4026 
4027 static int __net_init sock_inuse_init_net(struct net *net)
4028 {
4029 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4030 	if (net->core.prot_inuse == NULL)
4031 		return -ENOMEM;
4032 	return 0;
4033 }
4034 
4035 static void __net_exit sock_inuse_exit_net(struct net *net)
4036 {
4037 	free_percpu(net->core.prot_inuse);
4038 }
4039 
4040 static struct pernet_operations net_inuse_ops = {
4041 	.init = sock_inuse_init_net,
4042 	.exit = sock_inuse_exit_net,
4043 };
4044 
4045 static __init int net_inuse_init(void)
4046 {
4047 	if (register_pernet_subsys(&net_inuse_ops))
4048 		panic("Cannot initialize net inuse counters");
4049 
4050 	return 0;
4051 }
4052 
4053 core_initcall(net_inuse_init);
4054 
4055 static int assign_proto_idx(struct proto *prot)
4056 {
4057 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4058 
4059 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4060 		pr_err("PROTO_INUSE_NR exhausted\n");
4061 		return -ENOSPC;
4062 	}
4063 
4064 	set_bit(prot->inuse_idx, proto_inuse_idx);
4065 	return 0;
4066 }
4067 
4068 static void release_proto_idx(struct proto *prot)
4069 {
4070 	if (prot->inuse_idx != PROTO_INUSE_NR)
4071 		clear_bit(prot->inuse_idx, proto_inuse_idx);
4072 }
4073 #else
4074 static inline int assign_proto_idx(struct proto *prot)
4075 {
4076 	return 0;
4077 }
4078 
4079 static inline void release_proto_idx(struct proto *prot)
4080 {
4081 }
4082 
4083 #endif
4084 
4085 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4086 {
4087 	if (!twsk_prot)
4088 		return;
4089 	kfree(twsk_prot->twsk_slab_name);
4090 	twsk_prot->twsk_slab_name = NULL;
4091 	kmem_cache_destroy(twsk_prot->twsk_slab);
4092 	twsk_prot->twsk_slab = NULL;
4093 }
4094 
4095 static int tw_prot_init(const struct proto *prot)
4096 {
4097 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4098 
4099 	if (!twsk_prot)
4100 		return 0;
4101 
4102 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4103 					      prot->name);
4104 	if (!twsk_prot->twsk_slab_name)
4105 		return -ENOMEM;
4106 
4107 	twsk_prot->twsk_slab =
4108 		kmem_cache_create(twsk_prot->twsk_slab_name,
4109 				  twsk_prot->twsk_obj_size, 0,
4110 				  SLAB_ACCOUNT | prot->slab_flags,
4111 				  NULL);
4112 	if (!twsk_prot->twsk_slab) {
4113 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4114 			prot->name);
4115 		return -ENOMEM;
4116 	}
4117 
4118 	return 0;
4119 }
4120 
4121 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4122 {
4123 	if (!rsk_prot)
4124 		return;
4125 	kfree(rsk_prot->slab_name);
4126 	rsk_prot->slab_name = NULL;
4127 	kmem_cache_destroy(rsk_prot->slab);
4128 	rsk_prot->slab = NULL;
4129 }
4130 
4131 static int req_prot_init(const struct proto *prot)
4132 {
4133 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4134 
4135 	if (!rsk_prot)
4136 		return 0;
4137 
4138 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4139 					prot->name);
4140 	if (!rsk_prot->slab_name)
4141 		return -ENOMEM;
4142 
4143 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4144 					   rsk_prot->obj_size, 0,
4145 					   SLAB_ACCOUNT | prot->slab_flags,
4146 					   NULL);
4147 
4148 	if (!rsk_prot->slab) {
4149 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4150 			prot->name);
4151 		return -ENOMEM;
4152 	}
4153 	return 0;
4154 }
4155 
4156 int proto_register(struct proto *prot, int alloc_slab)
4157 {
4158 	int ret = -ENOBUFS;
4159 
4160 	if (prot->memory_allocated && !prot->sysctl_mem) {
4161 		pr_err("%s: missing sysctl_mem\n", prot->name);
4162 		return -EINVAL;
4163 	}
4164 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4165 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4166 		return -EINVAL;
4167 	}
4168 	if (alloc_slab) {
4169 		prot->slab = kmem_cache_create_usercopy(prot->name,
4170 					prot->obj_size, 0,
4171 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4172 					prot->slab_flags,
4173 					prot->useroffset, prot->usersize,
4174 					NULL);
4175 
4176 		if (prot->slab == NULL) {
4177 			pr_crit("%s: Can't create sock SLAB cache!\n",
4178 				prot->name);
4179 			goto out;
4180 		}
4181 
4182 		if (req_prot_init(prot))
4183 			goto out_free_request_sock_slab;
4184 
4185 		if (tw_prot_init(prot))
4186 			goto out_free_timewait_sock_slab;
4187 	}
4188 
4189 	mutex_lock(&proto_list_mutex);
4190 	ret = assign_proto_idx(prot);
4191 	if (ret) {
4192 		mutex_unlock(&proto_list_mutex);
4193 		goto out_free_timewait_sock_slab;
4194 	}
4195 	list_add(&prot->node, &proto_list);
4196 	mutex_unlock(&proto_list_mutex);
4197 	return ret;
4198 
4199 out_free_timewait_sock_slab:
4200 	if (alloc_slab)
4201 		tw_prot_cleanup(prot->twsk_prot);
4202 out_free_request_sock_slab:
4203 	if (alloc_slab) {
4204 		req_prot_cleanup(prot->rsk_prot);
4205 
4206 		kmem_cache_destroy(prot->slab);
4207 		prot->slab = NULL;
4208 	}
4209 out:
4210 	return ret;
4211 }
4212 EXPORT_SYMBOL(proto_register);
4213 
4214 void proto_unregister(struct proto *prot)
4215 {
4216 	mutex_lock(&proto_list_mutex);
4217 	release_proto_idx(prot);
4218 	list_del(&prot->node);
4219 	mutex_unlock(&proto_list_mutex);
4220 
4221 	kmem_cache_destroy(prot->slab);
4222 	prot->slab = NULL;
4223 
4224 	req_prot_cleanup(prot->rsk_prot);
4225 	tw_prot_cleanup(prot->twsk_prot);
4226 }
4227 EXPORT_SYMBOL(proto_unregister);
4228 
4229 int sock_load_diag_module(int family, int protocol)
4230 {
4231 	if (!protocol) {
4232 		if (!sock_is_registered(family))
4233 			return -ENOENT;
4234 
4235 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4236 				      NETLINK_SOCK_DIAG, family);
4237 	}
4238 
4239 #ifdef CONFIG_INET
4240 	if (family == AF_INET &&
4241 	    protocol != IPPROTO_RAW &&
4242 	    protocol < MAX_INET_PROTOS &&
4243 	    !rcu_access_pointer(inet_protos[protocol]))
4244 		return -ENOENT;
4245 #endif
4246 
4247 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4248 			      NETLINK_SOCK_DIAG, family, protocol);
4249 }
4250 EXPORT_SYMBOL(sock_load_diag_module);
4251 
4252 #ifdef CONFIG_PROC_FS
4253 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4254 	__acquires(proto_list_mutex)
4255 {
4256 	mutex_lock(&proto_list_mutex);
4257 	return seq_list_start_head(&proto_list, *pos);
4258 }
4259 
4260 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4261 {
4262 	return seq_list_next(v, &proto_list, pos);
4263 }
4264 
4265 static void proto_seq_stop(struct seq_file *seq, void *v)
4266 	__releases(proto_list_mutex)
4267 {
4268 	mutex_unlock(&proto_list_mutex);
4269 }
4270 
4271 static char proto_method_implemented(const void *method)
4272 {
4273 	return method == NULL ? 'n' : 'y';
4274 }
4275 static long sock_prot_memory_allocated(struct proto *proto)
4276 {
4277 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4278 }
4279 
4280 static const char *sock_prot_memory_pressure(struct proto *proto)
4281 {
4282 	return proto->memory_pressure != NULL ?
4283 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4284 }
4285 
4286 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4287 {
4288 
4289 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4290 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4291 		   proto->name,
4292 		   proto->obj_size,
4293 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4294 		   sock_prot_memory_allocated(proto),
4295 		   sock_prot_memory_pressure(proto),
4296 		   proto->max_header,
4297 		   proto->slab == NULL ? "no" : "yes",
4298 		   module_name(proto->owner),
4299 		   proto_method_implemented(proto->close),
4300 		   proto_method_implemented(proto->connect),
4301 		   proto_method_implemented(proto->disconnect),
4302 		   proto_method_implemented(proto->accept),
4303 		   proto_method_implemented(proto->ioctl),
4304 		   proto_method_implemented(proto->init),
4305 		   proto_method_implemented(proto->destroy),
4306 		   proto_method_implemented(proto->shutdown),
4307 		   proto_method_implemented(proto->setsockopt),
4308 		   proto_method_implemented(proto->getsockopt),
4309 		   proto_method_implemented(proto->sendmsg),
4310 		   proto_method_implemented(proto->recvmsg),
4311 		   proto_method_implemented(proto->bind),
4312 		   proto_method_implemented(proto->backlog_rcv),
4313 		   proto_method_implemented(proto->hash),
4314 		   proto_method_implemented(proto->unhash),
4315 		   proto_method_implemented(proto->get_port),
4316 		   proto_method_implemented(proto->enter_memory_pressure));
4317 }
4318 
4319 static int proto_seq_show(struct seq_file *seq, void *v)
4320 {
4321 	if (v == &proto_list)
4322 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4323 			   "protocol",
4324 			   "size",
4325 			   "sockets",
4326 			   "memory",
4327 			   "press",
4328 			   "maxhdr",
4329 			   "slab",
4330 			   "module",
4331 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4332 	else
4333 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4334 	return 0;
4335 }
4336 
4337 static const struct seq_operations proto_seq_ops = {
4338 	.start  = proto_seq_start,
4339 	.next   = proto_seq_next,
4340 	.stop   = proto_seq_stop,
4341 	.show   = proto_seq_show,
4342 };
4343 
4344 static __net_init int proto_init_net(struct net *net)
4345 {
4346 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4347 			sizeof(struct seq_net_private)))
4348 		return -ENOMEM;
4349 
4350 	return 0;
4351 }
4352 
4353 static __net_exit void proto_exit_net(struct net *net)
4354 {
4355 	remove_proc_entry("protocols", net->proc_net);
4356 }
4357 
4358 
4359 static __net_initdata struct pernet_operations proto_net_ops = {
4360 	.init = proto_init_net,
4361 	.exit = proto_exit_net,
4362 };
4363 
4364 static int __init proto_init(void)
4365 {
4366 	return register_pernet_subsys(&proto_net_ops);
4367 }
4368 
4369 subsys_initcall(proto_init);
4370 
4371 #endif /* PROC_FS */
4372 
4373 #ifdef CONFIG_NET_RX_BUSY_POLL
4374 bool sk_busy_loop_end(void *p, unsigned long start_time)
4375 {
4376 	struct sock *sk = p;
4377 
4378 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4379 		return true;
4380 
4381 	if (sk_is_udp(sk) &&
4382 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4383 		return true;
4384 
4385 	return sk_busy_loop_timeout(sk, start_time);
4386 }
4387 EXPORT_SYMBOL(sk_busy_loop_end);
4388 #endif /* CONFIG_NET_RX_BUSY_POLL */
4389 
4390 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4391 {
4392 	if (!sk->sk_prot->bind_add)
4393 		return -EOPNOTSUPP;
4394 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4395 }
4396 EXPORT_SYMBOL(sock_bind_add);
4397 
4398 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4399 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4400 		     void __user *arg, void *karg, size_t size)
4401 {
4402 	int ret;
4403 
4404 	if (copy_from_user(karg, arg, size))
4405 		return -EFAULT;
4406 
4407 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4408 	if (ret)
4409 		return ret;
4410 
4411 	if (copy_to_user(arg, karg, size))
4412 		return -EFAULT;
4413 
4414 	return 0;
4415 }
4416 EXPORT_SYMBOL(sock_ioctl_inout);
4417 
4418 /* This is the most common ioctl prep function, where the result (4 bytes) is
4419  * copied back to userspace if the ioctl() returns successfully. No input is
4420  * copied from userspace as input argument.
4421  */
4422 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4423 {
4424 	int ret, karg = 0;
4425 
4426 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4427 	if (ret)
4428 		return ret;
4429 
4430 	return put_user(karg, (int __user *)arg);
4431 }
4432 
4433 /* A wrapper around sock ioctls, which copies the data from userspace
4434  * (depending on the protocol/ioctl), and copies back the result to userspace.
4435  * The main motivation for this function is to pass kernel memory to the
4436  * protocol ioctl callbacks, instead of userspace memory.
4437  */
4438 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4439 {
4440 	int rc = 1;
4441 
4442 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4443 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4444 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4445 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4446 	else if (sk_is_phonet(sk))
4447 		rc = phonet_sk_ioctl(sk, cmd, arg);
4448 
4449 	/* If ioctl was processed, returns its value */
4450 	if (rc <= 0)
4451 		return rc;
4452 
4453 	/* Otherwise call the default handler */
4454 	return sock_ioctl_out(sk, cmd, arg);
4455 }
4456 EXPORT_SYMBOL(sk_ioctl);
4457 
4458 static int __init sock_struct_check(void)
4459 {
4460 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4461 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4462 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4463 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4464 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4465 
4466 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4467 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4468 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4469 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4470 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4471 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4472 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4473 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4474 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4475 
4476 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4477 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4478 #ifdef CONFIG_MEMCG
4479 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4480 #endif
4481 
4482 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4483 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4484 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4485 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4486 
4487 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4488 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4489 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
4490 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4491 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4492 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4493 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4494 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4495 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4496 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4497 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4498 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4499 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4500 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4501 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4502 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4503 
4504 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4505 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4506 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4507 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4508 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
4509 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
4510 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4511 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4512 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4513 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4514 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4515 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4516 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
4517 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4518 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4519 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4520 	return 0;
4521 }
4522 
4523 core_initcall(sock_struct_check);
4524