xref: /linux/net/core/sock.c (revision 09d7ff0694ea133c50ad905fd6e548c13f8af458)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include "dev.h"
152 
153 static DEFINE_MUTEX(proto_list_mutex);
154 static LIST_HEAD(proto_list);
155 
156 static void sock_def_write_space_wfree(struct sock *sk);
157 static void sock_def_write_space(struct sock *sk);
158 
159 /**
160  * sk_ns_capable - General socket capability test
161  * @sk: Socket to use a capability on or through
162  * @user_ns: The user namespace of the capability to use
163  * @cap: The capability to use
164  *
165  * Test to see if the opener of the socket had when the socket was
166  * created and the current process has the capability @cap in the user
167  * namespace @user_ns.
168  */
169 bool sk_ns_capable(const struct sock *sk,
170 		   struct user_namespace *user_ns, int cap)
171 {
172 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173 		ns_capable(user_ns, cap);
174 }
175 EXPORT_SYMBOL(sk_ns_capable);
176 
177 /**
178  * sk_capable - Socket global capability test
179  * @sk: Socket to use a capability on or through
180  * @cap: The global capability to use
181  *
182  * Test to see if the opener of the socket had when the socket was
183  * created and the current process has the capability @cap in all user
184  * namespaces.
185  */
186 bool sk_capable(const struct sock *sk, int cap)
187 {
188 	return sk_ns_capable(sk, &init_user_ns, cap);
189 }
190 EXPORT_SYMBOL(sk_capable);
191 
192 /**
193  * sk_net_capable - Network namespace socket capability test
194  * @sk: Socket to use a capability on or through
195  * @cap: The capability to use
196  *
197  * Test to see if the opener of the socket had when the socket was created
198  * and the current process has the capability @cap over the network namespace
199  * the socket is a member of.
200  */
201 bool sk_net_capable(const struct sock *sk, int cap)
202 {
203 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204 }
205 EXPORT_SYMBOL(sk_net_capable);
206 
207 /*
208  * Each address family might have different locking rules, so we have
209  * one slock key per address family and separate keys for internal and
210  * userspace sockets.
211  */
212 static struct lock_class_key af_family_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_keys[AF_MAX];
214 static struct lock_class_key af_family_slock_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216 
217 /*
218  * Make lock validator output more readable. (we pre-construct these
219  * strings build-time, so that runtime initialization of socket
220  * locks is fast):
221  */
222 
223 #define _sock_locks(x)						  \
224   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
225   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
226   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
227   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
228   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
229   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
230   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
231   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
232   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
233   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
234   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
235   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
236   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
237   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
238   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
239   x "AF_MCTP"  , \
240   x "AF_MAX"
241 
242 static const char *const af_family_key_strings[AF_MAX+1] = {
243 	_sock_locks("sk_lock-")
244 };
245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("slock-")
247 };
248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("clock-")
250 };
251 
252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-sk_lock-")
254 };
255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-slock-")
257 };
258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259 	_sock_locks("k-clock-")
260 };
261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("rlock-")
263 };
264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265 	_sock_locks("wlock-")
266 };
267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268 	_sock_locks("elock-")
269 };
270 
271 /*
272  * sk_callback_lock and sk queues locking rules are per-address-family,
273  * so split the lock classes by using a per-AF key:
274  */
275 static struct lock_class_key af_callback_keys[AF_MAX];
276 static struct lock_class_key af_rlock_keys[AF_MAX];
277 static struct lock_class_key af_wlock_keys[AF_MAX];
278 static struct lock_class_key af_elock_keys[AF_MAX];
279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
280 
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283 EXPORT_SYMBOL(sysctl_wmem_max);
284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285 EXPORT_SYMBOL(sysctl_rmem_max);
286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288 
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_branch_inc(&memalloc_socks_key);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_branch_dec(&memalloc_socks_key);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned int noreclaim_flag;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	noreclaim_flag = memalloc_noreclaim_save();
334 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
335 				 tcp_v6_do_rcv,
336 				 tcp_v4_do_rcv,
337 				 sk, skb);
338 	memalloc_noreclaim_restore(noreclaim_flag);
339 
340 	return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343 
344 void sk_error_report(struct sock *sk)
345 {
346 	sk->sk_error_report(sk);
347 
348 	switch (sk->sk_family) {
349 	case AF_INET:
350 		fallthrough;
351 	case AF_INET6:
352 		trace_inet_sk_error_report(sk);
353 		break;
354 	default:
355 		break;
356 	}
357 }
358 EXPORT_SYMBOL(sk_error_report);
359 
360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
361 {
362 	struct __kernel_sock_timeval tv;
363 
364 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
365 		tv.tv_sec = 0;
366 		tv.tv_usec = 0;
367 	} else {
368 		tv.tv_sec = timeo / HZ;
369 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
370 	}
371 
372 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
373 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
374 		*(struct old_timeval32 *)optval = tv32;
375 		return sizeof(tv32);
376 	}
377 
378 	if (old_timeval) {
379 		struct __kernel_old_timeval old_tv;
380 		old_tv.tv_sec = tv.tv_sec;
381 		old_tv.tv_usec = tv.tv_usec;
382 		*(struct __kernel_old_timeval *)optval = old_tv;
383 		return sizeof(old_tv);
384 	}
385 
386 	*(struct __kernel_sock_timeval *)optval = tv;
387 	return sizeof(tv);
388 }
389 EXPORT_SYMBOL(sock_get_timeout);
390 
391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
392 			   sockptr_t optval, int optlen, bool old_timeval)
393 {
394 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
395 		struct old_timeval32 tv32;
396 
397 		if (optlen < sizeof(tv32))
398 			return -EINVAL;
399 
400 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
401 			return -EFAULT;
402 		tv->tv_sec = tv32.tv_sec;
403 		tv->tv_usec = tv32.tv_usec;
404 	} else if (old_timeval) {
405 		struct __kernel_old_timeval old_tv;
406 
407 		if (optlen < sizeof(old_tv))
408 			return -EINVAL;
409 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
410 			return -EFAULT;
411 		tv->tv_sec = old_tv.tv_sec;
412 		tv->tv_usec = old_tv.tv_usec;
413 	} else {
414 		if (optlen < sizeof(*tv))
415 			return -EINVAL;
416 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
417 			return -EFAULT;
418 	}
419 
420 	return 0;
421 }
422 EXPORT_SYMBOL(sock_copy_user_timeval);
423 
424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
425 			    bool old_timeval)
426 {
427 	struct __kernel_sock_timeval tv;
428 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
429 	long val;
430 
431 	if (err)
432 		return err;
433 
434 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
435 		return -EDOM;
436 
437 	if (tv.tv_sec < 0) {
438 		static int warned __read_mostly;
439 
440 		WRITE_ONCE(*timeo_p, 0);
441 		if (warned < 10 && net_ratelimit()) {
442 			warned++;
443 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 				__func__, current->comm, task_pid_nr(current));
445 		}
446 		return 0;
447 	}
448 	val = MAX_SCHEDULE_TIMEOUT;
449 	if ((tv.tv_sec || tv.tv_usec) &&
450 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
451 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
452 						    USEC_PER_SEC / HZ);
453 	WRITE_ONCE(*timeo_p, val);
454 	return 0;
455 }
456 
457 static bool sk_set_prio_allowed(const struct sock *sk, int val)
458 {
459 	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
460 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
461 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
462 }
463 
464 static bool sock_needs_netstamp(const struct sock *sk)
465 {
466 	switch (sk->sk_family) {
467 	case AF_UNSPEC:
468 	case AF_UNIX:
469 		return false;
470 	default:
471 		return true;
472 	}
473 }
474 
475 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
476 {
477 	if (sk->sk_flags & flags) {
478 		sk->sk_flags &= ~flags;
479 		if (sock_needs_netstamp(sk) &&
480 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
481 			net_disable_timestamp();
482 	}
483 }
484 
485 
486 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
487 {
488 	unsigned long flags;
489 	struct sk_buff_head *list = &sk->sk_receive_queue;
490 
491 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
492 		atomic_inc(&sk->sk_drops);
493 		trace_sock_rcvqueue_full(sk, skb);
494 		return -ENOMEM;
495 	}
496 
497 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
498 		atomic_inc(&sk->sk_drops);
499 		return -ENOBUFS;
500 	}
501 
502 	skb->dev = NULL;
503 	skb_set_owner_r(skb, sk);
504 
505 	/* we escape from rcu protected region, make sure we dont leak
506 	 * a norefcounted dst
507 	 */
508 	skb_dst_force(skb);
509 
510 	spin_lock_irqsave(&list->lock, flags);
511 	sock_skb_set_dropcount(sk, skb);
512 	__skb_queue_tail(list, skb);
513 	spin_unlock_irqrestore(&list->lock, flags);
514 
515 	if (!sock_flag(sk, SOCK_DEAD))
516 		sk->sk_data_ready(sk);
517 	return 0;
518 }
519 EXPORT_SYMBOL(__sock_queue_rcv_skb);
520 
521 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
522 			      enum skb_drop_reason *reason)
523 {
524 	enum skb_drop_reason drop_reason;
525 	int err;
526 
527 	err = sk_filter(sk, skb);
528 	if (err) {
529 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
530 		goto out;
531 	}
532 	err = __sock_queue_rcv_skb(sk, skb);
533 	switch (err) {
534 	case -ENOMEM:
535 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
536 		break;
537 	case -ENOBUFS:
538 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
539 		break;
540 	default:
541 		drop_reason = SKB_NOT_DROPPED_YET;
542 		break;
543 	}
544 out:
545 	if (reason)
546 		*reason = drop_reason;
547 	return err;
548 }
549 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
550 
551 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
552 		     const int nested, unsigned int trim_cap, bool refcounted)
553 {
554 	int rc = NET_RX_SUCCESS;
555 
556 	if (sk_filter_trim_cap(sk, skb, trim_cap))
557 		goto discard_and_relse;
558 
559 	skb->dev = NULL;
560 
561 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
562 		atomic_inc(&sk->sk_drops);
563 		goto discard_and_relse;
564 	}
565 	if (nested)
566 		bh_lock_sock_nested(sk);
567 	else
568 		bh_lock_sock(sk);
569 	if (!sock_owned_by_user(sk)) {
570 		/*
571 		 * trylock + unlock semantics:
572 		 */
573 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
574 
575 		rc = sk_backlog_rcv(sk, skb);
576 
577 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
578 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
579 		bh_unlock_sock(sk);
580 		atomic_inc(&sk->sk_drops);
581 		goto discard_and_relse;
582 	}
583 
584 	bh_unlock_sock(sk);
585 out:
586 	if (refcounted)
587 		sock_put(sk);
588 	return rc;
589 discard_and_relse:
590 	kfree_skb(skb);
591 	goto out;
592 }
593 EXPORT_SYMBOL(__sk_receive_skb);
594 
595 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
596 							  u32));
597 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
598 							   u32));
599 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
600 {
601 	struct dst_entry *dst = __sk_dst_get(sk);
602 
603 	if (dst && dst->obsolete &&
604 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
605 			       dst, cookie) == NULL) {
606 		sk_tx_queue_clear(sk);
607 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
608 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
609 		dst_release(dst);
610 		return NULL;
611 	}
612 
613 	return dst;
614 }
615 EXPORT_SYMBOL(__sk_dst_check);
616 
617 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
618 {
619 	struct dst_entry *dst = sk_dst_get(sk);
620 
621 	if (dst && dst->obsolete &&
622 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
623 			       dst, cookie) == NULL) {
624 		sk_dst_reset(sk);
625 		dst_release(dst);
626 		return NULL;
627 	}
628 
629 	return dst;
630 }
631 EXPORT_SYMBOL(sk_dst_check);
632 
633 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
634 {
635 	int ret = -ENOPROTOOPT;
636 #ifdef CONFIG_NETDEVICES
637 	struct net *net = sock_net(sk);
638 
639 	/* Sorry... */
640 	ret = -EPERM;
641 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
642 		goto out;
643 
644 	ret = -EINVAL;
645 	if (ifindex < 0)
646 		goto out;
647 
648 	/* Paired with all READ_ONCE() done locklessly. */
649 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
650 
651 	if (sk->sk_prot->rehash)
652 		sk->sk_prot->rehash(sk);
653 	sk_dst_reset(sk);
654 
655 	ret = 0;
656 
657 out:
658 #endif
659 
660 	return ret;
661 }
662 
663 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
664 {
665 	int ret;
666 
667 	if (lock_sk)
668 		lock_sock(sk);
669 	ret = sock_bindtoindex_locked(sk, ifindex);
670 	if (lock_sk)
671 		release_sock(sk);
672 
673 	return ret;
674 }
675 EXPORT_SYMBOL(sock_bindtoindex);
676 
677 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
678 {
679 	int ret = -ENOPROTOOPT;
680 #ifdef CONFIG_NETDEVICES
681 	struct net *net = sock_net(sk);
682 	char devname[IFNAMSIZ];
683 	int index;
684 
685 	ret = -EINVAL;
686 	if (optlen < 0)
687 		goto out;
688 
689 	/* Bind this socket to a particular device like "eth0",
690 	 * as specified in the passed interface name. If the
691 	 * name is "" or the option length is zero the socket
692 	 * is not bound.
693 	 */
694 	if (optlen > IFNAMSIZ - 1)
695 		optlen = IFNAMSIZ - 1;
696 	memset(devname, 0, sizeof(devname));
697 
698 	ret = -EFAULT;
699 	if (copy_from_sockptr(devname, optval, optlen))
700 		goto out;
701 
702 	index = 0;
703 	if (devname[0] != '\0') {
704 		struct net_device *dev;
705 
706 		rcu_read_lock();
707 		dev = dev_get_by_name_rcu(net, devname);
708 		if (dev)
709 			index = dev->ifindex;
710 		rcu_read_unlock();
711 		ret = -ENODEV;
712 		if (!dev)
713 			goto out;
714 	}
715 
716 	sockopt_lock_sock(sk);
717 	ret = sock_bindtoindex_locked(sk, index);
718 	sockopt_release_sock(sk);
719 out:
720 #endif
721 
722 	return ret;
723 }
724 
725 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
726 				sockptr_t optlen, int len)
727 {
728 	int ret = -ENOPROTOOPT;
729 #ifdef CONFIG_NETDEVICES
730 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
731 	struct net *net = sock_net(sk);
732 	char devname[IFNAMSIZ];
733 
734 	if (bound_dev_if == 0) {
735 		len = 0;
736 		goto zero;
737 	}
738 
739 	ret = -EINVAL;
740 	if (len < IFNAMSIZ)
741 		goto out;
742 
743 	ret = netdev_get_name(net, devname, bound_dev_if);
744 	if (ret)
745 		goto out;
746 
747 	len = strlen(devname) + 1;
748 
749 	ret = -EFAULT;
750 	if (copy_to_sockptr(optval, devname, len))
751 		goto out;
752 
753 zero:
754 	ret = -EFAULT;
755 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
756 		goto out;
757 
758 	ret = 0;
759 
760 out:
761 #endif
762 
763 	return ret;
764 }
765 
766 bool sk_mc_loop(const struct sock *sk)
767 {
768 	if (dev_recursion_level())
769 		return false;
770 	if (!sk)
771 		return true;
772 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
773 	switch (READ_ONCE(sk->sk_family)) {
774 	case AF_INET:
775 		return inet_test_bit(MC_LOOP, sk);
776 #if IS_ENABLED(CONFIG_IPV6)
777 	case AF_INET6:
778 		return inet6_test_bit(MC6_LOOP, sk);
779 #endif
780 	}
781 	WARN_ON_ONCE(1);
782 	return true;
783 }
784 EXPORT_SYMBOL(sk_mc_loop);
785 
786 void sock_set_reuseaddr(struct sock *sk)
787 {
788 	lock_sock(sk);
789 	sk->sk_reuse = SK_CAN_REUSE;
790 	release_sock(sk);
791 }
792 EXPORT_SYMBOL(sock_set_reuseaddr);
793 
794 void sock_set_reuseport(struct sock *sk)
795 {
796 	lock_sock(sk);
797 	sk->sk_reuseport = true;
798 	release_sock(sk);
799 }
800 EXPORT_SYMBOL(sock_set_reuseport);
801 
802 void sock_no_linger(struct sock *sk)
803 {
804 	lock_sock(sk);
805 	WRITE_ONCE(sk->sk_lingertime, 0);
806 	sock_set_flag(sk, SOCK_LINGER);
807 	release_sock(sk);
808 }
809 EXPORT_SYMBOL(sock_no_linger);
810 
811 void sock_set_priority(struct sock *sk, u32 priority)
812 {
813 	WRITE_ONCE(sk->sk_priority, priority);
814 }
815 EXPORT_SYMBOL(sock_set_priority);
816 
817 void sock_set_sndtimeo(struct sock *sk, s64 secs)
818 {
819 	lock_sock(sk);
820 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
821 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
822 	else
823 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
824 	release_sock(sk);
825 }
826 EXPORT_SYMBOL(sock_set_sndtimeo);
827 
828 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
829 {
830 	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
831 	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
832 	if (val)  {
833 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
834 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
835 	}
836 }
837 
838 void sock_enable_timestamps(struct sock *sk)
839 {
840 	lock_sock(sk);
841 	__sock_set_timestamps(sk, true, false, true);
842 	release_sock(sk);
843 }
844 EXPORT_SYMBOL(sock_enable_timestamps);
845 
846 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
847 {
848 	switch (optname) {
849 	case SO_TIMESTAMP_OLD:
850 		__sock_set_timestamps(sk, valbool, false, false);
851 		break;
852 	case SO_TIMESTAMP_NEW:
853 		__sock_set_timestamps(sk, valbool, true, false);
854 		break;
855 	case SO_TIMESTAMPNS_OLD:
856 		__sock_set_timestamps(sk, valbool, false, true);
857 		break;
858 	case SO_TIMESTAMPNS_NEW:
859 		__sock_set_timestamps(sk, valbool, true, true);
860 		break;
861 	}
862 }
863 
864 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
865 {
866 	struct net *net = sock_net(sk);
867 	struct net_device *dev = NULL;
868 	bool match = false;
869 	int *vclock_index;
870 	int i, num;
871 
872 	if (sk->sk_bound_dev_if)
873 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
874 
875 	if (!dev) {
876 		pr_err("%s: sock not bind to device\n", __func__);
877 		return -EOPNOTSUPP;
878 	}
879 
880 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
881 	dev_put(dev);
882 
883 	for (i = 0; i < num; i++) {
884 		if (*(vclock_index + i) == phc_index) {
885 			match = true;
886 			break;
887 		}
888 	}
889 
890 	if (num > 0)
891 		kfree(vclock_index);
892 
893 	if (!match)
894 		return -EINVAL;
895 
896 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
897 
898 	return 0;
899 }
900 
901 int sock_set_timestamping(struct sock *sk, int optname,
902 			  struct so_timestamping timestamping)
903 {
904 	int val = timestamping.flags;
905 	int ret;
906 
907 	if (val & ~SOF_TIMESTAMPING_MASK)
908 		return -EINVAL;
909 
910 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
911 	    !(val & SOF_TIMESTAMPING_OPT_ID))
912 		return -EINVAL;
913 
914 	if (val & SOF_TIMESTAMPING_OPT_ID &&
915 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
916 		if (sk_is_tcp(sk)) {
917 			if ((1 << sk->sk_state) &
918 			    (TCPF_CLOSE | TCPF_LISTEN))
919 				return -EINVAL;
920 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
922 			else
923 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
924 		} else {
925 			atomic_set(&sk->sk_tskey, 0);
926 		}
927 	}
928 
929 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
930 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
931 		return -EINVAL;
932 
933 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
934 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
935 		if (ret)
936 			return ret;
937 	}
938 
939 	WRITE_ONCE(sk->sk_tsflags, val);
940 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
941 	sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
942 
943 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
944 		sock_enable_timestamp(sk,
945 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
946 	else
947 		sock_disable_timestamp(sk,
948 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
949 	return 0;
950 }
951 
952 #if defined(CONFIG_CGROUP_BPF)
953 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
954 {
955 	struct bpf_sock_ops_kern sock_ops;
956 
957 	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
958 	sock_ops.op = op;
959 	sock_ops.is_fullsock = 1;
960 	sock_ops.sk = sk;
961 	bpf_skops_init_skb(&sock_ops, skb, 0);
962 	__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
963 }
964 #endif
965 
966 void sock_set_keepalive(struct sock *sk)
967 {
968 	lock_sock(sk);
969 	if (sk->sk_prot->keepalive)
970 		sk->sk_prot->keepalive(sk, true);
971 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
972 	release_sock(sk);
973 }
974 EXPORT_SYMBOL(sock_set_keepalive);
975 
976 static void __sock_set_rcvbuf(struct sock *sk, int val)
977 {
978 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
979 	 * as a negative value.
980 	 */
981 	val = min_t(int, val, INT_MAX / 2);
982 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
983 
984 	/* We double it on the way in to account for "struct sk_buff" etc.
985 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
986 	 * will allow that much actual data to be received on that socket.
987 	 *
988 	 * Applications are unaware that "struct sk_buff" and other overheads
989 	 * allocate from the receive buffer during socket buffer allocation.
990 	 *
991 	 * And after considering the possible alternatives, returning the value
992 	 * we actually used in getsockopt is the most desirable behavior.
993 	 */
994 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
995 }
996 
997 void sock_set_rcvbuf(struct sock *sk, int val)
998 {
999 	lock_sock(sk);
1000 	__sock_set_rcvbuf(sk, val);
1001 	release_sock(sk);
1002 }
1003 EXPORT_SYMBOL(sock_set_rcvbuf);
1004 
1005 static void __sock_set_mark(struct sock *sk, u32 val)
1006 {
1007 	if (val != sk->sk_mark) {
1008 		WRITE_ONCE(sk->sk_mark, val);
1009 		sk_dst_reset(sk);
1010 	}
1011 }
1012 
1013 void sock_set_mark(struct sock *sk, u32 val)
1014 {
1015 	lock_sock(sk);
1016 	__sock_set_mark(sk, val);
1017 	release_sock(sk);
1018 }
1019 EXPORT_SYMBOL(sock_set_mark);
1020 
1021 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1022 {
1023 	/* Round down bytes to multiple of pages */
1024 	bytes = round_down(bytes, PAGE_SIZE);
1025 
1026 	WARN_ON(bytes > sk->sk_reserved_mem);
1027 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1028 	sk_mem_reclaim(sk);
1029 }
1030 
1031 static int sock_reserve_memory(struct sock *sk, int bytes)
1032 {
1033 	long allocated;
1034 	bool charged;
1035 	int pages;
1036 
1037 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1038 		return -EOPNOTSUPP;
1039 
1040 	if (!bytes)
1041 		return 0;
1042 
1043 	pages = sk_mem_pages(bytes);
1044 
1045 	/* pre-charge to memcg */
1046 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1047 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1048 	if (!charged)
1049 		return -ENOMEM;
1050 
1051 	/* pre-charge to forward_alloc */
1052 	sk_memory_allocated_add(sk, pages);
1053 	allocated = sk_memory_allocated(sk);
1054 	/* If the system goes into memory pressure with this
1055 	 * precharge, give up and return error.
1056 	 */
1057 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1058 		sk_memory_allocated_sub(sk, pages);
1059 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1060 		return -ENOMEM;
1061 	}
1062 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1063 
1064 	WRITE_ONCE(sk->sk_reserved_mem,
1065 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1066 
1067 	return 0;
1068 }
1069 
1070 #ifdef CONFIG_PAGE_POOL
1071 
1072 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1073  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1074  * allocates to copy these tokens, and to prevent looping over the frags for
1075  * too long.
1076  */
1077 #define MAX_DONTNEED_TOKENS 128
1078 #define MAX_DONTNEED_FRAGS 1024
1079 
1080 static noinline_for_stack int
1081 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1082 {
1083 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1084 	struct dmabuf_token *tokens;
1085 	int ret = 0, num_frags = 0;
1086 	netmem_ref netmems[16];
1087 
1088 	if (!sk_is_tcp(sk))
1089 		return -EBADF;
1090 
1091 	if (optlen % sizeof(*tokens) ||
1092 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1093 		return -EINVAL;
1094 
1095 	num_tokens = optlen / sizeof(*tokens);
1096 	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1097 	if (!tokens)
1098 		return -ENOMEM;
1099 
1100 	if (copy_from_sockptr(tokens, optval, optlen)) {
1101 		kvfree(tokens);
1102 		return -EFAULT;
1103 	}
1104 
1105 	xa_lock_bh(&sk->sk_user_frags);
1106 	for (i = 0; i < num_tokens; i++) {
1107 		for (j = 0; j < tokens[i].token_count; j++) {
1108 			if (++num_frags > MAX_DONTNEED_FRAGS)
1109 				goto frag_limit_reached;
1110 
1111 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1112 				&sk->sk_user_frags, tokens[i].token_start + j);
1113 
1114 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1115 				continue;
1116 
1117 			netmems[netmem_num++] = netmem;
1118 			if (netmem_num == ARRAY_SIZE(netmems)) {
1119 				xa_unlock_bh(&sk->sk_user_frags);
1120 				for (k = 0; k < netmem_num; k++)
1121 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1122 				netmem_num = 0;
1123 				xa_lock_bh(&sk->sk_user_frags);
1124 			}
1125 			ret++;
1126 		}
1127 	}
1128 
1129 frag_limit_reached:
1130 	xa_unlock_bh(&sk->sk_user_frags);
1131 	for (k = 0; k < netmem_num; k++)
1132 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1133 
1134 	kvfree(tokens);
1135 	return ret;
1136 }
1137 #endif
1138 
1139 void sockopt_lock_sock(struct sock *sk)
1140 {
1141 	/* When current->bpf_ctx is set, the setsockopt is called from
1142 	 * a bpf prog.  bpf has ensured the sk lock has been
1143 	 * acquired before calling setsockopt().
1144 	 */
1145 	if (has_current_bpf_ctx())
1146 		return;
1147 
1148 	lock_sock(sk);
1149 }
1150 EXPORT_SYMBOL(sockopt_lock_sock);
1151 
1152 void sockopt_release_sock(struct sock *sk)
1153 {
1154 	if (has_current_bpf_ctx())
1155 		return;
1156 
1157 	release_sock(sk);
1158 }
1159 EXPORT_SYMBOL(sockopt_release_sock);
1160 
1161 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1162 {
1163 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1164 }
1165 EXPORT_SYMBOL(sockopt_ns_capable);
1166 
1167 bool sockopt_capable(int cap)
1168 {
1169 	return has_current_bpf_ctx() || capable(cap);
1170 }
1171 EXPORT_SYMBOL(sockopt_capable);
1172 
1173 static int sockopt_validate_clockid(__kernel_clockid_t value)
1174 {
1175 	switch (value) {
1176 	case CLOCK_REALTIME:
1177 	case CLOCK_MONOTONIC:
1178 	case CLOCK_TAI:
1179 		return 0;
1180 	}
1181 	return -EINVAL;
1182 }
1183 
1184 /*
1185  *	This is meant for all protocols to use and covers goings on
1186  *	at the socket level. Everything here is generic.
1187  */
1188 
1189 int sk_setsockopt(struct sock *sk, int level, int optname,
1190 		  sockptr_t optval, unsigned int optlen)
1191 {
1192 	struct so_timestamping timestamping;
1193 	struct socket *sock = sk->sk_socket;
1194 	struct sock_txtime sk_txtime;
1195 	int val;
1196 	int valbool;
1197 	struct linger ling;
1198 	int ret = 0;
1199 
1200 	/*
1201 	 *	Options without arguments
1202 	 */
1203 
1204 	if (optname == SO_BINDTODEVICE)
1205 		return sock_setbindtodevice(sk, optval, optlen);
1206 
1207 	if (optlen < sizeof(int))
1208 		return -EINVAL;
1209 
1210 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1211 		return -EFAULT;
1212 
1213 	valbool = val ? 1 : 0;
1214 
1215 	/* handle options which do not require locking the socket. */
1216 	switch (optname) {
1217 	case SO_PRIORITY:
1218 		if (sk_set_prio_allowed(sk, val)) {
1219 			sock_set_priority(sk, val);
1220 			return 0;
1221 		}
1222 		return -EPERM;
1223 	case SO_TYPE:
1224 	case SO_PROTOCOL:
1225 	case SO_DOMAIN:
1226 	case SO_ERROR:
1227 		return -ENOPROTOOPT;
1228 #ifdef CONFIG_NET_RX_BUSY_POLL
1229 	case SO_BUSY_POLL:
1230 		if (val < 0)
1231 			return -EINVAL;
1232 		WRITE_ONCE(sk->sk_ll_usec, val);
1233 		return 0;
1234 	case SO_PREFER_BUSY_POLL:
1235 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1236 			return -EPERM;
1237 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1238 		return 0;
1239 	case SO_BUSY_POLL_BUDGET:
1240 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1241 		    !sockopt_capable(CAP_NET_ADMIN))
1242 			return -EPERM;
1243 		if (val < 0 || val > U16_MAX)
1244 			return -EINVAL;
1245 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1246 		return 0;
1247 #endif
1248 	case SO_MAX_PACING_RATE:
1249 		{
1250 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1251 		unsigned long pacing_rate;
1252 
1253 		if (sizeof(ulval) != sizeof(val) &&
1254 		    optlen >= sizeof(ulval) &&
1255 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1256 			return -EFAULT;
1257 		}
1258 		if (ulval != ~0UL)
1259 			cmpxchg(&sk->sk_pacing_status,
1260 				SK_PACING_NONE,
1261 				SK_PACING_NEEDED);
1262 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1263 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1264 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1265 		if (ulval < pacing_rate)
1266 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1267 		return 0;
1268 		}
1269 	case SO_TXREHASH:
1270 		if (!sk_is_tcp(sk))
1271 			return -EOPNOTSUPP;
1272 		if (val < -1 || val > 1)
1273 			return -EINVAL;
1274 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1275 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1276 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1277 		 * and sk_getsockopt().
1278 		 */
1279 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1280 		return 0;
1281 	case SO_PEEK_OFF:
1282 		{
1283 		int (*set_peek_off)(struct sock *sk, int val);
1284 
1285 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1286 		if (set_peek_off)
1287 			ret = set_peek_off(sk, val);
1288 		else
1289 			ret = -EOPNOTSUPP;
1290 		return ret;
1291 		}
1292 #ifdef CONFIG_PAGE_POOL
1293 	case SO_DEVMEM_DONTNEED:
1294 		return sock_devmem_dontneed(sk, optval, optlen);
1295 #endif
1296 	}
1297 
1298 	sockopt_lock_sock(sk);
1299 
1300 	switch (optname) {
1301 	case SO_DEBUG:
1302 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1303 			ret = -EACCES;
1304 		else
1305 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1306 		break;
1307 	case SO_REUSEADDR:
1308 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1309 		break;
1310 	case SO_REUSEPORT:
1311 		if (valbool && !sk_is_inet(sk))
1312 			ret = -EOPNOTSUPP;
1313 		else
1314 			sk->sk_reuseport = valbool;
1315 		break;
1316 	case SO_DONTROUTE:
1317 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1318 		sk_dst_reset(sk);
1319 		break;
1320 	case SO_BROADCAST:
1321 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1322 		break;
1323 	case SO_SNDBUF:
1324 		/* Don't error on this BSD doesn't and if you think
1325 		 * about it this is right. Otherwise apps have to
1326 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1327 		 * are treated in BSD as hints
1328 		 */
1329 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1330 set_sndbuf:
1331 		/* Ensure val * 2 fits into an int, to prevent max_t()
1332 		 * from treating it as a negative value.
1333 		 */
1334 		val = min_t(int, val, INT_MAX / 2);
1335 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1336 		WRITE_ONCE(sk->sk_sndbuf,
1337 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1338 		/* Wake up sending tasks if we upped the value. */
1339 		sk->sk_write_space(sk);
1340 		break;
1341 
1342 	case SO_SNDBUFFORCE:
1343 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1344 			ret = -EPERM;
1345 			break;
1346 		}
1347 
1348 		/* No negative values (to prevent underflow, as val will be
1349 		 * multiplied by 2).
1350 		 */
1351 		if (val < 0)
1352 			val = 0;
1353 		goto set_sndbuf;
1354 
1355 	case SO_RCVBUF:
1356 		/* Don't error on this BSD doesn't and if you think
1357 		 * about it this is right. Otherwise apps have to
1358 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1359 		 * are treated in BSD as hints
1360 		 */
1361 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1362 		break;
1363 
1364 	case SO_RCVBUFFORCE:
1365 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1366 			ret = -EPERM;
1367 			break;
1368 		}
1369 
1370 		/* No negative values (to prevent underflow, as val will be
1371 		 * multiplied by 2).
1372 		 */
1373 		__sock_set_rcvbuf(sk, max(val, 0));
1374 		break;
1375 
1376 	case SO_KEEPALIVE:
1377 		if (sk->sk_prot->keepalive)
1378 			sk->sk_prot->keepalive(sk, valbool);
1379 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1380 		break;
1381 
1382 	case SO_OOBINLINE:
1383 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1384 		break;
1385 
1386 	case SO_NO_CHECK:
1387 		sk->sk_no_check_tx = valbool;
1388 		break;
1389 
1390 	case SO_LINGER:
1391 		if (optlen < sizeof(ling)) {
1392 			ret = -EINVAL;	/* 1003.1g */
1393 			break;
1394 		}
1395 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1396 			ret = -EFAULT;
1397 			break;
1398 		}
1399 		if (!ling.l_onoff) {
1400 			sock_reset_flag(sk, SOCK_LINGER);
1401 		} else {
1402 			unsigned long t_sec = ling.l_linger;
1403 
1404 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1405 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1406 			else
1407 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1408 			sock_set_flag(sk, SOCK_LINGER);
1409 		}
1410 		break;
1411 
1412 	case SO_BSDCOMPAT:
1413 		break;
1414 
1415 	case SO_TIMESTAMP_OLD:
1416 	case SO_TIMESTAMP_NEW:
1417 	case SO_TIMESTAMPNS_OLD:
1418 	case SO_TIMESTAMPNS_NEW:
1419 		sock_set_timestamp(sk, optname, valbool);
1420 		break;
1421 
1422 	case SO_TIMESTAMPING_NEW:
1423 	case SO_TIMESTAMPING_OLD:
1424 		if (optlen == sizeof(timestamping)) {
1425 			if (copy_from_sockptr(&timestamping, optval,
1426 					      sizeof(timestamping))) {
1427 				ret = -EFAULT;
1428 				break;
1429 			}
1430 		} else {
1431 			memset(&timestamping, 0, sizeof(timestamping));
1432 			timestamping.flags = val;
1433 		}
1434 		ret = sock_set_timestamping(sk, optname, timestamping);
1435 		break;
1436 
1437 	case SO_RCVLOWAT:
1438 		{
1439 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1440 
1441 		if (val < 0)
1442 			val = INT_MAX;
1443 		if (sock)
1444 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1445 		if (set_rcvlowat)
1446 			ret = set_rcvlowat(sk, val);
1447 		else
1448 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1449 		break;
1450 		}
1451 	case SO_RCVTIMEO_OLD:
1452 	case SO_RCVTIMEO_NEW:
1453 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1454 				       optlen, optname == SO_RCVTIMEO_OLD);
1455 		break;
1456 
1457 	case SO_SNDTIMEO_OLD:
1458 	case SO_SNDTIMEO_NEW:
1459 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1460 				       optlen, optname == SO_SNDTIMEO_OLD);
1461 		break;
1462 
1463 	case SO_ATTACH_FILTER: {
1464 		struct sock_fprog fprog;
1465 
1466 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1467 		if (!ret)
1468 			ret = sk_attach_filter(&fprog, sk);
1469 		break;
1470 	}
1471 	case SO_ATTACH_BPF:
1472 		ret = -EINVAL;
1473 		if (optlen == sizeof(u32)) {
1474 			u32 ufd;
1475 
1476 			ret = -EFAULT;
1477 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1478 				break;
1479 
1480 			ret = sk_attach_bpf(ufd, sk);
1481 		}
1482 		break;
1483 
1484 	case SO_ATTACH_REUSEPORT_CBPF: {
1485 		struct sock_fprog fprog;
1486 
1487 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1488 		if (!ret)
1489 			ret = sk_reuseport_attach_filter(&fprog, sk);
1490 		break;
1491 	}
1492 	case SO_ATTACH_REUSEPORT_EBPF:
1493 		ret = -EINVAL;
1494 		if (optlen == sizeof(u32)) {
1495 			u32 ufd;
1496 
1497 			ret = -EFAULT;
1498 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1499 				break;
1500 
1501 			ret = sk_reuseport_attach_bpf(ufd, sk);
1502 		}
1503 		break;
1504 
1505 	case SO_DETACH_REUSEPORT_BPF:
1506 		ret = reuseport_detach_prog(sk);
1507 		break;
1508 
1509 	case SO_DETACH_FILTER:
1510 		ret = sk_detach_filter(sk);
1511 		break;
1512 
1513 	case SO_LOCK_FILTER:
1514 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1515 			ret = -EPERM;
1516 		else
1517 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1518 		break;
1519 
1520 	case SO_MARK:
1521 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1522 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1523 			ret = -EPERM;
1524 			break;
1525 		}
1526 
1527 		__sock_set_mark(sk, val);
1528 		break;
1529 	case SO_RCVMARK:
1530 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1531 		break;
1532 
1533 	case SO_RCVPRIORITY:
1534 		sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1535 		break;
1536 
1537 	case SO_RXQ_OVFL:
1538 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1539 		break;
1540 
1541 	case SO_WIFI_STATUS:
1542 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1543 		break;
1544 
1545 	case SO_NOFCS:
1546 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1547 		break;
1548 
1549 	case SO_SELECT_ERR_QUEUE:
1550 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1551 		break;
1552 
1553 	case SO_PASSCRED:
1554 		if (sk_may_scm_recv(sk))
1555 			sk->sk_scm_credentials = valbool;
1556 		else
1557 			ret = -EOPNOTSUPP;
1558 		break;
1559 
1560 	case SO_PASSSEC:
1561 		if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1562 			sk->sk_scm_security = valbool;
1563 		else
1564 			ret = -EOPNOTSUPP;
1565 		break;
1566 
1567 	case SO_PASSPIDFD:
1568 		if (sk_is_unix(sk))
1569 			sk->sk_scm_pidfd = valbool;
1570 		else
1571 			ret = -EOPNOTSUPP;
1572 		break;
1573 
1574 	case SO_PASSRIGHTS:
1575 		if (sk_is_unix(sk))
1576 			sk->sk_scm_rights = valbool;
1577 		else
1578 			ret = -EOPNOTSUPP;
1579 		break;
1580 
1581 	case SO_INCOMING_CPU:
1582 		reuseport_update_incoming_cpu(sk, val);
1583 		break;
1584 
1585 	case SO_CNX_ADVICE:
1586 		if (val == 1)
1587 			dst_negative_advice(sk);
1588 		break;
1589 
1590 	case SO_ZEROCOPY:
1591 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1592 			if (!(sk_is_tcp(sk) ||
1593 			      (sk->sk_type == SOCK_DGRAM &&
1594 			       sk->sk_protocol == IPPROTO_UDP)))
1595 				ret = -EOPNOTSUPP;
1596 		} else if (sk->sk_family != PF_RDS) {
1597 			ret = -EOPNOTSUPP;
1598 		}
1599 		if (!ret) {
1600 			if (val < 0 || val > 1)
1601 				ret = -EINVAL;
1602 			else
1603 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1604 		}
1605 		break;
1606 
1607 	case SO_TXTIME:
1608 		if (optlen != sizeof(struct sock_txtime)) {
1609 			ret = -EINVAL;
1610 			break;
1611 		} else if (copy_from_sockptr(&sk_txtime, optval,
1612 			   sizeof(struct sock_txtime))) {
1613 			ret = -EFAULT;
1614 			break;
1615 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1616 			ret = -EINVAL;
1617 			break;
1618 		}
1619 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1620 		 * scheduler has enough safe guards.
1621 		 */
1622 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1623 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1624 			ret = -EPERM;
1625 			break;
1626 		}
1627 
1628 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1629 		if (ret)
1630 			break;
1631 
1632 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1633 		sk->sk_clockid = sk_txtime.clockid;
1634 		sk->sk_txtime_deadline_mode =
1635 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1636 		sk->sk_txtime_report_errors =
1637 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1638 		break;
1639 
1640 	case SO_BINDTOIFINDEX:
1641 		ret = sock_bindtoindex_locked(sk, val);
1642 		break;
1643 
1644 	case SO_BUF_LOCK:
1645 		if (val & ~SOCK_BUF_LOCK_MASK) {
1646 			ret = -EINVAL;
1647 			break;
1648 		}
1649 		sk->sk_userlocks = val | (sk->sk_userlocks &
1650 					  ~SOCK_BUF_LOCK_MASK);
1651 		break;
1652 
1653 	case SO_RESERVE_MEM:
1654 	{
1655 		int delta;
1656 
1657 		if (val < 0) {
1658 			ret = -EINVAL;
1659 			break;
1660 		}
1661 
1662 		delta = val - sk->sk_reserved_mem;
1663 		if (delta < 0)
1664 			sock_release_reserved_memory(sk, -delta);
1665 		else
1666 			ret = sock_reserve_memory(sk, delta);
1667 		break;
1668 	}
1669 
1670 	default:
1671 		ret = -ENOPROTOOPT;
1672 		break;
1673 	}
1674 	sockopt_release_sock(sk);
1675 	return ret;
1676 }
1677 
1678 int sock_setsockopt(struct socket *sock, int level, int optname,
1679 		    sockptr_t optval, unsigned int optlen)
1680 {
1681 	return sk_setsockopt(sock->sk, level, optname,
1682 			     optval, optlen);
1683 }
1684 EXPORT_SYMBOL(sock_setsockopt);
1685 
1686 static const struct cred *sk_get_peer_cred(struct sock *sk)
1687 {
1688 	const struct cred *cred;
1689 
1690 	spin_lock(&sk->sk_peer_lock);
1691 	cred = get_cred(sk->sk_peer_cred);
1692 	spin_unlock(&sk->sk_peer_lock);
1693 
1694 	return cred;
1695 }
1696 
1697 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1698 			  struct ucred *ucred)
1699 {
1700 	ucred->pid = pid_vnr(pid);
1701 	ucred->uid = ucred->gid = -1;
1702 	if (cred) {
1703 		struct user_namespace *current_ns = current_user_ns();
1704 
1705 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1706 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1707 	}
1708 }
1709 
1710 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1711 {
1712 	struct user_namespace *user_ns = current_user_ns();
1713 	int i;
1714 
1715 	for (i = 0; i < src->ngroups; i++) {
1716 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1717 
1718 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1719 			return -EFAULT;
1720 	}
1721 
1722 	return 0;
1723 }
1724 
1725 int sk_getsockopt(struct sock *sk, int level, int optname,
1726 		  sockptr_t optval, sockptr_t optlen)
1727 {
1728 	struct socket *sock = sk->sk_socket;
1729 
1730 	union {
1731 		int val;
1732 		u64 val64;
1733 		unsigned long ulval;
1734 		struct linger ling;
1735 		struct old_timeval32 tm32;
1736 		struct __kernel_old_timeval tm;
1737 		struct  __kernel_sock_timeval stm;
1738 		struct sock_txtime txtime;
1739 		struct so_timestamping timestamping;
1740 	} v;
1741 
1742 	int lv = sizeof(int);
1743 	int len;
1744 
1745 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1746 		return -EFAULT;
1747 	if (len < 0)
1748 		return -EINVAL;
1749 
1750 	memset(&v, 0, sizeof(v));
1751 
1752 	switch (optname) {
1753 	case SO_DEBUG:
1754 		v.val = sock_flag(sk, SOCK_DBG);
1755 		break;
1756 
1757 	case SO_DONTROUTE:
1758 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1759 		break;
1760 
1761 	case SO_BROADCAST:
1762 		v.val = sock_flag(sk, SOCK_BROADCAST);
1763 		break;
1764 
1765 	case SO_SNDBUF:
1766 		v.val = READ_ONCE(sk->sk_sndbuf);
1767 		break;
1768 
1769 	case SO_RCVBUF:
1770 		v.val = READ_ONCE(sk->sk_rcvbuf);
1771 		break;
1772 
1773 	case SO_REUSEADDR:
1774 		v.val = sk->sk_reuse;
1775 		break;
1776 
1777 	case SO_REUSEPORT:
1778 		v.val = sk->sk_reuseport;
1779 		break;
1780 
1781 	case SO_KEEPALIVE:
1782 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1783 		break;
1784 
1785 	case SO_TYPE:
1786 		v.val = sk->sk_type;
1787 		break;
1788 
1789 	case SO_PROTOCOL:
1790 		v.val = sk->sk_protocol;
1791 		break;
1792 
1793 	case SO_DOMAIN:
1794 		v.val = sk->sk_family;
1795 		break;
1796 
1797 	case SO_ERROR:
1798 		v.val = -sock_error(sk);
1799 		if (v.val == 0)
1800 			v.val = xchg(&sk->sk_err_soft, 0);
1801 		break;
1802 
1803 	case SO_OOBINLINE:
1804 		v.val = sock_flag(sk, SOCK_URGINLINE);
1805 		break;
1806 
1807 	case SO_NO_CHECK:
1808 		v.val = sk->sk_no_check_tx;
1809 		break;
1810 
1811 	case SO_PRIORITY:
1812 		v.val = READ_ONCE(sk->sk_priority);
1813 		break;
1814 
1815 	case SO_LINGER:
1816 		lv		= sizeof(v.ling);
1817 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1818 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1819 		break;
1820 
1821 	case SO_BSDCOMPAT:
1822 		break;
1823 
1824 	case SO_TIMESTAMP_OLD:
1825 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1826 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1827 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1828 		break;
1829 
1830 	case SO_TIMESTAMPNS_OLD:
1831 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1832 		break;
1833 
1834 	case SO_TIMESTAMP_NEW:
1835 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1836 		break;
1837 
1838 	case SO_TIMESTAMPNS_NEW:
1839 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1840 		break;
1841 
1842 	case SO_TIMESTAMPING_OLD:
1843 	case SO_TIMESTAMPING_NEW:
1844 		lv = sizeof(v.timestamping);
1845 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1846 		 * returning the flags when they were set through the same option.
1847 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1848 		 */
1849 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1850 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1851 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1852 		}
1853 		break;
1854 
1855 	case SO_RCVTIMEO_OLD:
1856 	case SO_RCVTIMEO_NEW:
1857 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1858 				      SO_RCVTIMEO_OLD == optname);
1859 		break;
1860 
1861 	case SO_SNDTIMEO_OLD:
1862 	case SO_SNDTIMEO_NEW:
1863 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1864 				      SO_SNDTIMEO_OLD == optname);
1865 		break;
1866 
1867 	case SO_RCVLOWAT:
1868 		v.val = READ_ONCE(sk->sk_rcvlowat);
1869 		break;
1870 
1871 	case SO_SNDLOWAT:
1872 		v.val = 1;
1873 		break;
1874 
1875 	case SO_PASSCRED:
1876 		if (!sk_may_scm_recv(sk))
1877 			return -EOPNOTSUPP;
1878 
1879 		v.val = sk->sk_scm_credentials;
1880 		break;
1881 
1882 	case SO_PASSPIDFD:
1883 		if (!sk_is_unix(sk))
1884 			return -EOPNOTSUPP;
1885 
1886 		v.val = sk->sk_scm_pidfd;
1887 		break;
1888 
1889 	case SO_PASSRIGHTS:
1890 		if (!sk_is_unix(sk))
1891 			return -EOPNOTSUPP;
1892 
1893 		v.val = sk->sk_scm_rights;
1894 		break;
1895 
1896 	case SO_PEERCRED:
1897 	{
1898 		struct ucred peercred;
1899 		if (len > sizeof(peercred))
1900 			len = sizeof(peercred);
1901 
1902 		spin_lock(&sk->sk_peer_lock);
1903 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1904 		spin_unlock(&sk->sk_peer_lock);
1905 
1906 		if (copy_to_sockptr(optval, &peercred, len))
1907 			return -EFAULT;
1908 		goto lenout;
1909 	}
1910 
1911 	case SO_PEERPIDFD:
1912 	{
1913 		struct pid *peer_pid;
1914 		struct file *pidfd_file = NULL;
1915 		int pidfd;
1916 
1917 		if (len > sizeof(pidfd))
1918 			len = sizeof(pidfd);
1919 
1920 		spin_lock(&sk->sk_peer_lock);
1921 		peer_pid = get_pid(sk->sk_peer_pid);
1922 		spin_unlock(&sk->sk_peer_lock);
1923 
1924 		if (!peer_pid)
1925 			return -ENODATA;
1926 
1927 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1928 		put_pid(peer_pid);
1929 		if (pidfd < 0)
1930 			return pidfd;
1931 
1932 		if (copy_to_sockptr(optval, &pidfd, len) ||
1933 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1934 			put_unused_fd(pidfd);
1935 			fput(pidfd_file);
1936 
1937 			return -EFAULT;
1938 		}
1939 
1940 		fd_install(pidfd, pidfd_file);
1941 		return 0;
1942 	}
1943 
1944 	case SO_PEERGROUPS:
1945 	{
1946 		const struct cred *cred;
1947 		int ret, n;
1948 
1949 		cred = sk_get_peer_cred(sk);
1950 		if (!cred)
1951 			return -ENODATA;
1952 
1953 		n = cred->group_info->ngroups;
1954 		if (len < n * sizeof(gid_t)) {
1955 			len = n * sizeof(gid_t);
1956 			put_cred(cred);
1957 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1958 		}
1959 		len = n * sizeof(gid_t);
1960 
1961 		ret = groups_to_user(optval, cred->group_info);
1962 		put_cred(cred);
1963 		if (ret)
1964 			return ret;
1965 		goto lenout;
1966 	}
1967 
1968 	case SO_PEERNAME:
1969 	{
1970 		struct sockaddr_storage address;
1971 
1972 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1973 		if (lv < 0)
1974 			return -ENOTCONN;
1975 		if (lv < len)
1976 			return -EINVAL;
1977 		if (copy_to_sockptr(optval, &address, len))
1978 			return -EFAULT;
1979 		goto lenout;
1980 	}
1981 
1982 	/* Dubious BSD thing... Probably nobody even uses it, but
1983 	 * the UNIX standard wants it for whatever reason... -DaveM
1984 	 */
1985 	case SO_ACCEPTCONN:
1986 		v.val = sk->sk_state == TCP_LISTEN;
1987 		break;
1988 
1989 	case SO_PASSSEC:
1990 		if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
1991 			return -EOPNOTSUPP;
1992 
1993 		v.val = sk->sk_scm_security;
1994 		break;
1995 
1996 	case SO_PEERSEC:
1997 		return security_socket_getpeersec_stream(sock,
1998 							 optval, optlen, len);
1999 
2000 	case SO_MARK:
2001 		v.val = READ_ONCE(sk->sk_mark);
2002 		break;
2003 
2004 	case SO_RCVMARK:
2005 		v.val = sock_flag(sk, SOCK_RCVMARK);
2006 		break;
2007 
2008 	case SO_RCVPRIORITY:
2009 		v.val = sock_flag(sk, SOCK_RCVPRIORITY);
2010 		break;
2011 
2012 	case SO_RXQ_OVFL:
2013 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
2014 		break;
2015 
2016 	case SO_WIFI_STATUS:
2017 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
2018 		break;
2019 
2020 	case SO_PEEK_OFF:
2021 		if (!READ_ONCE(sock->ops)->set_peek_off)
2022 			return -EOPNOTSUPP;
2023 
2024 		v.val = READ_ONCE(sk->sk_peek_off);
2025 		break;
2026 	case SO_NOFCS:
2027 		v.val = sock_flag(sk, SOCK_NOFCS);
2028 		break;
2029 
2030 	case SO_BINDTODEVICE:
2031 		return sock_getbindtodevice(sk, optval, optlen, len);
2032 
2033 	case SO_GET_FILTER:
2034 		len = sk_get_filter(sk, optval, len);
2035 		if (len < 0)
2036 			return len;
2037 
2038 		goto lenout;
2039 
2040 	case SO_LOCK_FILTER:
2041 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
2042 		break;
2043 
2044 	case SO_BPF_EXTENSIONS:
2045 		v.val = bpf_tell_extensions();
2046 		break;
2047 
2048 	case SO_SELECT_ERR_QUEUE:
2049 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2050 		break;
2051 
2052 #ifdef CONFIG_NET_RX_BUSY_POLL
2053 	case SO_BUSY_POLL:
2054 		v.val = READ_ONCE(sk->sk_ll_usec);
2055 		break;
2056 	case SO_PREFER_BUSY_POLL:
2057 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2058 		break;
2059 #endif
2060 
2061 	case SO_MAX_PACING_RATE:
2062 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2063 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2064 			lv = sizeof(v.ulval);
2065 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2066 		} else {
2067 			/* 32bit version */
2068 			v.val = min_t(unsigned long, ~0U,
2069 				      READ_ONCE(sk->sk_max_pacing_rate));
2070 		}
2071 		break;
2072 
2073 	case SO_INCOMING_CPU:
2074 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2075 		break;
2076 
2077 	case SO_MEMINFO:
2078 	{
2079 		u32 meminfo[SK_MEMINFO_VARS];
2080 
2081 		sk_get_meminfo(sk, meminfo);
2082 
2083 		len = min_t(unsigned int, len, sizeof(meminfo));
2084 		if (copy_to_sockptr(optval, &meminfo, len))
2085 			return -EFAULT;
2086 
2087 		goto lenout;
2088 	}
2089 
2090 #ifdef CONFIG_NET_RX_BUSY_POLL
2091 	case SO_INCOMING_NAPI_ID:
2092 		v.val = READ_ONCE(sk->sk_napi_id);
2093 
2094 		/* aggregate non-NAPI IDs down to 0 */
2095 		if (!napi_id_valid(v.val))
2096 			v.val = 0;
2097 
2098 		break;
2099 #endif
2100 
2101 	case SO_COOKIE:
2102 		lv = sizeof(u64);
2103 		if (len < lv)
2104 			return -EINVAL;
2105 		v.val64 = sock_gen_cookie(sk);
2106 		break;
2107 
2108 	case SO_ZEROCOPY:
2109 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2110 		break;
2111 
2112 	case SO_TXTIME:
2113 		lv = sizeof(v.txtime);
2114 		v.txtime.clockid = sk->sk_clockid;
2115 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2116 				  SOF_TXTIME_DEADLINE_MODE : 0;
2117 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2118 				  SOF_TXTIME_REPORT_ERRORS : 0;
2119 		break;
2120 
2121 	case SO_BINDTOIFINDEX:
2122 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2123 		break;
2124 
2125 	case SO_NETNS_COOKIE:
2126 		lv = sizeof(u64);
2127 		if (len != lv)
2128 			return -EINVAL;
2129 		v.val64 = sock_net(sk)->net_cookie;
2130 		break;
2131 
2132 	case SO_BUF_LOCK:
2133 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2134 		break;
2135 
2136 	case SO_RESERVE_MEM:
2137 		v.val = READ_ONCE(sk->sk_reserved_mem);
2138 		break;
2139 
2140 	case SO_TXREHASH:
2141 		if (!sk_is_tcp(sk))
2142 			return -EOPNOTSUPP;
2143 
2144 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2145 		v.val = READ_ONCE(sk->sk_txrehash);
2146 		break;
2147 
2148 	default:
2149 		/* We implement the SO_SNDLOWAT etc to not be settable
2150 		 * (1003.1g 7).
2151 		 */
2152 		return -ENOPROTOOPT;
2153 	}
2154 
2155 	if (len > lv)
2156 		len = lv;
2157 	if (copy_to_sockptr(optval, &v, len))
2158 		return -EFAULT;
2159 lenout:
2160 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2161 		return -EFAULT;
2162 	return 0;
2163 }
2164 
2165 /*
2166  * Initialize an sk_lock.
2167  *
2168  * (We also register the sk_lock with the lock validator.)
2169  */
2170 static inline void sock_lock_init(struct sock *sk)
2171 {
2172 	sk_owner_clear(sk);
2173 
2174 	if (sk->sk_kern_sock)
2175 		sock_lock_init_class_and_name(
2176 			sk,
2177 			af_family_kern_slock_key_strings[sk->sk_family],
2178 			af_family_kern_slock_keys + sk->sk_family,
2179 			af_family_kern_key_strings[sk->sk_family],
2180 			af_family_kern_keys + sk->sk_family);
2181 	else
2182 		sock_lock_init_class_and_name(
2183 			sk,
2184 			af_family_slock_key_strings[sk->sk_family],
2185 			af_family_slock_keys + sk->sk_family,
2186 			af_family_key_strings[sk->sk_family],
2187 			af_family_keys + sk->sk_family);
2188 }
2189 
2190 /*
2191  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2192  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2193  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2194  */
2195 static void sock_copy(struct sock *nsk, const struct sock *osk)
2196 {
2197 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2198 #ifdef CONFIG_SECURITY_NETWORK
2199 	void *sptr = nsk->sk_security;
2200 #endif
2201 
2202 	/* If we move sk_tx_queue_mapping out of the private section,
2203 	 * we must check if sk_tx_queue_clear() is called after
2204 	 * sock_copy() in sk_clone_lock().
2205 	 */
2206 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2207 		     offsetof(struct sock, sk_dontcopy_begin) ||
2208 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2209 		     offsetof(struct sock, sk_dontcopy_end));
2210 
2211 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2212 
2213 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2214 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2215 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2216 
2217 #ifdef CONFIG_SECURITY_NETWORK
2218 	nsk->sk_security = sptr;
2219 	security_sk_clone(osk, nsk);
2220 #endif
2221 }
2222 
2223 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2224 		int family)
2225 {
2226 	struct sock *sk;
2227 	struct kmem_cache *slab;
2228 
2229 	slab = prot->slab;
2230 	if (slab != NULL) {
2231 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2232 		if (!sk)
2233 			return sk;
2234 		if (want_init_on_alloc(priority))
2235 			sk_prot_clear_nulls(sk, prot->obj_size);
2236 	} else
2237 		sk = kmalloc(prot->obj_size, priority);
2238 
2239 	if (sk != NULL) {
2240 		if (security_sk_alloc(sk, family, priority))
2241 			goto out_free;
2242 
2243 		if (!try_module_get(prot->owner))
2244 			goto out_free_sec;
2245 	}
2246 
2247 	return sk;
2248 
2249 out_free_sec:
2250 	security_sk_free(sk);
2251 out_free:
2252 	if (slab != NULL)
2253 		kmem_cache_free(slab, sk);
2254 	else
2255 		kfree(sk);
2256 	return NULL;
2257 }
2258 
2259 static void sk_prot_free(struct proto *prot, struct sock *sk)
2260 {
2261 	struct kmem_cache *slab;
2262 	struct module *owner;
2263 
2264 	owner = prot->owner;
2265 	slab = prot->slab;
2266 
2267 	cgroup_sk_free(&sk->sk_cgrp_data);
2268 	mem_cgroup_sk_free(sk);
2269 	security_sk_free(sk);
2270 
2271 	sk_owner_put(sk);
2272 
2273 	if (slab != NULL)
2274 		kmem_cache_free(slab, sk);
2275 	else
2276 		kfree(sk);
2277 	module_put(owner);
2278 }
2279 
2280 /**
2281  *	sk_alloc - All socket objects are allocated here
2282  *	@net: the applicable net namespace
2283  *	@family: protocol family
2284  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2285  *	@prot: struct proto associated with this new sock instance
2286  *	@kern: is this to be a kernel socket?
2287  */
2288 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2289 		      struct proto *prot, int kern)
2290 {
2291 	struct sock *sk;
2292 
2293 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2294 	if (sk) {
2295 		sk->sk_family = family;
2296 		/*
2297 		 * See comment in struct sock definition to understand
2298 		 * why we need sk_prot_creator -acme
2299 		 */
2300 		sk->sk_prot = sk->sk_prot_creator = prot;
2301 		sk->sk_kern_sock = kern;
2302 		sock_lock_init(sk);
2303 		sk->sk_net_refcnt = kern ? 0 : 1;
2304 		if (likely(sk->sk_net_refcnt)) {
2305 			get_net_track(net, &sk->ns_tracker, priority);
2306 			sock_inuse_add(net, 1);
2307 		} else {
2308 			net_passive_inc(net);
2309 			__netns_tracker_alloc(net, &sk->ns_tracker,
2310 					      false, priority);
2311 		}
2312 
2313 		sock_net_set(sk, net);
2314 		refcount_set(&sk->sk_wmem_alloc, 1);
2315 
2316 		mem_cgroup_sk_alloc(sk);
2317 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2318 		sock_update_classid(&sk->sk_cgrp_data);
2319 		sock_update_netprioidx(&sk->sk_cgrp_data);
2320 		sk_tx_queue_clear(sk);
2321 	}
2322 
2323 	return sk;
2324 }
2325 EXPORT_SYMBOL(sk_alloc);
2326 
2327 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2328  * grace period. This is the case for UDP sockets and TCP listeners.
2329  */
2330 static void __sk_destruct(struct rcu_head *head)
2331 {
2332 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2333 	struct net *net = sock_net(sk);
2334 	struct sk_filter *filter;
2335 
2336 	if (sk->sk_destruct)
2337 		sk->sk_destruct(sk);
2338 
2339 	filter = rcu_dereference_check(sk->sk_filter,
2340 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2341 	if (filter) {
2342 		sk_filter_uncharge(sk, filter);
2343 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2344 	}
2345 
2346 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2347 
2348 #ifdef CONFIG_BPF_SYSCALL
2349 	bpf_sk_storage_free(sk);
2350 #endif
2351 
2352 	if (atomic_read(&sk->sk_omem_alloc))
2353 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2354 			 __func__, atomic_read(&sk->sk_omem_alloc));
2355 
2356 	if (sk->sk_frag.page) {
2357 		put_page(sk->sk_frag.page);
2358 		sk->sk_frag.page = NULL;
2359 	}
2360 
2361 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2362 	put_cred(sk->sk_peer_cred);
2363 	put_pid(sk->sk_peer_pid);
2364 
2365 	if (likely(sk->sk_net_refcnt)) {
2366 		put_net_track(net, &sk->ns_tracker);
2367 	} else {
2368 		__netns_tracker_free(net, &sk->ns_tracker, false);
2369 		net_passive_dec(net);
2370 	}
2371 	sk_prot_free(sk->sk_prot_creator, sk);
2372 }
2373 
2374 void sk_net_refcnt_upgrade(struct sock *sk)
2375 {
2376 	struct net *net = sock_net(sk);
2377 
2378 	WARN_ON_ONCE(sk->sk_net_refcnt);
2379 	__netns_tracker_free(net, &sk->ns_tracker, false);
2380 	net_passive_dec(net);
2381 	sk->sk_net_refcnt = 1;
2382 	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2383 	sock_inuse_add(net, 1);
2384 }
2385 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2386 
2387 void sk_destruct(struct sock *sk)
2388 {
2389 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2390 
2391 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2392 		reuseport_detach_sock(sk);
2393 		use_call_rcu = true;
2394 	}
2395 
2396 	if (use_call_rcu)
2397 		call_rcu(&sk->sk_rcu, __sk_destruct);
2398 	else
2399 		__sk_destruct(&sk->sk_rcu);
2400 }
2401 
2402 static void __sk_free(struct sock *sk)
2403 {
2404 	if (likely(sk->sk_net_refcnt))
2405 		sock_inuse_add(sock_net(sk), -1);
2406 
2407 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2408 		sock_diag_broadcast_destroy(sk);
2409 	else
2410 		sk_destruct(sk);
2411 }
2412 
2413 void sk_free(struct sock *sk)
2414 {
2415 	/*
2416 	 * We subtract one from sk_wmem_alloc and can know if
2417 	 * some packets are still in some tx queue.
2418 	 * If not null, sock_wfree() will call __sk_free(sk) later
2419 	 */
2420 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2421 		__sk_free(sk);
2422 }
2423 EXPORT_SYMBOL(sk_free);
2424 
2425 static void sk_init_common(struct sock *sk)
2426 {
2427 	skb_queue_head_init(&sk->sk_receive_queue);
2428 	skb_queue_head_init(&sk->sk_write_queue);
2429 	skb_queue_head_init(&sk->sk_error_queue);
2430 
2431 	rwlock_init(&sk->sk_callback_lock);
2432 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2433 			af_rlock_keys + sk->sk_family,
2434 			af_family_rlock_key_strings[sk->sk_family]);
2435 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2436 			af_wlock_keys + sk->sk_family,
2437 			af_family_wlock_key_strings[sk->sk_family]);
2438 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2439 			af_elock_keys + sk->sk_family,
2440 			af_family_elock_key_strings[sk->sk_family]);
2441 	if (sk->sk_kern_sock)
2442 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2443 			af_kern_callback_keys + sk->sk_family,
2444 			af_family_kern_clock_key_strings[sk->sk_family]);
2445 	else
2446 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2447 			af_callback_keys + sk->sk_family,
2448 			af_family_clock_key_strings[sk->sk_family]);
2449 }
2450 
2451 /**
2452  *	sk_clone_lock - clone a socket, and lock its clone
2453  *	@sk: the socket to clone
2454  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2455  *
2456  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2457  */
2458 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2459 {
2460 	struct proto *prot = READ_ONCE(sk->sk_prot);
2461 	struct sk_filter *filter;
2462 	bool is_charged = true;
2463 	struct sock *newsk;
2464 
2465 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2466 	if (!newsk)
2467 		goto out;
2468 
2469 	sock_copy(newsk, sk);
2470 
2471 	newsk->sk_prot_creator = prot;
2472 
2473 	/* SANITY */
2474 	if (likely(newsk->sk_net_refcnt)) {
2475 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2476 		sock_inuse_add(sock_net(newsk), 1);
2477 	} else {
2478 		/* Kernel sockets are not elevating the struct net refcount.
2479 		 * Instead, use a tracker to more easily detect if a layer
2480 		 * is not properly dismantling its kernel sockets at netns
2481 		 * destroy time.
2482 		 */
2483 		net_passive_inc(sock_net(newsk));
2484 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2485 				      false, priority);
2486 	}
2487 	sk_node_init(&newsk->sk_node);
2488 	sock_lock_init(newsk);
2489 	bh_lock_sock(newsk);
2490 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2491 	newsk->sk_backlog.len = 0;
2492 
2493 	atomic_set(&newsk->sk_rmem_alloc, 0);
2494 
2495 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2496 	refcount_set(&newsk->sk_wmem_alloc, 1);
2497 
2498 	atomic_set(&newsk->sk_omem_alloc, 0);
2499 	sk_init_common(newsk);
2500 
2501 	newsk->sk_dst_cache	= NULL;
2502 	newsk->sk_dst_pending_confirm = 0;
2503 	newsk->sk_wmem_queued	= 0;
2504 	newsk->sk_forward_alloc = 0;
2505 	newsk->sk_reserved_mem  = 0;
2506 	atomic_set(&newsk->sk_drops, 0);
2507 	newsk->sk_send_head	= NULL;
2508 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2509 	atomic_set(&newsk->sk_zckey, 0);
2510 
2511 	sock_reset_flag(newsk, SOCK_DONE);
2512 
2513 	/* sk->sk_memcg will be populated at accept() time */
2514 	newsk->sk_memcg = NULL;
2515 
2516 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2517 
2518 	rcu_read_lock();
2519 	filter = rcu_dereference(sk->sk_filter);
2520 	if (filter != NULL)
2521 		/* though it's an empty new sock, the charging may fail
2522 		 * if sysctl_optmem_max was changed between creation of
2523 		 * original socket and cloning
2524 		 */
2525 		is_charged = sk_filter_charge(newsk, filter);
2526 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2527 	rcu_read_unlock();
2528 
2529 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2530 		/* We need to make sure that we don't uncharge the new
2531 		 * socket if we couldn't charge it in the first place
2532 		 * as otherwise we uncharge the parent's filter.
2533 		 */
2534 		if (!is_charged)
2535 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2536 
2537 		goto free;
2538 	}
2539 
2540 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2541 
2542 	if (bpf_sk_storage_clone(sk, newsk))
2543 		goto free;
2544 
2545 	/* Clear sk_user_data if parent had the pointer tagged
2546 	 * as not suitable for copying when cloning.
2547 	 */
2548 	if (sk_user_data_is_nocopy(newsk))
2549 		newsk->sk_user_data = NULL;
2550 
2551 	newsk->sk_err	   = 0;
2552 	newsk->sk_err_soft = 0;
2553 	newsk->sk_priority = 0;
2554 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2555 
2556 	/* Before updating sk_refcnt, we must commit prior changes to memory
2557 	 * (Documentation/RCU/rculist_nulls.rst for details)
2558 	 */
2559 	smp_wmb();
2560 	refcount_set(&newsk->sk_refcnt, 2);
2561 
2562 	sk_set_socket(newsk, NULL);
2563 	sk_tx_queue_clear(newsk);
2564 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2565 
2566 	if (newsk->sk_prot->sockets_allocated)
2567 		sk_sockets_allocated_inc(newsk);
2568 
2569 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2570 		net_enable_timestamp();
2571 out:
2572 	return newsk;
2573 free:
2574 	/* It is still raw copy of parent, so invalidate
2575 	 * destructor and make plain sk_free()
2576 	 */
2577 	newsk->sk_destruct = NULL;
2578 	bh_unlock_sock(newsk);
2579 	sk_free(newsk);
2580 	newsk = NULL;
2581 	goto out;
2582 }
2583 EXPORT_SYMBOL_GPL(sk_clone_lock);
2584 
2585 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2586 {
2587 	bool is_ipv6 = false;
2588 	u32 max_size;
2589 
2590 #if IS_ENABLED(CONFIG_IPV6)
2591 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2592 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2593 #endif
2594 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2595 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2596 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2597 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2598 		max_size = GSO_LEGACY_MAX_SIZE;
2599 
2600 	return max_size - (MAX_TCP_HEADER + 1);
2601 }
2602 
2603 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2604 {
2605 	u32 max_segs = 1;
2606 
2607 	sk->sk_route_caps = dst->dev->features;
2608 	if (sk_is_tcp(sk)) {
2609 		struct inet_connection_sock *icsk = inet_csk(sk);
2610 
2611 		sk->sk_route_caps |= NETIF_F_GSO;
2612 		icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2613 	}
2614 	if (sk->sk_route_caps & NETIF_F_GSO)
2615 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2616 	if (unlikely(sk->sk_gso_disabled))
2617 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2618 	if (sk_can_gso(sk)) {
2619 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2620 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2621 		} else {
2622 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2623 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2624 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2625 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2626 		}
2627 	}
2628 	sk->sk_gso_max_segs = max_segs;
2629 	sk_dst_set(sk, dst);
2630 }
2631 EXPORT_SYMBOL_GPL(sk_setup_caps);
2632 
2633 /*
2634  *	Simple resource managers for sockets.
2635  */
2636 
2637 
2638 /*
2639  * Write buffer destructor automatically called from kfree_skb.
2640  */
2641 void sock_wfree(struct sk_buff *skb)
2642 {
2643 	struct sock *sk = skb->sk;
2644 	unsigned int len = skb->truesize;
2645 	bool free;
2646 
2647 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2648 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2649 		    sk->sk_write_space == sock_def_write_space) {
2650 			rcu_read_lock();
2651 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2652 			sock_def_write_space_wfree(sk);
2653 			rcu_read_unlock();
2654 			if (unlikely(free))
2655 				__sk_free(sk);
2656 			return;
2657 		}
2658 
2659 		/*
2660 		 * Keep a reference on sk_wmem_alloc, this will be released
2661 		 * after sk_write_space() call
2662 		 */
2663 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2664 		sk->sk_write_space(sk);
2665 		len = 1;
2666 	}
2667 	/*
2668 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2669 	 * could not do because of in-flight packets
2670 	 */
2671 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2672 		__sk_free(sk);
2673 }
2674 EXPORT_SYMBOL(sock_wfree);
2675 
2676 /* This variant of sock_wfree() is used by TCP,
2677  * since it sets SOCK_USE_WRITE_QUEUE.
2678  */
2679 void __sock_wfree(struct sk_buff *skb)
2680 {
2681 	struct sock *sk = skb->sk;
2682 
2683 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2684 		__sk_free(sk);
2685 }
2686 
2687 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2688 {
2689 	skb_orphan(skb);
2690 #ifdef CONFIG_INET
2691 	if (unlikely(!sk_fullsock(sk)))
2692 		return skb_set_owner_edemux(skb, sk);
2693 #endif
2694 	skb->sk = sk;
2695 	skb->destructor = sock_wfree;
2696 	skb_set_hash_from_sk(skb, sk);
2697 	/*
2698 	 * We used to take a refcount on sk, but following operation
2699 	 * is enough to guarantee sk_free() won't free this sock until
2700 	 * all in-flight packets are completed
2701 	 */
2702 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2703 }
2704 EXPORT_SYMBOL(skb_set_owner_w);
2705 
2706 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2707 {
2708 	/* Drivers depend on in-order delivery for crypto offload,
2709 	 * partial orphan breaks out-of-order-OK logic.
2710 	 */
2711 	if (skb_is_decrypted(skb))
2712 		return false;
2713 
2714 	return (skb->destructor == sock_wfree ||
2715 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2716 }
2717 
2718 /* This helper is used by netem, as it can hold packets in its
2719  * delay queue. We want to allow the owner socket to send more
2720  * packets, as if they were already TX completed by a typical driver.
2721  * But we also want to keep skb->sk set because some packet schedulers
2722  * rely on it (sch_fq for example).
2723  */
2724 void skb_orphan_partial(struct sk_buff *skb)
2725 {
2726 	if (skb_is_tcp_pure_ack(skb))
2727 		return;
2728 
2729 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2730 		return;
2731 
2732 	skb_orphan(skb);
2733 }
2734 EXPORT_SYMBOL(skb_orphan_partial);
2735 
2736 /*
2737  * Read buffer destructor automatically called from kfree_skb.
2738  */
2739 void sock_rfree(struct sk_buff *skb)
2740 {
2741 	struct sock *sk = skb->sk;
2742 	unsigned int len = skb->truesize;
2743 
2744 	atomic_sub(len, &sk->sk_rmem_alloc);
2745 	sk_mem_uncharge(sk, len);
2746 }
2747 EXPORT_SYMBOL(sock_rfree);
2748 
2749 /*
2750  * Buffer destructor for skbs that are not used directly in read or write
2751  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2752  */
2753 void sock_efree(struct sk_buff *skb)
2754 {
2755 	sock_put(skb->sk);
2756 }
2757 EXPORT_SYMBOL(sock_efree);
2758 
2759 /* Buffer destructor for prefetch/receive path where reference count may
2760  * not be held, e.g. for listen sockets.
2761  */
2762 #ifdef CONFIG_INET
2763 void sock_pfree(struct sk_buff *skb)
2764 {
2765 	struct sock *sk = skb->sk;
2766 
2767 	if (!sk_is_refcounted(sk))
2768 		return;
2769 
2770 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2771 		inet_reqsk(sk)->rsk_listener = NULL;
2772 		reqsk_free(inet_reqsk(sk));
2773 		return;
2774 	}
2775 
2776 	sock_gen_put(sk);
2777 }
2778 EXPORT_SYMBOL(sock_pfree);
2779 #endif /* CONFIG_INET */
2780 
2781 kuid_t sock_i_uid(struct sock *sk)
2782 {
2783 	kuid_t uid;
2784 
2785 	read_lock_bh(&sk->sk_callback_lock);
2786 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2787 	read_unlock_bh(&sk->sk_callback_lock);
2788 	return uid;
2789 }
2790 EXPORT_SYMBOL(sock_i_uid);
2791 
2792 unsigned long __sock_i_ino(struct sock *sk)
2793 {
2794 	unsigned long ino;
2795 
2796 	read_lock(&sk->sk_callback_lock);
2797 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2798 	read_unlock(&sk->sk_callback_lock);
2799 	return ino;
2800 }
2801 EXPORT_SYMBOL(__sock_i_ino);
2802 
2803 unsigned long sock_i_ino(struct sock *sk)
2804 {
2805 	unsigned long ino;
2806 
2807 	local_bh_disable();
2808 	ino = __sock_i_ino(sk);
2809 	local_bh_enable();
2810 	return ino;
2811 }
2812 EXPORT_SYMBOL(sock_i_ino);
2813 
2814 /*
2815  * Allocate a skb from the socket's send buffer.
2816  */
2817 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2818 			     gfp_t priority)
2819 {
2820 	if (force ||
2821 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2822 		struct sk_buff *skb = alloc_skb(size, priority);
2823 
2824 		if (skb) {
2825 			skb_set_owner_w(skb, sk);
2826 			return skb;
2827 		}
2828 	}
2829 	return NULL;
2830 }
2831 EXPORT_SYMBOL(sock_wmalloc);
2832 
2833 static void sock_ofree(struct sk_buff *skb)
2834 {
2835 	struct sock *sk = skb->sk;
2836 
2837 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2838 }
2839 
2840 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2841 			     gfp_t priority)
2842 {
2843 	struct sk_buff *skb;
2844 
2845 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2846 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2847 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2848 		return NULL;
2849 
2850 	skb = alloc_skb(size, priority);
2851 	if (!skb)
2852 		return NULL;
2853 
2854 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2855 	skb->sk = sk;
2856 	skb->destructor = sock_ofree;
2857 	return skb;
2858 }
2859 
2860 /*
2861  * Allocate a memory block from the socket's option memory buffer.
2862  */
2863 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2864 {
2865 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2866 
2867 	if ((unsigned int)size <= optmem_max &&
2868 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2869 		void *mem;
2870 		/* First do the add, to avoid the race if kmalloc
2871 		 * might sleep.
2872 		 */
2873 		atomic_add(size, &sk->sk_omem_alloc);
2874 		mem = kmalloc(size, priority);
2875 		if (mem)
2876 			return mem;
2877 		atomic_sub(size, &sk->sk_omem_alloc);
2878 	}
2879 	return NULL;
2880 }
2881 EXPORT_SYMBOL(sock_kmalloc);
2882 
2883 /*
2884  * Duplicate the input "src" memory block using the socket's
2885  * option memory buffer.
2886  */
2887 void *sock_kmemdup(struct sock *sk, const void *src,
2888 		   int size, gfp_t priority)
2889 {
2890 	void *mem;
2891 
2892 	mem = sock_kmalloc(sk, size, priority);
2893 	if (mem)
2894 		memcpy(mem, src, size);
2895 	return mem;
2896 }
2897 EXPORT_SYMBOL(sock_kmemdup);
2898 
2899 /* Free an option memory block. Note, we actually want the inline
2900  * here as this allows gcc to detect the nullify and fold away the
2901  * condition entirely.
2902  */
2903 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2904 				  const bool nullify)
2905 {
2906 	if (WARN_ON_ONCE(!mem))
2907 		return;
2908 	if (nullify)
2909 		kfree_sensitive(mem);
2910 	else
2911 		kfree(mem);
2912 	atomic_sub(size, &sk->sk_omem_alloc);
2913 }
2914 
2915 void sock_kfree_s(struct sock *sk, void *mem, int size)
2916 {
2917 	__sock_kfree_s(sk, mem, size, false);
2918 }
2919 EXPORT_SYMBOL(sock_kfree_s);
2920 
2921 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2922 {
2923 	__sock_kfree_s(sk, mem, size, true);
2924 }
2925 EXPORT_SYMBOL(sock_kzfree_s);
2926 
2927 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2928    I think, these locks should be removed for datagram sockets.
2929  */
2930 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2931 {
2932 	DEFINE_WAIT(wait);
2933 
2934 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2935 	for (;;) {
2936 		if (!timeo)
2937 			break;
2938 		if (signal_pending(current))
2939 			break;
2940 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2941 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2942 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2943 			break;
2944 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2945 			break;
2946 		if (READ_ONCE(sk->sk_err))
2947 			break;
2948 		timeo = schedule_timeout(timeo);
2949 	}
2950 	finish_wait(sk_sleep(sk), &wait);
2951 	return timeo;
2952 }
2953 
2954 
2955 /*
2956  *	Generic send/receive buffer handlers
2957  */
2958 
2959 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2960 				     unsigned long data_len, int noblock,
2961 				     int *errcode, int max_page_order)
2962 {
2963 	struct sk_buff *skb;
2964 	long timeo;
2965 	int err;
2966 
2967 	timeo = sock_sndtimeo(sk, noblock);
2968 	for (;;) {
2969 		err = sock_error(sk);
2970 		if (err != 0)
2971 			goto failure;
2972 
2973 		err = -EPIPE;
2974 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2975 			goto failure;
2976 
2977 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2978 			break;
2979 
2980 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2981 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2982 		err = -EAGAIN;
2983 		if (!timeo)
2984 			goto failure;
2985 		if (signal_pending(current))
2986 			goto interrupted;
2987 		timeo = sock_wait_for_wmem(sk, timeo);
2988 	}
2989 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2990 				   errcode, sk->sk_allocation);
2991 	if (skb)
2992 		skb_set_owner_w(skb, sk);
2993 	return skb;
2994 
2995 interrupted:
2996 	err = sock_intr_errno(timeo);
2997 failure:
2998 	*errcode = err;
2999 	return NULL;
3000 }
3001 EXPORT_SYMBOL(sock_alloc_send_pskb);
3002 
3003 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
3004 		     struct sockcm_cookie *sockc)
3005 {
3006 	u32 tsflags;
3007 
3008 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
3009 
3010 	switch (cmsg->cmsg_type) {
3011 	case SO_MARK:
3012 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
3013 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3014 			return -EPERM;
3015 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3016 			return -EINVAL;
3017 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
3018 		break;
3019 	case SO_TIMESTAMPING_OLD:
3020 	case SO_TIMESTAMPING_NEW:
3021 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3022 			return -EINVAL;
3023 
3024 		tsflags = *(u32 *)CMSG_DATA(cmsg);
3025 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3026 			return -EINVAL;
3027 
3028 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3029 		sockc->tsflags |= tsflags;
3030 		break;
3031 	case SCM_TXTIME:
3032 		if (!sock_flag(sk, SOCK_TXTIME))
3033 			return -EINVAL;
3034 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3035 			return -EINVAL;
3036 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3037 		break;
3038 	case SCM_TS_OPT_ID:
3039 		if (sk_is_tcp(sk))
3040 			return -EINVAL;
3041 		tsflags = READ_ONCE(sk->sk_tsflags);
3042 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3043 			return -EINVAL;
3044 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3045 			return -EINVAL;
3046 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
3047 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
3048 		break;
3049 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
3050 	case SCM_RIGHTS:
3051 	case SCM_CREDENTIALS:
3052 		break;
3053 	case SO_PRIORITY:
3054 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3055 			return -EINVAL;
3056 		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
3057 			return -EPERM;
3058 		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
3059 		break;
3060 	case SCM_DEVMEM_DMABUF:
3061 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3062 			return -EINVAL;
3063 		sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
3064 		break;
3065 	default:
3066 		return -EINVAL;
3067 	}
3068 	return 0;
3069 }
3070 EXPORT_SYMBOL(__sock_cmsg_send);
3071 
3072 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
3073 		   struct sockcm_cookie *sockc)
3074 {
3075 	struct cmsghdr *cmsg;
3076 	int ret;
3077 
3078 	for_each_cmsghdr(cmsg, msg) {
3079 		if (!CMSG_OK(msg, cmsg))
3080 			return -EINVAL;
3081 		if (cmsg->cmsg_level != SOL_SOCKET)
3082 			continue;
3083 		ret = __sock_cmsg_send(sk, cmsg, sockc);
3084 		if (ret)
3085 			return ret;
3086 	}
3087 	return 0;
3088 }
3089 EXPORT_SYMBOL(sock_cmsg_send);
3090 
3091 static void sk_enter_memory_pressure(struct sock *sk)
3092 {
3093 	if (!sk->sk_prot->enter_memory_pressure)
3094 		return;
3095 
3096 	sk->sk_prot->enter_memory_pressure(sk);
3097 }
3098 
3099 static void sk_leave_memory_pressure(struct sock *sk)
3100 {
3101 	if (sk->sk_prot->leave_memory_pressure) {
3102 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3103 				     tcp_leave_memory_pressure, sk);
3104 	} else {
3105 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3106 
3107 		if (memory_pressure && READ_ONCE(*memory_pressure))
3108 			WRITE_ONCE(*memory_pressure, 0);
3109 	}
3110 }
3111 
3112 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3113 
3114 /**
3115  * skb_page_frag_refill - check that a page_frag contains enough room
3116  * @sz: minimum size of the fragment we want to get
3117  * @pfrag: pointer to page_frag
3118  * @gfp: priority for memory allocation
3119  *
3120  * Note: While this allocator tries to use high order pages, there is
3121  * no guarantee that allocations succeed. Therefore, @sz MUST be
3122  * less or equal than PAGE_SIZE.
3123  */
3124 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3125 {
3126 	if (pfrag->page) {
3127 		if (page_ref_count(pfrag->page) == 1) {
3128 			pfrag->offset = 0;
3129 			return true;
3130 		}
3131 		if (pfrag->offset + sz <= pfrag->size)
3132 			return true;
3133 		put_page(pfrag->page);
3134 	}
3135 
3136 	pfrag->offset = 0;
3137 	if (SKB_FRAG_PAGE_ORDER &&
3138 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3139 		/* Avoid direct reclaim but allow kswapd to wake */
3140 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3141 					  __GFP_COMP | __GFP_NOWARN |
3142 					  __GFP_NORETRY,
3143 					  SKB_FRAG_PAGE_ORDER);
3144 		if (likely(pfrag->page)) {
3145 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3146 			return true;
3147 		}
3148 	}
3149 	pfrag->page = alloc_page(gfp);
3150 	if (likely(pfrag->page)) {
3151 		pfrag->size = PAGE_SIZE;
3152 		return true;
3153 	}
3154 	return false;
3155 }
3156 EXPORT_SYMBOL(skb_page_frag_refill);
3157 
3158 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3159 {
3160 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3161 		return true;
3162 
3163 	sk_enter_memory_pressure(sk);
3164 	sk_stream_moderate_sndbuf(sk);
3165 	return false;
3166 }
3167 EXPORT_SYMBOL(sk_page_frag_refill);
3168 
3169 void __lock_sock(struct sock *sk)
3170 	__releases(&sk->sk_lock.slock)
3171 	__acquires(&sk->sk_lock.slock)
3172 {
3173 	DEFINE_WAIT(wait);
3174 
3175 	for (;;) {
3176 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3177 					TASK_UNINTERRUPTIBLE);
3178 		spin_unlock_bh(&sk->sk_lock.slock);
3179 		schedule();
3180 		spin_lock_bh(&sk->sk_lock.slock);
3181 		if (!sock_owned_by_user(sk))
3182 			break;
3183 	}
3184 	finish_wait(&sk->sk_lock.wq, &wait);
3185 }
3186 
3187 void __release_sock(struct sock *sk)
3188 	__releases(&sk->sk_lock.slock)
3189 	__acquires(&sk->sk_lock.slock)
3190 {
3191 	struct sk_buff *skb, *next;
3192 
3193 	while ((skb = sk->sk_backlog.head) != NULL) {
3194 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3195 
3196 		spin_unlock_bh(&sk->sk_lock.slock);
3197 
3198 		do {
3199 			next = skb->next;
3200 			prefetch(next);
3201 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3202 			skb_mark_not_on_list(skb);
3203 			sk_backlog_rcv(sk, skb);
3204 
3205 			cond_resched();
3206 
3207 			skb = next;
3208 		} while (skb != NULL);
3209 
3210 		spin_lock_bh(&sk->sk_lock.slock);
3211 	}
3212 
3213 	/*
3214 	 * Doing the zeroing here guarantee we can not loop forever
3215 	 * while a wild producer attempts to flood us.
3216 	 */
3217 	sk->sk_backlog.len = 0;
3218 }
3219 
3220 void __sk_flush_backlog(struct sock *sk)
3221 {
3222 	spin_lock_bh(&sk->sk_lock.slock);
3223 	__release_sock(sk);
3224 
3225 	if (sk->sk_prot->release_cb)
3226 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3227 				     tcp_release_cb, sk);
3228 
3229 	spin_unlock_bh(&sk->sk_lock.slock);
3230 }
3231 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3232 
3233 /**
3234  * sk_wait_data - wait for data to arrive at sk_receive_queue
3235  * @sk:    sock to wait on
3236  * @timeo: for how long
3237  * @skb:   last skb seen on sk_receive_queue
3238  *
3239  * Now socket state including sk->sk_err is changed only under lock,
3240  * hence we may omit checks after joining wait queue.
3241  * We check receive queue before schedule() only as optimization;
3242  * it is very likely that release_sock() added new data.
3243  */
3244 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3245 {
3246 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3247 	int rc;
3248 
3249 	add_wait_queue(sk_sleep(sk), &wait);
3250 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3251 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3252 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3253 	remove_wait_queue(sk_sleep(sk), &wait);
3254 	return rc;
3255 }
3256 EXPORT_SYMBOL(sk_wait_data);
3257 
3258 /**
3259  *	__sk_mem_raise_allocated - increase memory_allocated
3260  *	@sk: socket
3261  *	@size: memory size to allocate
3262  *	@amt: pages to allocate
3263  *	@kind: allocation type
3264  *
3265  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3266  *
3267  *	Unlike the globally shared limits among the sockets under same protocol,
3268  *	consuming the budget of a memcg won't have direct effect on other ones.
3269  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3270  *	whether or not to raise allocated through sk_under_memory_pressure() or
3271  *	its variants.
3272  */
3273 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3274 {
3275 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3276 	struct proto *prot = sk->sk_prot;
3277 	bool charged = false;
3278 	long allocated;
3279 
3280 	sk_memory_allocated_add(sk, amt);
3281 	allocated = sk_memory_allocated(sk);
3282 
3283 	if (memcg) {
3284 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3285 			goto suppress_allocation;
3286 		charged = true;
3287 	}
3288 
3289 	/* Under limit. */
3290 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3291 		sk_leave_memory_pressure(sk);
3292 		return 1;
3293 	}
3294 
3295 	/* Under pressure. */
3296 	if (allocated > sk_prot_mem_limits(sk, 1))
3297 		sk_enter_memory_pressure(sk);
3298 
3299 	/* Over hard limit. */
3300 	if (allocated > sk_prot_mem_limits(sk, 2))
3301 		goto suppress_allocation;
3302 
3303 	/* Guarantee minimum buffer size under pressure (either global
3304 	 * or memcg) to make sure features described in RFC 7323 (TCP
3305 	 * Extensions for High Performance) work properly.
3306 	 *
3307 	 * This rule does NOT stand when exceeds global or memcg's hard
3308 	 * limit, or else a DoS attack can be taken place by spawning
3309 	 * lots of sockets whose usage are under minimum buffer size.
3310 	 */
3311 	if (kind == SK_MEM_RECV) {
3312 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3313 			return 1;
3314 
3315 	} else { /* SK_MEM_SEND */
3316 		int wmem0 = sk_get_wmem0(sk, prot);
3317 
3318 		if (sk->sk_type == SOCK_STREAM) {
3319 			if (sk->sk_wmem_queued < wmem0)
3320 				return 1;
3321 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3322 				return 1;
3323 		}
3324 	}
3325 
3326 	if (sk_has_memory_pressure(sk)) {
3327 		u64 alloc;
3328 
3329 		/* The following 'average' heuristic is within the
3330 		 * scope of global accounting, so it only makes
3331 		 * sense for global memory pressure.
3332 		 */
3333 		if (!sk_under_global_memory_pressure(sk))
3334 			return 1;
3335 
3336 		/* Try to be fair among all the sockets under global
3337 		 * pressure by allowing the ones that below average
3338 		 * usage to raise.
3339 		 */
3340 		alloc = sk_sockets_allocated_read_positive(sk);
3341 		if (sk_prot_mem_limits(sk, 2) > alloc *
3342 		    sk_mem_pages(sk->sk_wmem_queued +
3343 				 atomic_read(&sk->sk_rmem_alloc) +
3344 				 sk->sk_forward_alloc))
3345 			return 1;
3346 	}
3347 
3348 suppress_allocation:
3349 
3350 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3351 		sk_stream_moderate_sndbuf(sk);
3352 
3353 		/* Fail only if socket is _under_ its sndbuf.
3354 		 * In this case we cannot block, so that we have to fail.
3355 		 */
3356 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3357 			/* Force charge with __GFP_NOFAIL */
3358 			if (memcg && !charged) {
3359 				mem_cgroup_charge_skmem(memcg, amt,
3360 					gfp_memcg_charge() | __GFP_NOFAIL);
3361 			}
3362 			return 1;
3363 		}
3364 	}
3365 
3366 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3367 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3368 
3369 	sk_memory_allocated_sub(sk, amt);
3370 
3371 	if (charged)
3372 		mem_cgroup_uncharge_skmem(memcg, amt);
3373 
3374 	return 0;
3375 }
3376 
3377 /**
3378  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3379  *	@sk: socket
3380  *	@size: memory size to allocate
3381  *	@kind: allocation type
3382  *
3383  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3384  *	rmem allocation. This function assumes that protocols which have
3385  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3386  */
3387 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3388 {
3389 	int ret, amt = sk_mem_pages(size);
3390 
3391 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3392 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3393 	if (!ret)
3394 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3395 	return ret;
3396 }
3397 EXPORT_SYMBOL(__sk_mem_schedule);
3398 
3399 /**
3400  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3401  *	@sk: socket
3402  *	@amount: number of quanta
3403  *
3404  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3405  */
3406 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3407 {
3408 	sk_memory_allocated_sub(sk, amount);
3409 
3410 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3411 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3412 
3413 	if (sk_under_global_memory_pressure(sk) &&
3414 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3415 		sk_leave_memory_pressure(sk);
3416 }
3417 
3418 /**
3419  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3420  *	@sk: socket
3421  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3422  */
3423 void __sk_mem_reclaim(struct sock *sk, int amount)
3424 {
3425 	amount >>= PAGE_SHIFT;
3426 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3427 	__sk_mem_reduce_allocated(sk, amount);
3428 }
3429 EXPORT_SYMBOL(__sk_mem_reclaim);
3430 
3431 int sk_set_peek_off(struct sock *sk, int val)
3432 {
3433 	WRITE_ONCE(sk->sk_peek_off, val);
3434 	return 0;
3435 }
3436 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3437 
3438 /*
3439  * Set of default routines for initialising struct proto_ops when
3440  * the protocol does not support a particular function. In certain
3441  * cases where it makes no sense for a protocol to have a "do nothing"
3442  * function, some default processing is provided.
3443  */
3444 
3445 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3446 {
3447 	return -EOPNOTSUPP;
3448 }
3449 EXPORT_SYMBOL(sock_no_bind);
3450 
3451 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3452 		    int len, int flags)
3453 {
3454 	return -EOPNOTSUPP;
3455 }
3456 EXPORT_SYMBOL(sock_no_connect);
3457 
3458 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3459 {
3460 	return -EOPNOTSUPP;
3461 }
3462 EXPORT_SYMBOL(sock_no_socketpair);
3463 
3464 int sock_no_accept(struct socket *sock, struct socket *newsock,
3465 		   struct proto_accept_arg *arg)
3466 {
3467 	return -EOPNOTSUPP;
3468 }
3469 EXPORT_SYMBOL(sock_no_accept);
3470 
3471 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3472 		    int peer)
3473 {
3474 	return -EOPNOTSUPP;
3475 }
3476 EXPORT_SYMBOL(sock_no_getname);
3477 
3478 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3479 {
3480 	return -EOPNOTSUPP;
3481 }
3482 EXPORT_SYMBOL(sock_no_ioctl);
3483 
3484 int sock_no_listen(struct socket *sock, int backlog)
3485 {
3486 	return -EOPNOTSUPP;
3487 }
3488 EXPORT_SYMBOL(sock_no_listen);
3489 
3490 int sock_no_shutdown(struct socket *sock, int how)
3491 {
3492 	return -EOPNOTSUPP;
3493 }
3494 EXPORT_SYMBOL(sock_no_shutdown);
3495 
3496 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3497 {
3498 	return -EOPNOTSUPP;
3499 }
3500 EXPORT_SYMBOL(sock_no_sendmsg);
3501 
3502 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3503 {
3504 	return -EOPNOTSUPP;
3505 }
3506 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3507 
3508 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3509 		    int flags)
3510 {
3511 	return -EOPNOTSUPP;
3512 }
3513 EXPORT_SYMBOL(sock_no_recvmsg);
3514 
3515 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3516 {
3517 	/* Mirror missing mmap method error code */
3518 	return -ENODEV;
3519 }
3520 EXPORT_SYMBOL(sock_no_mmap);
3521 
3522 /*
3523  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3524  * various sock-based usage counts.
3525  */
3526 void __receive_sock(struct file *file)
3527 {
3528 	struct socket *sock;
3529 
3530 	sock = sock_from_file(file);
3531 	if (sock) {
3532 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3533 		sock_update_classid(&sock->sk->sk_cgrp_data);
3534 	}
3535 }
3536 
3537 /*
3538  *	Default Socket Callbacks
3539  */
3540 
3541 static void sock_def_wakeup(struct sock *sk)
3542 {
3543 	struct socket_wq *wq;
3544 
3545 	rcu_read_lock();
3546 	wq = rcu_dereference(sk->sk_wq);
3547 	if (skwq_has_sleeper(wq))
3548 		wake_up_interruptible_all(&wq->wait);
3549 	rcu_read_unlock();
3550 }
3551 
3552 static void sock_def_error_report(struct sock *sk)
3553 {
3554 	struct socket_wq *wq;
3555 
3556 	rcu_read_lock();
3557 	wq = rcu_dereference(sk->sk_wq);
3558 	if (skwq_has_sleeper(wq))
3559 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3560 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3561 	rcu_read_unlock();
3562 }
3563 
3564 void sock_def_readable(struct sock *sk)
3565 {
3566 	struct socket_wq *wq;
3567 
3568 	trace_sk_data_ready(sk);
3569 
3570 	rcu_read_lock();
3571 	wq = rcu_dereference(sk->sk_wq);
3572 	if (skwq_has_sleeper(wq))
3573 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3574 						EPOLLRDNORM | EPOLLRDBAND);
3575 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3576 	rcu_read_unlock();
3577 }
3578 
3579 static void sock_def_write_space(struct sock *sk)
3580 {
3581 	struct socket_wq *wq;
3582 
3583 	rcu_read_lock();
3584 
3585 	/* Do not wake up a writer until he can make "significant"
3586 	 * progress.  --DaveM
3587 	 */
3588 	if (sock_writeable(sk)) {
3589 		wq = rcu_dereference(sk->sk_wq);
3590 		if (skwq_has_sleeper(wq))
3591 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3592 						EPOLLWRNORM | EPOLLWRBAND);
3593 
3594 		/* Should agree with poll, otherwise some programs break */
3595 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3596 	}
3597 
3598 	rcu_read_unlock();
3599 }
3600 
3601 /* An optimised version of sock_def_write_space(), should only be called
3602  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3603  * ->sk_wmem_alloc.
3604  */
3605 static void sock_def_write_space_wfree(struct sock *sk)
3606 {
3607 	/* Do not wake up a writer until he can make "significant"
3608 	 * progress.  --DaveM
3609 	 */
3610 	if (sock_writeable(sk)) {
3611 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3612 
3613 		/* rely on refcount_sub from sock_wfree() */
3614 		smp_mb__after_atomic();
3615 		if (wq && waitqueue_active(&wq->wait))
3616 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3617 						EPOLLWRNORM | EPOLLWRBAND);
3618 
3619 		/* Should agree with poll, otherwise some programs break */
3620 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3621 	}
3622 }
3623 
3624 static void sock_def_destruct(struct sock *sk)
3625 {
3626 }
3627 
3628 void sk_send_sigurg(struct sock *sk)
3629 {
3630 	if (sk->sk_socket && sk->sk_socket->file)
3631 		if (send_sigurg(sk->sk_socket->file))
3632 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3633 }
3634 EXPORT_SYMBOL(sk_send_sigurg);
3635 
3636 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3637 		    unsigned long expires)
3638 {
3639 	if (!mod_timer(timer, expires))
3640 		sock_hold(sk);
3641 }
3642 EXPORT_SYMBOL(sk_reset_timer);
3643 
3644 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3645 {
3646 	if (timer_delete(timer))
3647 		__sock_put(sk);
3648 }
3649 EXPORT_SYMBOL(sk_stop_timer);
3650 
3651 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3652 {
3653 	if (timer_delete_sync(timer))
3654 		__sock_put(sk);
3655 }
3656 EXPORT_SYMBOL(sk_stop_timer_sync);
3657 
3658 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3659 {
3660 	sk_init_common(sk);
3661 	sk->sk_send_head	=	NULL;
3662 
3663 	timer_setup(&sk->sk_timer, NULL, 0);
3664 
3665 	sk->sk_allocation	=	GFP_KERNEL;
3666 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3667 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3668 	sk->sk_state		=	TCP_CLOSE;
3669 	sk->sk_use_task_frag	=	true;
3670 	sk_set_socket(sk, sock);
3671 
3672 	sock_set_flag(sk, SOCK_ZAPPED);
3673 
3674 	if (sock) {
3675 		sk->sk_type	=	sock->type;
3676 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3677 		sock->sk	=	sk;
3678 	} else {
3679 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3680 	}
3681 	sk->sk_uid	=	uid;
3682 
3683 	sk->sk_state_change	=	sock_def_wakeup;
3684 	sk->sk_data_ready	=	sock_def_readable;
3685 	sk->sk_write_space	=	sock_def_write_space;
3686 	sk->sk_error_report	=	sock_def_error_report;
3687 	sk->sk_destruct		=	sock_def_destruct;
3688 
3689 	sk->sk_frag.page	=	NULL;
3690 	sk->sk_frag.offset	=	0;
3691 	sk->sk_peek_off		=	-1;
3692 
3693 	sk->sk_peer_pid 	=	NULL;
3694 	sk->sk_peer_cred	=	NULL;
3695 	spin_lock_init(&sk->sk_peer_lock);
3696 
3697 	sk->sk_write_pending	=	0;
3698 	sk->sk_rcvlowat		=	1;
3699 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3700 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3701 
3702 	sk->sk_stamp = SK_DEFAULT_STAMP;
3703 #if BITS_PER_LONG==32
3704 	seqlock_init(&sk->sk_stamp_seq);
3705 #endif
3706 	atomic_set(&sk->sk_zckey, 0);
3707 
3708 #ifdef CONFIG_NET_RX_BUSY_POLL
3709 	sk->sk_napi_id		=	0;
3710 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3711 #endif
3712 
3713 	sk->sk_max_pacing_rate = ~0UL;
3714 	sk->sk_pacing_rate = ~0UL;
3715 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3716 	sk->sk_incoming_cpu = -1;
3717 
3718 	sk_rx_queue_clear(sk);
3719 	/*
3720 	 * Before updating sk_refcnt, we must commit prior changes to memory
3721 	 * (Documentation/RCU/rculist_nulls.rst for details)
3722 	 */
3723 	smp_wmb();
3724 	refcount_set(&sk->sk_refcnt, 1);
3725 	atomic_set(&sk->sk_drops, 0);
3726 }
3727 EXPORT_SYMBOL(sock_init_data_uid);
3728 
3729 void sock_init_data(struct socket *sock, struct sock *sk)
3730 {
3731 	kuid_t uid = sock ?
3732 		SOCK_INODE(sock)->i_uid :
3733 		make_kuid(sock_net(sk)->user_ns, 0);
3734 
3735 	sock_init_data_uid(sock, sk, uid);
3736 }
3737 EXPORT_SYMBOL(sock_init_data);
3738 
3739 void lock_sock_nested(struct sock *sk, int subclass)
3740 {
3741 	/* The sk_lock has mutex_lock() semantics here. */
3742 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3743 
3744 	might_sleep();
3745 	spin_lock_bh(&sk->sk_lock.slock);
3746 	if (sock_owned_by_user_nocheck(sk))
3747 		__lock_sock(sk);
3748 	sk->sk_lock.owned = 1;
3749 	spin_unlock_bh(&sk->sk_lock.slock);
3750 }
3751 EXPORT_SYMBOL(lock_sock_nested);
3752 
3753 void release_sock(struct sock *sk)
3754 {
3755 	spin_lock_bh(&sk->sk_lock.slock);
3756 	if (sk->sk_backlog.tail)
3757 		__release_sock(sk);
3758 
3759 	if (sk->sk_prot->release_cb)
3760 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3761 				     tcp_release_cb, sk);
3762 
3763 	sock_release_ownership(sk);
3764 	if (waitqueue_active(&sk->sk_lock.wq))
3765 		wake_up(&sk->sk_lock.wq);
3766 	spin_unlock_bh(&sk->sk_lock.slock);
3767 }
3768 EXPORT_SYMBOL(release_sock);
3769 
3770 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3771 {
3772 	might_sleep();
3773 	spin_lock_bh(&sk->sk_lock.slock);
3774 
3775 	if (!sock_owned_by_user_nocheck(sk)) {
3776 		/*
3777 		 * Fast path return with bottom halves disabled and
3778 		 * sock::sk_lock.slock held.
3779 		 *
3780 		 * The 'mutex' is not contended and holding
3781 		 * sock::sk_lock.slock prevents all other lockers to
3782 		 * proceed so the corresponding unlock_sock_fast() can
3783 		 * avoid the slow path of release_sock() completely and
3784 		 * just release slock.
3785 		 *
3786 		 * From a semantical POV this is equivalent to 'acquiring'
3787 		 * the 'mutex', hence the corresponding lockdep
3788 		 * mutex_release() has to happen in the fast path of
3789 		 * unlock_sock_fast().
3790 		 */
3791 		return false;
3792 	}
3793 
3794 	__lock_sock(sk);
3795 	sk->sk_lock.owned = 1;
3796 	__acquire(&sk->sk_lock.slock);
3797 	spin_unlock_bh(&sk->sk_lock.slock);
3798 	return true;
3799 }
3800 EXPORT_SYMBOL(__lock_sock_fast);
3801 
3802 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3803 		   bool timeval, bool time32)
3804 {
3805 	struct sock *sk = sock->sk;
3806 	struct timespec64 ts;
3807 
3808 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3809 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3810 	if (ts.tv_sec == -1)
3811 		return -ENOENT;
3812 	if (ts.tv_sec == 0) {
3813 		ktime_t kt = ktime_get_real();
3814 		sock_write_timestamp(sk, kt);
3815 		ts = ktime_to_timespec64(kt);
3816 	}
3817 
3818 	if (timeval)
3819 		ts.tv_nsec /= 1000;
3820 
3821 #ifdef CONFIG_COMPAT_32BIT_TIME
3822 	if (time32)
3823 		return put_old_timespec32(&ts, userstamp);
3824 #endif
3825 #ifdef CONFIG_SPARC64
3826 	/* beware of padding in sparc64 timeval */
3827 	if (timeval && !in_compat_syscall()) {
3828 		struct __kernel_old_timeval __user tv = {
3829 			.tv_sec = ts.tv_sec,
3830 			.tv_usec = ts.tv_nsec,
3831 		};
3832 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3833 			return -EFAULT;
3834 		return 0;
3835 	}
3836 #endif
3837 	return put_timespec64(&ts, userstamp);
3838 }
3839 EXPORT_SYMBOL(sock_gettstamp);
3840 
3841 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3842 {
3843 	if (!sock_flag(sk, flag)) {
3844 		unsigned long previous_flags = sk->sk_flags;
3845 
3846 		sock_set_flag(sk, flag);
3847 		/*
3848 		 * we just set one of the two flags which require net
3849 		 * time stamping, but time stamping might have been on
3850 		 * already because of the other one
3851 		 */
3852 		if (sock_needs_netstamp(sk) &&
3853 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3854 			net_enable_timestamp();
3855 	}
3856 }
3857 
3858 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3859 		       int level, int type)
3860 {
3861 	struct sock_exterr_skb *serr;
3862 	struct sk_buff *skb;
3863 	int copied, err;
3864 
3865 	err = -EAGAIN;
3866 	skb = sock_dequeue_err_skb(sk);
3867 	if (skb == NULL)
3868 		goto out;
3869 
3870 	copied = skb->len;
3871 	if (copied > len) {
3872 		msg->msg_flags |= MSG_TRUNC;
3873 		copied = len;
3874 	}
3875 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3876 	if (err)
3877 		goto out_free_skb;
3878 
3879 	sock_recv_timestamp(msg, sk, skb);
3880 
3881 	serr = SKB_EXT_ERR(skb);
3882 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3883 
3884 	msg->msg_flags |= MSG_ERRQUEUE;
3885 	err = copied;
3886 
3887 out_free_skb:
3888 	kfree_skb(skb);
3889 out:
3890 	return err;
3891 }
3892 EXPORT_SYMBOL(sock_recv_errqueue);
3893 
3894 /*
3895  *	Get a socket option on an socket.
3896  *
3897  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3898  *	asynchronous errors should be reported by getsockopt. We assume
3899  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3900  */
3901 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3902 			   char __user *optval, int __user *optlen)
3903 {
3904 	struct sock *sk = sock->sk;
3905 
3906 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3907 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3908 }
3909 EXPORT_SYMBOL(sock_common_getsockopt);
3910 
3911 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3912 			int flags)
3913 {
3914 	struct sock *sk = sock->sk;
3915 	int addr_len = 0;
3916 	int err;
3917 
3918 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3919 	if (err >= 0)
3920 		msg->msg_namelen = addr_len;
3921 	return err;
3922 }
3923 EXPORT_SYMBOL(sock_common_recvmsg);
3924 
3925 /*
3926  *	Set socket options on an inet socket.
3927  */
3928 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3929 			   sockptr_t optval, unsigned int optlen)
3930 {
3931 	struct sock *sk = sock->sk;
3932 
3933 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3934 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3935 }
3936 EXPORT_SYMBOL(sock_common_setsockopt);
3937 
3938 void sk_common_release(struct sock *sk)
3939 {
3940 	if (sk->sk_prot->destroy)
3941 		sk->sk_prot->destroy(sk);
3942 
3943 	/*
3944 	 * Observation: when sk_common_release is called, processes have
3945 	 * no access to socket. But net still has.
3946 	 * Step one, detach it from networking:
3947 	 *
3948 	 * A. Remove from hash tables.
3949 	 */
3950 
3951 	sk->sk_prot->unhash(sk);
3952 
3953 	/*
3954 	 * In this point socket cannot receive new packets, but it is possible
3955 	 * that some packets are in flight because some CPU runs receiver and
3956 	 * did hash table lookup before we unhashed socket. They will achieve
3957 	 * receive queue and will be purged by socket destructor.
3958 	 *
3959 	 * Also we still have packets pending on receive queue and probably,
3960 	 * our own packets waiting in device queues. sock_destroy will drain
3961 	 * receive queue, but transmitted packets will delay socket destruction
3962 	 * until the last reference will be released.
3963 	 */
3964 
3965 	sock_orphan(sk);
3966 
3967 	xfrm_sk_free_policy(sk);
3968 
3969 	sock_put(sk);
3970 }
3971 EXPORT_SYMBOL(sk_common_release);
3972 
3973 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3974 {
3975 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3976 
3977 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3978 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3979 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3980 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3981 	mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
3982 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3983 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3984 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3985 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3986 }
3987 
3988 #ifdef CONFIG_PROC_FS
3989 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3990 
3991 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3992 {
3993 	int cpu, idx = prot->inuse_idx;
3994 	int res = 0;
3995 
3996 	for_each_possible_cpu(cpu)
3997 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3998 
3999 	return res >= 0 ? res : 0;
4000 }
4001 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4002 
4003 int sock_inuse_get(struct net *net)
4004 {
4005 	int cpu, res = 0;
4006 
4007 	for_each_possible_cpu(cpu)
4008 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4009 
4010 	return res;
4011 }
4012 
4013 EXPORT_SYMBOL_GPL(sock_inuse_get);
4014 
4015 static int __net_init sock_inuse_init_net(struct net *net)
4016 {
4017 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4018 	if (net->core.prot_inuse == NULL)
4019 		return -ENOMEM;
4020 	return 0;
4021 }
4022 
4023 static void __net_exit sock_inuse_exit_net(struct net *net)
4024 {
4025 	free_percpu(net->core.prot_inuse);
4026 }
4027 
4028 static struct pernet_operations net_inuse_ops = {
4029 	.init = sock_inuse_init_net,
4030 	.exit = sock_inuse_exit_net,
4031 };
4032 
4033 static __init int net_inuse_init(void)
4034 {
4035 	if (register_pernet_subsys(&net_inuse_ops))
4036 		panic("Cannot initialize net inuse counters");
4037 
4038 	return 0;
4039 }
4040 
4041 core_initcall(net_inuse_init);
4042 
4043 static int assign_proto_idx(struct proto *prot)
4044 {
4045 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4046 
4047 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4048 		pr_err("PROTO_INUSE_NR exhausted\n");
4049 		return -ENOSPC;
4050 	}
4051 
4052 	set_bit(prot->inuse_idx, proto_inuse_idx);
4053 	return 0;
4054 }
4055 
4056 static void release_proto_idx(struct proto *prot)
4057 {
4058 	if (prot->inuse_idx != PROTO_INUSE_NR)
4059 		clear_bit(prot->inuse_idx, proto_inuse_idx);
4060 }
4061 #else
4062 static inline int assign_proto_idx(struct proto *prot)
4063 {
4064 	return 0;
4065 }
4066 
4067 static inline void release_proto_idx(struct proto *prot)
4068 {
4069 }
4070 
4071 #endif
4072 
4073 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4074 {
4075 	if (!twsk_prot)
4076 		return;
4077 	kfree(twsk_prot->twsk_slab_name);
4078 	twsk_prot->twsk_slab_name = NULL;
4079 	kmem_cache_destroy(twsk_prot->twsk_slab);
4080 	twsk_prot->twsk_slab = NULL;
4081 }
4082 
4083 static int tw_prot_init(const struct proto *prot)
4084 {
4085 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4086 
4087 	if (!twsk_prot)
4088 		return 0;
4089 
4090 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4091 					      prot->name);
4092 	if (!twsk_prot->twsk_slab_name)
4093 		return -ENOMEM;
4094 
4095 	twsk_prot->twsk_slab =
4096 		kmem_cache_create(twsk_prot->twsk_slab_name,
4097 				  twsk_prot->twsk_obj_size, 0,
4098 				  SLAB_ACCOUNT | prot->slab_flags,
4099 				  NULL);
4100 	if (!twsk_prot->twsk_slab) {
4101 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4102 			prot->name);
4103 		return -ENOMEM;
4104 	}
4105 
4106 	return 0;
4107 }
4108 
4109 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4110 {
4111 	if (!rsk_prot)
4112 		return;
4113 	kfree(rsk_prot->slab_name);
4114 	rsk_prot->slab_name = NULL;
4115 	kmem_cache_destroy(rsk_prot->slab);
4116 	rsk_prot->slab = NULL;
4117 }
4118 
4119 static int req_prot_init(const struct proto *prot)
4120 {
4121 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4122 
4123 	if (!rsk_prot)
4124 		return 0;
4125 
4126 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4127 					prot->name);
4128 	if (!rsk_prot->slab_name)
4129 		return -ENOMEM;
4130 
4131 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4132 					   rsk_prot->obj_size, 0,
4133 					   SLAB_ACCOUNT | prot->slab_flags,
4134 					   NULL);
4135 
4136 	if (!rsk_prot->slab) {
4137 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4138 			prot->name);
4139 		return -ENOMEM;
4140 	}
4141 	return 0;
4142 }
4143 
4144 int proto_register(struct proto *prot, int alloc_slab)
4145 {
4146 	int ret = -ENOBUFS;
4147 
4148 	if (prot->memory_allocated && !prot->sysctl_mem) {
4149 		pr_err("%s: missing sysctl_mem\n", prot->name);
4150 		return -EINVAL;
4151 	}
4152 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4153 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4154 		return -EINVAL;
4155 	}
4156 	if (alloc_slab) {
4157 		prot->slab = kmem_cache_create_usercopy(prot->name,
4158 					prot->obj_size, 0,
4159 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4160 					prot->slab_flags,
4161 					prot->useroffset, prot->usersize,
4162 					NULL);
4163 
4164 		if (prot->slab == NULL) {
4165 			pr_crit("%s: Can't create sock SLAB cache!\n",
4166 				prot->name);
4167 			goto out;
4168 		}
4169 
4170 		if (req_prot_init(prot))
4171 			goto out_free_request_sock_slab;
4172 
4173 		if (tw_prot_init(prot))
4174 			goto out_free_timewait_sock_slab;
4175 	}
4176 
4177 	mutex_lock(&proto_list_mutex);
4178 	ret = assign_proto_idx(prot);
4179 	if (ret) {
4180 		mutex_unlock(&proto_list_mutex);
4181 		goto out_free_timewait_sock_slab;
4182 	}
4183 	list_add(&prot->node, &proto_list);
4184 	mutex_unlock(&proto_list_mutex);
4185 	return ret;
4186 
4187 out_free_timewait_sock_slab:
4188 	if (alloc_slab)
4189 		tw_prot_cleanup(prot->twsk_prot);
4190 out_free_request_sock_slab:
4191 	if (alloc_slab) {
4192 		req_prot_cleanup(prot->rsk_prot);
4193 
4194 		kmem_cache_destroy(prot->slab);
4195 		prot->slab = NULL;
4196 	}
4197 out:
4198 	return ret;
4199 }
4200 EXPORT_SYMBOL(proto_register);
4201 
4202 void proto_unregister(struct proto *prot)
4203 {
4204 	mutex_lock(&proto_list_mutex);
4205 	release_proto_idx(prot);
4206 	list_del(&prot->node);
4207 	mutex_unlock(&proto_list_mutex);
4208 
4209 	kmem_cache_destroy(prot->slab);
4210 	prot->slab = NULL;
4211 
4212 	req_prot_cleanup(prot->rsk_prot);
4213 	tw_prot_cleanup(prot->twsk_prot);
4214 }
4215 EXPORT_SYMBOL(proto_unregister);
4216 
4217 int sock_load_diag_module(int family, int protocol)
4218 {
4219 	if (!protocol) {
4220 		if (!sock_is_registered(family))
4221 			return -ENOENT;
4222 
4223 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4224 				      NETLINK_SOCK_DIAG, family);
4225 	}
4226 
4227 #ifdef CONFIG_INET
4228 	if (family == AF_INET &&
4229 	    protocol != IPPROTO_RAW &&
4230 	    protocol < MAX_INET_PROTOS &&
4231 	    !rcu_access_pointer(inet_protos[protocol]))
4232 		return -ENOENT;
4233 #endif
4234 
4235 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4236 			      NETLINK_SOCK_DIAG, family, protocol);
4237 }
4238 EXPORT_SYMBOL(sock_load_diag_module);
4239 
4240 #ifdef CONFIG_PROC_FS
4241 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4242 	__acquires(proto_list_mutex)
4243 {
4244 	mutex_lock(&proto_list_mutex);
4245 	return seq_list_start_head(&proto_list, *pos);
4246 }
4247 
4248 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4249 {
4250 	return seq_list_next(v, &proto_list, pos);
4251 }
4252 
4253 static void proto_seq_stop(struct seq_file *seq, void *v)
4254 	__releases(proto_list_mutex)
4255 {
4256 	mutex_unlock(&proto_list_mutex);
4257 }
4258 
4259 static char proto_method_implemented(const void *method)
4260 {
4261 	return method == NULL ? 'n' : 'y';
4262 }
4263 static long sock_prot_memory_allocated(struct proto *proto)
4264 {
4265 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4266 }
4267 
4268 static const char *sock_prot_memory_pressure(struct proto *proto)
4269 {
4270 	return proto->memory_pressure != NULL ?
4271 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4272 }
4273 
4274 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4275 {
4276 
4277 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4278 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4279 		   proto->name,
4280 		   proto->obj_size,
4281 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4282 		   sock_prot_memory_allocated(proto),
4283 		   sock_prot_memory_pressure(proto),
4284 		   proto->max_header,
4285 		   proto->slab == NULL ? "no" : "yes",
4286 		   module_name(proto->owner),
4287 		   proto_method_implemented(proto->close),
4288 		   proto_method_implemented(proto->connect),
4289 		   proto_method_implemented(proto->disconnect),
4290 		   proto_method_implemented(proto->accept),
4291 		   proto_method_implemented(proto->ioctl),
4292 		   proto_method_implemented(proto->init),
4293 		   proto_method_implemented(proto->destroy),
4294 		   proto_method_implemented(proto->shutdown),
4295 		   proto_method_implemented(proto->setsockopt),
4296 		   proto_method_implemented(proto->getsockopt),
4297 		   proto_method_implemented(proto->sendmsg),
4298 		   proto_method_implemented(proto->recvmsg),
4299 		   proto_method_implemented(proto->bind),
4300 		   proto_method_implemented(proto->backlog_rcv),
4301 		   proto_method_implemented(proto->hash),
4302 		   proto_method_implemented(proto->unhash),
4303 		   proto_method_implemented(proto->get_port),
4304 		   proto_method_implemented(proto->enter_memory_pressure));
4305 }
4306 
4307 static int proto_seq_show(struct seq_file *seq, void *v)
4308 {
4309 	if (v == &proto_list)
4310 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4311 			   "protocol",
4312 			   "size",
4313 			   "sockets",
4314 			   "memory",
4315 			   "press",
4316 			   "maxhdr",
4317 			   "slab",
4318 			   "module",
4319 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4320 	else
4321 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4322 	return 0;
4323 }
4324 
4325 static const struct seq_operations proto_seq_ops = {
4326 	.start  = proto_seq_start,
4327 	.next   = proto_seq_next,
4328 	.stop   = proto_seq_stop,
4329 	.show   = proto_seq_show,
4330 };
4331 
4332 static __net_init int proto_init_net(struct net *net)
4333 {
4334 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4335 			sizeof(struct seq_net_private)))
4336 		return -ENOMEM;
4337 
4338 	return 0;
4339 }
4340 
4341 static __net_exit void proto_exit_net(struct net *net)
4342 {
4343 	remove_proc_entry("protocols", net->proc_net);
4344 }
4345 
4346 
4347 static __net_initdata struct pernet_operations proto_net_ops = {
4348 	.init = proto_init_net,
4349 	.exit = proto_exit_net,
4350 };
4351 
4352 static int __init proto_init(void)
4353 {
4354 	return register_pernet_subsys(&proto_net_ops);
4355 }
4356 
4357 subsys_initcall(proto_init);
4358 
4359 #endif /* PROC_FS */
4360 
4361 #ifdef CONFIG_NET_RX_BUSY_POLL
4362 bool sk_busy_loop_end(void *p, unsigned long start_time)
4363 {
4364 	struct sock *sk = p;
4365 
4366 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4367 		return true;
4368 
4369 	if (sk_is_udp(sk) &&
4370 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4371 		return true;
4372 
4373 	return sk_busy_loop_timeout(sk, start_time);
4374 }
4375 EXPORT_SYMBOL(sk_busy_loop_end);
4376 #endif /* CONFIG_NET_RX_BUSY_POLL */
4377 
4378 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4379 {
4380 	if (!sk->sk_prot->bind_add)
4381 		return -EOPNOTSUPP;
4382 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4383 }
4384 EXPORT_SYMBOL(sock_bind_add);
4385 
4386 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4387 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4388 		     void __user *arg, void *karg, size_t size)
4389 {
4390 	int ret;
4391 
4392 	if (copy_from_user(karg, arg, size))
4393 		return -EFAULT;
4394 
4395 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4396 	if (ret)
4397 		return ret;
4398 
4399 	if (copy_to_user(arg, karg, size))
4400 		return -EFAULT;
4401 
4402 	return 0;
4403 }
4404 EXPORT_SYMBOL(sock_ioctl_inout);
4405 
4406 /* This is the most common ioctl prep function, where the result (4 bytes) is
4407  * copied back to userspace if the ioctl() returns successfully. No input is
4408  * copied from userspace as input argument.
4409  */
4410 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4411 {
4412 	int ret, karg = 0;
4413 
4414 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4415 	if (ret)
4416 		return ret;
4417 
4418 	return put_user(karg, (int __user *)arg);
4419 }
4420 
4421 /* A wrapper around sock ioctls, which copies the data from userspace
4422  * (depending on the protocol/ioctl), and copies back the result to userspace.
4423  * The main motivation for this function is to pass kernel memory to the
4424  * protocol ioctl callbacks, instead of userspace memory.
4425  */
4426 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4427 {
4428 	int rc = 1;
4429 
4430 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4431 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4432 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4433 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4434 	else if (sk_is_phonet(sk))
4435 		rc = phonet_sk_ioctl(sk, cmd, arg);
4436 
4437 	/* If ioctl was processed, returns its value */
4438 	if (rc <= 0)
4439 		return rc;
4440 
4441 	/* Otherwise call the default handler */
4442 	return sock_ioctl_out(sk, cmd, arg);
4443 }
4444 EXPORT_SYMBOL(sk_ioctl);
4445 
4446 static int __init sock_struct_check(void)
4447 {
4448 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4449 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4450 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4451 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4452 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4453 
4454 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4455 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4456 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4457 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4458 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4459 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4460 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4461 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4462 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4463 
4464 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4465 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4466 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4467 
4468 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4469 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4470 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4471 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4472 
4473 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4474 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4475 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4476 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4477 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4478 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4479 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4480 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4481 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4482 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4483 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4484 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4485 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4486 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4487 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4488 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4489 
4490 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4491 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4492 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4493 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4494 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4495 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4496 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4497 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4498 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4499 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4500 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4501 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4502 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4503 	return 0;
4504 }
4505 
4506 core_initcall(sock_struct_check);
4507